From 1dffec7c394d99e19caf67c30b0a43fb5259ed9f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Sat, 9 Aug 2025 01:47:27 +0000 Subject: [PATCH 1/6] Add accuracy evaluation for AutoDeploy Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/llm_args.py | 16 ++++- tensorrt_llm/bench/benchmark/throughput.py | 1 - .../defs/accuracy/accuracy_core.py | 3 +- .../defs/accuracy/test_llm_api_autodeploy.py | 60 +++++++++++++++++++ 4 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 tests/integration/defs/accuracy/test_llm_api_autodeploy.py diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index 61337ae3f42..812dfea29cd 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -3,9 +3,11 @@ from typing import Any, Dict, List, Literal, Optional, Type, Union import torch -from pydantic import Field, ValidationInfo, field_validator, model_validator +from pydantic import Field, PrivateAttr, ValidationInfo, field_validator, model_validator from pydantic_settings import BaseSettings, SettingsConfigDict +from tensorrt_llm.models.modeling_utils import QuantConfig + from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, _ParallelConfig from ...llmapi.utils import get_type_repr from .models import ModelFactory, ModelFactoryRegistry @@ -259,6 +261,18 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings): ) garbage_collection_gen0_threshold: int = Field(default=20000, description="See TorchLlmArgs.") + _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) + + @property + def quant_config(self) -> QuantConfig: + if self._quant_config is None: + self._quant_config = QuantConfig() + return self._quant_config + + @quant_config.setter + def quant_config(self, value: QuantConfig): + self._quant_config = value + ### VALIDATION ################################################################################# @field_validator("build_config", mode="before") @classmethod diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index 2184919465c..3976e732f46 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -450,7 +450,6 @@ def ignore_trt_only_args(kwargs: dict): elif runtime_config.backend == "_autodeploy": ignore_trt_only_args(kwargs) kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None) - llm = AutoDeployLLM(**kwargs) else: llm = LLM(**kwargs) diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index 8f7f389b1d0..35234e42ef6 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -25,6 +25,7 @@ import tensorrt_llm.evaluate from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM from tensorrt_llm.builder import BuildConfig from tensorrt_llm.llmapi import SamplingParams from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig @@ -144,7 +145,7 @@ def get_num_samples_and_threshold(self, **acc_specs): return num_samples, threshold def evaluate(self, - llm: Union[LLM, PyTorchLLM], + llm: Union[LLM, PyTorchLLM, AutoDeployLLM], extra_acc_spec: Optional[str] = None, extra_evaluator_kwargs: Optional[dict] = None, sampling_params: Optional[SamplingParams] = None, diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py new file mode 100644 index 00000000000..806f9065c1d --- /dev/null +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from tensorrt_llm import LLM +from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM +from tensorrt_llm.quantization import QuantAlgo + +from ..conftest import llm_models_root, skip_pre_blackwell +from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness + + +class TestLlama3_1_8B(LlmapiAccuracyTestHarness): + MODEL_NAME = "meta-llama/Llama-3.1-8B" + MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B" + + @pytest.mark.skip_less_device_memory(32000) + def test_auto_dtype(self): + with AutoDeployLLM(self.MODEL_PATH) as llm: + # task = CnnDailymail(self.MODEL_NAME) + # task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_blackwell + def test_nvfp4(self): + model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B" + with LLM(model_path) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_blackwell + @pytest.mark.parametrize("stream_interval", [4, 64], + ids=["stream_interval_4", "stream_interval_64"]) + def test_nvfp4_streaming(self, stream_interval): + # When stream_interval < TLLM_STREAM_INTERVAL_THRESHOLD, hf incremental detokenization is used. + # When stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD, trtllm implemented incremental detokenization is used. + # The behavior is due to perf considerations, while both paths need to be tested. + with LLM(f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B", + stream_interval=stream_interval) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + assert llm.args.stream_interval == stream_interval + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm, streaming=True) From 6ce6b3f0a8fd1ab1bd19298ca3eb1475ada0b3e3 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 12 Aug 2025 22:05:16 +0000 Subject: [PATCH 2/6] Update llm args and sampling params Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 65 +++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 806f9065c1d..2d573022a52 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -17,7 +17,10 @@ from tensorrt_llm import LLM from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM +from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy, + ContextChunkingPolicy) from tensorrt_llm.quantization import QuantAlgo +from tensorrt_llm.sampling_params import SamplingParams from ..conftest import llm_models_root, skip_pre_blackwell from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness @@ -27,13 +30,67 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B" + def get_default_kwargs(self): + return { + 'skip_tokenizer_init': False, + 'trust_remote_code': True, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'max_tokens': None, + 'max_attention_window': None, + 'sink_token_length': None, + 'free_gpu_memory_fraction': 0.9, + 'host_cache_size': None, + 'onboard_blocks': True, + 'cross_kv_cache_fraction': None, + 'secondary_offload_min_priority': None, + 'event_buffer_max_size': 0, + 'attention_dp_events_gather_period_ms': 5, + 'enable_partial_reuse': True, + 'copy_on_partial_reuse': True, + 'use_uvm': False, + 'dtype': 'auto' + }, + 'enable_chunked_prefill': True, + 'scheduler_config': { + 'capacity_scheduler_policy': + CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, + 'context_chunking_policy': + ContextChunkingPolicy.FIRST_COME_FIRST_SERVED, + 'dynamic_batch_config': { + 'enable_batch_size_tuning': True, + 'enable_max_num_tokens_tuning': False, + 'dynamic_batch_moving_average_window': 128 + } + }, + 'max_batch_size': 512, + 'max_seq_len': 256, + 'max_num_tokens': 3840, + 'skip_loading_weights': False, + 'compile_backend': 'torch-opt', + 'free_mem_ratio': 0.7, + 'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256] + } + + def get_default_sampling_params(self): + eos_id = -1 + beam_width = 1 + return SamplingParams(end_id=eos_id, + pad_id=eos_id, + n=beam_width, + use_beam_search=beam_width > 1) + @pytest.mark.skip_less_device_memory(32000) def test_auto_dtype(self): - with AutoDeployLLM(self.MODEL_PATH) as llm: - # task = CnnDailymail(self.MODEL_NAME) - # task.evaluate(llm) - task = MMLU(self.MODEL_NAME) + kwargs = self.get_default_kwargs() + sampling_params = self.get_default_sampling_params() + with AutoDeployLLM(model=self.MODEL_PATH, + tokenizer=self.MODEL_PATH, + **kwargs) as llm: + task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=sampling_params) @skip_pre_blackwell def test_nvfp4(self): From 21787a1e8be17ce114f09c2203031456ef0d5ad3 Mon Sep 17 00:00:00 2001 From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> Date: Thu, 14 Aug 2025 00:52:10 -0700 Subject: [PATCH 3/6] fix test Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 33 +++---------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 2d573022a52..f466f5cc457 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -36,36 +36,13 @@ def get_default_kwargs(self): 'trust_remote_code': True, 'kv_cache_config': { 'enable_block_reuse': False, - 'max_tokens': None, - 'max_attention_window': None, - 'sink_token_length': None, - 'free_gpu_memory_fraction': 0.9, - 'host_cache_size': None, - 'onboard_blocks': True, - 'cross_kv_cache_fraction': None, - 'secondary_offload_min_priority': None, - 'event_buffer_max_size': 0, - 'attention_dp_events_gather_period_ms': 5, - 'enable_partial_reuse': True, - 'copy_on_partial_reuse': True, - 'use_uvm': False, - 'dtype': 'auto' - }, - 'enable_chunked_prefill': True, - 'scheduler_config': { - 'capacity_scheduler_policy': - CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, - 'context_chunking_policy': - ContextChunkingPolicy.FIRST_COME_FIRST_SERVED, - 'dynamic_batch_config': { - 'enable_batch_size_tuning': True, - 'enable_max_num_tokens_tuning': False, - 'dynamic_batch_moving_average_window': 128 - } }, 'max_batch_size': 512, - 'max_seq_len': 256, - 'max_num_tokens': 3840, + # 131072 is the max seq len for the model + 'max_seq_len': 8192, + # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. + # Set it explicitly here to 8192 which is the default in build_config. + 'max_num_tokens': 8192, 'skip_loading_weights': False, 'compile_backend': 'torch-opt', 'free_mem_ratio': 0.7, From a3d46b2cb885a59e7c7771d8f1c6d8ca2f3c0d4a Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 14 Aug 2025 21:51:47 +0000 Subject: [PATCH 4/6] Remove unused import Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- tensorrt_llm/bench/benchmark/throughput.py | 1 + tests/integration/defs/accuracy/test_llm_api_autodeploy.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index 3976e732f46..2184919465c 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -450,6 +450,7 @@ def ignore_trt_only_args(kwargs: dict): elif runtime_config.backend == "_autodeploy": ignore_trt_only_args(kwargs) kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None) + llm = AutoDeployLLM(**kwargs) else: llm = LLM(**kwargs) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index f466f5cc457..70022aae33d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -17,8 +17,6 @@ from tensorrt_llm import LLM from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM -from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy, - ContextChunkingPolicy) from tensorrt_llm.quantization import QuantAlgo from tensorrt_llm.sampling_params import SamplingParams @@ -40,7 +38,7 @@ def get_default_kwargs(self): 'max_batch_size': 512, # 131072 is the max seq len for the model 'max_seq_len': 8192, - # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. + # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. # Set it explicitly here to 8192 which is the default in build_config. 'max_num_tokens': 8192, 'skip_loading_weights': False, From edc863557815d91830c9b12253600266a3e4ce48 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 14 Aug 2025 22:10:22 +0000 Subject: [PATCH 5/6] Remove nvfp4 streaming test Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 70022aae33d..72dd38f65be 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -76,17 +76,3 @@ def test_nvfp4(self): task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm) - - @skip_pre_blackwell - @pytest.mark.parametrize("stream_interval", [4, 64], - ids=["stream_interval_4", "stream_interval_64"]) - def test_nvfp4_streaming(self, stream_interval): - # When stream_interval < TLLM_STREAM_INTERVAL_THRESHOLD, hf incremental detokenization is used. - # When stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD, trtllm implemented incremental detokenization is used. - # The behavior is due to perf considerations, while both paths need to be tested. - with LLM(f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B", - stream_interval=stream_interval) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - assert llm.args.stream_interval == stream_interval - task = CnnDailymail(self.MODEL_NAME) - task.evaluate(llm, streaming=True) From a226efb52fa354af4df63369be93f8367e80352f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 14 Aug 2025 23:57:33 +0000 Subject: [PATCH 6/6] Remove nvfp4 test Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_autodeploy.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 72dd38f65be..da64969337e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -15,12 +15,10 @@ import pytest -from tensorrt_llm import LLM from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM -from tensorrt_llm.quantization import QuantAlgo from tensorrt_llm.sampling_params import SamplingParams -from ..conftest import llm_models_root, skip_pre_blackwell +from ..conftest import llm_models_root from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness @@ -66,13 +64,3 @@ def test_auto_dtype(self): task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm, sampling_params=sampling_params) - - @skip_pre_blackwell - def test_nvfp4(self): - model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B" - with LLM(model_path) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 - task = CnnDailymail(self.MODEL_NAME) - task.evaluate(llm) - task = MMLU(self.MODEL_NAME) - task.evaluate(llm)