From 1dffec7c394d99e19caf67c30b0a43fb5259ed9f Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Sat, 9 Aug 2025 01:47:27 +0000
Subject: [PATCH 1/6] Add accuracy evaluation for AutoDeploy

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 tensorrt_llm/_torch/auto_deploy/llm_args.py   | 16 ++++-
 tensorrt_llm/bench/benchmark/throughput.py    |  1 -
 .../defs/accuracy/accuracy_core.py            |  3 +-
 .../defs/accuracy/test_llm_api_autodeploy.py  | 60 +++++++++++++++++++
 4 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 tests/integration/defs/accuracy/test_llm_api_autodeploy.py

diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
index 61337ae3f42..812dfea29cd 100644
--- a/tensorrt_llm/_torch/auto_deploy/llm_args.py
+++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -3,9 +3,11 @@
 from typing import Any, Dict, List, Literal, Optional, Type, Union
 
 import torch
-from pydantic import Field, ValidationInfo, field_validator, model_validator
+from pydantic import Field, PrivateAttr, ValidationInfo, field_validator, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
+from tensorrt_llm.models.modeling_utils import QuantConfig
+
 from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, _ParallelConfig
 from ...llmapi.utils import get_type_repr
 from .models import ModelFactory, ModelFactoryRegistry
@@ -259,6 +261,18 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
     )
     garbage_collection_gen0_threshold: int = Field(default=20000, description="See TorchLlmArgs.")
 
+    _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
+
+    @property
+    def quant_config(self) -> QuantConfig:
+        if self._quant_config is None:
+            self._quant_config = QuantConfig()
+        return self._quant_config
+
+    @quant_config.setter
+    def quant_config(self, value: QuantConfig):
+        self._quant_config = value
+
     ### VALIDATION #################################################################################
     @field_validator("build_config", mode="before")
     @classmethod
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 2184919465c..3976e732f46 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -450,7 +450,6 @@ def ignore_trt_only_args(kwargs: dict):
         elif runtime_config.backend == "_autodeploy":
             ignore_trt_only_args(kwargs)
             kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None)
-
             llm = AutoDeployLLM(**kwargs)
         else:
             llm = LLM(**kwargs)
diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py
index 8f7f389b1d0..35234e42ef6 100644
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@@ -25,6 +25,7 @@
 import tensorrt_llm.evaluate
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
 from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.llmapi import SamplingParams
 from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
@@ -144,7 +145,7 @@ def get_num_samples_and_threshold(self, **acc_specs):
         return num_samples, threshold
 
     def evaluate(self,
-                 llm: Union[LLM, PyTorchLLM],
+                 llm: Union[LLM, PyTorchLLM, AutoDeployLLM],
                  extra_acc_spec: Optional[str] = None,
                  extra_evaluator_kwargs: Optional[dict] = None,
                  sampling_params: Optional[SamplingParams] = None,
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
new file mode 100644
index 00000000000..806f9065c1d
--- /dev/null
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from tensorrt_llm import LLM
+from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
+from tensorrt_llm.quantization import QuantAlgo
+
+from ..conftest import llm_models_root, skip_pre_blackwell
+from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
+
+
+class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "meta-llama/Llama-3.1-8B"
+    MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"
+
+    @pytest.mark.skip_less_device_memory(32000)
+    def test_auto_dtype(self):
+        with AutoDeployLLM(self.MODEL_PATH) as llm:
+            # task = CnnDailymail(self.MODEL_NAME)
+            # task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_blackwell
+    def test_nvfp4(self):
+        model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B"
+        with LLM(model_path) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_blackwell
+    @pytest.mark.parametrize("stream_interval", [4, 64],
+                             ids=["stream_interval_4", "stream_interval_64"])
+    def test_nvfp4_streaming(self, stream_interval):
+        # When stream_interval < TLLM_STREAM_INTERVAL_THRESHOLD, hf incremental detokenization is used.
+        # When stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD, trtllm implemented incremental detokenization is used.
+        # The behavior is due to perf considerations, while both paths need to be tested.
+        with LLM(f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B",
+                 stream_interval=stream_interval) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            assert llm.args.stream_interval == stream_interval
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm, streaming=True)

From 6ce6b3f0a8fd1ab1bd19298ca3eb1475ada0b3e3 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Tue, 12 Aug 2025 22:05:16 +0000
Subject: [PATCH 2/6] Update llm args and sampling params

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py  | 65 +++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 806f9065c1d..2d573022a52 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -17,7 +17,10 @@
 
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
+from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
+                                          ContextChunkingPolicy)
 from tensorrt_llm.quantization import QuantAlgo
+from tensorrt_llm.sampling_params import SamplingParams
 
 from ..conftest import llm_models_root, skip_pre_blackwell
 from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
@@ -27,13 +30,67 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B"
 
+    def get_default_kwargs(self):
+        return {
+            'skip_tokenizer_init': False,
+            'trust_remote_code': True,
+            'kv_cache_config': {
+                'enable_block_reuse': False,
+                'max_tokens': None,
+                'max_attention_window': None,
+                'sink_token_length': None,
+                'free_gpu_memory_fraction': 0.9,
+                'host_cache_size': None,
+                'onboard_blocks': True,
+                'cross_kv_cache_fraction': None,
+                'secondary_offload_min_priority': None,
+                'event_buffer_max_size': 0,
+                'attention_dp_events_gather_period_ms': 5,
+                'enable_partial_reuse': True,
+                'copy_on_partial_reuse': True,
+                'use_uvm': False,
+                'dtype': 'auto'
+            },
+            'enable_chunked_prefill': True,
+            'scheduler_config': {
+                'capacity_scheduler_policy':
+                CapacitySchedulerPolicy.GUARANTEED_NO_EVICT,
+                'context_chunking_policy':
+                ContextChunkingPolicy.FIRST_COME_FIRST_SERVED,
+                'dynamic_batch_config': {
+                    'enable_batch_size_tuning': True,
+                    'enable_max_num_tokens_tuning': False,
+                    'dynamic_batch_moving_average_window': 128
+                }
+            },
+            'max_batch_size': 512,
+            'max_seq_len': 256,
+            'max_num_tokens': 3840,
+            'skip_loading_weights': False,
+            'compile_backend': 'torch-opt',
+            'free_mem_ratio': 0.7,
+            'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256]
+        }
+
+    def get_default_sampling_params(self):
+        eos_id = -1
+        beam_width = 1
+        return SamplingParams(end_id=eos_id,
+                              pad_id=eos_id,
+                              n=beam_width,
+                              use_beam_search=beam_width > 1)
+
     @pytest.mark.skip_less_device_memory(32000)
     def test_auto_dtype(self):
-        with AutoDeployLLM(self.MODEL_PATH) as llm:
-            # task = CnnDailymail(self.MODEL_NAME)
-            # task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
+        kwargs = self.get_default_kwargs()
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH,
+                           tokenizer=self.MODEL_PATH,
+                           **kwargs) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
 
     @skip_pre_blackwell
     def test_nvfp4(self):

From 21787a1e8be17ce114f09c2203031456ef0d5ad3 Mon Sep 17 00:00:00 2001
From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:52:10 -0700
Subject: [PATCH 3/6] fix test

Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py  | 33 +++----------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 2d573022a52..f466f5cc457 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -36,36 +36,13 @@ def get_default_kwargs(self):
             'trust_remote_code': True,
             'kv_cache_config': {
                 'enable_block_reuse': False,
-                'max_tokens': None,
-                'max_attention_window': None,
-                'sink_token_length': None,
-                'free_gpu_memory_fraction': 0.9,
-                'host_cache_size': None,
-                'onboard_blocks': True,
-                'cross_kv_cache_fraction': None,
-                'secondary_offload_min_priority': None,
-                'event_buffer_max_size': 0,
-                'attention_dp_events_gather_period_ms': 5,
-                'enable_partial_reuse': True,
-                'copy_on_partial_reuse': True,
-                'use_uvm': False,
-                'dtype': 'auto'
-            },
-            'enable_chunked_prefill': True,
-            'scheduler_config': {
-                'capacity_scheduler_policy':
-                CapacitySchedulerPolicy.GUARANTEED_NO_EVICT,
-                'context_chunking_policy':
-                ContextChunkingPolicy.FIRST_COME_FIRST_SERVED,
-                'dynamic_batch_config': {
-                    'enable_batch_size_tuning': True,
-                    'enable_max_num_tokens_tuning': False,
-                    'dynamic_batch_moving_average_window': 128
-                }
             },
             'max_batch_size': 512,
-            'max_seq_len': 256,
-            'max_num_tokens': 3840,
+            # 131072 is the max seq len for the model
+            'max_seq_len': 8192,
+            # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. 
+            # Set it explicitly here to 8192 which is the default in build_config.
+            'max_num_tokens': 8192,
             'skip_loading_weights': False,
             'compile_backend': 'torch-opt',
             'free_mem_ratio': 0.7,

From a3d46b2cb885a59e7c7771d8f1c6d8ca2f3c0d4a Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Thu, 14 Aug 2025 21:51:47 +0000
Subject: [PATCH 4/6] Remove unused import

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 tensorrt_llm/bench/benchmark/throughput.py                 | 1 +
 tests/integration/defs/accuracy/test_llm_api_autodeploy.py | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 3976e732f46..2184919465c 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -450,6 +450,7 @@ def ignore_trt_only_args(kwargs: dict):
         elif runtime_config.backend == "_autodeploy":
             ignore_trt_only_args(kwargs)
             kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None)
+
             llm = AutoDeployLLM(**kwargs)
         else:
             llm = LLM(**kwargs)
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index f466f5cc457..70022aae33d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -17,8 +17,6 @@
 
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
-from tensorrt_llm.llmapi.llm_args import (CapacitySchedulerPolicy,
-                                          ContextChunkingPolicy)
 from tensorrt_llm.quantization import QuantAlgo
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -40,7 +38,7 @@ def get_default_kwargs(self):
             'max_batch_size': 512,
             # 131072 is the max seq len for the model
             'max_seq_len': 8192,
-            # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs. 
+            # max num tokens is derived in the build_config, which is not used by AutoDeploy llmargs.
             # Set it explicitly here to 8192 which is the default in build_config.
             'max_num_tokens': 8192,
             'skip_loading_weights': False,

From edc863557815d91830c9b12253600266a3e4ce48 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Thu, 14 Aug 2025 22:10:22 +0000
Subject: [PATCH 5/6] Remove nvfp4 streaming test

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py       | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 70022aae33d..72dd38f65be 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -76,17 +76,3 @@ def test_nvfp4(self):
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
-
-    @skip_pre_blackwell
-    @pytest.mark.parametrize("stream_interval", [4, 64],
-                             ids=["stream_interval_4", "stream_interval_64"])
-    def test_nvfp4_streaming(self, stream_interval):
-        # When stream_interval < TLLM_STREAM_INTERVAL_THRESHOLD, hf incremental detokenization is used.
-        # When stream_interval >= TLLM_STREAM_INTERVAL_THRESHOLD, trtllm implemented incremental detokenization is used.
-        # The behavior is due to perf considerations, while both paths need to be tested.
-        with LLM(f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B",
-                 stream_interval=stream_interval) as llm:
-            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            assert llm.args.stream_interval == stream_interval
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm, streaming=True)

From a226efb52fa354af4df63369be93f8367e80352f Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Thu, 14 Aug 2025 23:57:33 +0000
Subject: [PATCH 6/6] Remove nvfp4 test

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_autodeploy.py       | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 72dd38f65be..da64969337e 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -15,12 +15,10 @@
 
 import pytest
 
-from tensorrt_llm import LLM
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
-from tensorrt_llm.quantization import QuantAlgo
 from tensorrt_llm.sampling_params import SamplingParams
 
-from ..conftest import llm_models_root, skip_pre_blackwell
+from ..conftest import llm_models_root
 from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
 
 
@@ -66,13 +64,3 @@ def test_auto_dtype(self):
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=sampling_params)
-
-    @skip_pre_blackwell
-    def test_nvfp4(self):
-        model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B"
-        with LLM(model_path) as llm:
-            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)