fix

Superjomn · Superjomn · commit 302d4b1d784c · 2025-06-19T14:39:56.000Z
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/docs/source/torch.md b/docs/source/torch.md
@@ -11,7 +11,7 @@ The PyTorch backend of TensorRT-LLM is available in version 0.17 and later. You
 
 ## Quick Start
 
-Here is a simple example to show how to use `tensorrt_llm._torch.LLM` API with Llama model.
+Here is a simple example to show how to use `tensorrt_llm.LLM` API with Llama model.
 
 ```{literalinclude} ../../examples/pytorch/quickstart.py
     :language: python
@@ -24,7 +24,7 @@ The PyTorch backend supports FP8 and NVFP4 quantization. You can pass quantized
 which are generated by [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
 llm.generate("Hello, my name is")
 ```
@@ -44,7 +44,7 @@ The PyTorch backend supports most of the sampling features that are supported on
 In order to use this feature, it is necessary to enable option `enable_trtllm_sampler` in the `LLM` class, and pass a `SamplingParams` object with the desired options as well. The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
           enable_trtllm_sampler=True)
 sampling_params = SamplingParams(
diff --git a/docs/source/torch/adding_new_model.md b/docs/source/torch/adding_new_model.md
@@ -186,7 +186,7 @@ __all__ = [
 Alternatively, you can register the new model as an out-of-tree model, so that you can use the new model without touching the TensorRT-LLM codebase. To do so, place `modeling_mymodel.py` (and potentially `configuration_mymodel.py`) in your working directory, and import the modeling code in your script:
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 import modeling_mymodel
 
 def main():
diff --git a/docs/source/torch/arch_overview.md b/docs/source/torch/arch_overview.md
@@ -5,10 +5,10 @@ Besides TensorRT, PyTorch can also serve as the backend for TensorRT-LLM. This d
 
 ## Top Level API
 
-The interface for PyTorch backend is `tensorrt._torch.LLM`.
+The interface for PyTorch backend is `tensorrt_llm.LLM`.
 
 ```python
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 llm = LLM(model=<path_to_llama_from_hf>)
 ```
 
diff --git a/examples/pytorch/out_of_tree_example/main.py b/examples/pytorch/out_of_tree_example/main.py
@@ -1,6 +1,6 @@
 import modeling_opt  # noqa
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 
 
 def main():
diff --git a/examples/pytorch/quickstart.py b/examples/pytorch/quickstart.py
@@ -1,5 +1,4 @@
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 
 
 def main():
diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py
@@ -1,7 +1,6 @@
 import argparse
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import (DraftTargetDecodingConfig, EagleDecodingConfig,
                                  KvCacheConfig, MTPDecodingConfig,
                                  NGramDecodingConfig, TorchCompileConfig)
diff --git a/examples/pytorch/star_attention.py b/examples/pytorch/star_attention.py
@@ -6,8 +6,7 @@
 
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 
diff --git a/tensorrt_llm/_torch/__init__.py b/tensorrt_llm/_torch/__init__.py
@@ -1,3 +1,4 @@
 from .llm import LLM
+from .model_config import MoeLoadBalancerConfig
 
-__all__ = ["LLM"]
+__all__ = ["LLM", "MoeLoadBalancerConfig"]
diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py
@@ -9,7 +9,7 @@
 from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
 from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
 from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -97,6 +97,7 @@ def _repr_fields(self):
 
     Attributes:
         tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.
+        llm_id (str): The unique ID of the LLM instance.
 """
 
 
@@ -883,6 +884,9 @@ def __init__(self,
         # TODO: deprecate backend in LLM kwargs
         kwargs.pop("backend", None)
 
+        # Validate that users don't pass TrtLlmArgs-specific arguments
+        self._validate_args_for_torch_backend(kwargs)
+
         super().__init__(model,
                          tokenizer,
                          tokenizer_mode,
@@ -895,6 +899,26 @@ def __init__(self,
                          backend='pytorch',
                          **kwargs)
 
+    def _validate_args_for_torch_backend(self, kwargs: dict) -> None:
+        """Validate that users don't pass TrtLlmArgs-specific arguments when using PyTorch backend.
+        """
+        trtllm_fields = set(TrtLlmArgs.model_fields.keys())
+        torchllm_fields = set(TorchLlmArgs.model_fields.keys())
+
+        trtllm_specific_fields = trtllm_fields - torchllm_fields
+
+        # Check if any TrtLlmArgs-specific arguments are passed
+        trtllm_specific_args = []
+        for key in kwargs:
+            if key in trtllm_specific_fields:
+                trtllm_specific_args.append(key)
+
+        if trtllm_specific_args:
+            raise ValueError(
+                f"The following arguments are specific to TensorRT backend and cannot be used with PyTorch backend: {trtllm_specific_args}.\n"
+                f"Please use 'from tensorrt_llm._tensorrt_engine import LLM' instead to use the TensorRT backend."
+            )
+
 
 class LLM(_TorchLLM):
 
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1591,9 +1591,6 @@ def validate_enable_build_cache(self):
         return self
 
 
-LlmArgs = TrtLlmArgs
-
-
 class LoadFormat(Enum):
     AUTO = 0
     # Initialize all weights randomly.
@@ -1663,7 +1660,10 @@ class TorchLlmArgs(BaseLlmArgs):
     moe_load_balancer: Optional[Union[object, str]] = Field(
         default=None,
         description="Configuration for MoE load balancing.",
-        json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
+        json_schema_extra={
+            "type":
+            "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
+        })
 
     attn_backend: str = Field(default='TRTLLM',
                               description="Attention backend to use.")
@@ -2081,6 +2081,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
         return model_format
 
 
+LlmArgs = TorchLlmArgs
+
 TRT_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TrtLlmArgs,
                                                                 indent=' ' * 4)
 TORCH_LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(TorchLlmArgs,
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -83,7 +83,6 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
         yaml.dump(gen_server_config, f)
 
     args = LlmArgs.from_kwargs(model=model_name,
-                               backend="pytorch",
                                tensor_parallel_size=tensor_parallel_size)
 
     trtllm_serve_path = "trtllm-serve"
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import pytest
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
 from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  MTPDecodingConfig, NGramDecodingConfig,
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -9,8 +9,7 @@
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
 
-from tensorrt_llm import DisaggregatedParams, SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession
 
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2107,8 +2107,7 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
     import torch
     from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-    from tensorrt_llm import SamplingParams
-    from tensorrt_llm._torch import LLM
+    from tensorrt_llm import LLM, SamplingParams
     from tensorrt_llm.sampling_params import SamplingParams
     prompts = [
         "Hello, my name is",
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/shim/test_llm_config.py
@@ -128,7 +128,8 @@ def test_config_flow(
 
     # Create instance with appropriate mocking
     with patch.object(api_class, "_try_load_tokenizer", return_value=MagicMock()):
-        instance = api_class(**config_params)
+        with patch.object(api_class, "_build_model", return_value=MagicMock()):
+            instance = api_class(**config_params)
 
     # Verify args were created correctly
     assert hasattr(instance, "args")
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -7,8 +7,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -2,7 +2,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import skip_gpu_memory_less_than
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
@@ -2,7 +2,7 @@
 
 from parameterized import parameterized
 
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@@ -5,8 +5,7 @@
 import torch
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -7,8 +7,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import getSMVersion
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -4,8 +4,7 @@
 import torch
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
@@ -5,8 +5,7 @@
 import pytest
 import torch
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py
@@ -4,8 +4,7 @@
 import pytest
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
diff --git a/tests/unittest/_torch/test_return_logits.py b/tests/unittest/_torch/test_return_logits.py
@@ -5,8 +5,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import force_ampere
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi.llm_utils import BuildConfig, KvCacheConfig
 
 prompts = ["A B C"]
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py
@@ -5,8 +5,7 @@
 from utils.llm_data import llm_models_root
 from utils.util import similar
 
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch import LLM
+from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
diff --git a/tests/unittest/api_stability/api_stability_core.py b/tests/unittest/api_stability/api_stability_core.py
@@ -17,7 +17,7 @@
 from pydantic import BaseModel
 
 import tensorrt_llm
-from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm import LLM
 from tensorrt_llm.executor import GenerationResult
 from tensorrt_llm.executor.result import TokenLogprobs
 from tensorrt_llm.llmapi import (CalibConfig, CompletionOutput,
@@ -367,8 +367,14 @@ def assert_equal(self, other: 'ClassSnapshot'):
         if self.properties.keys() != other.properties.keys():
             diff_keys = set(self.properties.keys()) ^ set(
                 other.properties.keys())
+            this_diff_keys = set(self.properties.keys()) - set(
+                other.properties.keys())
+            other_diff_keys = set(other.properties.keys()) - set(
+                self.properties.keys())
             raise AssertionError(
-                f"{qual_name} has different properties: {diff_keys}")
+                f"{qual_name} has different properties: {diff_keys}\n"
+                f"This class has extra properties: {this_diff_keys}\n"
+                f"The reference has extra properties: {other_diff_keys}")
 
         for name, prop in self.properties.items():
             with StackTrace().push(name):
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
diff --git a/tests/unittest/api_stability/test_llm_api.py b/tests/unittest/api_stability/test_llm_api.py
diff --git a/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py b/tests/unittest/llmapi/apps/_test_openai_consistent_chat.py
diff --git a/tests/unittest/llmapi/run_llm.py b/tests/unittest/llmapi/run_llm.py
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
diff --git a/tests/unittest/llmapi/test_llm_utils.py b/tests/unittest/llmapi/test_llm_utils.py