diff --git a/docs/source/torch/features/lora.md b/docs/source/torch/features/lora.md index e8ef7196ce1..d00a27d49a9 100644 --- a/docs/source/torch/features/lora.md +++ b/docs/source/torch/features/lora.md @@ -33,7 +33,7 @@ The PyTorch backend provides LoRA support, allowing you to: ```python from tensorrt_llm import LLM -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.executor.request import LoRARequest from tensorrt_llm.sampling_params import SamplingParams diff --git a/examples/llm-api/llm_multilora.py b/examples/llm-api/llm_multilora.py index 60795b6c60a..0c6fa4f5417 100644 --- a/examples/llm-api/llm_multilora.py +++ b/examples/llm-api/llm_multilora.py @@ -5,7 +5,7 @@ from tensorrt_llm import LLM from tensorrt_llm.executor import LoRARequest -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig def main(): diff --git a/tensorrt_llm/__init__.py b/tensorrt_llm/__init__.py index f54026a8cbc..5cffa985460 100644 --- a/tensorrt_llm/__init__.py +++ b/tensorrt_llm/__init__.py @@ -33,6 +33,7 @@ def _add_trt_llm_dll_directory(): # otherwise `MemoryError: std::bad_alloc` pattern error will be raised. import xgrammar # noqa +import tensorrt_llm._torch.models as torch_models import tensorrt_llm.functional as functional import tensorrt_llm.math_utils as math_utils import tensorrt_llm.models as models @@ -82,6 +83,7 @@ def _add_trt_llm_dll_directory(): 'default_trtnet', 'precision', 'net_guard', + 'torch_models', 'Network', 'Mapping', 'MnnvlMemory', diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index b54b64fc691..e32fdce9943 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -22,7 +22,7 @@ from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt, register_input_processor) from ...logger import logger -from ...lora_manager import LoraConfig +from ...lora_helper import LoraConfig from ...sampling_params import SamplingParams from ..attention_backend import AttentionMetadata from ..model_config import ModelConfig diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index ca373c2ed18..129738e8d11 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -6,7 +6,7 @@ import torch.nn.functional as F from torch import nn -from tensorrt_llm import logger +import tensorrt_llm.logger as trtllm_logger from tensorrt_llm._utils import get_sm_version from tensorrt_llm.quantization.utils.fp4_utils import ( float4_sf_dtype, get_reorder_rows_for_gated_act_gemm_row_indices, @@ -743,7 +743,7 @@ def load_weights(self, module: torch.nn.Module, weights: List[Dict], if int(name.split(".")[0]) not in expert_ids: continue weight_name = name.replace("weight_scale_inv", "weight") - logger.debug(f"Resmoothing {weight_name}") + trtllm_logger.logger.debug(f"Resmoothing {weight_name}") weight = weights[weight_name][:] scale = weights[name][:] weights[weight_name], weights[name] = resmooth_to_fp8_e8m0( diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 52bd7089d74..0e76938e9f5 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -13,9 +13,9 @@ from tensorrt_llm.bindings.executor import DecodingMode, ExecutorConfig from tensorrt_llm.llmapi.llm_args import PeftCacheConfig from tensorrt_llm.logger import logger -from tensorrt_llm.lora_manager import (LoraConfig, - get_default_trtllm_modules_to_hf_modules, - load_torch_lora) +from tensorrt_llm.lora_helper import (LoraConfig, + get_default_trtllm_modules_to_hf_modules) +from tensorrt_llm.lora_manager import load_torch_lora from tensorrt_llm.mapping import Mapping from ..model_config import ModelConfig diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index d39ddc4f2c1..00f314a20bd 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -26,7 +26,8 @@ from tensorrt_llm.inputs.multimodal import (MultimodalParams, MultimodalRuntimeData) from tensorrt_llm.logger import logger -from tensorrt_llm.lora_manager import LoraConfig, LoraModelConfig +from tensorrt_llm.lora_helper import LoraConfig +from tensorrt_llm.lora_manager import LoraModelConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import QuantAlgo from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2 diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 8fe7d8a1aa3..de88631c9a5 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -13,7 +13,7 @@ from tensorrt_llm.bindings.executor import ContextChunkingPolicy, ExecutorConfig from tensorrt_llm.bindings.internal.batch_manager import ContextChunkingConfig from tensorrt_llm.logger import logger -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.quantization import QuantAlgo diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 56c4871542e..e5286671c1a 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -10,7 +10,8 @@ import tensorrt_llm import tensorrt_llm.bindings from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE -from tensorrt_llm.lora_manager import LoraConfig, LoraManager, LoraModelConfig +from tensorrt_llm.lora_helper import LoraConfig +from tensorrt_llm.lora_manager import LoraManager, LoraModelConfig from tensorrt_llm.sampling_params import SamplingParams from ..._utils import binding_dtype_size, binding_to_str_dtype, nvtx_range diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py index 11d528a853d..32a66b160d4 100644 --- a/tensorrt_llm/builder.py +++ b/tensorrt_llm/builder.py @@ -36,7 +36,7 @@ from .functional import PositionEmbeddingType from .graph_rewriting import optimize from .logger import logger -from .lora_manager import LoraConfig +from .lora_helper import LoraConfig from .models import PretrainedConfig, PretrainedModel from .models.modeling_utils import SpeculativeDecodingMode, optimize_model from .network import Network, net_guard diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py index 9374883a9cb..7cca71c8793 100644 --- a/tensorrt_llm/commands/build.py +++ b/tensorrt_llm/commands/build.py @@ -31,7 +31,8 @@ from tensorrt_llm.bindings import KVCacheType from tensorrt_llm.builder import BuildConfig, Engine, build from tensorrt_llm.logger import logger, severity_map -from tensorrt_llm.lora_manager import LoraConfig, LoraManager +from tensorrt_llm.lora_helper import LoraConfig +from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.models import MODEL_MAP, PretrainedConfig from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode from tensorrt_llm.plugin import PluginConfig, add_plugin_argument diff --git a/tensorrt_llm/disaggregated_params.py b/tensorrt_llm/disaggregated_params.py index 16cfb7d3844..4dfaa5bca45 100644 --- a/tensorrt_llm/disaggregated_params.py +++ b/tensorrt_llm/disaggregated_params.py @@ -1,6 +1,11 @@ from dataclasses import dataclass from typing import List, Optional +# isort: off +# needed before trying to import bindings to load tensorrt_libs +import tensorrt as trt # noqa +# isort: on + from tensorrt_llm.bindings import executor as tllme diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index 9ce4ad0d85c..14c8eeb3894 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -15,7 +15,7 @@ from tensorrt_llm.inputs.multimodal import MultimodalParams from tensorrt_llm.logger import logger, set_level -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from .._utils import mpi_world_size from ..bindings import executor as tllm diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index c3a827bb00b..7cd1ed7dee0 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -24,7 +24,8 @@ from ..llmapi.utils import (AsyncQueue, ManagedThread, _SyncQueue, clear_sched_affinity, print_colored_debug, print_traceback_on_error) -from ..lora_manager import LoraConfig, LoraManager +from ..lora_helper import LoraConfig +from ..lora_manager import LoraManager from ..metrics import RequestEventTiming from ..prompt_adapter_manager import PromptAdapterManager from ..runtime import ModelConfig diff --git a/tensorrt_llm/llmapi/build_cache.py b/tensorrt_llm/llmapi/build_cache.py index 86c9eb4e770..6b61d277734 100644 --- a/tensorrt_llm/llmapi/build_cache.py +++ b/tensorrt_llm/llmapi/build_cache.py @@ -12,7 +12,7 @@ import filelock import tensorrt_llm -from tensorrt_llm import BuildConfig +from tensorrt_llm.builder import BuildConfig from tensorrt_llm.llmapi.utils import enable_llm_debug, print_colored from tensorrt_llm.logger import logger diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 279d26999b2..ce839d857cd 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -19,8 +19,8 @@ from strenum import StrEnum from transformers import PreTrainedTokenizerBase -from tensorrt_llm.lora_manager import (LoraConfig, - get_default_trtllm_modules_to_hf_modules) +from tensorrt_llm.lora_helper import (LoraConfig, + get_default_trtllm_modules_to_hf_modules) from .._utils import mpi_rank from ..auto_parallel import AutoParallelConfig, infer_cluster_config diff --git a/tensorrt_llm/lora_helper.py b/tensorrt_llm/lora_helper.py new file mode 100644 index 00000000000..37f5d534f7d --- /dev/null +++ b/tensorrt_llm/lora_helper.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from ._utils import DictConversion + + +def get_missing_qkv_modules_from_lora_modules( + lora_target_modules: List[str]) -> List[str]: + """Get missing QKV modules from LoRA target modules. + + In current design, q_lora_params, k_lora_params and v_lora_params should be all enabled or + all disabled at the same time. However, some lora checkpoints (e.g. BART) only contain two of them, + so we use zero tensor to fill the missing ones. + """ + missing_qkv_modules = [] + if any(x in lora_target_modules for x in ["attn_q", "attn_k", "attn_v"]): + for lora_module in ["attn_q", "attn_k", "attn_v"]: + if lora_module not in lora_target_modules: + missing_qkv_modules.append(lora_module) + if any(x in lora_target_modules + for x in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]): + for lora_module in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]: + if lora_module not in lora_target_modules: + missing_qkv_modules.append(lora_module) + return missing_qkv_modules + + +def get_default_trtllm_modules_to_hf_modules(): + """Get default mapping from TensorRT-LLM module names to HuggingFace module names.""" + return { + "attn_q": "q_proj", + "attn_k": "k_proj", + "attn_v": "v_proj", + "attn_dense": "o_proj", + "mlp_h_to_4h": "gate_proj", + "mlp_4h_to_h": "down_proj", + "mlp_gate": "up_proj", + "mlp_gate_up": "gate_up_proj", + "moe_h_to_4h": "w1", + "moe_4h_to_h": "w2", + "moe_gate": "w3", + "moe_router": "gate", + } + + +def use_lora( + model, + lora_config: "LoraConfig", + trtllm_modules_to_hf_modules: Optional[Dict[str, str]] = None, +): + """Use LoRA with the given model and configuration. + + This function is a wrapper that delegates to the appropriate loading function + based on the LoRA checkpoint source. + """ + if lora_config.lora_ckpt_source == "nemo": + from .lora_manager import load_nemo_lora + load_nemo_lora(model, lora_config) + elif lora_config.lora_ckpt_source == "hf": + from .lora_manager import load_hf_lora + load_hf_lora(model, lora_config, trtllm_modules_to_hf_modules) + else: + raise ValueError( + f"Unsupported lora_ckpt_source: {lora_config.lora_ckpt_source}") + + +@dataclass +class LoraConfig(DictConversion): + lora_dir: List[str] = field(default_factory=list) + lora_ckpt_source: str = "hf" + max_lora_rank: int = 64 + lora_target_modules: List[str] = field(default_factory=list) + trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict) + max_loras: Optional[int] = None + max_cpu_loras: Optional[int] = None + + def __post_init__(self): + assert self.lora_ckpt_source in [ + "hf", "nemo" + ], (f"lora_ckpt_source must be one of 'hf' or 'nemo', got {self.lora_ckpt_source}" + ) + + @property + def missing_qkv_modules(self) -> List[str]: + return get_missing_qkv_modules_from_lora_modules( + self.lora_target_modules) diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index f2e32047162..7440715474c 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -5,7 +5,7 @@ import tarfile import warnings from collections import defaultdict -from dataclasses import dataclass, field +from dataclasses import dataclass from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union @@ -16,8 +16,13 @@ from tensorrt_llm.bindings import internal as tb_internal -from ._utils import DictConversion, pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy +from ._utils import pad_vocab_size, release_gc, str_dtype_to_torch, torch_to_numpy from .layers.linear import ColumnLinear +from .lora_helper import ( + LoraConfig, + get_default_trtllm_modules_to_hf_modules, + get_missing_qkv_modules_from_lora_modules, +) from .mapping import Mapping from .models.convert_utils import get_model_path, load_state_dict, split_matrix_tp @@ -232,26 +237,6 @@ def norm_dora_magnitude( return norm_m -@dataclass -class LoraConfig(DictConversion): - lora_dir: List[str] = field(default_factory=list) - lora_ckpt_source: str = "hf" - max_lora_rank: int = 64 - lora_target_modules: List[str] = field(default_factory=list) - trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict) - max_loras: int | None = None - max_cpu_loras: int | None = None - - def __post_init__(self): - assert self.lora_ckpt_source in ["hf", "nemo"], ( - f"lora_ckpt_source must be one of 'hf' or 'nemo', got {self.lora_ckpt_source}" - ) - - @property - def missing_qkv_modules(self) -> List[str]: - return LoraManager.get_missing_qkv_modules(self.lora_target_modules) - - @dataclass class LoraModelConfig: lora_target_modules: list[str] @@ -430,23 +415,6 @@ def load_nemo_lora(model, lora_config: LoraConfig): lora_config.lora_target_modules = lora_loader.lora_target_modules -def get_default_trtllm_modules_to_hf_modules(): - return { - "attn_q": "q_proj", - "attn_k": "k_proj", - "attn_v": "v_proj", - "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_proj", - "mlp_4h_to_h": "down_proj", - "mlp_gate": "up_proj", - "mlp_gate_up": "gate_up_proj", - "moe_h_to_4h": "w1", - "moe_4h_to_h": "w2", - "moe_gate": "w3", - "moe_router": "gate", - } - - def load_torch_hf_lora(lora_config: LoraConfig): """This is a shortned version of load_hf_lora that is used for torch models. @@ -628,19 +596,6 @@ def load_hf_lora( ).to(torch_dtype) -def use_lora( - model, - lora_config: LoraConfig, - trtllm_modules_to_hf_modules: Optional[Dict[str, str]] = None, -): - if lora_config.lora_ckpt_source == "nemo": - load_nemo_lora(model, lora_config) - elif lora_config.lora_ckpt_source == "hf": - load_hf_lora(model, lora_config, trtllm_modules_to_hf_modules) - else: - raise ValueError(f"Unsupported lora_ckpt_source: {lora_config.lora_ckpt_source}") - - def unpack_nemo_weights(nemo_archive_path: str) -> Tuple[Dict, Dict[str, torch.Tensor]]: """Unpack model config and weights from a NeMo .nemo archive file. @@ -762,21 +717,8 @@ def is_adapter_in_cpu_cache(self, adapter_uid: int) -> bool: ) @staticmethod - def get_missing_qkv_modules(lora_target_modules): - # In current design, q_lora_params, k_lora_params and v_lora_params should be all enabled or - # all disabled at the same time. - # However, some lora checkpoint (e.g. BART) only contain two of them, so we use zero tensor - # to fill the missing ones. - missing_qkv_modules = [] - if any(x in lora_target_modules for x in ["attn_q", "attn_k", "attn_v"]): - for lora_module in ["attn_q", "attn_k", "attn_v"]: - if lora_module not in lora_target_modules: - missing_qkv_modules.append(lora_module) - if any(x in lora_target_modules for x in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]): - for lora_module in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]: - if lora_module not in lora_target_modules: - missing_qkv_modules.append(lora_module) - return missing_qkv_modules + def get_missing_qkv_modules(lora_target_modules: List[str]) -> List[str]: + return get_missing_qkv_modules_from_lora_modules(lora_target_modules) @property def missing_qkv_modules(self) -> List[str]: diff --git a/tensorrt_llm/models/enc_dec/model.py b/tensorrt_llm/models/enc_dec/model.py index 65de761502a..be3c5afc49f 100644 --- a/tensorrt_llm/models/enc_dec/model.py +++ b/tensorrt_llm/models/enc_dec/model.py @@ -36,9 +36,9 @@ LanguageAdapterConfig, LayerNorm, LoraParams, PromptTuningEmbedding, RmsNorm) # yapf: enable -from tensorrt_llm.lora_manager import (LoraConfig, - get_default_trtllm_modules_to_hf_modules, - use_lora) +from tensorrt_llm.lora_helper import (LoraConfig, + get_default_trtllm_modules_to_hf_modules, + use_lora) from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import PretrainedConfig, PretrainedModel from tensorrt_llm.module import Module, ModuleList diff --git a/tensorrt_llm/models/gemma/model.py b/tensorrt_llm/models/gemma/model.py index 494eb71799f..2091f111f85 100644 --- a/tensorrt_llm/models/gemma/model.py +++ b/tensorrt_llm/models/gemma/model.py @@ -28,7 +28,7 @@ from ...layers import (Attention, AttentionMaskType, AttentionParams, ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams, LoraParams, PositionEmbeddingType, RmsNorm) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py index aecfcda64c8..89267a90f71 100644 --- a/tensorrt_llm/models/gpt/model.py +++ b/tensorrt_llm/models/gpt/model.py @@ -21,7 +21,7 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear, Embedding, GatedMLP, LayerNorm, MoeConfig, PositionEmbeddingType) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ...quantization import QuantMode diff --git a/tensorrt_llm/models/grok/model.py b/tensorrt_llm/models/grok/model.py index 8fc34349f92..9ff22cd71c7 100644 --- a/tensorrt_llm/models/grok/model.py +++ b/tensorrt_llm/models/grok/model.py @@ -18,7 +18,7 @@ from ...functional import Tensor, recv, send from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear, Embedding, MoeConfig, PositionEmbeddingType, RmsNorm) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py index 259f3e2f9ae..2e272772adb 100644 --- a/tensorrt_llm/models/llama/model.py +++ b/tensorrt_llm/models/llama/model.py @@ -25,7 +25,7 @@ from ...layers import (MOE, Attention, AttentionMaskType, ColumnLinear, Embedding, FusedGatedMLP, GatedMLP, PositionEmbeddingType, RmsNorm) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ...quantization.functional import fused_layernorm diff --git a/tensorrt_llm/models/mllama/model.py b/tensorrt_llm/models/mllama/model.py index 5f9c622fa88..95a261350b8 100644 --- a/tensorrt_llm/models/mllama/model.py +++ b/tensorrt_llm/models/mllama/model.py @@ -32,9 +32,9 @@ ColumnLinear, Embedding, FusedGatedMLP, GatedMLP, GroupNorm, KeyValueCacheParams, LayerNorm, LoraParams, RmsNorm) -from tensorrt_llm.lora_manager import (LoraConfig, - get_default_trtllm_modules_to_hf_modules, - use_lora) +from tensorrt_llm.lora_helper import (LoraConfig, + get_default_trtllm_modules_to_hf_modules, + use_lora) from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.model_weights_loader import ModelWeightsLoader from tensorrt_llm.models.modeling_utils import PretrainedModel, QuantConfig diff --git a/tensorrt_llm/models/phi/model.py b/tensorrt_llm/models/phi/model.py index 6e6dd3579bd..9c90e114e9c 100644 --- a/tensorrt_llm/models/phi/model.py +++ b/tensorrt_llm/models/phi/model.py @@ -20,7 +20,7 @@ from ...functional import Tensor from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear, Embedding, LayerNorm) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, diff --git a/tensorrt_llm/models/phi3/model.py b/tensorrt_llm/models/phi3/model.py index 5f058f147e8..5bdc24f8ed2 100644 --- a/tensorrt_llm/models/phi3/model.py +++ b/tensorrt_llm/models/phi3/model.py @@ -8,7 +8,7 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, BlockSparseAttnParams, ColumnLinear, Embedding, LayerNorm, MoeConfig, RmsNorm) -from ...lora_manager import LoraConfig, use_lora +from ...lora_helper import LoraConfig, use_lora from ...mapping import Mapping from ...module import Module from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM, diff --git a/tensorrt_llm/models/qwen/model.py b/tensorrt_llm/models/qwen/model.py index 0eb6e8ac449..f32a4036d84 100644 --- a/tensorrt_llm/models/qwen/model.py +++ b/tensorrt_llm/models/qwen/model.py @@ -26,8 +26,8 @@ Embedding, GatedMLP, RmsNorm, SharedMoE) from ...layers.moe import MOEWeightWrapper from ...logger import logger -from ...lora_manager import (LoraConfig, - get_default_trtllm_modules_to_hf_modules, use_lora) +from ...lora_helper import (LoraConfig, + get_default_trtllm_modules_to_hf_modules, use_lora) from ...mapping import Mapping from ...module import Module from ...quantization import QuantAlgo diff --git a/tensorrt_llm/top_model_mixin.py b/tensorrt_llm/top_model_mixin.py index 61e8dcfa4f3..4d3702dca5a 100644 --- a/tensorrt_llm/top_model_mixin.py +++ b/tensorrt_llm/top_model_mixin.py @@ -15,7 +15,7 @@ from typing import Optional -from .lora_manager import LoraConfig +from .lora_helper import LoraConfig from .mapping import Mapping from .plugin.plugin import PluginConfig diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py index 21edd013da1..24320a993b3 100644 --- a/tests/unittest/_torch/test_resource_manager.py +++ b/tests/unittest/_torch/test_resource_manager.py @@ -17,7 +17,7 @@ from tensorrt_llm.bindings import executor as tllm from tensorrt_llm.bindings.internal.batch_manager import \ PeftTaskNotCachedException -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig DataType = tensorrt_llm.bindings.DataType LoraModule = tensorrt_llm.bindings.LoraModule diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 5e82d10b43c..4f7488205a9 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -42,7 +42,7 @@ from tensorrt_llm.llmapi.tokenizer import (TokenizerBase, TransformersTokenizer, load_hf_tokenizer) from tensorrt_llm.llmapi.utils import get_total_gpu_memory -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.models.automodel import AutoConfig, AutoModelForCausalLM from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode from tensorrt_llm.sampling_params import (BatchedLogitsProcessor, diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py index b498a8cd7f5..a92e640a8bb 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu.py @@ -12,7 +12,7 @@ from tensorrt_llm.executor import GenerationExecutorProxy from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig, SamplingParams from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import PretrainedConfig from tensorrt_llm.models.llama.model import LLaMAForCausalLM diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index cb8dbf03c07..28d6bedf1ba 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -4,7 +4,7 @@ from .test_llm import tinyllama_logits_processor_test_harness from tensorrt_llm import LLM from tensorrt_llm.llmapi import KvCacheConfig -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness from .test_llm import _test_llm_capture_request_error diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 541965b588f..7165aae2f65 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -25,7 +25,7 @@ skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb) from utils.llm_data import llm_models_root -from tensorrt_llm.lora_manager import LoraConfig +from tensorrt_llm.lora_helper import LoraConfig from tensorrt_llm.executor.request import LoRARequest from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization.mode import QuantAlgo