[None][autodeploy] Add quantization_source to factory interface (#100)

Fridah-nv · Fridah-nv · commit 6ff930b22af5 · 2025-08-08T16:22:34.000Z
* Move quant_config handling to _load_quantization_config

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

move kv_cache_dtype into _quant_config in hf factory

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

remove quant_source getter

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* add QuantConfigReader class

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

minor

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

tmp:Llama4 FP8 for BMM testing

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

revert Llama4 FP8 patch

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

move _quant_config into QuantConfigReader

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* move quantize and quantize_moe to the end of pattern matcher

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* delegate quant_config processing to QuantConfigReader and pass read to the transformation, spilit transformation into config and graph based

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

have quantConfigReader return the dtype for NVFP4

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

move quantization target collection as a transform

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

minor

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

tmp:hacky fix modelopt graph based path quantizer loading

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

fix rebase quantization to BaseTransform

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

minor: remove QuantizationTarget

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

rm tmp fix, rebase, minor update on interface

Signed-off-by: h-guo18 &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

update transform docstring

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

fix unit test

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

* move quantization to end of transformations

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

---------

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

minor

Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -19,10 +19,6 @@ transforms:
     stage: post_export
   cleanup_input_constraints:
     stage: post_export
-  quantize:
-    stage: pattern_matcher
-  quantize_moe:
-    stage: pattern_matcher
   match_repeat_kv:
     stage: pattern_matcher
   match_eager_attention:
@@ -41,3 +37,9 @@ transforms:
   # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
   optimize_rope:
     stage: pattern_matcher
+  quantize_from_config:
+    stage: pattern_matcher
+  quantize_from_graph:
+    stage: pattern_matcher
+  quantize_moe:
+    stage: pattern_matcher
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -1,6 +1,5 @@
 """Interface to initialize and load HF models."""
 
-import json
 import os
 import types
 from contextlib import contextmanager, nullcontext
@@ -31,6 +30,7 @@
 from ..utils._config import deep_merge_dicts
 from ..utils.logger import ad_logger
 from .factory import ModelFactory, ModelFactoryRegistry
+from .quant_config_reader import QuantConfigReader, QuantConfigReaderRegistry
 
 
 @contextmanager
@@ -84,9 +84,7 @@ def _get_max_position_embeddings_config(self) -> Dict[str, Any]:
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-
-        self._quant_config: Optional[Dict] = None
-
+        self._quant_config_reader: Optional[QuantConfigReader] = None
         # Ingest defaults for tokenizer and model kwargs
         self.tokenizer_kwargs = deep_merge_dicts(self._tokenizer_defaults, self.tokenizer_kwargs)
         self.model_kwargs = deep_merge_dicts(
@@ -156,9 +154,6 @@ def _recursive_update_config(self, config: PretrainedConfig, update_dict: Dict[s
 
     def _build_model(self, device: DeviceLikeType) -> nn.Module:
         """Build the model on the desired device."""
-        # We only support fp16 to fp4 conversion.
-        if self._quant_config and self._quant_config.get("quant_algo", None) == "NVFP4":
-            self.model_kwargs["torch_dtype"] = torch.half
 
         # NOTE (lucaslie): HF doesn't recursively update nested PreTrainedConfig objects. Instead,
         # the entire subconfig will be overwritten.
@@ -178,23 +173,24 @@ def _build_model(self, device: DeviceLikeType) -> nn.Module:
         model.forward = types.MethodType(self._simple_forward, model)
 
         model.eval()
+
         return model
 
     def get_quant_config(self) -> Dict:
-        return self._quant_config or {}
+        """Returns the quantization config for this model or None if not quantized."""
+        if self._quant_config_reader is not None:
+            return self._quant_config_reader.get_config()
+        return {}
 
     def get_cache_config(self):
-        """Setup cache information based on quantization information."""
-        if self._quant_config is not None and "kv_cache_quant_algo" in self._quant_config.keys():
-            kv_cache_format = self._quant_config.get("kv_cache_quant_algo", None)
-            if kv_cache_format is not None:
-                assert kv_cache_format == "FP8", (
-                    f"KV cache quantization format {kv_cache_format} is not supported."
-                )
-            kv_cache_dtype = torch.float8_e4m3fn if kv_cache_format is not None else None
-        else:
-            kv_cache_dtype = None
-        return CacheConfig(dtype=kv_cache_dtype)
+        """Return kv cache dtype configuration."""
+        if not self._quant_config_reader:
+            return CacheConfig(dtype=None)
+
+        kv_cache_dtype = self._quant_config_reader.get_config().get("kv_cache_dtype")
+        torch_dtype = {"float8_e4m3fn": torch.float8_e4m3fn}.get(kv_cache_dtype, None)
+
+        return CacheConfig(dtype=torch_dtype)
 
     def init_tokenizer(self) -> Optional[Any]:
         """Initialize the tokenizer—either a custom name or the model's default."""
@@ -325,22 +321,18 @@ def _load_checkpoint(self, model: nn.Module, device: DeviceLikeType):
 
     def _load_quantization_config(self, fetched_dir: str):
         """Load the quantization config from the model directory if not done already."""
-        if self._quant_config is not None:
+        if self._quant_config_reader is not None:
+            return
+        # TODO: specified by user or auto-detect
+        reader_cls = QuantConfigReaderRegistry.get("modelopt")
+        result = reader_cls.from_file(fetched_dir)
+        if result is None:
             return
+        reader, extra_model_kwargs = result
 
-        assert self.model
-        hf_quant_config_file = os.path.join(fetched_dir, "hf_quant_config.json")
-        if os.path.exists(hf_quant_config_file):
-            with open(hf_quant_config_file, "r") as file:
-                quantization_config = json.load(file)
-                assert quantization_config.get("producer", {}).get("name", None) == "modelopt", (
-                    "Only support modelopt quantized checkpoint"
-                )
-                self._quant_config = quantization_config.get("quantization", {})
-
-                # We do not quantize lm_head.
-                if "exclude_modules" not in self._quant_config:
-                    self._quant_config["exclude_modules"] = ["lm_head"]
+        if reader is not None:
+            self._quant_config_reader = reader
+            self.model_kwargs = deep_merge_dicts(self.model_kwargs, extra_model_kwargs)
 
 
 @ModelFactoryRegistry.register("AutoModelForImageTextToText")
diff --git a/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py b/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py
@@ -0,0 +1,130 @@
+"""
+Quantization Config Reader Registry.
+
+This module defines a registry system for parsing quantization configurations
+from various sources (e.g., 'modelopt'). It enables extensible support for different
+quantization producers by delegating parsing logic to dedicated subclasses.
+"""
+
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+import torch
+
+
+class QuantConfigReader(ABC):
+    """Base class for reading and parsing quantization config."""
+
+    def __init__(self):
+        self._quant_config: Optional[Dict] = None
+
+    def get_config(self) -> Dict:
+        """Return the parsed quantization config."""
+        return self._quant_config or {}
+
+    @abstractmethod
+    def read_config(self, path: str) -> Dict:
+        """
+        Parse and normalize a quantization config dictionary.
+
+        Args:
+            config: The raw "quantization" field from the JSON file.
+
+        Returns:
+            A processed and normalized config dictionary.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def from_file(cls, file_path: str) -> Optional["QuantConfigReader"]:
+        """
+        Load and parse a quantization config file from disk.
+
+        This method is implemented by each reader to handle loading and parsing logic.
+
+        Args:
+            file_path: Path to the quant config JSON file.
+
+        Returns:
+            An initialized QuantConfigReader instance, or None if the file doesn't exist.
+        """
+        pass
+
+
+class QuantConfigReaderRegistry:
+    _registry: Dict[str, Type[QuantConfigReader]] = {}
+
+    @classmethod
+    def register(cls, name: str) -> Callable[[Type[QuantConfigReader]], Type[QuantConfigReader]]:
+        def inner(reader_cls: Type[QuantConfigReader]) -> Type[QuantConfigReader]:
+            cls._registry[name] = reader_cls
+            return reader_cls
+
+        return inner
+
+    @classmethod
+    def get(cls, name: str) -> Type[QuantConfigReader]:
+        if name not in cls._registry:
+            raise ValueError(f"QuantConfigReader for '{name}' not registered.")
+        return cls._registry[name]
+
+    @classmethod
+    def has(cls, reader_cls: str) -> bool:
+        return reader_cls in cls._registry
+
+
+@QuantConfigReaderRegistry.register("modelopt")
+class ModelOPTQuantConfigReader(QuantConfigReader):
+    def read_config(self, config: Dict) -> Dict:
+        # Inject default exclusion
+        config.setdefault("exclude_modules", ["lm_head"])
+
+        # Update dtype
+        if config.get("quant_algo") == "NVFP4":
+            config["torch_dtype"] = "float16"
+
+        # Handle kv cache
+        kv_algo = config.get("kv_cache_quant_algo")
+        if kv_algo:
+            if kv_algo != "FP8":
+                raise ValueError(f"KV cache quantization format {kv_algo} not supported.")
+            config["kv_cache_dtype"] = "float8_e4m3fn"
+
+        self._quant_config = config
+        return self._quant_config
+
+    @classmethod
+    def from_file(
+        cls, ckpt_dir: str
+    ) -> Optional[Tuple["ModelOPTQuantConfigReader", Optional[torch.dtype]]]:
+        """
+        Load and parse a modelopt-style quantization config from a checkpoint directory.
+
+        Args:
+            ckpt_dir: Path to the root directory containing the checkpoint.
+
+        Returns:
+            An initialized ModelOPTQuantConfigReader instance, or None if the file doesn't exist.
+        """
+        quant_file = os.path.join(ckpt_dir, "hf_quant_config.json")
+        if not os.path.exists(quant_file):
+            return None
+
+        with open(quant_file, "r") as f:
+            raw = json.load(f)
+
+        producer = raw.get("producer", {}).get("name")
+        # sanity check
+        if producer != "modelopt":
+            raise ValueError(f"Expected producer 'modelopt', got '{producer}'")
+
+        quant_config = raw.get("quantization", {})
+        reader = cls()
+        reader.read_config(quant_config)
+        extra_model_kwargs: Dict[str, Any] = {}
+        if quant_config and quant_config.get("quant_algo", None) == "NVFP4":
+            extra_model_kwargs["torch_dtype"] = "float16"
+        return reader, extra_model_kwargs
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -1,6 +1,5 @@
-from collections import defaultdict
 from functools import partial
-from typing import Dict, Tuple
+from typing import Tuple
 
 import torch.nn as nn
 from torch.fx import GraphModule, Node
@@ -166,67 +165,87 @@ def get_scale_name(scale_name):
     node.args = (*node.args, *scale_values)
 
 
-@TransformRegistry.register("quantize")
-class Quantization(BaseTransform):
-    """Quantize the GraphModule and replace linear/BMM with quantized linear/BMM."""
+@TransformRegistry.register("quantize_from_config")
+class QuantizationFromConfig(BaseTransform):
+    """
+    Quantize linear and BMM ops using a quantization config.
+
+    Replaces eligible ops with quantized equivalents based on the quantization algorithm
+    and exclude patterns defined in the config.
+    """
 
     def _apply(
         self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
     ) -> Tuple[GraphModule, TransformInfo]:
-        # extract info from quant_config
         quant_config = factory.get_quant_config()
-        if not quant_config:
+        quant_algo = quant_config.get("quant_algo")
+        excluded_patterns = quant_config.get("exclude_modules", [])
+
+        if not quant_config or not quant_algo:
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
 
+        num_matches = 0
+
+        for n in gm.graph.nodes:
+            if should_skip_quantization(n, excluded_patterns):
+                continue
+
+            if is_linear_op(n, include_quantization=False):
+                impl = QuantizationImpl.create(quant_algo, is_bmm=False)
+                _insert_quantized_linear(gm, n, impl, False)
+                num_matches += 1
+
+            elif is_bmm_op(n):
+                impl = QuantizationImpl.create(quant_algo, is_bmm=True)
+                _insert_quantized_bmm(gm, n, impl, False)
+                num_matches += 1
+
+        info = TransformInfo(
+            skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=True
+        )
+
+        return gm, info
+
+
+@TransformRegistry.register("quantize_from_graph")
+class QuantizationFromGraph(BaseTransform):
+    """
+    Fuse ModelOpt-quantized linear ops into fused quantized implementations.
+
+    Detects quantized nodes from ModelOpt checkpoints's graph and replaces them with
+    fused linear ops based on the quantization type.
+    """
+
+    def _apply(
+        self, gm: GraphModule, cm: CachedSequenceInterface, factory: ModelFactory
+    ) -> Tuple[GraphModule, TransformInfo]:
         is_quant_graph = is_quantized_graph(gm)
-        quant_algo = quant_config.get("quant_algo")
-        excluded_patterns = quant_config.get("exclude_modules", [])
-        if not quant_algo:
+
+        # no quantization to do
+        if not is_quant_graph:
             return gm, TransformInfo(
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
 
         # tracking quantized operations in the graph
-        quantized_nodes: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
+        num_matches = 0
         for n in gm.graph.nodes:
-            if should_skip_quantization(n, excluded_patterns):
-                continue
-
             # Process linear operations
             if is_linear_op(n, include_quantization=False):
                 # get per-layer quantization format from the node
-                quant_algo_n: str = (
-                    get_quantization_from_linear_node(n) if is_quant_graph else quant_algo
-                )
+                quant_algo_n: str = get_quantization_from_linear_node(n)
                 if not quant_algo_n:
                     continue
 
                 # insert quantized linear node
-                _insert_quantized_linear(
-                    gm, n, QuantizationImpl.create(quant_algo_n), is_quant_graph
-                )
-                quantized_nodes[quant_algo_n]["linear"] += 1
+                _insert_quantized_linear(gm, n, QuantizationImpl.create(quant_algo_n), True)
+                num_matches += 1
 
-            # Process BMM operations
-            elif is_bmm_op(n):
-                if not quant_algo:
-                    continue
-
-                # insert quantized bmm node
-                _insert_quantized_bmm(
-                    gm, n, QuantizationImpl.create(quant_algo, is_bmm=True), is_quant_graph
-                )
-                quantized_nodes[quant_algo]["bmm"] += 1
+            # To check: quant BMM does not have graph based pass?
 
-        if is_quant_graph:
-            remove_output_quantizers(gm)
-
-        num_matches = 0
-        for quant_algo in quantized_nodes:
-            for op_type, count in quantized_nodes[quant_algo].items():
-                num_matches += count
+        remove_output_quantizers(gm)
 
         info = TransformInfo(
             skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=True
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -72,6 +72,7 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         ############################################################################################
 
         # Match MoE pattern
+        # TODO:remove quantized linear handling inside this transformation
         match_moe_pattern(egm)
 
         ############################################################################################
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_quantization.py