From 73ea701fd7c3bba521d16f4af34c4d4fb10be4d1 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Fri, 7 Mar 2025 06:48:03 -0800
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 torchao/prototype/smoothquant/api.py | 153 +++++++++++++++------------
 1 file changed, 86 insertions(+), 67 deletions(-)

diff --git a/torchao/prototype/smoothquant/api.py b/torchao/prototype/smoothquant/api.py
index 1354a4be30..ab53a394a2 100644
--- a/torchao/prototype/smoothquant/api.py
+++ b/torchao/prototype/smoothquant/api.py
@@ -1,20 +1,30 @@
+import types
+from dataclasses import dataclass
 from typing import Dict, Optional
 
 import torch
 
+from torchao.core.config import AOBaseConfig
 from torchao.dtypes import to_affine_quantized_intx, to_affine_quantized_intx_static
 from torchao.prototype.smoothquant.core import (
     SmoothQuantObservedLinear,
     SmoothQuantObserver,
 )
+from torchao.quantization import quantize_
 from torchao.quantization.linear_activation_quantized_tensor import (
     to_linear_activation_quantized,
 )
 from torchao.quantization.linear_activation_scale import (
     to_weight_tensor_with_linear_activation_scale_metadata,
 )
-from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
+from torchao.quantization.quant_api import (
+    _linear_extra_repr,
+    _replace_with_custom_fn_if_matches_filter,
+)
 from torchao.quantization.quant_primitives import MappingType
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
+)
 from torchao.quantization.utils import _get_per_token_block_size
 from torchao.quantization.weight_tensor_linear_activation_quantization import (
     to_weight_tensor_with_linear_activation_quantization_metadata,
@@ -53,32 +63,6 @@ def replace_with_observer(layer):
     _replace_with_custom_fn_if_matches_filter(model, replace_with_observer, _is_linear)
 
 
-def _observed_linear_subclass_inserter(constructor):
-    """
-    Replaces unquantized observed linear instances with quantized linear instances.
-
-    Args:
-        constructor: the function which applies quantization to the observed linear layer
-    """
-
-    def insert_subclass(observed_linear):
-        # creates the new linear layer using constructor
-        linear = torch.nn.Linear(
-            observed_linear.in_features,
-            observed_linear.out_features,
-            observed_linear.bias is not None,
-            device=observed_linear.weight.device,
-            dtype=observed_linear.weight.dtype,
-        )
-        linear.weight = torch.nn.Parameter(
-            constructor(observed_linear), requires_grad=False
-        )
-        linear.bias = observed_linear.bias
-        return linear
-
-    return insert_subclass
-
-
 def save_smooth_quant_recipe(
     model: torch.nn.Module, save_path: str
 ) -> Dict[str, torch.Tensor]:
@@ -121,7 +105,14 @@ def recurse(module: torch.nn.Module, name: str = ""):
             # act_scales is None for dynamic quantization
             if any(x is None for x in (smoothing_factor, wei_scales)):
                 return module
-            return smooth_quant(smoothing_factor, act_scales, wei_scales)(module)
+            is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
+            wrapper = torch.nn.Sequential(module)
+            quantize_(
+                wrapper,
+                smooth_quant(smoothing_factor, act_scales, wei_scales),
+                is_observed_linear,
+            )
+            return wrapper[0]
 
         mod_new = module
 
@@ -158,13 +149,10 @@ def static_quantize(self, input, scale, zero_point):
         )
 
 
-def smooth_quant(
-    smoothing_factor: Optional[torch.Tensor] = None,
-    act_scales: Optional[torch.Tensor] = None,
-    wei_scales: Optional[torch.Tensor] = None,
-):
+@dataclass
+class SmoothQuantConfig(AOBaseConfig):
     """
-    Quantizes linear layers when passed into quantize_()
+    Configuration for quantizing linear layers when passed into quantize_()
 
     Args:
         smoothing_factor: The smoothing factor for the layer. Acquired from the layer's observer if None.
@@ -172,40 +160,71 @@ def smooth_quant(
         wei_scales: The weight scales for the layer. Acquired from the layer's observer if None.
     """
 
-    def quantize_weight(observed_linear):
-        target_dtype = torch.int8
-        # act_scales is None for dynamic quantization thus not checked
-        if any(x is None for x in (smoothing_factor, wei_scales)):
-            factor, x_scale, w_scales = observed_linear.obs.calculate_qparams()
-            weight = observed_linear.obs.weight * factor
-        else:
-            factor, x_scale, w_scales = smoothing_factor, act_scales, wei_scales
-            weight = observed_linear.weight * factor
-        weight = weight.to(observed_linear.weight.dtype)
-        block_size = (1, weight.size(1))
-        wei_zero_points = torch.zeros_like(w_scales, dtype=torch.int64)
-        qw = to_affine_quantized_intx_static(
-            weight,
-            w_scales,
-            wei_zero_points,
-            block_size,
-            target_dtype,
-        )
+    smoothing_factor: Optional[torch.Tensor] = None
+    act_scales: Optional[torch.Tensor] = None
+    wei_scales: Optional[torch.Tensor] = None
+
+
+# for bc
+smooth_quant = SmoothQuantConfig
 
-        if x_scale is None:
-            # dynamic quant
-            qw = to_linear_activation_quantized(
-                qw, _ActQuantizer(target_dtype).dynamic_quantize
-            )
-        else:
-            # static quant
-            x_zero_point = torch.zeros_like(x_scale, dtype=torch.int64)
-            qw = to_weight_tensor_with_linear_activation_quantization_metadata(
-                qw, _ActQuantizer(target_dtype).static_quantize, x_scale, x_zero_point
-            )
 
-        return to_weight_tensor_with_linear_activation_scale_metadata(
-            qw, factor.to(qw.dtype)
+@register_quantize_module_handler(SmoothQuantConfig)
+def _smooth_quant_transform(
+    module: torch.nn.Module,
+    config: SmoothQuantConfig,
+):
+    smoothing_factor = config.smoothing_factor
+    act_scales = config.act_scales
+    wei_scales = config.wei_scales
+    # weight = module.weight
+    observed_linear = module
+
+    linear = torch.nn.Linear(
+        observed_linear.in_features,
+        observed_linear.out_features,
+        observed_linear.bias is not None,
+        device=observed_linear.weight.device,
+        dtype=observed_linear.weight.dtype,
+    )
+    # linear.weight = torch.nn.Parameter(
+    #     constructor(observed_linear), requires_grad=False
+    # )
+    linear.bias = observed_linear.bias
+    # return linear
+
+    target_dtype = torch.int8
+    # act_scales is None for dynamic quantization thus not checked
+    if any(x is None for x in (smoothing_factor, wei_scales)):
+        factor, x_scale, w_scales = observed_linear.obs.calculate_qparams()
+        weight = observed_linear.obs.weight * factor
+    else:
+        factor, x_scale, w_scales = smoothing_factor, act_scales, wei_scales
+        weight = observed_linear.weight * factor
+    weight = weight.to(observed_linear.weight.dtype)
+    block_size = (1, weight.size(1))
+    wei_zero_points = torch.zeros_like(w_scales, dtype=torch.int64)
+    qw = to_affine_quantized_intx_static(
+        weight,
+        w_scales,
+        wei_zero_points,
+        block_size,
+        target_dtype,
+    )
+
+    if x_scale is None:
+        # dynamic quant
+        qw = to_linear_activation_quantized(
+            qw, _ActQuantizer(target_dtype).dynamic_quantize
+        )
+    else:
+        # static quant
+        x_zero_point = torch.zeros_like(x_scale, dtype=torch.int64)
+        qw = to_weight_tensor_with_linear_activation_quantization_metadata(
+            qw, _ActQuantizer(target_dtype).static_quantize, x_scale, x_zero_point
         )
 
-    return _observed_linear_subclass_inserter(quantize_weight)
+    qw = to_weight_tensor_with_linear_activation_scale_metadata(qw, factor.to(qw.dtype))
+    linear.weight = torch.nn.Parameter(qw, requires_grad=False)
+    linear.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return linear

From 4f2c69de4098bdbf3a71dd48343d5432f50d559a Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Fri, 7 Mar 2025 07:22:52 -0800
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 test/prototype/test_autoround.py    | 10 ++++++++--
 torchao/prototype/autoround/core.py |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/test/prototype/test_autoround.py b/test/prototype/test_autoround.py
index d706175246..e8d4de444d 100644
--- a/test/prototype/test_autoround.py
+++ b/test/prototype/test_autoround.py
@@ -86,7 +86,10 @@ def _check_params_and_buffers_type(module, check_fun):
 
 
 class TestAutoRound(TestCase):
-    @pytest.mark.skip(not TORCH_VERSION_AT_LEAST_2_5, "Requires torch 2.5 or later")
+    @pytest.mark.skip("these tests are broken on main branch")
+    @pytest.mark.skipif(
+        not TORCH_VERSION_AT_LEAST_2_5, reason="Requires torch 2.5 or later"
+    )
     @parametrize("device", _AVAILABLE_DEVICES)
     @torch.no_grad()
     def test_auto_round(self, device: str):
@@ -127,7 +130,10 @@ def test_auto_round(self, device: str):
         after_quant = m(*example_inputs)
         assert after_quant is not None, "Quantized model forward pass failed"
 
-    @pytest.mark.skip(not TORCH_VERSION_AT_LEAST_2_5, "Requires torch 2.5 or later")
+    @pytest.mark.skip("these tests are broken on main branch")
+    @pytest.mark.skipif(
+        not TORCH_VERSION_AT_LEAST_2_5, reason="Requires torch 2.5 or later"
+    )
     @parametrize("device", _AVAILABLE_DEVICES)
     @torch.no_grad()
     def test_wrap_model_with_multi_tensor(self, device: str):
diff --git a/torchao/prototype/autoround/core.py b/torchao/prototype/autoround/core.py
index 0504b5ea27..e14b817c13 100644
--- a/torchao/prototype/autoround/core.py
+++ b/torchao/prototype/autoround/core.py
@@ -165,6 +165,10 @@ def apply_auto_round():
     More details about the auto-round can be found at https://arxiv.org/abs/2309.05516.
     """
 
+    raise AssertionError(
+        "Please migrate this function to direct configuration, see https://github.com/pytorch/ao/issues/1690 for details"
+    )
+
     def _apply_auto_round(optimized_model: torch.nn.Module):
         """
         The `optimized_model` includes `Linear` layers optimized by auto-round, which includes `qdq_weight`, `scale`, `zp`.

From 6f3d1278e33490854d27be113bf4bfaff8d9ec33 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Sat, 8 Mar 2025 06:15:24 -0800
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 test/prototype/test_smoothquant.py        |  6 +++---
 torchao/prototype/smoothquant/README.md   |  6 +++---
 torchao/prototype/smoothquant/__init__.py |  4 ++--
 torchao/prototype/smoothquant/api.py      | 11 +----------
 torchao/prototype/smoothquant/example.py  |  4 ++--
 5 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/test/prototype/test_smoothquant.py b/test/prototype/test_smoothquant.py
index d90990143c..aed1f6fcd8 100644
--- a/test/prototype/test_smoothquant.py
+++ b/test/prototype/test_smoothquant.py
@@ -5,11 +5,11 @@
 import torch
 
 from torchao.prototype.smoothquant import (
+    SmoothQuantConfig,
     SmoothQuantObservedLinear,
     insert_smooth_quant_observer_,
     load_smooth_quant_recipe,
     save_smooth_quant_recipe,
-    smooth_quant,
 )
 from torchao.quantization import quantize_
 from torchao.quantization.utils import (
@@ -85,7 +85,7 @@ def forward(self, x):
     m(data)
     # quantize
     is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-    quantize_(m, smooth_quant(), is_observed_linear)
+    quantize_(m, SmoothQuantConfig(), is_observed_linear)
     with torch.inference_mode():
         if TORCH_VERSION_AT_LEAST_2_5:
             m = torch.compile(m, fullgraph=True)
@@ -173,7 +173,7 @@ def test_save_load_recipe(alpha, quant_mode, device, idtype):
 
     # quantize
     is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-    quantize_(m, smooth_quant(), is_observed_linear)
+    quantize_(m, SmoothQuantConfig(), is_observed_linear)
     if TORCH_VERSION_AT_LEAST_2_5:
         # earlier versions are not compatible
         m = torch.compile(m, fullgraph=True)
diff --git a/torchao/prototype/smoothquant/README.md b/torchao/prototype/smoothquant/README.md
index fa64fc4460..c268a83504 100644
--- a/torchao/prototype/smoothquant/README.md
+++ b/torchao/prototype/smoothquant/README.md
@@ -27,7 +27,7 @@ python example.py -m MODLE_ID --device=<cuda or cpu> --quant-mode=<dynamic or st
 ## Usage of API
 The following APIs are provided:
 - insert_smooth_quant_observer_
-- smooth_quant
+- SmoothQuantConfig
 - save_smooth_quant_recipe (advanced)
 - load_smooth_quant_recipe (advanced)
 
@@ -37,11 +37,11 @@ insert_smooth_quant_observer_(model, alpha=0.5, quant_mode="dynamic")
 ```
 After insertion, run the model for calibration on a certain dataset or (advanced) load a recipe.
 
-`smooth_quant` applies SmoothQuant to each linear layer of the model. Use it by calling `torchao.quantization.quantize_`. For example:
+`SmoothQuantConfig` configures appliying SmoothQuant to each linear layer of the model. Use it by calling `torchao.quantization.quantize_`. For example:
 ```python
 from torchao.prototype.smoothquant import SmoothQuantObservedLinear
 is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
-torchao.quantization.quantize_(model, smooth_quant(), is_observed_linear)
+torchao.quantization.quantize_(model, SmoothQuantConfig(), is_observed_linear)
 ```
 `is_observed_linear` is a filter so that we only quantize observed linear layers.
 
diff --git a/torchao/prototype/smoothquant/__init__.py b/torchao/prototype/smoothquant/__init__.py
index d31da9b5f8..948a99c080 100644
--- a/torchao/prototype/smoothquant/__init__.py
+++ b/torchao/prototype/smoothquant/__init__.py
@@ -1,8 +1,8 @@
 from .api import (
+    SmoothQuantConfig,
     insert_smooth_quant_observer_,
     load_smooth_quant_recipe,
     save_smooth_quant_recipe,
-    smooth_quant,
 )
 from .core import SmoothQuantObservedLinear
 
@@ -10,6 +10,6 @@
     "insert_smooth_quant_observer_",
     "load_smooth_quant_recipe",
     "save_smooth_quant_recipe",
-    "smooth_quant",
+    "SmoothQuantConfig",
     "SmoothQuantObservedLinear",
 ]
diff --git a/torchao/prototype/smoothquant/api.py b/torchao/prototype/smoothquant/api.py
index ab53a394a2..16ff045663 100644
--- a/torchao/prototype/smoothquant/api.py
+++ b/torchao/prototype/smoothquant/api.py
@@ -109,7 +109,7 @@ def recurse(module: torch.nn.Module, name: str = ""):
             wrapper = torch.nn.Sequential(module)
             quantize_(
                 wrapper,
-                smooth_quant(smoothing_factor, act_scales, wei_scales),
+                SmoothQuantConfig(smoothing_factor, act_scales, wei_scales),
                 is_observed_linear,
             )
             return wrapper[0]
@@ -165,10 +165,6 @@ class SmoothQuantConfig(AOBaseConfig):
     wei_scales: Optional[torch.Tensor] = None
 
 
-# for bc
-smooth_quant = SmoothQuantConfig
-
-
 @register_quantize_module_handler(SmoothQuantConfig)
 def _smooth_quant_transform(
     module: torch.nn.Module,
@@ -177,7 +173,6 @@ def _smooth_quant_transform(
     smoothing_factor = config.smoothing_factor
     act_scales = config.act_scales
     wei_scales = config.wei_scales
-    # weight = module.weight
     observed_linear = module
 
     linear = torch.nn.Linear(
@@ -187,11 +182,7 @@ def _smooth_quant_transform(
         device=observed_linear.weight.device,
         dtype=observed_linear.weight.dtype,
     )
-    # linear.weight = torch.nn.Parameter(
-    #     constructor(observed_linear), requires_grad=False
-    # )
     linear.bias = observed_linear.bias
-    # return linear
 
     target_dtype = torch.int8
     # act_scales is None for dynamic quantization thus not checked
diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py
index 0075502595..788201d3fe 100644
--- a/torchao/prototype/smoothquant/example.py
+++ b/torchao/prototype/smoothquant/example.py
@@ -9,9 +9,9 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from torchao.prototype.smoothquant import (
+    SmoothQuantConfig,
     SmoothQuantObservedLinear,
     insert_smooth_quant_observer_,
-    smooth_quant,
 )
 from torchao.quantization import quantize_
 
@@ -145,7 +145,7 @@ def wikitext2_ppl(
         is_observed_linear = lambda m, fqn: isinstance(m, SmoothQuantObservedLinear)
         print(f"running SmoothQuant with {quant_mode} quantization")
         t0 = time.time()
-        quantize_(model, smooth_quant(), is_observed_linear)
+        quantize_(model, SmoothQuantConfig(), is_observed_linear)
         print(f"time for quantization: {time.time() - t0:.02f} seconds")
         if model_save_path is not None:
             print(f"Saving quantized model to {model_save_path}")