20
20
import math
21
21
import os
22
22
import re
23
- from collections import OrderedDict , UserDict , namedtuple
23
+ from collections import OrderedDict , UserDict
24
24
from functools import partial
25
25
26
26
import yaml
@@ -1800,7 +1800,7 @@ def smooth_quant(
1800
1800
assert folding , "IPEX version >= 2.1 is required for SmoothQuant folding=False."
1801
1801
1802
1802
if not hasattr (self , "sq" ) or force_re_smooth :
1803
- from . torch_utils .smooth_quant import TorchSmoothQuant
1803
+ from neural_compressor . adaptor . torch_utils .waq import TorchSmoothQuant
1804
1804
1805
1805
self .sq = TorchSmoothQuant (
1806
1806
model ._model , dataloader = dataloader , example_inputs = self .example_inputs , q_func = self .q_func
@@ -1813,17 +1813,18 @@ def smooth_quant(
1813
1813
kwargs ["percentile" ] = percentile
1814
1814
if scales_per_op is not None :
1815
1815
kwargs ["scales_per_op" ] = scales_per_op
1816
+ auto_alpha_args ["init_alpha" ] = default_alpha
1816
1817
model ._model = self .sq .transform (
1817
1818
alpha = alpha ,
1818
1819
folding = folding ,
1819
1820
calib_iter = calib_iter ,
1820
1821
weight_clip = weight_clip ,
1821
- default_alpha = default_alpha ,
1822
1822
auto_alpha_args = auto_alpha_args ,
1823
1823
** kwargs ,
1824
1824
)
1825
1825
if self .sq .record_max_info :
1826
1826
model .sq_max_info = self .sq .max_value_info
1827
+ model .sq_scale_info = self .sq .sq_scale_info
1827
1828
return model
1828
1829
1829
1830
def _apply_pre_optimization (self , model , tune_cfg , recover = False ):
@@ -1840,7 +1841,7 @@ def _apply_pre_optimization(self, model, tune_cfg, recover=False):
1840
1841
q_model = model ._model
1841
1842
sq_max_info = model .sq_max_info
1842
1843
if sq_max_info :
1843
- from . torch_utils .smooth_quant import TorchSmoothQuant
1844
+ from neural_compressor . adaptor . torch_utils .waq import TorchSmoothQuant
1844
1845
1845
1846
tsq = TorchSmoothQuant (q_model , None )
1846
1847
alpha = tune_cfg ["recipe_cfgs" ]["smooth_quant_args" ]["alpha" ]
@@ -1876,8 +1877,9 @@ def qdq_quantize(self, model, tune_cfg):
1876
1877
model: qdq quantized model.
1877
1878
"""
1878
1879
q_model = model ._model
1880
+ from neural_compressor .adaptor .torch_utils .waq import get_module , set_module
1881
+
1879
1882
from .torch_utils .model_wrapper import QDQLinear , SQLinearWrapper
1880
- from .torch_utils .smooth_quant import get_module , set_module
1881
1883
1882
1884
smoothquant_scale_info = {}
1883
1885
fallback_op_name_list = []
@@ -3317,37 +3319,7 @@ def qdq_quantize(self, model, q_model, tune_cfg, dataloader, q_func):
3317
3319
inplace = True if self .performance_only else False
3318
3320
3319
3321
# fetch SmoothQuant scale info from pre-optimized model
3320
- sq_max_info = model .sq_max_info
3321
- if sq_max_info :
3322
- smoothquant_scale_info = {}
3323
- from .torch_utils .model_wrapper import SQLinearWrapper
3324
- from .torch_utils .smooth_quant import get_module
3325
-
3326
- for _ , info in sq_max_info .items ():
3327
- alpha = info ["alpha" ]
3328
- absorbed_layer = info ["absorbed_layer" ]
3329
- input_minmax = info ["input_minmax" ]
3330
- # for peft model,lora_B weights is 0.
3331
- weight_max = info ["weight_max" ]
3332
- if self .sq .weight_clip :
3333
- weight_max = weight_max .clamp (min = 1e-5 )
3334
- abs_input_max = torch .max (torch .abs (input_minmax [0 ]), torch .abs (input_minmax [1 ]))
3335
- input_power = torch .pow (abs_input_max , alpha )
3336
- weight_power = torch .pow (weight_max , 1 - alpha )
3337
- scale = torch .clip (input_power / weight_power , min = 1e-5 )
3338
- for op_name in absorbed_layer :
3339
- module = copy .deepcopy (get_module (q_model ._model , op_name ))
3340
- new_module = SQLinearWrapper (module , 1.0 / scale , input_minmax , alpha )
3341
- weight_scale = new_module ._get_weight_scale ()
3342
- smoothquant_scale_info [op_name ] = {
3343
- "alpha" : new_module .alpha ,
3344
- "input_scale_for_mul" : new_module .input_scale ,
3345
- "input_scale_after_mul" : new_module .scale ,
3346
- "input_zero_point_after_mul" : new_module .zero_point ,
3347
- "input_dtype" : new_module .dtype ,
3348
- "weight_scale_after_mul" : weight_scale ,
3349
- }
3350
- logger .debug (f"Current SmoothQuant alpha of { op_name } is { alpha } " )
3322
+ smoothquant_scale_info = model .sq_scale_info
3351
3323
3352
3324
# Check save_qconf_summary part is a workaround for IPEX bug.
3353
3325
# Sometimes the prepared model from get_op_capablitiy loss this attribute
@@ -4795,7 +4767,7 @@ def teq_quantize(self, model, tune_cfg, dataloader, calib_func):
4795
4767
4796
4768
supported_layers = ["Linear" ]
4797
4769
if folding : # pragma: no cover
4798
- from . torch_utils .smooth_quant import GraphTrace
4770
+ from neural_compressor . adaptor . torch_utils .waq import GraphTrace
4799
4771
4800
4772
tg = GraphTrace ()
4801
4773
absorb_to_layer , _ = tg .get_absorb_to_layer (model , self .example_inputs , supported_layers )
0 commit comments