@@ -694,21 +694,28 @@ def autoround_quantize(
694
694
enable_minmax_tuning : bool = True ,
695
695
lr : float = None ,
696
696
minmax_lr : float = None ,
697
- low_gpu_mem_usage : bool = True ,
697
+ low_gpu_mem_usage : bool = False ,
698
698
iters : int = 200 ,
699
699
seqlen : int = 2048 ,
700
- n_samples : int = 512 ,
700
+ nsamples : int = 128 ,
701
701
sampler : str = "rand" ,
702
702
seed : int = 42 ,
703
- n_blocks : int = 1 ,
703
+ nblocks : int = 1 ,
704
704
gradient_accumulate_steps : int = 1 ,
705
705
not_use_best_mse : bool = False ,
706
706
dynamic_max_gap : int = - 1 ,
707
707
data_type : str = "int" , ##only support int for now
708
708
scale_dtype : str = "fp16" ,
709
+ multimodal : bool = False ,
710
+ act_bits : int = 32 ,
711
+ act_group_size : int = None ,
712
+ act_sym : bool = None ,
713
+ act_dynamic : bool = True ,
714
+ use_layer_wise : bool = False ,
709
715
** kwargs ,
710
716
):
711
717
"""Run autoround weight-only quantization.
718
+
712
719
Args:
713
720
model: The PyTorch model to be quantized.
714
721
tokenizer: An optional tokenizer for processing input data. If none is provided, a dataloader must be supplied.
@@ -717,15 +724,19 @@ def autoround_quantize(
717
724
sym (bool): Whether symmetric quantization is to be used (default is False).
718
725
weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
719
726
weight_config={
720
- 'layer1':##layer_name
721
- {
722
- 'data_type': 'int',
723
- 'bits': 4,
724
- 'group_size': 32,
725
- 'sym': False
726
- }
727
- ...
728
- }
727
+ 'layer1':##layer_name
728
+ {
729
+ 'data_type': 'int',
730
+ 'bits': 4,
731
+ 'group_size': 32,
732
+ 'sym': False,
733
+ 'act_data_type': None,
734
+ 'act_bits': 32,
735
+ 'act_sym': None,
736
+ 'act_dynamic': True,
737
+ }
738
+ ...,
739
+ }
729
740
enable_full_range (bool): Whether to enable full range quantization (default is False).
730
741
batch_size (int): Batch size for training (default is 8).
731
742
amp (bool): Whether to use automatic mixed precision (default is True).
@@ -737,20 +748,24 @@ def autoround_quantize(
737
748
enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
738
749
lr (float): The learning rate (default is None, will be set to 1.0/iters).
739
750
minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically).
740
- low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True ).
751
+ low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False ).
741
752
iters (int): Number of iterations (default is 200).
742
753
seqlen (int): Data length of the sequence for tuning (default is 2048).
743
- n_samples (int): Number of samples (default is 512 ).
754
+ nsamples (int): Number of samples (default is 128 ).
744
755
sampler (str): The sampling method (default is "rand").
745
756
seed (int): The random seed (default is 42).
746
- n_blocks (int): Number of blocks (default is 1).
757
+ nblocks (int): Number of blocks (default is 1).
747
758
gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
748
759
not_use_best_mse (bool): Whether to use mean squared error (default is False).
749
760
dynamic_max_gap (int): The dynamic maximum gap (default is -1).
750
761
data_type (str): The data type to be used (default is "int").
751
762
scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
752
763
have different choices.
753
-
764
+ multimodal(bool): Enable multimodal model quantization, (default is "False").
765
+ act_bits (int): Number of bits for activation quantization. Default is 32.
766
+ act_group_size (int): Group size for activation quantization. Default is None.
767
+ act_sym (bool): Whether to use symmetric activation quantization. Default is None.
768
+ act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
754
769
Returns:
755
770
The quantized model.
756
771
"""
@@ -762,7 +777,7 @@ def autoround_quantize(
762
777
bits = bits ,
763
778
group_size = group_size ,
764
779
sym = sym ,
765
- weight_config = weight_config ,
780
+ layer_config = weight_config ,
766
781
enable_full_range = enable_full_range , ##for symmetric, TODO support later
767
782
batch_size = batch_size ,
768
783
amp = amp ,
@@ -776,15 +791,21 @@ def autoround_quantize(
776
791
low_gpu_mem_usage = low_gpu_mem_usage ,
777
792
iters = iters ,
778
793
seqlen = seqlen ,
779
- n_samples = n_samples ,
794
+ nsamples = nsamples ,
780
795
sampler = sampler ,
781
796
seed = seed ,
782
- n_blocks = n_blocks ,
797
+ nblocks = nblocks ,
783
798
gradient_accumulate_steps = gradient_accumulate_steps ,
784
799
not_use_best_mse = not_use_best_mse ,
785
800
dynamic_max_gap = dynamic_max_gap ,
786
801
data_type = data_type , ## only support data_type
787
802
scale_dtype = scale_dtype ,
803
+ multimodal = multimodal ,
804
+ act_bits = act_bits ,
805
+ act_group_size = act_group_size ,
806
+ act_sym = act_sym ,
807
+ act_dynamic = act_dynamic ,
808
+ low_cpu_mem_usage = use_layer_wise ,
788
809
** kwargs ,
789
810
)
790
811
qdq_model , weight_config = rounder .quantize ()
0 commit comments