From 46d3caab6a7695a8d90dc77467380bb7f0df69ef Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 14:10:45 +0800
Subject: [PATCH 01/23] add awq example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 35 ++++++++++++++++++-
 .../quantization/weight_only/run_quant.sh     |  7 ++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index abd8228354e..d5eb7b2f1b5 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -77,6 +77,15 @@
                     help='Calibration dataset sequence max length, '
                         'this should align with your model config, '
                         'and your dataset builder args: args.pad_max_length')
+# =============AWQ configs====================
+parser.add_argument("--use_auto_scale", action="store_true",
+                    help="Enables best scales search based on activation distribution.")
+parser.add_argument("--use_auto_clip", action="store_true",
+                    help="Enables clip range searchc.")
+parser.add_argument("--folding", action="store_true",
+                    help="Allow insert mul before linear when the scale cannot be absorbed by last layer.")
+parser.add_argument('--absorb_layer_dict', type=dict, default={},
+                    help="The layer dict that scale can be absorbed.")
 
 # =============DoubleQuant configs====================
 parser.add_argument("--double_quant_type",
@@ -223,9 +232,14 @@ def get_user_model():
         shuffle=False,
         collate_fn=calib_evaluator.collate_batch,
     )
+    def calib_func(prepared_model):
+        for i, calib_input in enumerate(calib_dataloader):
+            if i > args.calib_iters:
+                break
+            prepared_model(calib_input[0])
 
     # 3.x api
-    from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
+    from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, AWQConfig, prepare, convert
     from neural_compressor.torch.utils import get_double_quant_config_dict
     weight_sym = True if args.woq_scheme == "sym" else False
     if args.double_quant_type is not None:
@@ -311,6 +325,25 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
         user_model = prepare(model=user_model, quant_config=quant_config)
         run_fn_for_gptq(user_model, dataloader_for_calibration)
         user_model = convert(user_model)
+    elif args.woq_algo == "AWQ":
+        quant_config = AWQConfig(
+            dtype=args.woq_dtype,
+            bits=args.woq_bits,
+            use_sym=weight_sym,
+            group_size=args.woq_group_size,
+            group_dim=args.woq_group_dim,
+            use_auto_scale=args.use_auto_scale,
+            use_auto_clip=args.use_auto_clip,
+            folding=args.folding,
+            absorb_layer_dict=args.absorb_layer_dict,
+        )
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+        run_fn = calib_func
+        user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(user_model)
+        user_model = convert(user_model)
+            
+        
 
     user_model.save(args.output_dir)
 
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index a860712b697..50bed4bbb68 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -85,6 +85,13 @@ function run_tuning {
         model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 ---calib_iters 128"
     fi
 
     python -u run_clm_no_trainer.py \

From 1f9bbab37ea06516fa2f4f47f87575e3cdb9803d Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 14:24:22 +0800
Subject: [PATCH 02/23] update extension test

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index c3ae3f6b5be..afc6b9408fb 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -84,6 +84,20 @@
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 8
       },
+      "gpt_j_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
       "gpt_j_ipex":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
         "dataset_location": "",

From 9cb9795662585067b89ddbb9c39d0d4bd03a9073 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 15:10:55 +0800
Subject: [PATCH 03/23] update extension path

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index afc6b9408fb..0df06fb940d 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -85,14 +85,14 @@
         "batch_size": 8
       },
       "gpt_j_woq_awq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
         "dataset_location": "",
         "input_model": "",
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1
       },
       "opt_125m_woq_awq_int4":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm",
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
         "dataset_location": "",
         "input_model": "",
         "main_script": "run_clm_no_trainer.py",

From b40c2c90109dd47bce216cfb14c5231fc2951bdb Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 15:21:48 +0800
Subject: [PATCH 04/23] minor fix

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../language-modeling/quantization/weight_only/run_quant.sh     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index 50bed4bbb68..15baea53a77 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -91,7 +91,7 @@ function run_tuning {
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 ---calib_iters 128"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
     fi
 
     python -u run_clm_no_trainer.py \

From 6382a489708b1b0618372e7db6976d000c827406 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 15:33:32 +0800
Subject: [PATCH 05/23] update awq benchmark

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_benchmark.sh              | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 9e1d766128e..2ef001a5534 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -114,6 +114,13 @@ function run_benchmark {
         model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
     fi
 
     python -u run_clm_no_trainer.py \

From 4552103e14952573da353567c8ee79a2a233ede8 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 15:49:46 +0800
Subject: [PATCH 06/23] update benchmarking

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_benchmark.sh | 55 ++++++++-----------
 .../quantization/weight_only/run_quant.sh     |  3 +
 2 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 2ef001a5534..e5f8d54b644 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -70,65 +70,54 @@ function run_benchmark {
     fi
     echo $extra_cmd
 
-        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+        model_name_or_path="EleutherAI/gpt-j-6b""
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
     fi
-
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --output_dir ${tuned_checkpoint} \
-        --task ${task} \
-        --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
+    
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            ${extra_cmd} ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index 15baea53a77..0847976a99c 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -92,6 +92,9 @@ function run_tuning {
     elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128"
     fi
 
     python -u run_clm_no_trainer.py \

From 1a13c7231e9798cfff910452f07f3076af484ef2 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 15:50:37 +0800
Subject: [PATCH 07/23] add autoround

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 67 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 3 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index d5eb7b2f1b5..761b86a8d46 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -52,7 +52,7 @@
                     type=str, help="tasks for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============WeightOnly configs===============
-parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound'],
                     help="Weight-only parameter.")
 parser.add_argument("--woq_bits", type=int, default=8)
 parser.add_argument("--woq_dtype", type=str, default="int")
@@ -86,6 +86,26 @@
                     help="Allow insert mul before linear when the scale cannot be absorbed by last layer.")
 parser.add_argument('--absorb_layer_dict', type=dict, default={},
                     help="The layer dict that scale can be absorbed.")
+# ============AUTOROUND configs==============
+parser.add_argument(
+    "--lr",
+    type=float,
+    default=None,
+    help="learning rate, if None, it will be set to 1.0/iters automatically",
+)
+parser.add_argument(
+    "--minmax_lr",
+    type=float,
+    default=None,
+    help="minmax learning rate, if None,it will beset to be the same with lr",
+)
+parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.")
+parser.add_argument("--autoround_nsamples", default=128, type=int, help="num samples for autoround calibration.")
+parser.add_argument(
+    "--disable_quanted_input",
+    action="store_true",
+    help="whether to use the output of quantized block to tune the next block",
+)
 
 # =============DoubleQuant configs====================
 parser.add_argument("--double_quant_type",
@@ -239,7 +259,14 @@ def calib_func(prepared_model):
             prepared_model(calib_input[0])
 
     # 3.x api
-    from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, AWQConfig, prepare, convert
+    from neural_compressor.torch.quantization import (
+        RTNConfig,
+        GPTQConfig,
+        AWQConfig,
+        AutoRoundConfig,
+        prepare,
+        convert
+    )
     from neural_compressor.torch.utils import get_double_quant_config_dict
     weight_sym = True if args.woq_scheme == "sym" else False
     if args.double_quant_type is not None:
@@ -342,7 +369,41 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
         user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
         run_fn(user_model)
         user_model = convert(user_model)
-            
+    elif args.woq_algo == "AutoRound":
+        quant_config = AutoRoundConfig(
+                dtype=args.woq_dtype,
+                bits=args.woq_bits,
+                use_sym=weight_sym,
+                group_size=args.woq_group_size,
+                enable_quanted_input=not args.disable_quanted_input,
+                lr=args.lr,
+                minmax_lr=args.minmax_lr,
+                seqlen=args.pad_max_length,
+                nsamples=args.autoround_nsamples,
+                iters=args.autoround_iters,
+            )
+        quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
+        from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader
+        dataloader = get_dataloader(tokenizer=tokenizer,
+                                                seqlen=args.pad_max_length,
+                                                dataset_name=datasets,
+                                                seed=args.seed,
+                                                bs=args.batch_size,
+                                                nsamples=args.autoround_nsamples)
+        @torch.no_grad()
+        def run_fn_for_autoround(model, dataloader):
+            for data in dataloader:
+                if isinstance(data, tuple) or isinstance(data, list):
+                    model(*data)
+                elif isinstance(data, dict):
+                    model(**data)
+                else:
+                    model(data)
+        run_fn = run_fn_for_autoround
+        run_args = (dataloader,)
+        user_model = prepare(model=user_model, quant_config=quant_config)
+        run_fn(user_model, *run_args)
+        user_model = convert(user_model)
         
 
     user_model.save(args.output_dir)

From 29f1ba9c1b21fc9ff9e2f87b00f6a99095da6f5d Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 16:01:31 +0800
Subject: [PATCH 08/23] minor fix

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../language-modeling/quantization/weight_only/run_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index e5f8d54b644..3f90f89244e 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -85,7 +85,7 @@ function run_benchmark {
     elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b""
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then

From b0da8224c80969faf34e53b37718df80097b04c4 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 16:18:48 +0800
Subject: [PATCH 09/23] minor fix

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_benchmark.sh                 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 3f90f89244e..d2e83223680 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -101,7 +101,7 @@ function run_benchmark {
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
     fi
-    
+
     if [[ ${mode} == "accuracy" ]]; then
         python -u run_clm_no_trainer.py \
             --model ${model_name_or_path} \
@@ -118,6 +118,8 @@ function run_benchmark {
     else
         echo "Error: No such mode: ${mode}"
         exit 1
+    fi
+        
 }
 
 main "$@"

From 17e36fc6ade7f0a988ba206c6ce88db89401dbbc Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 15 Aug 2024 21:25:21 +0800
Subject: [PATCH 10/23] update group_size

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../language-modeling/quantization/weight_only/run_quant.sh   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index 0847976a99c..fa559ab4cbb 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -87,11 +87,11 @@ function run_tuning {
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128"

From 538f0089ec7e2297cea1131ae248a9f2735044a0 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 16 Aug 2024 08:43:55 +0800
Subject: [PATCH 11/23] add autoround extension test

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 0df06fb940d..a618faa6193 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -98,6 +98,13 @@
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1
       },
+      "opt_125m_woq_autoround_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
       "gpt_j_ipex":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
         "dataset_location": "",

From f703f60af891df874c01f113efc2b026a3b63cd8 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 16 Aug 2024 10:18:32 +0800
Subject: [PATCH 12/23] add teq

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 761b86a8d46..6062845b045 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -83,9 +83,9 @@
 parser.add_argument("--use_auto_clip", action="store_true",
                     help="Enables clip range searchc.")
 parser.add_argument("--folding", action="store_true",
-                    help="Allow insert mul before linear when the scale cannot be absorbed by last layer.")
+                    help="Allow insert mul before linear when the scale cannot be absorbed by last layer for TEQ/AWQ.")
 parser.add_argument('--absorb_layer_dict', type=dict, default={},
-                    help="The layer dict that scale can be absorbed.")
+                    help="The layer dict that scale can be absorbed for TEQ/AWQ.")
 # ============AUTOROUND configs==============
 parser.add_argument(
     "--lr",
@@ -264,6 +264,7 @@ def calib_func(prepared_model):
         GPTQConfig,
         AWQConfig,
         AutoRoundConfig,
+        TEQConfig,
         prepare,
         convert
     )
@@ -369,6 +370,20 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
         user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
         run_fn(user_model)
         user_model = convert(user_model)
+    elif args.woq_algo == "TEQ":
+        quant_config = TEQConfig(
+            dtype=args.woq_dtype,
+            bits=args.woq_bits,
+            use_sym=weight_sym,
+            group_size=args.woq_group_size,
+            group_dim=args.woq_group_dim,
+            folding=args.folding,
+        )
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+        run_fn = calib_func
+        user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(user_model)
+        user_model = convert(user_model)
     elif args.woq_algo == "AutoRound":
         quant_config = AutoRoundConfig(
                 dtype=args.woq_dtype,
@@ -404,6 +419,7 @@ def run_fn_for_autoround(model, dataloader):
         user_model = prepare(model=user_model, quant_config=quant_config)
         run_fn(user_model, *run_args)
         user_model = convert(user_model)
+    
         
 
     user_model.save(args.output_dir)

From 6c380326999f156f0006e3d160d8b2933c086cc1 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 16 Aug 2024 10:29:48 +0800
Subject: [PATCH 13/23] update readme

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/README.md        | 73 ++++++++++++++++---
 1 file changed, 61 insertions(+), 12 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
index 889d7b42682..0519b490ff7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --double_quant_type "BNB_NF4" \
+    --output_dir saved_results
 
 # "--woq_algo RTN" is used to enable RTN algorithms
 python run_clm_no_trainer.py \
@@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
 
@@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m  \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 
 ### LLAMA2-7b/13b/70b
->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
 #### Quantization
 
 ```bash
@@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 ```
 

From f8fbcf667ce855c271ec718c46e04ef70c00c2c2 Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Fri, 16 Aug 2024 17:28:13 +0800
Subject: [PATCH 14/23] limit onnxruntime version

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .azure-pipelines/scripts/ut/run_itrex.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
index 2bbbf958398..5adaf86579b 100644
--- a/.azure-pipelines/scripts/ut/run_itrex.sh
+++ b/.azure-pipelines/scripts/ut/run_itrex.sh
@@ -18,7 +18,8 @@ bash /intel-extension-for-transformers/.github/workflows/script/install_binary.s
 sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt
 pip install -r /intel-extension-for-transformers/tests/requirements.txt
 # workaround
-pip install onnx==1.15.0
+pip install onnx==1.16.0
+pip install onnxruntime==1.18.0
 echo "pip list itrex ut deps..."
 pip list
 LOG_DIR=/neural-compressor/log_dir

From 29e8dc3c43c6bab930921ed9d9fb45d7e353e18c Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 20 Aug 2024 12:32:33 +0800
Subject: [PATCH 15/23] rm float

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_clm_no_trainer.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 6062845b045..a75cb907354 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -223,7 +223,7 @@ def get_user_model():
         revision=args.revision,
     )
     tokenizer = AutoTokenizer.from_pretrained(args.model)
-    user_model = user_model.float()
+    # user_model = user_model.float()
 
     # Set model's seq_len when GPTQ calibration is enabled.
     if args.woq_algo == 'GPTQ':

From e8610c1d306dfc74ad62b7dff49b35a08f6f58e0 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 20 Aug 2024 13:28:37 +0800
Subject: [PATCH 16/23] fix ar dtype

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_benchmark.sh                 | 1 +
 .../quantization/weight_only/run_clm_no_trainer.py            | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index d2e83223680..5b4627a1d2d 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -100,6 +100,7 @@ function run_benchmark {
         model_name_or_path="facebook/opt-125m"
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound"
     fi
 
     if [[ ${mode} == "accuracy" ]]; then
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index a75cb907354..bba8ec57d6a 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -223,7 +223,9 @@ def get_user_model():
         revision=args.revision,
     )
     tokenizer = AutoTokenizer.from_pretrained(args.model)
-    # user_model = user_model.float()
+    user_model = user_model.float()
+    if args.woq_algo == 'AutoRound':
+        user_model.to(torch.float32)
 
     # Set model's seq_len when GPTQ calibration is enabled.
     if args.woq_algo == 'GPTQ':

From a7ec2fe573bec0aaea1a9ca5e712f6e2394d699b Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:14:26 +0800
Subject: [PATCH 17/23] update autotune

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 31 ++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 7773e73c988..1b44e42f0c8 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -62,6 +62,7 @@
 parser.add_argument("--woq_scheme", default="sym")
 parser.add_argument("--woq_use_mse_search", action="store_true")
 parser.add_argument("--woq_use_full_range", action="store_true")
+parser.add_argument("--quant_lm_head", action="store_true",  help="whether to quant the lm_head layer in transformers")
 # =============GPTQ configs====================
 parser.add_argument("--gptq_actorder", action="store_true",
                     help="Whether to apply the activation order GPTQ heuristic.")
@@ -268,6 +269,9 @@ def calib_func(prepared_model):
         AWQConfig,
         AutoRoundConfig,
         TEQConfig,
+        TuningConfig,
+        autotune,
+        get_woq_tuning_config,
         prepare,
         convert
     )
@@ -283,6 +287,7 @@ def calib_func(prepared_model):
                     # TODO: add group_dim into double quant config?
                     "use_full_range": args.woq_use_full_range,
                     "use_mse_search": args.woq_use_mse_search,
+                    "quant_lm_head": args.quant_lm_head,
                 }
             )
             quant_config = RTNConfig.from_dict(double_quant_config_dict)
@@ -300,8 +305,8 @@ def calib_func(prepared_model):
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
+                quant_lm_head=args.quant_lm_head,
             )
-        quant_config.set_local("lm_head", RTNConfig(dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         user_model = convert(model=user_model)
     elif args.woq_algo == "GPTQ":
@@ -332,6 +337,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                     "act_order": args.gptq_actorder,
                     "block_size": args.gptq_block_size,
                     "static_groups": args.gptq_static_groups,
+                    "quant_lm_head": args.quant_lm_head,
                 }
             )
             quant_config = GPTQConfig.from_dict(double_quant_config_dict)
@@ -351,8 +357,8 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
+                quant_lm_head=args.quant_lm_head,
             )
-        quant_config.set_local("lm_head", GPTQConfig(dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         run_fn_for_gptq(user_model, dataloader_for_calibration)
         user_model = convert(user_model)
@@ -367,6 +373,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
             use_auto_clip=args.use_auto_clip,
             folding=args.folding,
             absorb_layer_dict=args.absorb_layer_dict,
+            quant_lm_head=args.quant_lm_head,
         )
         example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
         run_fn = calib_func
@@ -381,6 +388,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
             group_size=args.woq_group_size,
             group_dim=args.woq_group_dim,
             folding=args.folding,
+            quant_lm_head=args.quant_lm_head,
         )
         example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
         run_fn = calib_func
@@ -422,8 +430,23 @@ def run_fn_for_autoround(model, dataloader):
         user_model = prepare(model=user_model, quant_config=quant_config)
         run_fn(user_model, *run_args)
         user_model = convert(user_model)
-    
-        
+    elif args.woq_algo == "AutoTune":
+        from utils import DataloaderPreprocessor
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=calib_dataloader,
+            use_max_length=args.gptq_use_max_length,
+            max_seq_length=args.gptq_max_seq_length,
+        )
+        dataloader = dataloaderPreprocessor.get_prepared_dataloader()
+        custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
+        best_model = autotune(
+            model=model,
+            tune_config=custom_tune_config,
+            eval_fn=eval_acc_fn,
+            run_fn=run_fn_for_gptq,
+            run_args=(dataloader, True),  # run_args should be a tuple,
+            example_inputs=example_inputs,
+        )
 
     user_model.save(args.output_dir)
 

From d27604f69cee2ce05dbf9aabcc3c37709f4760dc Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:23:29 +0800
Subject: [PATCH 18/23] update woq_algo

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_clm_no_trainer.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 1b44e42f0c8..1a6ec1321ef 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -53,7 +53,7 @@
                     type=str, help="tasks for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============WeightOnly configs===============
-parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound'],
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound', 'AutoTune'],
                     help="Weight-only parameter.")
 parser.add_argument("--woq_bits", type=int, default=8)
 parser.add_argument("--woq_dtype", type=str, default="int")

From 1aa1b13365a67547b2d3206f66d2cf84b00b7102 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:25:32 +0800
Subject: [PATCH 19/23] update user_model

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_clm_no_trainer.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 1a6ec1321ef..74b510e8f33 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -439,8 +439,8 @@ def run_fn_for_autoround(model, dataloader):
         )
         dataloader = dataloaderPreprocessor.get_prepared_dataloader()
         custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
-        best_model = autotune(
-            model=model,
+        user_model = autotune(
+            model=user_model,
             tune_config=custom_tune_config,
             eval_fn=eval_acc_fn,
             run_fn=run_fn_for_gptq,

From f6b5810214706e0eaa7710a000c47b9796ef45c8 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:33:32 +0800
Subject: [PATCH 20/23] add eval fn

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../weight_only/run_clm_no_trainer.py         | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 74b510e8f33..afc2c2c6b57 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -242,6 +242,31 @@ def get_user_model():
     user_model.eval()
     return user_model, tokenizer
 
+def eval_fn(user_model=None):
+    user_model.eval()
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="hpu" if is_hpex_available() else "cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    return acc
 
 if args.quantize:
     # dataset
@@ -439,10 +464,22 @@ def run_fn_for_autoround(model, dataloader):
         )
         dataloader = dataloaderPreprocessor.get_prepared_dataloader()
         custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
+        from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
+        from tqdm import tqdm
+        def run_fn_for_gptq(model, dataloader_for_calibration, *args):
+            for batch in tqdm(dataloader_for_calibration):
+                batch = move_input_to_device(batch, device=None)
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+            return
         user_model = autotune(
             model=user_model,
             tune_config=custom_tune_config,
-            eval_fn=eval_acc_fn,
+            eval_fn=eval_fn,
             run_fn=run_fn_for_gptq,
             run_args=(dataloader, True),  # run_args should be a tuple,
             example_inputs=example_inputs,

From 45e8d00f55dd8d89bcfd52f2dcc785d20e6405cb Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:42:26 +0800
Subject: [PATCH 21/23] update example_inputs

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/weight_only/run_clm_no_trainer.py               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index afc2c2c6b57..51be2900ba7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -476,6 +476,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 else:
                     model(batch)
             return
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
         user_model = autotune(
             model=user_model,
             tune_config=custom_tune_config,

From 4a22029d10bd0caf132adc435d815ce69ab50e53 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 8 Oct 2024 13:47:37 +0800
Subject: [PATCH 22/23] update extension test

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json              | 7 +++++++
 .../quantization/weight_only/run_benchmark.sh              | 2 ++
 .../quantization/weight_only/run_quant.sh                  | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index a618faa6193..809b898d5e3 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -105,6 +105,13 @@
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1
       },
+      "opt_125m_woq_autotune_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
       "gpt_j_ipex":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
         "dataset_location": "",
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 5b4627a1d2d..6c84e27ce88 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -101,6 +101,8 @@ function run_benchmark {
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AutoRound"
+    elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
     fi
 
     if [[ ${mode} == "accuracy" ]]; then
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index fa559ab4cbb..3c5c7ff9594 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -95,6 +95,9 @@ function run_tuning {
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128"
+    elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4"
     fi
 
     python -u run_clm_no_trainer.py \

From c9893dc05d6fffd43325fd4db69fd4d4ea018ee7 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Wed, 9 Oct 2024 13:53:45 +0800
Subject: [PATCH 23/23] update autoround config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../language-modeling/quantization/weight_only/run_quant.sh     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index 3c5c7ff9594..ed4ee705726 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -94,7 +94,7 @@ function run_tuning {
         extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
     elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128 --woq_scheme asym --autoround_iters 200 --autoround_nsamples 500"
     elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
         model_name_or_path="facebook/opt-125m"
         extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4"