From 4fb88435cfa1f226cfb5072e7f4c73f83641425f Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Sun, 9 Jun 2024 16:15:51 +0800
Subject: [PATCH 01/10] modify 3.x ipex example structure

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 .../quantization/smooth_quant/README.md       |  64 +++++
 .../smooth_quant/requirements.txt             |  13 +
 .../smooth_quant/run_benchmark.sh             |  94 +++++++
 .../smooth_quant/run_clm_no_trainer.py        | 260 ++++++++++++++++++
 .../quantization/smooth_quant/run_quant.sh    |  67 +++++
 .../quantization/smooth_quant/utils.py        | 193 +++++++++++++
 .../quantization/static_quant/README.md       |  57 ++++
 .../static_quant/requirements.txt             |  13 +
 .../static_quant/run_benchmark.sh             |  94 +++++++
 .../static_quant/run_clm_no_trainer.py        | 256 +++++++++++++++++
 .../quantization/static_quant/run_quant.sh    |  67 +++++
 .../quantization/static_quant/utils.py        | 193 +++++++++++++
 12 files changed, 1371 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md
new file mode 100644
index 00000000000..8900ea9fd9b
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md
@@ -0,0 +1,64 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) using Smooth Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --sq \
+    --alpha 1.0 \
+    --ipex \
+    --output_dir "saved_results"
+```
+**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.
+
+### OPT-125m
+
+#### Quantization
+
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --quantize \
+    --sq \
+    --alpha 0.5 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### LLAMA2-7b/13b/70b
+>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
+#### Quantization
+
+```bash
+# "--sq" is used to enable smooth quant
+python run_clm_no_trainer.py \
+    --model meta-llama/Llama-2-7b-hf \
+    --quantize \
+    --sq \
+    --alpha 0.8 \
+    --ipex \
+    --output_dir "saved_results"
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
new file mode 100644
index 00000000000..f0b56e558d3
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
@@ -0,0 +1,13 @@
+accelerate
+protobuf
+sentencepiece != 0.1.92
+datasets >= 1.1.3
+torch >= 1.10
+transformers
+pytest
+wandb
+einops
+neural-compressor
+intel-extension-for-transformers
+lm_eval==0.4.2
+peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
new file mode 100644
index 00000000000..955ffd91456
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "opt_125m_ipex_sq" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
+    elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
+    elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
new file mode 100644
index 00000000000..2afb74068f5
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -0,0 +1,260 @@
+import argparse
+import os
+import sys
+
+sys.path.append('./')
+import time
+import re
+import torch
+from datasets import load_dataset
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="EleutherAI/gpt-j-6b"
+)
+parser.add_argument(
+    "--trust_remote_code", default=True,
+    help="Transformers parameter: use the external repo")
+parser.add_argument(
+    "--revision", default=None,
+    help="Transformers parameter: set the model hub commit number")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument(
+    "--int8_bf16_mixed",
+    action="store_true",
+    help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument(
+    '--seed',
+    type=int, default=42, help='Seed for sampling the calibration data.'
+)
+parser.add_argument("--approach", type=str, default='static',
+                    help="Select from ['dynamic', 'static', 'weight-only']")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
+parser.add_argument("--accuracy", action="store_true")
+parser.add_argument("--performance", action="store_true")
+parser.add_argument("--iters", default=100, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--save_accuracy_path", default=None,
+                    help="Save accuracy results path.")
+parser.add_argument("--pad_max_length", default=512, type=int,
+                    help="Pad input ids to max length.")
+parser.add_argument("--calib_iters", default=512, type=int,
+                    help="calibration iters.")
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+                    type=str, help="tasks for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# ============SmoothQuant configs==============
+parser.add_argument("--sq", action="store_true")
+parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.")
+
+args = parser.parse_args()
+if args.ipex:
+    import intel_extension_for_pytorch as ipex
+calib_size = 1
+
+
+class Evaluator:
+    def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.pad_val = pad_val
+        self.pad_max = pad_max
+        self.is_calib = is_calib
+
+        # tokenize the dataset
+        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+        self.dataset.set_format(type="torch", columns=["input_ids"])
+
+    @torch.no_grad()
+    def tokenize_function(self, examples):
+        return self.tokenizer(examples["text"])
+
+    @torch.no_grad()
+    def collate_batch(self, batch):
+
+        input_ids_padded = []
+        last_ind = []
+
+        for text in batch:
+            input_ids = text["input_ids"]
+            pad_len = self.pad_max - input_ids.shape[0]
+            last_ind.append(input_ids.shape[0] - 1)
+            if self.is_calib:
+                input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids
+            else:
+                input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
+            input_ids_padded.append(input_ids)
+
+        return (torch.vstack(input_ids_padded), torch.tensor(last_ind))
+
+    @torch.no_grad()
+    def evaluate(self, model):
+        model.eval()
+        # The task is to predict the last word of the input.
+        total, hit = 0, 0
+        latency = 0
+        test_dataloader = DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=self.collate_batch,
+        )
+        for i, (input_ids, last_ind) in enumerate(test_dataloader):
+            label = input_ids[torch.arange(len(last_ind)), last_ind]
+            input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val
+            pad_len = self.pad_max - last_ind - 1
+
+            start = time.time()
+            outputs = model(input_ids)
+            latency += time.time() - start
+
+            last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :]
+            pred = last_token_logits.argmax(dim=-1)
+            total += label.size(0)
+            hit += (pred == label).sum().item()
+            if (i + 1) % 50 == 0:
+                print(hit / total)
+                print("Processed minibatch:", i)
+
+        acc = hit / total
+        print("Accuracy: ", acc)
+        print("Latency: ", latency)
+        return acc
+
+
+def get_user_model():
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
+
+    # to channels last
+    user_model = user_model.to(memory_format=torch.channels_last)
+    user_model.eval()
+    return user_model, tokenizer
+
+
+if args.quantize:
+    # dataset
+    user_model, tokenizer = get_user_model()
+    calib_dataset = load_dataset(args.dataset, split="train")
+    # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
+    calib_dataset = calib_dataset.shuffle(seed=args.seed)
+    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
+    calib_dataloader = DataLoader(
+        calib_evaluator.dataset,
+        batch_size=calib_size,
+        shuffle=False,
+        collate_fn=calib_evaluator.collate_batch,
+    )
+
+    from neural_compressor.torch.quantization import SmoothQuantConfig
+    args.alpha = eval(args.alpha)
+    excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+    quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions)
+
+    if re.search("gpt", user_model.config.model_type):
+        quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+    from tqdm import tqdm
+    def run_fn(model):
+        for batch in tqdm(calib_dataloader):
+            batch = move_input_to_device(batch, device=None)
+            try:
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+            except ValueError:
+                pass
+        return
+
+    from utils import get_example_inputs
+    example_inputs = get_example_inputs(user_model, calib_dataloader)
+
+    from neural_compressor.torch.quantization import prepare, convert
+    user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+    run_fn(user_model)
+    user_model = convert(user_model)
+    user_model.save(args.output_dir)
+
+
+# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
+if args.int8 or args.int8_bf16_mixed:
+    print("load int8 model")
+    from neural_compressor.torch.quantization import load
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(args.model)
+    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+    setattr(user_model, "config", config)
+else:
+    user_model, tokenizer = get_user_model()
+
+
+if args.accuracy:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        device="cpu",
+    )
+    results = evaluate(eval_args)
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Batch size = %d' % args.batch_size)
+
+if args.performance:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
+    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
+    print('Batch size = %d' % args.batch_size)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh
new file mode 100644
index 00000000000..774bb73b6f1
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=''
+    batch_size=8
+    approach='static'
+    DATASET_NAME="NeelNanda/pile-10k"
+    tuned_checkpoint="saved_results"
+
+    if [ "${topology}" = "opt_125m_ipex_sq" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5"
+    elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8"
+    elif [ "${topology}" = "gpt_j_ipex_sq" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --dataset ${DATASET_NAME} \
+        --quantize \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --tasks "lambada_openai" \
+        --batch_size ${batch_size} \
+        ${extra_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
new file mode 100644
index 00000000000..38083129a65
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
@@ -0,0 +1,193 @@
+import random
+import torch
+from collections import UserDict
+from packaging.version import Version
+from neural_compressor.common import logger
+from neural_compressor.torch.utils import get_torch_version
+
+class DataloaderPreprocessor:
+    def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None:
+        self.dataloader_original = dataloader_original
+        self.use_max_length = use_max_length
+        self.max_seq_length = max_seq_length
+        self.nsamples = nsamples
+        self.dataloader = []
+        self.is_ready = False
+
+    def get_prepared_dataloader(self):
+        if not self.is_ready:
+            self.prepare_dataloader()
+        return self.dataloader
+
+    def prepare_dataloader(self):
+        if self.use_max_length:
+            # (Recommend) only take sequence whose length exceeds self.max_seq_length,
+            # which preserves calibration's tokens are all valid
+            # This is GPTQ official dataloader implementation
+            self.obtain_first_n_samples_fulllength()
+        else:
+            # general selection, no padding, not GPTQ original implementation.
+            self.obtain_first_n_samples()
+        self.is_ready = True
+
+    def obtain_first_n_samples(self, seed=0):
+        """Get first nsample data as the real calibration dataset."""
+        self.dataloader.clear()
+        random.seed(seed)
+        for batch in self.dataloader_original:
+            # process data, depends on its data type.
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list, tuple
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] > self.max_seq_length:
+                    i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    batch_final = batch[:]
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length > self.max_seq_length:
+                    i = random.randint(0, length - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    batch_final = batch
+            # tensor
+            else:
+                if batch.shape[-1] > self.max_seq_length:
+                    i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    batch_final = batch[:, i:j]
+                else:
+                    batch_final = batch
+            self.dataloader.append(batch_final)
+
+        if len(self.dataloader) < self.nsamples:
+            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
+
+    def obtain_first_n_samples_fulllength(self, seed=0):
+            self.dataloader.clear()
+            random.seed(seed)
+            unified_length = self.max_seq_length
+            for batch in self.dataloader_original:
+                if len(self.dataloader) == self.nsamples:
+                    logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                    break
+                # list & tuple, gpt-j-6b mlperf, etc.
+                if isinstance(batch, list) or isinstance(batch, tuple):
+                    if batch[0].shape[-1] == unified_length:
+                        batch_final = batch[:]
+                    elif batch[0].shape[-1] > unified_length:
+                        i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
+                        j = i + unified_length
+                        batch_final = []
+                        for item in batch:
+                            if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                                batch_final.append(item[:, i:j])
+                            else:
+                                batch_final.append(item)
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                # dict
+                elif isinstance(batch, dict):
+                    try:
+                        length = batch["input_ids"].shape[-1]
+                    except:
+                        logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                        continue
+                    batch_final = {}
+                    if length == self.max_seq_length:
+                        batch_final = batch
+                    elif length > self.max_seq_length:
+                        i = random.randint(0, length - self.max_seq_length - 1)
+                        j = i + self.max_seq_length
+                        # may have to slice every sequence related data
+                        for key in batch.keys():
+                            if isinstance(batch[key], torch.Tensor):
+                                batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
+                            else:
+                                batch_final[key] = batch[key]
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                # tensor
+                else:
+                    if batch.shape[-1] == unified_length:
+                        batch_final = batch
+                    elif batch.shape[-1] > unified_length:
+                        i = random.randint(0, batch.shape[-1] - unified_length - 1)
+                        j = i + unified_length
+                        batch_final = batch[:, i:j]
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                self.dataloader.append(batch_final)
+            if len(self.dataloader) < self.nsamples:  # pragma: no cover
+                logger.warning(
+                    f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
+                but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value."
+                )
+
+
+def get_example_inputs(model, dataloader):
+    version = get_torch_version()
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+
+    # Suggest set dataloader like calib_dataloader
+    if dataloader is None:
+        return None
+    device = next(model.parameters()).device
+    try:
+        for idx, (input, label) in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, (list, tuple)):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    except Exception as e:  # pragma: no cover
+        for idx, input in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, list) or isinstance(input, tuple):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    if idx == 0:
+        assert False, "Please checkout the example_inputs format."
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
new file mode 100644
index 00000000000..8ecdc6c5110
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
@@ -0,0 +1,57 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
+```bash
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --alpha 1.0 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### OPT-125m
+
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --quantize \
+    --alpha 0.5 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### LLAMA2-7b/13b/70b
+>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model meta-llama/Llama-2-7b-hf \
+    --quantize \
+    --alpha 0.8 \
+    --ipex \
+    --output_dir "saved_results"
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
new file mode 100644
index 00000000000..f0b56e558d3
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
@@ -0,0 +1,13 @@
+accelerate
+protobuf
+sentencepiece != 0.1.92
+datasets >= 1.1.3
+torch >= 1.10
+transformers
+pytest
+wandb
+einops
+neural-compressor
+intel-extension-for-transformers
+lm_eval==0.4.2
+peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
new file mode 100644
index 00000000000..87359e9a094
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
new file mode 100644
index 00000000000..5d13abb73fd
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
@@ -0,0 +1,256 @@
+import argparse
+import os
+import sys
+
+sys.path.append('./')
+import time
+import re
+import torch
+from datasets import load_dataset
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="EleutherAI/gpt-j-6b"
+)
+parser.add_argument(
+    "--trust_remote_code", default=True,
+    help="Transformers parameter: use the external repo")
+parser.add_argument(
+    "--revision", default=None,
+    help="Transformers parameter: set the model hub commit number")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument(
+    "--int8_bf16_mixed",
+    action="store_true",
+    help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument(
+    '--seed',
+    type=int, default=42, help='Seed for sampling the calibration data.'
+)
+parser.add_argument("--approach", type=str, default='static',
+                    help="Select from ['dynamic', 'static', 'weight-only']")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
+parser.add_argument("--accuracy", action="store_true")
+parser.add_argument("--performance", action="store_true")
+parser.add_argument("--iters", default=100, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--save_accuracy_path", default=None,
+                    help="Save accuracy results path.")
+parser.add_argument("--pad_max_length", default=512, type=int,
+                    help="Pad input ids to max length.")
+parser.add_argument("--calib_iters", default=512, type=int,
+                    help="calibration iters.")
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+                    type=str, help="tasks for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+
+args = parser.parse_args()
+if args.ipex:
+    import intel_extension_for_pytorch as ipex
+calib_size = 1
+
+
+class Evaluator:
+    def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.pad_val = pad_val
+        self.pad_max = pad_max
+        self.is_calib = is_calib
+
+        # tokenize the dataset
+        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+        self.dataset.set_format(type="torch", columns=["input_ids"])
+
+    @torch.no_grad()
+    def tokenize_function(self, examples):
+        return self.tokenizer(examples["text"])
+
+    @torch.no_grad()
+    def collate_batch(self, batch):
+
+        input_ids_padded = []
+        last_ind = []
+
+        for text in batch:
+            input_ids = text["input_ids"]
+            pad_len = self.pad_max - input_ids.shape[0]
+            last_ind.append(input_ids.shape[0] - 1)
+            if self.is_calib:
+                input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids
+            else:
+                input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
+            input_ids_padded.append(input_ids)
+
+        return (torch.vstack(input_ids_padded), torch.tensor(last_ind))
+
+    @torch.no_grad()
+    def evaluate(self, model):
+        model.eval()
+        # The task is to predict the last word of the input.
+        total, hit = 0, 0
+        latency = 0
+        test_dataloader = DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=self.collate_batch,
+        )
+        for i, (input_ids, last_ind) in enumerate(test_dataloader):
+            label = input_ids[torch.arange(len(last_ind)), last_ind]
+            input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val
+            pad_len = self.pad_max - last_ind - 1
+
+            start = time.time()
+            outputs = model(input_ids)
+            latency += time.time() - start
+
+            last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :]
+            pred = last_token_logits.argmax(dim=-1)
+            total += label.size(0)
+            hit += (pred == label).sum().item()
+            if (i + 1) % 50 == 0:
+                print(hit / total)
+                print("Processed minibatch:", i)
+
+        acc = hit / total
+        print("Accuracy: ", acc)
+        print("Latency: ", latency)
+        return acc
+
+
+def get_user_model():
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
+
+    # to channels last
+    user_model = user_model.to(memory_format=torch.channels_last)
+    user_model.eval()
+    return user_model, tokenizer
+
+
+if args.quantize:
+    # dataset
+    user_model, tokenizer = get_user_model()
+    calib_dataset = load_dataset(args.dataset, split="train")
+    # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
+    calib_dataset = calib_dataset.shuffle(seed=args.seed)
+    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
+    calib_dataloader = DataLoader(
+        calib_evaluator.dataset,
+        batch_size=calib_size,
+        shuffle=False,
+        collate_fn=calib_evaluator.collate_batch,
+    )
+
+    
+    from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig
+    quant_config =  get_default_static_config()
+    quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+    if re.search("gpt", user_model.config.model_type):
+        quant_config.set_local(torch.add, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+    from tqdm import tqdm
+    def run_fn(model):
+        for batch in tqdm(calib_dataloader):
+            batch = move_input_to_device(batch, device=None)
+            try:
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+            except ValueError:
+                pass
+        return
+
+    from utils import get_example_inputs
+    example_inputs = get_example_inputs(user_model, calib_dataloader)
+
+    from neural_compressor.torch.quantization import prepare, convert
+    user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+    run_fn(user_model)
+    user_model = convert(user_model)
+    user_model.save(args.output_dir)
+
+
+# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
+if args.int8 or args.int8_bf16_mixed:
+    print("load int8 model")
+    from neural_compressor.torch.quantization import load
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(args.model)
+    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+    setattr(user_model, "config", config)
+else:
+    user_model, tokenizer = get_user_model()
+
+
+if args.accuracy:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        device="cpu",
+    )
+    results = evaluate(eval_args)
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Batch size = %d' % args.batch_size)
+
+if args.performance:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
+    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
+    print('Batch size = %d' % args.batch_size)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
new file mode 100644
index 00000000000..a93d8220d64
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=''
+    batch_size=8
+    approach='static'
+    DATASET_NAME="NeelNanda/pile-10k"
+    tuned_checkpoint="saved_results"
+
+    if [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --dataset ${DATASET_NAME} \
+        --quantize \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --tasks "lambada_openai" \
+        --batch_size ${batch_size} \
+        ${extra_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
new file mode 100644
index 00000000000..38083129a65
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
@@ -0,0 +1,193 @@
+import random
+import torch
+from collections import UserDict
+from packaging.version import Version
+from neural_compressor.common import logger
+from neural_compressor.torch.utils import get_torch_version
+
+class DataloaderPreprocessor:
+    def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None:
+        self.dataloader_original = dataloader_original
+        self.use_max_length = use_max_length
+        self.max_seq_length = max_seq_length
+        self.nsamples = nsamples
+        self.dataloader = []
+        self.is_ready = False
+
+    def get_prepared_dataloader(self):
+        if not self.is_ready:
+            self.prepare_dataloader()
+        return self.dataloader
+
+    def prepare_dataloader(self):
+        if self.use_max_length:
+            # (Recommend) only take sequence whose length exceeds self.max_seq_length,
+            # which preserves calibration's tokens are all valid
+            # This is GPTQ official dataloader implementation
+            self.obtain_first_n_samples_fulllength()
+        else:
+            # general selection, no padding, not GPTQ original implementation.
+            self.obtain_first_n_samples()
+        self.is_ready = True
+
+    def obtain_first_n_samples(self, seed=0):
+        """Get first nsample data as the real calibration dataset."""
+        self.dataloader.clear()
+        random.seed(seed)
+        for batch in self.dataloader_original:
+            # process data, depends on its data type.
+            if len(self.dataloader) == self.nsamples:
+                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                break
+            # list, tuple
+            if isinstance(batch, list) or isinstance(batch, tuple):
+                if batch[0].shape[-1] > self.max_seq_length:
+                    i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    batch_final = []
+                    for item in batch:
+                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                            batch_final.append(item[:, i:j])
+                        else:
+                            batch_final.append(item)
+                else:
+                    batch_final = batch[:]
+            # dict
+            elif isinstance(batch, dict):
+                try:
+                    length = batch["input_ids"].shape[-1]
+                except:
+                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                    continue
+                batch_final = {}
+                if length > self.max_seq_length:
+                    i = random.randint(0, length - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    # may have to slice every sequence related data
+                    for key in batch.keys():
+                        if isinstance(batch[key], torch.Tensor):
+                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
+                        else:
+                            batch_final[key] = batch[key]
+                else:
+                    batch_final = batch
+            # tensor
+            else:
+                if batch.shape[-1] > self.max_seq_length:
+                    i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1)
+                    j = i + self.max_seq_length
+                    batch_final = batch[:, i:j]
+                else:
+                    batch_final = batch
+            self.dataloader.append(batch_final)
+
+        if len(self.dataloader) < self.nsamples:
+            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
+
+    def obtain_first_n_samples_fulllength(self, seed=0):
+            self.dataloader.clear()
+            random.seed(seed)
+            unified_length = self.max_seq_length
+            for batch in self.dataloader_original:
+                if len(self.dataloader) == self.nsamples:
+                    logger.info(f"Successfully collect {self.nsamples} calibration samples.")
+                    break
+                # list & tuple, gpt-j-6b mlperf, etc.
+                if isinstance(batch, list) or isinstance(batch, tuple):
+                    if batch[0].shape[-1] == unified_length:
+                        batch_final = batch[:]
+                    elif batch[0].shape[-1] > unified_length:
+                        i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
+                        j = i + unified_length
+                        batch_final = []
+                        for item in batch:
+                            if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
+                                batch_final.append(item[:, i:j])
+                            else:
+                                batch_final.append(item)
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                # dict
+                elif isinstance(batch, dict):
+                    try:
+                        length = batch["input_ids"].shape[-1]
+                    except:
+                        logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
+                        continue
+                    batch_final = {}
+                    if length == self.max_seq_length:
+                        batch_final = batch
+                    elif length > self.max_seq_length:
+                        i = random.randint(0, length - self.max_seq_length - 1)
+                        j = i + self.max_seq_length
+                        # may have to slice every sequence related data
+                        for key in batch.keys():
+                            if isinstance(batch[key], torch.Tensor):
+                                batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
+                            else:
+                                batch_final[key] = batch[key]
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                # tensor
+                else:
+                    if batch.shape[-1] == unified_length:
+                        batch_final = batch
+                    elif batch.shape[-1] > unified_length:
+                        i = random.randint(0, batch.shape[-1] - unified_length - 1)
+                        j = i + unified_length
+                        batch_final = batch[:, i:j]
+                    else:
+                        # not match max length, not include in target dataset
+                        continue
+                self.dataloader.append(batch_final)
+            if len(self.dataloader) < self.nsamples:  # pragma: no cover
+                logger.warning(
+                    f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
+                but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value."
+                )
+
+
+def get_example_inputs(model, dataloader):
+    version = get_torch_version()
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+
+    # Suggest set dataloader like calib_dataloader
+    if dataloader is None:
+        return None
+    device = next(model.parameters()).device
+    try:
+        for idx, (input, label) in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, (list, tuple)):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    except Exception as e:  # pragma: no cover
+        for idx, input in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, list) or isinstance(input, tuple):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    if idx == 0:
+        assert False, "Please checkout the example_inputs format."

From 28be72b9cf902dea16eeab635eaa5f08db97a1fa Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Tue, 11 Jun 2024 10:18:51 +0800
Subject: [PATCH 02/10] add json path

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 docs/3x/PT_SmoothQuant.md                     |   2 +-
 docs/3x/PT_StaticQuant.md                     |   2 +-
 examples/.config/model_params_pytorch_3x.json |  46 ++++++
 .../quantization/smooth_quant/utils.py        | 146 ------------------
 .../quantization/static_quant/utils.py        | 146 ------------------
 5 files changed, 48 insertions(+), 294 deletions(-)
 create mode 100644 examples/.config/model_params_pytorch_3x.json

diff --git a/docs/3x/PT_SmoothQuant.md b/docs/3x/PT_SmoothQuant.md
index 9e4ae3eb62f..e3a7262dcde 100644
--- a/docs/3x/PT_SmoothQuant.md
+++ b/docs/3x/PT_SmoothQuant.md
@@ -46,7 +46,7 @@ run_fn(prepared_model)
 q_model = convert(prepared_model)
 ```
 
-To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm).
+To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant).
 
 
 ## Validated Models
diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
index ec967a780d4..ff84cb6e247 100644
--- a/docs/3x/PT_StaticQuant.md
+++ b/docs/3x/PT_StaticQuant.md
@@ -68,7 +68,7 @@ q_model = convert(prepared_model)
 
 #### Model Examples
 
-Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a new model.
+Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model.
 
 
 ### Static Quantization with PT2E Backend
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
new file mode 100644
index 00000000000..8520a9545b0
--- /dev/null
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -0,0 +1,46 @@
+{
+    "pytorch": {
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      }
+    }
+}
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
index 38083129a65..76117f8b0b5 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py
@@ -1,154 +1,8 @@
-import random
 import torch
 from collections import UserDict
 from packaging.version import Version
-from neural_compressor.common import logger
 from neural_compressor.torch.utils import get_torch_version
 
-class DataloaderPreprocessor:
-    def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None:
-        self.dataloader_original = dataloader_original
-        self.use_max_length = use_max_length
-        self.max_seq_length = max_seq_length
-        self.nsamples = nsamples
-        self.dataloader = []
-        self.is_ready = False
-
-    def get_prepared_dataloader(self):
-        if not self.is_ready:
-            self.prepare_dataloader()
-        return self.dataloader
-
-    def prepare_dataloader(self):
-        if self.use_max_length:
-            # (Recommend) only take sequence whose length exceeds self.max_seq_length,
-            # which preserves calibration's tokens are all valid
-            # This is GPTQ official dataloader implementation
-            self.obtain_first_n_samples_fulllength()
-        else:
-            # general selection, no padding, not GPTQ original implementation.
-            self.obtain_first_n_samples()
-        self.is_ready = True
-
-    def obtain_first_n_samples(self, seed=0):
-        """Get first nsample data as the real calibration dataset."""
-        self.dataloader.clear()
-        random.seed(seed)
-        for batch in self.dataloader_original:
-            # process data, depends on its data type.
-            if len(self.dataloader) == self.nsamples:
-                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                break
-            # list, tuple
-            if isinstance(batch, list) or isinstance(batch, tuple):
-                if batch[0].shape[-1] > self.max_seq_length:
-                    i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    batch_final = []
-                    for item in batch:
-                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                            batch_final.append(item[:, i:j])
-                        else:
-                            batch_final.append(item)
-                else:
-                    batch_final = batch[:]
-            # dict
-            elif isinstance(batch, dict):
-                try:
-                    length = batch["input_ids"].shape[-1]
-                except:
-                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                    continue
-                batch_final = {}
-                if length > self.max_seq_length:
-                    i = random.randint(0, length - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    # may have to slice every sequence related data
-                    for key in batch.keys():
-                        if isinstance(batch[key], torch.Tensor):
-                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
-                        else:
-                            batch_final[key] = batch[key]
-                else:
-                    batch_final = batch
-            # tensor
-            else:
-                if batch.shape[-1] > self.max_seq_length:
-                    i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    batch_final = batch[:, i:j]
-                else:
-                    batch_final = batch
-            self.dataloader.append(batch_final)
-
-        if len(self.dataloader) < self.nsamples:
-            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
-
-    def obtain_first_n_samples_fulllength(self, seed=0):
-            self.dataloader.clear()
-            random.seed(seed)
-            unified_length = self.max_seq_length
-            for batch in self.dataloader_original:
-                if len(self.dataloader) == self.nsamples:
-                    logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                    break
-                # list & tuple, gpt-j-6b mlperf, etc.
-                if isinstance(batch, list) or isinstance(batch, tuple):
-                    if batch[0].shape[-1] == unified_length:
-                        batch_final = batch[:]
-                    elif batch[0].shape[-1] > unified_length:
-                        i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
-                        j = i + unified_length
-                        batch_final = []
-                        for item in batch:
-                            if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                                batch_final.append(item[:, i:j])
-                            else:
-                                batch_final.append(item)
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                # dict
-                elif isinstance(batch, dict):
-                    try:
-                        length = batch["input_ids"].shape[-1]
-                    except:
-                        logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                        continue
-                    batch_final = {}
-                    if length == self.max_seq_length:
-                        batch_final = batch
-                    elif length > self.max_seq_length:
-                        i = random.randint(0, length - self.max_seq_length - 1)
-                        j = i + self.max_seq_length
-                        # may have to slice every sequence related data
-                        for key in batch.keys():
-                            if isinstance(batch[key], torch.Tensor):
-                                batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
-                            else:
-                                batch_final[key] = batch[key]
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                # tensor
-                else:
-                    if batch.shape[-1] == unified_length:
-                        batch_final = batch
-                    elif batch.shape[-1] > unified_length:
-                        i = random.randint(0, batch.shape[-1] - unified_length - 1)
-                        j = i + unified_length
-                        batch_final = batch[:, i:j]
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                self.dataloader.append(batch_final)
-            if len(self.dataloader) < self.nsamples:  # pragma: no cover
-                logger.warning(
-                    f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
-                but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value."
-                )
-
-
 def get_example_inputs(model, dataloader):
     version = get_torch_version()
     from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
index 38083129a65..76117f8b0b5 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
@@ -1,154 +1,8 @@
-import random
 import torch
 from collections import UserDict
 from packaging.version import Version
-from neural_compressor.common import logger
 from neural_compressor.torch.utils import get_torch_version
 
-class DataloaderPreprocessor:
-    def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None:
-        self.dataloader_original = dataloader_original
-        self.use_max_length = use_max_length
-        self.max_seq_length = max_seq_length
-        self.nsamples = nsamples
-        self.dataloader = []
-        self.is_ready = False
-
-    def get_prepared_dataloader(self):
-        if not self.is_ready:
-            self.prepare_dataloader()
-        return self.dataloader
-
-    def prepare_dataloader(self):
-        if self.use_max_length:
-            # (Recommend) only take sequence whose length exceeds self.max_seq_length,
-            # which preserves calibration's tokens are all valid
-            # This is GPTQ official dataloader implementation
-            self.obtain_first_n_samples_fulllength()
-        else:
-            # general selection, no padding, not GPTQ original implementation.
-            self.obtain_first_n_samples()
-        self.is_ready = True
-
-    def obtain_first_n_samples(self, seed=0):
-        """Get first nsample data as the real calibration dataset."""
-        self.dataloader.clear()
-        random.seed(seed)
-        for batch in self.dataloader_original:
-            # process data, depends on its data type.
-            if len(self.dataloader) == self.nsamples:
-                logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                break
-            # list, tuple
-            if isinstance(batch, list) or isinstance(batch, tuple):
-                if batch[0].shape[-1] > self.max_seq_length:
-                    i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    batch_final = []
-                    for item in batch:
-                        if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                            batch_final.append(item[:, i:j])
-                        else:
-                            batch_final.append(item)
-                else:
-                    batch_final = batch[:]
-            # dict
-            elif isinstance(batch, dict):
-                try:
-                    length = batch["input_ids"].shape[-1]
-                except:
-                    logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                    continue
-                batch_final = {}
-                if length > self.max_seq_length:
-                    i = random.randint(0, length - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    # may have to slice every sequence related data
-                    for key in batch.keys():
-                        if isinstance(batch[key], torch.Tensor):
-                            batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim
-                        else:
-                            batch_final[key] = batch[key]
-                else:
-                    batch_final = batch
-            # tensor
-            else:
-                if batch.shape[-1] > self.max_seq_length:
-                    i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1)
-                    j = i + self.max_seq_length
-                    batch_final = batch[:, i:j]
-                else:
-                    batch_final = batch
-            self.dataloader.append(batch_final)
-
-        if len(self.dataloader) < self.nsamples:
-            logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.")
-
-    def obtain_first_n_samples_fulllength(self, seed=0):
-            self.dataloader.clear()
-            random.seed(seed)
-            unified_length = self.max_seq_length
-            for batch in self.dataloader_original:
-                if len(self.dataloader) == self.nsamples:
-                    logger.info(f"Successfully collect {self.nsamples} calibration samples.")
-                    break
-                # list & tuple, gpt-j-6b mlperf, etc.
-                if isinstance(batch, list) or isinstance(batch, tuple):
-                    if batch[0].shape[-1] == unified_length:
-                        batch_final = batch[:]
-                    elif batch[0].shape[-1] > unified_length:
-                        i = random.randint(0, batch[0].shape[-1] - unified_length - 1)
-                        j = i + unified_length
-                        batch_final = []
-                        for item in batch:
-                            if isinstance(item, torch.Tensor) and item.shape.__len__() == 2:
-                                batch_final.append(item[:, i:j])
-                            else:
-                                batch_final.append(item)
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                # dict
-                elif isinstance(batch, dict):
-                    try:
-                        length = batch["input_ids"].shape[-1]
-                    except:
-                        logger.warning("Please make sure your dict'like data contains key of 'input_ids'.")
-                        continue
-                    batch_final = {}
-                    if length == self.max_seq_length:
-                        batch_final = batch
-                    elif length > self.max_seq_length:
-                        i = random.randint(0, length - self.max_seq_length - 1)
-                        j = i + self.max_seq_length
-                        # may have to slice every sequence related data
-                        for key in batch.keys():
-                            if isinstance(batch[key], torch.Tensor):
-                                batch_final[key] = batch[key][:, i:j]  # slice on sequence length dim with same position
-                            else:
-                                batch_final[key] = batch[key]
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                # tensor
-                else:
-                    if batch.shape[-1] == unified_length:
-                        batch_final = batch
-                    elif batch.shape[-1] > unified_length:
-                        i = random.randint(0, batch.shape[-1] - unified_length - 1)
-                        j = i + unified_length
-                        batch_final = batch[:, i:j]
-                    else:
-                        # not match max length, not include in target dataset
-                        continue
-                self.dataloader.append(batch_final)
-            if len(self.dataloader) < self.nsamples:  # pragma: no cover
-                logger.warning(
-                    f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \
-                but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value."
-                )
-
-
 def get_example_inputs(model, dataloader):
     version = get_torch_version()
     from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device

From 7236eb2a6df9d16f7ab602ca5b0bc74b418bfbab Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Wed, 12 Jun 2024 11:43:35 +0800
Subject: [PATCH 03/10] fix for sq

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 .../torch/algorithms/smooth_quant/smooth_quant.py        | 9 +++++----
 .../torch/algorithms/smooth_quant/utility.py             | 6 +++---
 .../torch/algorithms/static_quant/utility.py             | 8 +++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
index cd2686d0b0e..d27dfdd3fbc 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
@@ -82,13 +82,14 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
             model.output_tensor_id_op_name,
         )
 
-        # Update json file in ipex_config_path
-        cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name)
-        model.eval()
-
         # check smoothquant alpha and act_algo value
         recipe_cfgs = self.quant_config.get("recipe_cfgs", None)
         alpha = recipe_cfgs["smooth_quant_args"]["alpha"]
+
+        # Update json file in ipex_config_path
+        cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True)
+        model.eval()
+
         for op, _ in self.quant_config["op"].items():
             act_algo = self.quant_config["op"][op]["activation"]["algorithm"]
 
diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py
index 7dc647dbc95..d25b14444ec 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/utility.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py
@@ -164,7 +164,7 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl
 
 
 def check_cfg_and_qconfig(
-    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False
+    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha, smooth_quant=True
 ):  # pragma: no cover
     """Check configs and quantization configs.
 
@@ -205,7 +205,7 @@ def check_cfg_and_qconfig(
                         else:
                             smooth_quant_enable = False
                         activation_observer = generate_activation_observer(
-                            inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable
+                            inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable, alpha
                         )
                         if not smooth_quant:
                             if inc_scheme == "sym":
@@ -241,7 +241,7 @@ def check_cfg_and_qconfig(
 
 
 def cfg_to_qconfig(
-    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=False
+    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True
 ):  # pragma: no cover
     assert cfgs is not None, "No configure for IPEX int8 model..."
     op_infos = copy.deepcopy(op_infos_from_cfgs)
diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py
index f90471539fd..2c7fd753e65 100644
--- a/neural_compressor/torch/algorithms/static_quant/utility.py
+++ b/neural_compressor/torch/algorithms/static_quant/utility.py
@@ -157,7 +157,9 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_
     return cfgs, user_cfg
 
 
-def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False):  # pragma: no cover
+def generate_activation_observer(
+    scheme, algorithm, smooth_quant=False, smooth_quant_enable=False, alpha=0.5
+):  # pragma: no cover
     """This is a helper method to generate an activation observer.
 
     Args:
@@ -193,7 +195,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q
         "reduce_range": False,
         "quant_min": 0,
         "quant_max": 255,
-        "alpha": 0.5,
+        "alpha": 0.5 if alpha == "auto" else alpha,
         "act_observer": kl_activation_observer,
         "act_ic_observer": {
             "name": "PerChannelMinMaxObserver",
@@ -213,7 +215,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q
         "reduce_range": False,
         "quant_min": 0,
         "quant_max": 255,
-        "alpha": 0.5,
+        "alpha": 0.5 if alpha == "auto" else alpha,
         "act_observer": minmax_activation_observer,
         "act_ic_observer": {
             "name": "PerChannelMinMaxObserver",

From 5b5ba7d16cf8c11a08c3bfc47ac446839f66b0dd Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Thu, 13 Jun 2024 14:02:55 +0800
Subject: [PATCH 04/10] minor fix

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 .../torch/algorithms/smooth_quant/smooth_quant.py           | 1 -
 neural_compressor/torch/algorithms/smooth_quant/utility.py  | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
index d27dfdd3fbc..fdfb51640ac 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py
@@ -121,7 +121,6 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
             else:
                 model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace)
 
-        cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=True)
         model.load_qconf_summary(qconf_summary=ipex_config_path)
         return model
 
diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py
index d25b14444ec..21253f26892 100644
--- a/neural_compressor/torch/algorithms/smooth_quant/utility.py
+++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py
@@ -164,7 +164,7 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl
 
 
 def check_cfg_and_qconfig(
-    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha, smooth_quant=True
+    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha=0.5, smooth_quant=True
 ):  # pragma: no cover
     """Check configs and quantization configs.
 
@@ -241,11 +241,11 @@ def check_cfg_and_qconfig(
 
 
 def cfg_to_qconfig(
-    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True
+    tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha=0.5, smooth_quant=True
 ):  # pragma: no cover
     assert cfgs is not None, "No configure for IPEX int8 model..."
     op_infos = copy.deepcopy(op_infos_from_cfgs)
-    cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, smooth_quant)
+    cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, alpha, smooth_quant)
     with open(ipex_config_path, "w") as write_f:
         json.dump(cfgs, write_f, indent=4)
     return None

From c9827399808acb82384e4c5177cd259af6bf08a5 Mon Sep 17 00:00:00 2001
From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:16:13 +0800
Subject: [PATCH 05/10] Update run_clm_no_trainer.py

---
 .../quantization/static_quant/run_clm_no_trainer.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
index 5d13abb73fd..75dfeb34de8 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
@@ -167,7 +167,7 @@ def get_user_model():
     quant_config =  get_default_static_config()
     quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
     if re.search("gpt", user_model.config.model_type):
-        quant_config.set_local(torch.add, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+        quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
 
     from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
     from tqdm import tqdm
@@ -253,4 +253,4 @@ def run_fn(model):
     print("Accuracy: %.5f" % acc)
     print('Throughput: %.3f samples/sec' % (samples / (end - start)))
     print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
-    print('Batch size = %d' % args.batch_size)
\ No newline at end of file
+    print('Batch size = %d' % args.batch_size)

From 34282d0903f6e5b2162a87c383b32f9871f286f7 Mon Sep 17 00:00:00 2001
From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:38:57 +0800
Subject: [PATCH 06/10] Update run_clm_no_trainer.py

---
 .../smooth_quant/run_clm_no_trainer.py          | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
index 2afb74068f5..dbe4ae5fc78 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -178,15 +178,12 @@ def get_user_model():
     def run_fn(model):
         for batch in tqdm(calib_dataloader):
             batch = move_input_to_device(batch, device=None)
-            try:
-                if isinstance(batch, tuple) or isinstance(batch, list):
-                    model(batch[0])
-                elif isinstance(batch, dict):
-                    model(**batch)
-                else:
-                    model(batch)
-            except ValueError:
-                pass
+            if isinstance(batch, tuple) or isinstance(batch, list):
+                model(batch[0])
+            elif isinstance(batch, dict):
+                model(**batch)
+            else:
+                model(batch)
         return
 
     from utils import get_example_inputs
@@ -257,4 +254,4 @@ def run_fn(model):
     print("Accuracy: %.5f" % acc)
     print('Throughput: %.3f samples/sec' % (samples / (end - start)))
     print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
-    print('Batch size = %d' % args.batch_size)
\ No newline at end of file
+    print('Batch size = %d' % args.batch_size)

From 383b6a221ad040535b795cef75ef35f53c47542c Mon Sep 17 00:00:00 2001
From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com>
Date: Thu, 13 Jun 2024 14:47:20 +0800
Subject: [PATCH 07/10] Update run_clm_no_trainer.py

---
 .../static_quant/run_clm_no_trainer.py            | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
index 75dfeb34de8..9aee3fbfe55 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
@@ -174,15 +174,12 @@ def get_user_model():
     def run_fn(model):
         for batch in tqdm(calib_dataloader):
             batch = move_input_to_device(batch, device=None)
-            try:
-                if isinstance(batch, tuple) or isinstance(batch, list):
-                    model(batch[0])
-                elif isinstance(batch, dict):
-                    model(**batch)
-                else:
-                    model(batch)
-            except ValueError:
-                pass
+            if isinstance(batch, tuple) or isinstance(batch, list):
+                model(batch[0])
+            elif isinstance(batch, dict):
+                model(**batch)
+            else:
+                model(batch)
         return
 
     from utils import get_example_inputs

From 6b83c9eb36300a5e6de5fb7b0800e0839b2ea0f8 Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Fri, 14 Jun 2024 12:38:45 +0800
Subject: [PATCH 08/10] minor fix

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 docs/3x/PT_StaticQuant.md                     |   2 +-
 .../smooth_quant/run_benchmark.sh             |   2 +
 .../smooth_quant/run_clm_no_trainer.py        |  29 +-
 .../quantization/static_quant/ipex/README.md  |  57 ++++
 .../static_quant/ipex/requirements.txt        |  13 +
 .../static_quant/ipex/run_benchmark.sh        |  96 +++++++
 .../static_quant/ipex/run_clm_no_trainer.py   | 259 ++++++++++++++++++
 .../static_quant/ipex/run_quant.sh            |  67 +++++
 .../quantization/static_quant/ipex/utils.py   |  47 ++++
 9 files changed, 560 insertions(+), 12 deletions(-)
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py

diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
index ff84cb6e247..7d56f817296 100644
--- a/docs/3x/PT_StaticQuant.md
+++ b/docs/3x/PT_StaticQuant.md
@@ -68,7 +68,7 @@ q_model = convert(prepared_model)
 
 #### Model Examples
 
-Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model.
+Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model.
 
 
 ### Static Quantization with PT2E Backend
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
index 955ffd91456..61c50611090 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh
@@ -59,8 +59,10 @@ function run_benchmark {
 
     if [[ ${mode} == "accuracy" ]]; then
         mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd" --load"
     elif [[ ${mode} == "performance" ]]; then
         mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd" --load"
     else
         echo "Error: No such mode: ${mode}"
         exit 1
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
index dbe4ae5fc78..ef0590e2982 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -37,6 +37,7 @@
                     help="Select from ['dynamic', 'static', 'weight-only']")
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
+parser.add_argument("--load", action="store_true", help="Load quantized model.")
 parser.add_argument("--accuracy", action="store_true")
 parser.add_argument("--performance", action="store_true")
 parser.add_argument("--iters", default=100, type=int,
@@ -176,7 +177,8 @@ def get_user_model():
     from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
     from tqdm import tqdm
     def run_fn(model):
-        for batch in tqdm(calib_dataloader):
+        calib_iter = 0
+        for batch in tqdm(calib_dataloader, total=args.calib_iters):
             batch = move_input_to_device(batch, device=None)
             if isinstance(batch, tuple) or isinstance(batch, list):
                 model(batch[0])
@@ -184,6 +186,10 @@ def run_fn(model):
                 model(**batch)
             else:
                 model(batch)
+            
+            calib_iter += 1
+            if calib_iter >= args.calib_iters:
+                break
         return
 
     from utils import get_example_inputs
@@ -196,16 +202,17 @@ def run_fn(model):
     user_model.save(args.output_dir)
 
 
-# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
-if args.int8 or args.int8_bf16_mixed:
-    print("load int8 model")
-    from neural_compressor.torch.quantization import load
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    config = AutoConfig.from_pretrained(args.model)
-    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
-    setattr(user_model, "config", config)
-else:
-    user_model, tokenizer = get_user_model()
+if args.load:
+    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
+    if args.int8 or args.int8_bf16_mixed:
+        print("load int8 model")
+        from neural_compressor.torch.quantization import load
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        config = AutoConfig.from_pretrained(args.model)
+        user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+        setattr(user_model, "config", config)
+    else:
+        user_model, tokenizer = get_user_model()
 
 
 if args.accuracy:
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md
new file mode 100644
index 00000000000..8ecdc6c5110
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md
@@ -0,0 +1,57 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
+
+The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### GPT-J-6b
+
+#### Quantization
+```bash
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --quantize \
+    --alpha 1.0 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### OPT-125m
+
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --quantize \
+    --alpha 0.5 \
+    --ipex \
+    --output_dir "saved_results"
+```
+
+### LLAMA2-7b/13b/70b
+>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py \
+    --model meta-llama/Llama-2-7b-hf \
+    --quantize \
+    --alpha 0.8 \
+    --ipex \
+    --output_dir "saved_results"
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
new file mode 100644
index 00000000000..f0b56e558d3
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
@@ -0,0 +1,13 @@
+accelerate
+protobuf
+sentencepiece != 0.1.92
+datasets >= 1.1.3
+torch >= 1.10
+transformers
+pytest
+wandb
+einops
+neural-compressor
+intel-extension-for-transformers
+lm_eval==0.4.2
+peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh
new file mode 100644
index 00000000000..b62a6381b20
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_benchmark
+
+}
+
+# init params
+function init_params {
+  iters=100
+  batch_size=16
+  approach=static
+  tuned_checkpoint=saved_results
+  task=lambada_openai
+  echo ${max_eval_samples}
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --mode=*)
+          mode=$(echo $var |cut -f2 -d=)
+      ;;
+      --batch_size=*)
+          batch_size=$(echo $var |cut -f2 -d=)
+      ;;
+      --iters=*)
+          iters=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --int8=*)
+          int8=$(echo ${var} |cut -f2 -d=)
+      ;;
+      --config=*)
+          tuned_checkpoint=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+
+# run_benchmark
+function run_benchmark {
+    extra_cmd=''
+
+    if [[ ${mode} == "accuracy" ]]; then
+        mode_cmd=" --accuracy "
+        extra_cmd=$extra_cmd" --load"
+    elif [[ ${mode} == "performance" ]]; then
+        mode_cmd=" --performance --iters "${iters}
+        extra_cmd=$extra_cmd" --load"
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+
+    if [[ ${int8} == "true" ]]; then
+        extra_cmd=$extra_cmd" --int8"
+    fi
+    echo $extra_cmd
+
+    if [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --task ${task} \
+        --batch_size ${batch_size} \
+        ${extra_cmd} ${mode_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
new file mode 100644
index 00000000000..0ccb2093537
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
@@ -0,0 +1,259 @@
+import argparse
+import os
+import sys
+
+sys.path.append('./')
+import time
+import re
+import torch
+from datasets import load_dataset
+from torch.nn.functional import pad
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="EleutherAI/gpt-j-6b"
+)
+parser.add_argument(
+    "--trust_remote_code", default=True,
+    help="Transformers parameter: use the external repo")
+parser.add_argument(
+    "--revision", default=None,
+    help="Transformers parameter: set the model hub commit number")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument(
+    "--int8_bf16_mixed",
+    action="store_true",
+    help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
+)
+parser.add_argument(
+    '--seed',
+    type=int, default=42, help='Seed for sampling the calibration data.'
+)
+parser.add_argument("--approach", type=str, default='static',
+                    help="Select from ['dynamic', 'static', 'weight-only']")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
+parser.add_argument("--load", action="store_true", help="Load quantized model.")
+parser.add_argument("--accuracy", action="store_true")
+parser.add_argument("--performance", action="store_true")
+parser.add_argument("--iters", default=100, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--save_accuracy_path", default=None,
+                    help="Save accuracy results path.")
+parser.add_argument("--pad_max_length", default=512, type=int,
+                    help="Pad input ids to max length.")
+parser.add_argument("--calib_iters", default=512, type=int,
+                    help="calibration iters.")
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+                    type=str, help="tasks for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+
+args = parser.parse_args()
+if args.ipex:
+    import intel_extension_for_pytorch as ipex
+calib_size = 1
+
+
+class Evaluator:
+    def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.batch_size = batch_size
+        self.pad_val = pad_val
+        self.pad_max = pad_max
+        self.is_calib = is_calib
+
+        # tokenize the dataset
+        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+        self.dataset.set_format(type="torch", columns=["input_ids"])
+
+    @torch.no_grad()
+    def tokenize_function(self, examples):
+        return self.tokenizer(examples["text"])
+
+    @torch.no_grad()
+    def collate_batch(self, batch):
+
+        input_ids_padded = []
+        last_ind = []
+
+        for text in batch:
+            input_ids = text["input_ids"]
+            pad_len = self.pad_max - input_ids.shape[0]
+            last_ind.append(input_ids.shape[0] - 1)
+            if self.is_calib:
+                input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids
+            else:
+                input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
+            input_ids_padded.append(input_ids)
+
+        return (torch.vstack(input_ids_padded), torch.tensor(last_ind))
+
+    @torch.no_grad()
+    def evaluate(self, model):
+        model.eval()
+        # The task is to predict the last word of the input.
+        total, hit = 0, 0
+        latency = 0
+        test_dataloader = DataLoader(
+            self.dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            collate_fn=self.collate_batch,
+        )
+        for i, (input_ids, last_ind) in enumerate(test_dataloader):
+            label = input_ids[torch.arange(len(last_ind)), last_ind]
+            input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val
+            pad_len = self.pad_max - last_ind - 1
+
+            start = time.time()
+            outputs = model(input_ids)
+            latency += time.time() - start
+
+            last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :]
+            pred = last_token_logits.argmax(dim=-1)
+            total += label.size(0)
+            hit += (pred == label).sum().item()
+            if (i + 1) % 50 == 0:
+                print(hit / total)
+                print("Processed minibatch:", i)
+
+        acc = hit / total
+        print("Accuracy: ", acc)
+        print("Latency: ", latency)
+        return acc
+
+
+def get_user_model():
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
+
+    # to channels last
+    user_model = user_model.to(memory_format=torch.channels_last)
+    user_model.eval()
+    return user_model, tokenizer
+
+
+if args.quantize:
+    # dataset
+    user_model, tokenizer = get_user_model()
+    calib_dataset = load_dataset(args.dataset, split="train")
+    # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
+    calib_dataset = calib_dataset.shuffle(seed=args.seed)
+    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
+    calib_dataloader = DataLoader(
+        calib_evaluator.dataset,
+        batch_size=calib_size,
+        shuffle=False,
+        collate_fn=calib_evaluator.collate_batch,
+    )
+
+    
+    from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig
+    quant_config =  get_default_static_config()
+    quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
+    if re.search("gpt", user_model.config.model_type):
+        quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
+
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+    from tqdm import tqdm
+    def run_fn(model):
+        calib_iter = 0
+        for batch in tqdm(calib_dataloader, total=args.calib_iters):
+            batch = move_input_to_device(batch, device=None)
+            if isinstance(batch, tuple) or isinstance(batch, list):
+                model(batch[0])
+            elif isinstance(batch, dict):
+                model(**batch)
+            else:
+                model(batch)
+            
+            calib_iter += 1
+            if calib_iter >= args.calib_iters:
+                break
+        return
+
+    from utils import get_example_inputs
+    example_inputs = get_example_inputs(user_model, calib_dataloader)
+
+    from neural_compressor.torch.quantization import prepare, convert
+    user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+    run_fn(user_model)
+    user_model = convert(user_model)
+    user_model.save(args.output_dir)
+
+if args.load:
+    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
+    if args.int8 or args.int8_bf16_mixed:
+        print("load int8 model")
+        from neural_compressor.torch.quantization import load
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        config = AutoConfig.from_pretrained(args.model)
+        user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
+        setattr(user_model, "config", config)
+    else:
+        user_model, tokenizer = get_user_model()
+
+
+if args.accuracy:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        device="cpu",
+    )
+    results = evaluate(eval_args)
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Batch size = %d' % args.batch_size)
+
+if args.performance:
+    user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
+    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
+    print('Batch size = %d' % args.batch_size)
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh
new file mode 100644
index 00000000000..a93d8220d64
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    extra_cmd=''
+    batch_size=8
+    approach='static'
+    DATASET_NAME="NeelNanda/pile-10k"
+    tuned_checkpoint="saved_results"
+
+    if [ "${topology}" = "opt_125m_ipex" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "llama2_7b_ipex" ]; then
+        model_name_or_path="meta-llama/Llama-2-7b-hf"
+        extra_cmd=$extra_cmd" --ipex"
+    elif [ "${topology}" = "gpt_j_ipex" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --ipex"
+    fi
+
+    python -u run_clm_no_trainer.py \
+        --model ${model_name_or_path} \
+        --dataset ${DATASET_NAME} \
+        --quantize \
+        --approach ${approach} \
+        --output_dir ${tuned_checkpoint} \
+        --tasks "lambada_openai" \
+        --batch_size ${batch_size} \
+        ${extra_cmd}
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py
new file mode 100644
index 00000000000..76117f8b0b5
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py
@@ -0,0 +1,47 @@
+import torch
+from collections import UserDict
+from packaging.version import Version
+from neural_compressor.torch.utils import get_torch_version
+
+def get_example_inputs(model, dataloader):
+    version = get_torch_version()
+    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
+
+    # Suggest set dataloader like calib_dataloader
+    if dataloader is None:
+        return None
+    device = next(model.parameters()).device
+    try:
+        for idx, (input, label) in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, (list, tuple)):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    except Exception as e:  # pragma: no cover
+        for idx, input in enumerate(dataloader):
+            input = move_input_to_device(input, device)
+            if isinstance(input, (dict, UserDict)):  # pragma: no cover
+                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
+                if "label" in input.keys():
+                    input.pop("label")
+                if version.release <= Version("2.0.1").release:
+                    return tuple(input.values())
+                else:
+                    return dict(input)
+            if isinstance(input, list) or isinstance(input, tuple):
+                return tuple(input)
+            if isinstance(input, torch.Tensor):
+                return input
+            break
+    if idx == 0:
+        assert False, "Please checkout the example_inputs format."

From 00fe9d988aa323b57ab8439d5cd6f762fa6ec61f Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Fri, 14 Jun 2024 12:41:14 +0800
Subject: [PATCH 09/10] remove old files

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 .../quantization/static_quant/README.md       |  57 ----
 .../static_quant/requirements.txt             |  13 -
 .../static_quant/run_benchmark.sh             |  94 -------
 .../static_quant/run_clm_no_trainer.py        | 253 ------------------
 .../quantization/static_quant/run_quant.sh    |  67 -----
 .../quantization/static_quant/utils.py        |  47 ----
 6 files changed, 531 deletions(-)
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
deleted file mode 100644
index 8ecdc6c5110..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-Step-by-Step
-============
-This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch.
-
-The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
-
-# Prerequisite
-## 1. Create Environment
-```
-# Installation
-pip install -r requirements.txt
-```
-
-# Run
-
-Here is how to run the scripts:
-
-**Causal Language Modeling (CLM)**
-
-`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
-### GPT-J-6b
-
-#### Quantization
-```bash
-python run_clm_no_trainer.py \
-    --model EleutherAI/gpt-j-6B \
-    --quantize \
-    --alpha 1.0 \
-    --ipex \
-    --output_dir "saved_results"
-```
-
-### OPT-125m
-
-#### Quantization
-
-```bash
-python run_clm_no_trainer.py \
-    --model facebook/opt-125m \
-    --quantize \
-    --alpha 0.5 \
-    --ipex \
-    --output_dir "saved_results"
-```
-
-### LLAMA2-7b/13b/70b
->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
-#### Quantization
-
-```bash
-python run_clm_no_trainer.py \
-    --model meta-llama/Llama-2-7b-hf \
-    --quantize \
-    --alpha 0.8 \
-    --ipex \
-    --output_dir "saved_results"
-```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
deleted file mode 100644
index f0b56e558d3..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-accelerate
-protobuf
-sentencepiece != 0.1.92
-datasets >= 1.1.3
-torch >= 1.10
-transformers
-pytest
-wandb
-einops
-neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
-peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
deleted file mode 100644
index 87359e9a094..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_benchmark
-
-}
-
-# init params
-function init_params {
-  iters=100
-  batch_size=16
-  approach=static
-  tuned_checkpoint=saved_results
-  task=lambada_openai
-  echo ${max_eval_samples}
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-      --mode=*)
-          mode=$(echo $var |cut -f2 -d=)
-      ;;
-      --batch_size=*)
-          batch_size=$(echo $var |cut -f2 -d=)
-      ;;
-      --iters=*)
-          iters=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --int8=*)
-          int8=$(echo ${var} |cut -f2 -d=)
-      ;;
-      --config=*)
-          tuned_checkpoint=$(echo $var |cut -f2 -d=)
-      ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-
-# run_benchmark
-function run_benchmark {
-    extra_cmd=''
-
-    if [[ ${mode} == "accuracy" ]]; then
-        mode_cmd=" --accuracy "
-    elif [[ ${mode} == "performance" ]]; then
-        mode_cmd=" --performance --iters "${iters}
-    else
-        echo "Error: No such mode: ${mode}"
-        exit 1
-    fi
-
-    if [[ ${int8} == "true" ]]; then
-        extra_cmd=$extra_cmd" --int8"
-    fi
-    echo $extra_cmd
-
-    if [ "${topology}" = "opt_125m_ipex" ]; then
-        model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --ipex"
-    elif [ "${topology}" = "llama2_7b_ipex" ]; then
-        model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --ipex"
-    elif [ "${topology}" = "gpt_j_ipex" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --ipex"
-    fi
-
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --approach ${approach} \
-        --output_dir ${tuned_checkpoint} \
-        --task ${task} \
-        --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
-}
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
deleted file mode 100644
index 9aee3fbfe55..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import argparse
-import os
-import sys
-
-sys.path.append('./')
-import time
-import re
-import torch
-from datasets import load_dataset
-from torch.nn.functional import pad
-from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--model", nargs="?", default="EleutherAI/gpt-j-6b"
-)
-parser.add_argument(
-    "--trust_remote_code", default=True,
-    help="Transformers parameter: use the external repo")
-parser.add_argument(
-    "--revision", default=None,
-    help="Transformers parameter: set the model hub commit number")
-parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
-parser.add_argument("--output_dir", nargs="?", default="./saved_results")
-parser.add_argument("--quantize", action="store_true")
-parser.add_argument(
-    "--int8_bf16_mixed",
-    action="store_true",
-    help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)",
-)
-parser.add_argument(
-    '--seed',
-    type=int, default=42, help='Seed for sampling the calibration data.'
-)
-parser.add_argument("--approach", type=str, default='static',
-                    help="Select from ['dynamic', 'static', 'weight-only']")
-parser.add_argument("--int8", action="store_true")
-parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.")
-parser.add_argument("--accuracy", action="store_true")
-parser.add_argument("--performance", action="store_true")
-parser.add_argument("--iters", default=100, type=int,
-                    help="For accuracy measurement only.")
-parser.add_argument("--batch_size", default=1, type=int,
-                    help="For accuracy measurement only.")
-parser.add_argument("--save_accuracy_path", default=None,
-                    help="Save accuracy results path.")
-parser.add_argument("--pad_max_length", default=512, type=int,
-                    help="Pad input ids to max length.")
-parser.add_argument("--calib_iters", default=512, type=int,
-                    help="calibration iters.")
-parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
-                    type=str, help="tasks for accuracy validation")
-parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
-
-args = parser.parse_args()
-if args.ipex:
-    import intel_extension_for_pytorch as ipex
-calib_size = 1
-
-
-class Evaluator:
-    def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False):
-        self.dataset = dataset
-        self.tokenizer = tokenizer
-        self.batch_size = batch_size
-        self.pad_val = pad_val
-        self.pad_max = pad_max
-        self.is_calib = is_calib
-
-        # tokenize the dataset
-        self.dataset = self.dataset.map(self.tokenize_function, batched=True)
-        self.dataset.set_format(type="torch", columns=["input_ids"])
-
-    @torch.no_grad()
-    def tokenize_function(self, examples):
-        return self.tokenizer(examples["text"])
-
-    @torch.no_grad()
-    def collate_batch(self, batch):
-
-        input_ids_padded = []
-        last_ind = []
-
-        for text in batch:
-            input_ids = text["input_ids"]
-            pad_len = self.pad_max - input_ids.shape[0]
-            last_ind.append(input_ids.shape[0] - 1)
-            if self.is_calib:
-                input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids
-            else:
-                input_ids = pad(input_ids, (0, pad_len), value=self.pad_val)
-            input_ids_padded.append(input_ids)
-
-        return (torch.vstack(input_ids_padded), torch.tensor(last_ind))
-
-    @torch.no_grad()
-    def evaluate(self, model):
-        model.eval()
-        # The task is to predict the last word of the input.
-        total, hit = 0, 0
-        latency = 0
-        test_dataloader = DataLoader(
-            self.dataset,
-            batch_size=self.batch_size,
-            shuffle=False,
-            collate_fn=self.collate_batch,
-        )
-        for i, (input_ids, last_ind) in enumerate(test_dataloader):
-            label = input_ids[torch.arange(len(last_ind)), last_ind]
-            input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val
-            pad_len = self.pad_max - last_ind - 1
-
-            start = time.time()
-            outputs = model(input_ids)
-            latency += time.time() - start
-
-            last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :]
-            pred = last_token_logits.argmax(dim=-1)
-            total += label.size(0)
-            hit += (pred == label).sum().item()
-            if (i + 1) % 50 == 0:
-                print(hit / total)
-                print("Processed minibatch:", i)
-
-        acc = hit / total
-        print("Accuracy: ", acc)
-        print("Latency: ", latency)
-        return acc
-
-
-def get_user_model():
-    user_model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torchscript=True,  # torchscript will force `return_dict=False` to avoid jit errors
-        trust_remote_code=args.trust_remote_code,
-        revision=args.revision,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-
-    if args.peft_model_id is not None:
-        from peft import PeftModel
-        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
-
-    # to channels last
-    user_model = user_model.to(memory_format=torch.channels_last)
-    user_model.eval()
-    return user_model, tokenizer
-
-
-if args.quantize:
-    # dataset
-    user_model, tokenizer = get_user_model()
-    calib_dataset = load_dataset(args.dataset, split="train")
-    # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF
-    calib_dataset = calib_dataset.shuffle(seed=args.seed)
-    calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True)
-    calib_dataloader = DataLoader(
-        calib_evaluator.dataset,
-        batch_size=calib_size,
-        shuffle=False,
-        collate_fn=calib_evaluator.collate_batch,
-    )
-
-    
-    from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig
-    quant_config =  get_default_static_config()
-    quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"]
-    if re.search("gpt", user_model.config.model_type):
-        quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32"))
-
-    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
-    from tqdm import tqdm
-    def run_fn(model):
-        for batch in tqdm(calib_dataloader):
-            batch = move_input_to_device(batch, device=None)
-            if isinstance(batch, tuple) or isinstance(batch, list):
-                model(batch[0])
-            elif isinstance(batch, dict):
-                model(**batch)
-            else:
-                model(batch)
-        return
-
-    from utils import get_example_inputs
-    example_inputs = get_example_inputs(user_model, calib_dataloader)
-
-    from neural_compressor.torch.quantization import prepare, convert
-    user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
-    run_fn(user_model)
-    user_model = convert(user_model)
-    user_model.save(args.output_dir)
-
-
-# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
-if args.int8 or args.int8_bf16_mixed:
-    print("load int8 model")
-    from neural_compressor.torch.quantization import load
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    config = AutoConfig.from_pretrained(args.model)
-    user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
-    setattr(user_model, "config", config)
-else:
-    user_model, tokenizer = get_user_model()
-
-
-if args.accuracy:
-    user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
-    eval_args = LMEvalParser(
-        model="hf",
-        user_model=user_model,
-        tokenizer=tokenizer,
-        batch_size=args.batch_size,
-        tasks=args.tasks,
-        device="cpu",
-    )
-    results = evaluate(eval_args)
-    for task_name in args.tasks.split(","):
-        if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity,none"]
-        else:
-            acc = results["results"][task_name]["acc,none"]
-    print("Accuracy: %.5f" % acc)
-    print('Batch size = %d' % args.batch_size)
-
-if args.performance:
-    user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
-    import time
-
-    samples = args.iters * args.batch_size
-    eval_args = LMEvalParser(
-        model="hf",
-        user_model=user_model,
-        tokenizer=tokenizer,
-        batch_size=args.batch_size,
-        tasks=args.tasks,
-        limit=samples,
-        device="cpu",
-    )
-    start = time.time()
-    results = evaluate(eval_args)
-    end = time.time()
-    for task_name in args.tasks.split(","):
-        if task_name == "wikitext":
-            acc = results["results"][task_name]["word_perplexity,none"]
-        else:
-            acc = results["results"][task_name]["acc,none"]
-    print("Accuracy: %.5f" % acc)
-    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
-    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
-    print('Batch size = %d' % args.batch_size)
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
deleted file mode 100644
index a93d8220d64..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-set -x
-
-function main {
-
-  init_params "$@"
-  run_tuning
-
-}
-
-# init params
-function init_params {
-  for var in "$@"
-  do
-    case $var in
-      --topology=*)
-          topology=$(echo $var |cut -f2 -d=)
-      ;;
-      --dataset_location=*)
-          dataset_location=$(echo $var |cut -f2 -d=)
-      ;;
-      --input_model=*)
-          input_model=$(echo $var |cut -f2 -d=)
-      ;;
-       --output_model=*)
-           tuned_checkpoint=$(echo $var |cut -f2 -d=)
-       ;;
-      *)
-          echo "Error: No such parameter: ${var}"
-          exit 1
-      ;;
-    esac
-  done
-
-}
-
-# run_tuning
-function run_tuning {
-    extra_cmd=''
-    batch_size=8
-    approach='static'
-    DATASET_NAME="NeelNanda/pile-10k"
-    tuned_checkpoint="saved_results"
-
-    if [ "${topology}" = "opt_125m_ipex" ]; then
-        model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --ipex"
-    elif [ "${topology}" = "llama2_7b_ipex" ]; then
-        model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --ipex"
-    elif [ "${topology}" = "gpt_j_ipex" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --ipex"
-    fi
-
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --dataset ${DATASET_NAME} \
-        --quantize \
-        --approach ${approach} \
-        --output_dir ${tuned_checkpoint} \
-        --tasks "lambada_openai" \
-        --batch_size ${batch_size} \
-        ${extra_cmd}
-}
-
-main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
deleted file mode 100644
index 76117f8b0b5..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import torch
-from collections import UserDict
-from packaging.version import Version
-from neural_compressor.torch.utils import get_torch_version
-
-def get_example_inputs(model, dataloader):
-    version = get_torch_version()
-    from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device
-
-    # Suggest set dataloader like calib_dataloader
-    if dataloader is None:
-        return None
-    device = next(model.parameters()).device
-    try:
-        for idx, (input, label) in enumerate(dataloader):
-            input = move_input_to_device(input, device)
-            if isinstance(input, (dict, UserDict)):  # pragma: no cover
-                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
-                if "label" in input.keys():
-                    input.pop("label")
-                if version.release <= Version("2.0.1").release:
-                    return tuple(input.values())
-                else:
-                    return dict(input)
-            if isinstance(input, (list, tuple)):
-                return tuple(input)
-            if isinstance(input, torch.Tensor):
-                return input
-            break
-    except Exception as e:  # pragma: no cover
-        for idx, input in enumerate(dataloader):
-            input = move_input_to_device(input, device)
-            if isinstance(input, (dict, UserDict)):  # pragma: no cover
-                assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0"
-                if "label" in input.keys():
-                    input.pop("label")
-                if version.release <= Version("2.0.1").release:
-                    return tuple(input.values())
-                else:
-                    return dict(input)
-            if isinstance(input, list) or isinstance(input, tuple):
-                return tuple(input)
-            if isinstance(input, torch.Tensor):
-                return input
-            break
-    if idx == 0:
-        assert False, "Please checkout the example_inputs format."

From 959170d961a2b8eca1b9bbf95c13080059f00f03 Mon Sep 17 00:00:00 2001
From: "Cheng, Zixuan" <zixuan.cheng@intel.com>
Date: Fri, 14 Jun 2024 12:44:13 +0800
Subject: [PATCH 10/10] fix act_algo

Signed-off-by: Cheng, Zixuan <zixuan.cheng@intel.com>
---
 .../torch/algorithms/static_quant/static_quant.py      | 10 +++++++++-
 neural_compressor/torch/quantization/config.py         |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py
index e2eac7f236d..efd1880666c 100644
--- a/neural_compressor/torch/algorithms/static_quant/static_quant.py
+++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py
@@ -85,7 +85,15 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs):
             from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
 
             if ipex_ver.release >= Version("2.1").release:
-                static_qconfig = ipex.quantization.default_static_qconfig_mapping
+                # HistogramObserver will cause a performance issue.
+                # static_qconfig = ipex.quantization.default_static_qconfig_mapping
+                qconfig = QConfig(
+                    activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
+                    weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric),
+                )
+                from torch.ao.quantization import QConfigMapping
+
+                static_qconfig = QConfigMapping().set_global(qconfig)
             else:
                 static_qconfig = QConfig(
                     activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index a8bab76b972..27a056d3284 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -980,7 +980,7 @@ def __init__(
         act_dtype: str = "uint8",
         act_sym: bool = False,
         act_granularity: str = "per_tensor",
-        act_algo: str = "kl",
+        act_algo: str = "minmax",
         excluded_precisions: list = [],
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):