From 4fb88435cfa1f226cfb5072e7f4c73f83641425f Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Sun, 9 Jun 2024 16:15:51 +0800 Subject: [PATCH 01/10] modify 3.x ipex example structure Signed-off-by: Cheng, Zixuan --- .../quantization/smooth_quant/README.md | 64 +++++ .../smooth_quant/requirements.txt | 13 + .../smooth_quant/run_benchmark.sh | 94 +++++++ .../smooth_quant/run_clm_no_trainer.py | 260 ++++++++++++++++++ .../quantization/smooth_quant/run_quant.sh | 67 +++++ .../quantization/smooth_quant/utils.py | 193 +++++++++++++ .../quantization/static_quant/README.md | 57 ++++ .../static_quant/requirements.txt | 13 + .../static_quant/run_benchmark.sh | 94 +++++++ .../static_quant/run_clm_no_trainer.py | 256 +++++++++++++++++ .../quantization/static_quant/run_quant.sh | 67 +++++ .../quantization/static_quant/utils.py | 193 +++++++++++++ 12 files changed, 1371 insertions(+) create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md new file mode 100644 index 00000000000..8900ea9fd9b --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md @@ -0,0 +1,64 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) using Smooth Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. + +The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### GPT-J-6b + +#### Quantization +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --quantize \ + --sq \ + --alpha 1.0 \ + --ipex \ + --output_dir "saved_results" +``` +**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation. + +### OPT-125m + +#### Quantization + +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --quantize \ + --sq \ + --alpha 0.5 \ + --ipex \ + --output_dir "saved_results" +``` + +### LLAMA2-7b/13b/70b +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +#### Quantization + +```bash +# "--sq" is used to enable smooth quant +python run_clm_no_trainer.py \ + --model meta-llama/Llama-2-7b-hf \ + --quantize \ + --sq \ + --alpha 0.8 \ + --ipex \ + --output_dir "saved_results" +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt new file mode 100644 index 00000000000..f0b56e558d3 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -0,0 +1,13 @@ +accelerate +protobuf +sentencepiece != 0.1.92 +datasets >= 1.1.3 +torch >= 1.10 +transformers +pytest +wandb +einops +neural-compressor +intel-extension-for-transformers +lm_eval==0.4.2 +peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh new file mode 100644 index 00000000000..955ffd91456 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh @@ -0,0 +1,94 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + approach=static + tuned_checkpoint=saved_results + task=lambada_openai + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --performance --iters "${iters} + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + if [ "${topology}" = "opt_125m_ipex_sq" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" + elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" + elif [ "${topology}" = "gpt_j_ipex_sq" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py new file mode 100644 index 00000000000..2afb74068f5 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -0,0 +1,260 @@ +import argparse +import os +import sys + +sys.path.append('./') +import time +import re +import torch +from datasets import load_dataset +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="EleutherAI/gpt-j-6b" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument( + "--int8_bf16_mixed", + action="store_true", + help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + '--seed', + type=int, default=42, help='Seed for sampling the calibration data.' +) +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--save_accuracy_path", default=None, + help="Save accuracy results path.") +parser.add_argument("--pad_max_length", default=512, type=int, + help="Pad input ids to max length.") +parser.add_argument("--calib_iters", default=512, type=int, + help="calibration iters.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ============SmoothQuant configs============== +parser.add_argument("--sq", action="store_true") +parser.add_argument("--alpha", default="auto", help="Smooth quant parameter.") + +args = parser.parse_args() +if args.ipex: + import intel_extension_for_pytorch as ipex +calib_size = 1 + + +class Evaluator: + def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + self.is_calib = is_calib + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + return self.tokenizer(examples["text"]) + + @torch.no_grad() + def collate_batch(self, batch): + + input_ids_padded = [] + last_ind = [] + + for text in batch: + input_ids = text["input_ids"] + pad_len = self.pad_max - input_ids.shape[0] + last_ind.append(input_ids.shape[0] - 1) + if self.is_calib: + input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + else: + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + input_ids_padded.append(input_ids) + + return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) + + @torch.no_grad() + def evaluate(self, model): + model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + latency = 0 + test_dataloader = DataLoader( + self.dataset, + batch_size=self.batch_size, + shuffle=False, + collate_fn=self.collate_batch, + ) + for i, (input_ids, last_ind) in enumerate(test_dataloader): + label = input_ids[torch.arange(len(last_ind)), last_ind] + input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val + pad_len = self.pad_max - last_ind - 1 + + start = time.time() + outputs = model(input_ids) + latency += time.time() - start + + last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] + pred = last_token_logits.argmax(dim=-1) + total += label.size(0) + hit += (pred == label).sum().item() + if (i + 1) % 50 == 0: + print(hit / total) + print("Processed minibatch:", i) + + acc = hit / total + print("Accuracy: ", acc) + print("Latency: ", latency) + return acc + + +def get_user_model(): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + + +if args.quantize: + # dataset + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") + # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) + + from neural_compressor.torch.quantization import SmoothQuantConfig + args.alpha = eval(args.alpha) + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions) + + if re.search("gpt", user_model.config.model_type): + quant_config.set_local(torch.add, SmoothQuantConfig(w_dtype="fp32", act_dtype="fp32")) + + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + from tqdm import tqdm + def run_fn(model): + for batch in tqdm(calib_dataloader): + batch = move_input_to_device(batch, device=None) + try: + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + except ValueError: + pass + return + + from utils import get_example_inputs + example_inputs = get_example_inputs(user_model, calib_dataloader) + + from neural_compressor.torch.quantization import prepare, convert + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + user_model.save(args.output_dir) + + +# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result +if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) +else: + user_model, tokenizer = get_user_model() + + +if args.accuracy: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh new file mode 100644 index 00000000000..774bb73b6f1 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + approach='static' + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "opt_125m_ipex_sq" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" + elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" + elif [ "${topology}" = "gpt_j_ipex_sq" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --tasks "lambada_openai" \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py new file mode 100644 index 00000000000..38083129a65 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py @@ -0,0 +1,193 @@ +import random +import torch +from collections import UserDict +from packaging.version import Version +from neural_compressor.common import logger +from neural_compressor.torch.utils import get_torch_version + +class DataloaderPreprocessor: + def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None: + self.dataloader_original = dataloader_original + self.use_max_length = use_max_length + self.max_seq_length = max_seq_length + self.nsamples = nsamples + self.dataloader = [] + self.is_ready = False + + def get_prepared_dataloader(self): + if not self.is_ready: + self.prepare_dataloader() + return self.dataloader + + def prepare_dataloader(self): + if self.use_max_length: + # (Recommend) only take sequence whose length exceeds self.max_seq_length, + # which preserves calibration's tokens are all valid + # This is GPTQ official dataloader implementation + self.obtain_first_n_samples_fulllength() + else: + # general selection, no padding, not GPTQ original implementation. + self.obtain_first_n_samples() + self.is_ready = True + + def obtain_first_n_samples(self, seed=0): + """Get first nsample data as the real calibration dataset.""" + self.dataloader.clear() + random.seed(seed) + for batch in self.dataloader_original: + # process data, depends on its data type. + if len(self.dataloader) == self.nsamples: + logger.info(f"Successfully collect {self.nsamples} calibration samples.") + break + # list, tuple + if isinstance(batch, list) or isinstance(batch, tuple): + if batch[0].shape[-1] > self.max_seq_length: + i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1) + j = i + self.max_seq_length + batch_final = [] + for item in batch: + if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: + batch_final.append(item[:, i:j]) + else: + batch_final.append(item) + else: + batch_final = batch[:] + # dict + elif isinstance(batch, dict): + try: + length = batch["input_ids"].shape[-1] + except: + logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") + continue + batch_final = {} + if length > self.max_seq_length: + i = random.randint(0, length - self.max_seq_length - 1) + j = i + self.max_seq_length + # may have to slice every sequence related data + for key in batch.keys(): + if isinstance(batch[key], torch.Tensor): + batch_final[key] = batch[key][:, i:j] # slice on sequence length dim + else: + batch_final[key] = batch[key] + else: + batch_final = batch + # tensor + else: + if batch.shape[-1] > self.max_seq_length: + i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1) + j = i + self.max_seq_length + batch_final = batch[:, i:j] + else: + batch_final = batch + self.dataloader.append(batch_final) + + if len(self.dataloader) < self.nsamples: + logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.") + + def obtain_first_n_samples_fulllength(self, seed=0): + self.dataloader.clear() + random.seed(seed) + unified_length = self.max_seq_length + for batch in self.dataloader_original: + if len(self.dataloader) == self.nsamples: + logger.info(f"Successfully collect {self.nsamples} calibration samples.") + break + # list & tuple, gpt-j-6b mlperf, etc. + if isinstance(batch, list) or isinstance(batch, tuple): + if batch[0].shape[-1] == unified_length: + batch_final = batch[:] + elif batch[0].shape[-1] > unified_length: + i = random.randint(0, batch[0].shape[-1] - unified_length - 1) + j = i + unified_length + batch_final = [] + for item in batch: + if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: + batch_final.append(item[:, i:j]) + else: + batch_final.append(item) + else: + # not match max length, not include in target dataset + continue + # dict + elif isinstance(batch, dict): + try: + length = batch["input_ids"].shape[-1] + except: + logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") + continue + batch_final = {} + if length == self.max_seq_length: + batch_final = batch + elif length > self.max_seq_length: + i = random.randint(0, length - self.max_seq_length - 1) + j = i + self.max_seq_length + # may have to slice every sequence related data + for key in batch.keys(): + if isinstance(batch[key], torch.Tensor): + batch_final[key] = batch[key][:, i:j] # slice on sequence length dim with same position + else: + batch_final[key] = batch[key] + else: + # not match max length, not include in target dataset + continue + # tensor + else: + if batch.shape[-1] == unified_length: + batch_final = batch + elif batch.shape[-1] > unified_length: + i = random.randint(0, batch.shape[-1] - unified_length - 1) + j = i + unified_length + batch_final = batch[:, i:j] + else: + # not match max length, not include in target dataset + continue + self.dataloader.append(batch_final) + if len(self.dataloader) < self.nsamples: # pragma: no cover + logger.warning( + f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \ + but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value." + ) + + +def get_example_inputs(model, dataloader): + version = get_torch_version() + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + + # Suggest set dataloader like calib_dataloader + if dataloader is None: + return None + device = next(model.parameters()).device + try: + for idx, (input, label) in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, (list, tuple)): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + except Exception as e: # pragma: no cover + for idx, input in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, list) or isinstance(input, tuple): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + if idx == 0: + assert False, "Please checkout the example_inputs format." diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md new file mode 100644 index 00000000000..8ecdc6c5110 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md @@ -0,0 +1,57 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. + +The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### GPT-J-6b + +#### Quantization +```bash +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --quantize \ + --alpha 1.0 \ + --ipex \ + --output_dir "saved_results" +``` + +### OPT-125m + +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --quantize \ + --alpha 0.5 \ + --ipex \ + --output_dir "saved_results" +``` + +### LLAMA2-7b/13b/70b +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model meta-llama/Llama-2-7b-hf \ + --quantize \ + --alpha 0.8 \ + --ipex \ + --output_dir "saved_results" +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt new file mode 100644 index 00000000000..f0b56e558d3 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt @@ -0,0 +1,13 @@ +accelerate +protobuf +sentencepiece != 0.1.92 +datasets >= 1.1.3 +torch >= 1.10 +transformers +pytest +wandb +einops +neural-compressor +intel-extension-for-transformers +lm_eval==0.4.2 +peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh new file mode 100644 index 00000000000..87359e9a094 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh @@ -0,0 +1,94 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + approach=static + tuned_checkpoint=saved_results + task=lambada_openai + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --performance --iters "${iters} + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py new file mode 100644 index 00000000000..5d13abb73fd --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py @@ -0,0 +1,256 @@ +import argparse +import os +import sys + +sys.path.append('./') +import time +import re +import torch +from datasets import load_dataset +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="EleutherAI/gpt-j-6b" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument( + "--int8_bf16_mixed", + action="store_true", + help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + '--seed', + type=int, default=42, help='Seed for sampling the calibration data.' +) +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--save_accuracy_path", default=None, + help="Save accuracy results path.") +parser.add_argument("--pad_max_length", default=512, type=int, + help="Pad input ids to max length.") +parser.add_argument("--calib_iters", default=512, type=int, + help="calibration iters.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") + +args = parser.parse_args() +if args.ipex: + import intel_extension_for_pytorch as ipex +calib_size = 1 + + +class Evaluator: + def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + self.is_calib = is_calib + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + return self.tokenizer(examples["text"]) + + @torch.no_grad() + def collate_batch(self, batch): + + input_ids_padded = [] + last_ind = [] + + for text in batch: + input_ids = text["input_ids"] + pad_len = self.pad_max - input_ids.shape[0] + last_ind.append(input_ids.shape[0] - 1) + if self.is_calib: + input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + else: + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + input_ids_padded.append(input_ids) + + return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) + + @torch.no_grad() + def evaluate(self, model): + model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + latency = 0 + test_dataloader = DataLoader( + self.dataset, + batch_size=self.batch_size, + shuffle=False, + collate_fn=self.collate_batch, + ) + for i, (input_ids, last_ind) in enumerate(test_dataloader): + label = input_ids[torch.arange(len(last_ind)), last_ind] + input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val + pad_len = self.pad_max - last_ind - 1 + + start = time.time() + outputs = model(input_ids) + latency += time.time() - start + + last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] + pred = last_token_logits.argmax(dim=-1) + total += label.size(0) + hit += (pred == label).sum().item() + if (i + 1) % 50 == 0: + print(hit / total) + print("Processed minibatch:", i) + + acc = hit / total + print("Accuracy: ", acc) + print("Latency: ", latency) + return acc + + +def get_user_model(): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + + +if args.quantize: + # dataset + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") + # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) + + + from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig + quant_config = get_default_static_config() + quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + if re.search("gpt", user_model.config.model_type): + quant_config.set_local(torch.add, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) + + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + from tqdm import tqdm + def run_fn(model): + for batch in tqdm(calib_dataloader): + batch = move_input_to_device(batch, device=None) + try: + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + except ValueError: + pass + return + + from utils import get_example_inputs + example_inputs = get_example_inputs(user_model, calib_dataloader) + + from neural_compressor.torch.quantization import prepare, convert + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + user_model.save(args.output_dir) + + +# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result +if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) +else: + user_model, tokenizer = get_user_model() + + +if args.accuracy: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh new file mode 100644 index 00000000000..a93d8220d64 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + approach='static' + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --tasks "lambada_openai" \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py new file mode 100644 index 00000000000..38083129a65 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py @@ -0,0 +1,193 @@ +import random +import torch +from collections import UserDict +from packaging.version import Version +from neural_compressor.common import logger +from neural_compressor.torch.utils import get_torch_version + +class DataloaderPreprocessor: + def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None: + self.dataloader_original = dataloader_original + self.use_max_length = use_max_length + self.max_seq_length = max_seq_length + self.nsamples = nsamples + self.dataloader = [] + self.is_ready = False + + def get_prepared_dataloader(self): + if not self.is_ready: + self.prepare_dataloader() + return self.dataloader + + def prepare_dataloader(self): + if self.use_max_length: + # (Recommend) only take sequence whose length exceeds self.max_seq_length, + # which preserves calibration's tokens are all valid + # This is GPTQ official dataloader implementation + self.obtain_first_n_samples_fulllength() + else: + # general selection, no padding, not GPTQ original implementation. + self.obtain_first_n_samples() + self.is_ready = True + + def obtain_first_n_samples(self, seed=0): + """Get first nsample data as the real calibration dataset.""" + self.dataloader.clear() + random.seed(seed) + for batch in self.dataloader_original: + # process data, depends on its data type. + if len(self.dataloader) == self.nsamples: + logger.info(f"Successfully collect {self.nsamples} calibration samples.") + break + # list, tuple + if isinstance(batch, list) or isinstance(batch, tuple): + if batch[0].shape[-1] > self.max_seq_length: + i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1) + j = i + self.max_seq_length + batch_final = [] + for item in batch: + if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: + batch_final.append(item[:, i:j]) + else: + batch_final.append(item) + else: + batch_final = batch[:] + # dict + elif isinstance(batch, dict): + try: + length = batch["input_ids"].shape[-1] + except: + logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") + continue + batch_final = {} + if length > self.max_seq_length: + i = random.randint(0, length - self.max_seq_length - 1) + j = i + self.max_seq_length + # may have to slice every sequence related data + for key in batch.keys(): + if isinstance(batch[key], torch.Tensor): + batch_final[key] = batch[key][:, i:j] # slice on sequence length dim + else: + batch_final[key] = batch[key] + else: + batch_final = batch + # tensor + else: + if batch.shape[-1] > self.max_seq_length: + i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1) + j = i + self.max_seq_length + batch_final = batch[:, i:j] + else: + batch_final = batch + self.dataloader.append(batch_final) + + if len(self.dataloader) < self.nsamples: + logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.") + + def obtain_first_n_samples_fulllength(self, seed=0): + self.dataloader.clear() + random.seed(seed) + unified_length = self.max_seq_length + for batch in self.dataloader_original: + if len(self.dataloader) == self.nsamples: + logger.info(f"Successfully collect {self.nsamples} calibration samples.") + break + # list & tuple, gpt-j-6b mlperf, etc. + if isinstance(batch, list) or isinstance(batch, tuple): + if batch[0].shape[-1] == unified_length: + batch_final = batch[:] + elif batch[0].shape[-1] > unified_length: + i = random.randint(0, batch[0].shape[-1] - unified_length - 1) + j = i + unified_length + batch_final = [] + for item in batch: + if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: + batch_final.append(item[:, i:j]) + else: + batch_final.append(item) + else: + # not match max length, not include in target dataset + continue + # dict + elif isinstance(batch, dict): + try: + length = batch["input_ids"].shape[-1] + except: + logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") + continue + batch_final = {} + if length == self.max_seq_length: + batch_final = batch + elif length > self.max_seq_length: + i = random.randint(0, length - self.max_seq_length - 1) + j = i + self.max_seq_length + # may have to slice every sequence related data + for key in batch.keys(): + if isinstance(batch[key], torch.Tensor): + batch_final[key] = batch[key][:, i:j] # slice on sequence length dim with same position + else: + batch_final[key] = batch[key] + else: + # not match max length, not include in target dataset + continue + # tensor + else: + if batch.shape[-1] == unified_length: + batch_final = batch + elif batch.shape[-1] > unified_length: + i = random.randint(0, batch.shape[-1] - unified_length - 1) + j = i + unified_length + batch_final = batch[:, i:j] + else: + # not match max length, not include in target dataset + continue + self.dataloader.append(batch_final) + if len(self.dataloader) < self.nsamples: # pragma: no cover + logger.warning( + f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \ + but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value." + ) + + +def get_example_inputs(model, dataloader): + version = get_torch_version() + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + + # Suggest set dataloader like calib_dataloader + if dataloader is None: + return None + device = next(model.parameters()).device + try: + for idx, (input, label) in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, (list, tuple)): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + except Exception as e: # pragma: no cover + for idx, input in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, list) or isinstance(input, tuple): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + if idx == 0: + assert False, "Please checkout the example_inputs format." From 28be72b9cf902dea16eeab635eaa5f08db97a1fa Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Tue, 11 Jun 2024 10:18:51 +0800 Subject: [PATCH 02/10] add json path Signed-off-by: Cheng, Zixuan --- docs/3x/PT_SmoothQuant.md | 2 +- docs/3x/PT_StaticQuant.md | 2 +- examples/.config/model_params_pytorch_3x.json | 46 ++++++ .../quantization/smooth_quant/utils.py | 146 ------------------ .../quantization/static_quant/utils.py | 146 ------------------ 5 files changed, 48 insertions(+), 294 deletions(-) create mode 100644 examples/.config/model_params_pytorch_3x.json diff --git a/docs/3x/PT_SmoothQuant.md b/docs/3x/PT_SmoothQuant.md index 9e4ae3eb62f..e3a7262dcde 100644 --- a/docs/3x/PT_SmoothQuant.md +++ b/docs/3x/PT_SmoothQuant.md @@ -46,7 +46,7 @@ run_fn(prepared_model) q_model = convert(prepared_model) ``` -To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm). +To get more information, please refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant). ## Validated Models diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md index ec967a780d4..ff84cb6e247 100644 --- a/docs/3x/PT_StaticQuant.md +++ b/docs/3x/PT_StaticQuant.md @@ -68,7 +68,7 @@ q_model = convert(prepared_model) #### Model Examples -Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a new model. +Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model. ### Static Quantization with PT2E Backend diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json new file mode 100644 index 00000000000..8520a9545b0 --- /dev/null +++ b/examples/.config/model_params_pytorch_3x.json @@ -0,0 +1,46 @@ +{ + "pytorch": { + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + } + } +} \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py index 38083129a65..76117f8b0b5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/utils.py @@ -1,154 +1,8 @@ -import random import torch from collections import UserDict from packaging.version import Version -from neural_compressor.common import logger from neural_compressor.torch.utils import get_torch_version -class DataloaderPreprocessor: - def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None: - self.dataloader_original = dataloader_original - self.use_max_length = use_max_length - self.max_seq_length = max_seq_length - self.nsamples = nsamples - self.dataloader = [] - self.is_ready = False - - def get_prepared_dataloader(self): - if not self.is_ready: - self.prepare_dataloader() - return self.dataloader - - def prepare_dataloader(self): - if self.use_max_length: - # (Recommend) only take sequence whose length exceeds self.max_seq_length, - # which preserves calibration's tokens are all valid - # This is GPTQ official dataloader implementation - self.obtain_first_n_samples_fulllength() - else: - # general selection, no padding, not GPTQ original implementation. - self.obtain_first_n_samples() - self.is_ready = True - - def obtain_first_n_samples(self, seed=0): - """Get first nsample data as the real calibration dataset.""" - self.dataloader.clear() - random.seed(seed) - for batch in self.dataloader_original: - # process data, depends on its data type. - if len(self.dataloader) == self.nsamples: - logger.info(f"Successfully collect {self.nsamples} calibration samples.") - break - # list, tuple - if isinstance(batch, list) or isinstance(batch, tuple): - if batch[0].shape[-1] > self.max_seq_length: - i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1) - j = i + self.max_seq_length - batch_final = [] - for item in batch: - if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: - batch_final.append(item[:, i:j]) - else: - batch_final.append(item) - else: - batch_final = batch[:] - # dict - elif isinstance(batch, dict): - try: - length = batch["input_ids"].shape[-1] - except: - logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") - continue - batch_final = {} - if length > self.max_seq_length: - i = random.randint(0, length - self.max_seq_length - 1) - j = i + self.max_seq_length - # may have to slice every sequence related data - for key in batch.keys(): - if isinstance(batch[key], torch.Tensor): - batch_final[key] = batch[key][:, i:j] # slice on sequence length dim - else: - batch_final[key] = batch[key] - else: - batch_final = batch - # tensor - else: - if batch.shape[-1] > self.max_seq_length: - i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1) - j = i + self.max_seq_length - batch_final = batch[:, i:j] - else: - batch_final = batch - self.dataloader.append(batch_final) - - if len(self.dataloader) < self.nsamples: - logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.") - - def obtain_first_n_samples_fulllength(self, seed=0): - self.dataloader.clear() - random.seed(seed) - unified_length = self.max_seq_length - for batch in self.dataloader_original: - if len(self.dataloader) == self.nsamples: - logger.info(f"Successfully collect {self.nsamples} calibration samples.") - break - # list & tuple, gpt-j-6b mlperf, etc. - if isinstance(batch, list) or isinstance(batch, tuple): - if batch[0].shape[-1] == unified_length: - batch_final = batch[:] - elif batch[0].shape[-1] > unified_length: - i = random.randint(0, batch[0].shape[-1] - unified_length - 1) - j = i + unified_length - batch_final = [] - for item in batch: - if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: - batch_final.append(item[:, i:j]) - else: - batch_final.append(item) - else: - # not match max length, not include in target dataset - continue - # dict - elif isinstance(batch, dict): - try: - length = batch["input_ids"].shape[-1] - except: - logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") - continue - batch_final = {} - if length == self.max_seq_length: - batch_final = batch - elif length > self.max_seq_length: - i = random.randint(0, length - self.max_seq_length - 1) - j = i + self.max_seq_length - # may have to slice every sequence related data - for key in batch.keys(): - if isinstance(batch[key], torch.Tensor): - batch_final[key] = batch[key][:, i:j] # slice on sequence length dim with same position - else: - batch_final[key] = batch[key] - else: - # not match max length, not include in target dataset - continue - # tensor - else: - if batch.shape[-1] == unified_length: - batch_final = batch - elif batch.shape[-1] > unified_length: - i = random.randint(0, batch.shape[-1] - unified_length - 1) - j = i + unified_length - batch_final = batch[:, i:j] - else: - # not match max length, not include in target dataset - continue - self.dataloader.append(batch_final) - if len(self.dataloader) < self.nsamples: # pragma: no cover - logger.warning( - f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \ - but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value." - ) - - def get_example_inputs(model, dataloader): version = get_torch_version() from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py index 38083129a65..76117f8b0b5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py @@ -1,154 +1,8 @@ -import random import torch from collections import UserDict from packaging.version import Version -from neural_compressor.common import logger from neural_compressor.torch.utils import get_torch_version -class DataloaderPreprocessor: - def __init__(self, dataloader_original, use_max_length=False, max_seq_length=2048, nsamples=128) -> None: - self.dataloader_original = dataloader_original - self.use_max_length = use_max_length - self.max_seq_length = max_seq_length - self.nsamples = nsamples - self.dataloader = [] - self.is_ready = False - - def get_prepared_dataloader(self): - if not self.is_ready: - self.prepare_dataloader() - return self.dataloader - - def prepare_dataloader(self): - if self.use_max_length: - # (Recommend) only take sequence whose length exceeds self.max_seq_length, - # which preserves calibration's tokens are all valid - # This is GPTQ official dataloader implementation - self.obtain_first_n_samples_fulllength() - else: - # general selection, no padding, not GPTQ original implementation. - self.obtain_first_n_samples() - self.is_ready = True - - def obtain_first_n_samples(self, seed=0): - """Get first nsample data as the real calibration dataset.""" - self.dataloader.clear() - random.seed(seed) - for batch in self.dataloader_original: - # process data, depends on its data type. - if len(self.dataloader) == self.nsamples: - logger.info(f"Successfully collect {self.nsamples} calibration samples.") - break - # list, tuple - if isinstance(batch, list) or isinstance(batch, tuple): - if batch[0].shape[-1] > self.max_seq_length: - i = random.randint(0, batch[0].shape[-1] - self.max_seq_length - 1) - j = i + self.max_seq_length - batch_final = [] - for item in batch: - if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: - batch_final.append(item[:, i:j]) - else: - batch_final.append(item) - else: - batch_final = batch[:] - # dict - elif isinstance(batch, dict): - try: - length = batch["input_ids"].shape[-1] - except: - logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") - continue - batch_final = {} - if length > self.max_seq_length: - i = random.randint(0, length - self.max_seq_length - 1) - j = i + self.max_seq_length - # may have to slice every sequence related data - for key in batch.keys(): - if isinstance(batch[key], torch.Tensor): - batch_final[key] = batch[key][:, i:j] # slice on sequence length dim - else: - batch_final[key] = batch[key] - else: - batch_final = batch - # tensor - else: - if batch.shape[-1] > self.max_seq_length: - i = random.randint(0, batch.shape[-1] - self.max_seq_length - 1) - j = i + self.max_seq_length - batch_final = batch[:, i:j] - else: - batch_final = batch - self.dataloader.append(batch_final) - - if len(self.dataloader) < self.nsamples: - logger.warning(f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.") - - def obtain_first_n_samples_fulllength(self, seed=0): - self.dataloader.clear() - random.seed(seed) - unified_length = self.max_seq_length - for batch in self.dataloader_original: - if len(self.dataloader) == self.nsamples: - logger.info(f"Successfully collect {self.nsamples} calibration samples.") - break - # list & tuple, gpt-j-6b mlperf, etc. - if isinstance(batch, list) or isinstance(batch, tuple): - if batch[0].shape[-1] == unified_length: - batch_final = batch[:] - elif batch[0].shape[-1] > unified_length: - i = random.randint(0, batch[0].shape[-1] - unified_length - 1) - j = i + unified_length - batch_final = [] - for item in batch: - if isinstance(item, torch.Tensor) and item.shape.__len__() == 2: - batch_final.append(item[:, i:j]) - else: - batch_final.append(item) - else: - # not match max length, not include in target dataset - continue - # dict - elif isinstance(batch, dict): - try: - length = batch["input_ids"].shape[-1] - except: - logger.warning("Please make sure your dict'like data contains key of 'input_ids'.") - continue - batch_final = {} - if length == self.max_seq_length: - batch_final = batch - elif length > self.max_seq_length: - i = random.randint(0, length - self.max_seq_length - 1) - j = i + self.max_seq_length - # may have to slice every sequence related data - for key in batch.keys(): - if isinstance(batch[key], torch.Tensor): - batch_final[key] = batch[key][:, i:j] # slice on sequence length dim with same position - else: - batch_final[key] = batch[key] - else: - # not match max length, not include in target dataset - continue - # tensor - else: - if batch.shape[-1] == unified_length: - batch_final = batch - elif batch.shape[-1] > unified_length: - i = random.randint(0, batch.shape[-1] - unified_length - 1) - j = i + unified_length - batch_final = batch[:, i:j] - else: - # not match max length, not include in target dataset - continue - self.dataloader.append(batch_final) - if len(self.dataloader) < self.nsamples: # pragma: no cover - logger.warning( - f"Trying to allocate {self.nsamples} data with fixed length {unified_length}, \ - but only {len(self.dataloader)} samples are found. Please use smaller 'self.max_seq_length' value." - ) - - def get_example_inputs(model, dataloader): version = get_torch_version() from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device From 7236eb2a6df9d16f7ab602ca5b0bc74b418bfbab Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Wed, 12 Jun 2024 11:43:35 +0800 Subject: [PATCH 03/10] fix for sq Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/smooth_quant/smooth_quant.py | 9 +++++---- .../torch/algorithms/smooth_quant/utility.py | 6 +++--- .../torch/algorithms/static_quant/utility.py | 8 +++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py index cd2686d0b0e..d27dfdd3fbc 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py +++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py @@ -82,13 +82,14 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): model.output_tensor_id_op_name, ) - # Update json file in ipex_config_path - cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name) - model.eval() - # check smoothquant alpha and act_algo value recipe_cfgs = self.quant_config.get("recipe_cfgs", None) alpha = recipe_cfgs["smooth_quant_args"]["alpha"] + + # Update json file in ipex_config_path + cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True) + model.eval() + for op, _ in self.quant_config["op"].items(): act_algo = self.quant_config["op"][op]["activation"]["algorithm"] diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py index 7dc647dbc95..d25b14444ec 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/utility.py +++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py @@ -164,7 +164,7 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl def check_cfg_and_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, smooth_quant=False + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha, smooth_quant=True ): # pragma: no cover """Check configs and quantization configs. @@ -205,7 +205,7 @@ def check_cfg_and_qconfig( else: smooth_quant_enable = False activation_observer = generate_activation_observer( - inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable + inc_scheme, inc_algorithm, smooth_quant, smooth_quant_enable, alpha ) if not smooth_quant: if inc_scheme == "sym": @@ -241,7 +241,7 @@ def check_cfg_and_qconfig( def cfg_to_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=False + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True ): # pragma: no cover assert cfgs is not None, "No configure for IPEX int8 model..." op_infos = copy.deepcopy(op_infos_from_cfgs) diff --git a/neural_compressor/torch/algorithms/static_quant/utility.py b/neural_compressor/torch/algorithms/static_quant/utility.py index f90471539fd..2c7fd753e65 100644 --- a/neural_compressor/torch/algorithms/static_quant/utility.py +++ b/neural_compressor/torch/algorithms/static_quant/utility.py @@ -157,7 +157,9 @@ def check_cfg_and_qconfig(user_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_ return cfgs, user_cfg -def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_quant_enable=False): # pragma: no cover +def generate_activation_observer( + scheme, algorithm, smooth_quant=False, smooth_quant_enable=False, alpha=0.5 +): # pragma: no cover """This is a helper method to generate an activation observer. Args: @@ -193,7 +195,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q "reduce_range": False, "quant_min": 0, "quant_max": 255, - "alpha": 0.5, + "alpha": 0.5 if alpha == "auto" else alpha, "act_observer": kl_activation_observer, "act_ic_observer": { "name": "PerChannelMinMaxObserver", @@ -213,7 +215,7 @@ def generate_activation_observer(scheme, algorithm, smooth_quant=False, smooth_q "reduce_range": False, "quant_min": 0, "quant_max": 255, - "alpha": 0.5, + "alpha": 0.5 if alpha == "auto" else alpha, "act_observer": minmax_activation_observer, "act_ic_observer": { "name": "PerChannelMinMaxObserver", From 5b5ba7d16cf8c11a08c3bfc47ac446839f66b0dd Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Thu, 13 Jun 2024 14:02:55 +0800 Subject: [PATCH 04/10] minor fix Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/smooth_quant/smooth_quant.py | 1 - neural_compressor/torch/algorithms/smooth_quant/utility.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py index d27dfdd3fbc..fdfb51640ac 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py +++ b/neural_compressor/torch/algorithms/smooth_quant/smooth_quant.py @@ -121,7 +121,6 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): else: model = ipex.quantization.prepare(model, static_qconfig, example_inputs=example_inputs, inplace=inplace) - cfg_to_qconfig(self.quant_config, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, smooth_quant=True) model.load_qconf_summary(qconf_summary=ipex_config_path) return model diff --git a/neural_compressor/torch/algorithms/smooth_quant/utility.py b/neural_compressor/torch/algorithms/smooth_quant/utility.py index d25b14444ec..21253f26892 100644 --- a/neural_compressor/torch/algorithms/smooth_quant/utility.py +++ b/neural_compressor/torch/algorithms/smooth_quant/utility.py @@ -164,7 +164,7 @@ def get_quantizable_ops_recursively(model, example_inputs, alpha, act_algo, inpl def check_cfg_and_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha, smooth_quant=True + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_ids_op_name, alpha=0.5, smooth_quant=True ): # pragma: no cover """Check configs and quantization configs. @@ -241,11 +241,11 @@ def check_cfg_and_qconfig( def cfg_to_qconfig( - tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha, smooth_quant=True + tune_cfg, cfgs, op_infos_from_cfgs, output_tensor_id_op_name, alpha=0.5, smooth_quant=True ): # pragma: no cover assert cfgs is not None, "No configure for IPEX int8 model..." op_infos = copy.deepcopy(op_infos_from_cfgs) - cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, smooth_quant) + cfgs = check_cfg_and_qconfig(tune_cfg["op"], cfgs, op_infos, output_tensor_id_op_name, alpha, smooth_quant) with open(ipex_config_path, "w") as write_f: json.dump(cfgs, write_f, indent=4) return None From c9827399808acb82384e4c5177cd259af6bf08a5 Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Thu, 13 Jun 2024 14:16:13 +0800 Subject: [PATCH 05/10] Update run_clm_no_trainer.py --- .../quantization/static_quant/run_clm_no_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py index 5d13abb73fd..75dfeb34de8 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py @@ -167,7 +167,7 @@ def get_user_model(): quant_config = get_default_static_config() quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] if re.search("gpt", user_model.config.model_type): - quant_config.set_local(torch.add, StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) + quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device from tqdm import tqdm @@ -253,4 +253,4 @@ def run_fn(model): print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (samples / (end - start))) print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) - print('Batch size = %d' % args.batch_size) \ No newline at end of file + print('Batch size = %d' % args.batch_size) From 34282d0903f6e5b2162a87c383b32f9871f286f7 Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Thu, 13 Jun 2024 14:38:57 +0800 Subject: [PATCH 06/10] Update run_clm_no_trainer.py --- .../smooth_quant/run_clm_no_trainer.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 2afb74068f5..dbe4ae5fc78 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -178,15 +178,12 @@ def get_user_model(): def run_fn(model): for batch in tqdm(calib_dataloader): batch = move_input_to_device(batch, device=None) - try: - if isinstance(batch, tuple) or isinstance(batch, list): - model(batch[0]) - elif isinstance(batch, dict): - model(**batch) - else: - model(batch) - except ValueError: - pass + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) return from utils import get_example_inputs @@ -257,4 +254,4 @@ def run_fn(model): print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (samples / (end - start))) print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) - print('Batch size = %d' % args.batch_size) \ No newline at end of file + print('Batch size = %d' % args.batch_size) From 383b6a221ad040535b795cef75ef35f53c47542c Mon Sep 17 00:00:00 2001 From: Zixuan Cheng <110808245+violetch24@users.noreply.github.com> Date: Thu, 13 Jun 2024 14:47:20 +0800 Subject: [PATCH 07/10] Update run_clm_no_trainer.py --- .../static_quant/run_clm_no_trainer.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py index 75dfeb34de8..9aee3fbfe55 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py @@ -174,15 +174,12 @@ def get_user_model(): def run_fn(model): for batch in tqdm(calib_dataloader): batch = move_input_to_device(batch, device=None) - try: - if isinstance(batch, tuple) or isinstance(batch, list): - model(batch[0]) - elif isinstance(batch, dict): - model(**batch) - else: - model(batch) - except ValueError: - pass + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) return from utils import get_example_inputs From 6b83c9eb36300a5e6de5fb7b0800e0839b2ea0f8 Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Fri, 14 Jun 2024 12:38:45 +0800 Subject: [PATCH 08/10] minor fix Signed-off-by: Cheng, Zixuan --- docs/3x/PT_StaticQuant.md | 2 +- .../smooth_quant/run_benchmark.sh | 2 + .../smooth_quant/run_clm_no_trainer.py | 29 +- .../quantization/static_quant/ipex/README.md | 57 ++++ .../static_quant/ipex/requirements.txt | 13 + .../static_quant/ipex/run_benchmark.sh | 96 +++++++ .../static_quant/ipex/run_clm_no_trainer.py | 259 ++++++++++++++++++ .../static_quant/ipex/run_quant.sh | 67 +++++ .../quantization/static_quant/ipex/utils.py | 47 ++++ 9 files changed, 560 insertions(+), 12 deletions(-) create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md index ff84cb6e247..7d56f817296 100644 --- a/docs/3x/PT_StaticQuant.md +++ b/docs/3x/PT_StaticQuant.md @@ -68,7 +68,7 @@ q_model = convert(prepared_model) #### Model Examples -Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant) on how to quantize a new model. +Users could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex) on how to quantize a new model. ### Static Quantization with PT2E Backend diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh index 955ffd91456..61c50611090 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_benchmark.sh @@ -59,8 +59,10 @@ function run_benchmark { if [[ ${mode} == "accuracy" ]]; then mode_cmd=" --accuracy " + extra_cmd=$extra_cmd" --load" elif [[ ${mode} == "performance" ]]; then mode_cmd=" --performance --iters "${iters} + extra_cmd=$extra_cmd" --load" else echo "Error: No such mode: ${mode}" exit 1 diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index dbe4ae5fc78..ef0590e2982 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -37,6 +37,7 @@ help="Select from ['dynamic', 'static', 'weight-only']") parser.add_argument("--int8", action="store_true") parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--load", action="store_true", help="Load quantized model.") parser.add_argument("--accuracy", action="store_true") parser.add_argument("--performance", action="store_true") parser.add_argument("--iters", default=100, type=int, @@ -176,7 +177,8 @@ def get_user_model(): from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device from tqdm import tqdm def run_fn(model): - for batch in tqdm(calib_dataloader): + calib_iter = 0 + for batch in tqdm(calib_dataloader, total=args.calib_iters): batch = move_input_to_device(batch, device=None) if isinstance(batch, tuple) or isinstance(batch, list): model(batch[0]) @@ -184,6 +186,10 @@ def run_fn(model): model(**batch) else: model(batch) + + calib_iter += 1 + if calib_iter >= args.calib_iters: + break return from utils import get_example_inputs @@ -196,16 +202,17 @@ def run_fn(model): user_model.save(args.output_dir) -# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result -if args.int8 or args.int8_bf16_mixed: - print("load int8 model") - from neural_compressor.torch.quantization import load - tokenizer = AutoTokenizer.from_pretrained(args.model) - config = AutoConfig.from_pretrained(args.model) - user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) - setattr(user_model, "config", config) -else: - user_model, tokenizer = get_user_model() +if args.load: + # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result + if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) + else: + user_model, tokenizer = get_user_model() if args.accuracy: diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md new file mode 100644 index 00000000000..8ecdc6c5110 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/README.md @@ -0,0 +1,57 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. + +The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### GPT-J-6b + +#### Quantization +```bash +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --quantize \ + --alpha 1.0 \ + --ipex \ + --output_dir "saved_results" +``` + +### OPT-125m + +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --quantize \ + --alpha 0.5 \ + --ipex \ + --output_dir "saved_results" +``` + +### LLAMA2-7b/13b/70b +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +#### Quantization + +```bash +python run_clm_no_trainer.py \ + --model meta-llama/Llama-2-7b-hf \ + --quantize \ + --alpha 0.8 \ + --ipex \ + --output_dir "saved_results" +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt new file mode 100644 index 00000000000..f0b56e558d3 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt @@ -0,0 +1,13 @@ +accelerate +protobuf +sentencepiece != 0.1.92 +datasets >= 1.1.3 +torch >= 1.10 +transformers +pytest +wandb +einops +neural-compressor +intel-extension-for-transformers +lm_eval==0.4.2 +peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh new file mode 100644 index 00000000000..b62a6381b20 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_benchmark.sh @@ -0,0 +1,96 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + batch_size=16 + approach=static + tuned_checkpoint=saved_results + task=lambada_openai + echo ${max_eval_samples} + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --batch_size=*) + batch_size=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + --config=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + extra_cmd='' + + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy " + extra_cmd=$extra_cmd" --load" + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --performance --iters "${iters} + extra_cmd=$extra_cmd" --load" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "true" ]]; then + extra_cmd=$extra_cmd" --int8" + fi + echo $extra_cmd + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py new file mode 100644 index 00000000000..0ccb2093537 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -0,0 +1,259 @@ +import argparse +import os +import sys + +sys.path.append('./') +import time +import re +import torch +from datasets import load_dataset +from torch.nn.functional import pad +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="EleutherAI/gpt-j-6b" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument( + "--int8_bf16_mixed", + action="store_true", + help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + '--seed', + type=int, default=42, help='Seed for sampling the calibration data.' +) +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") +parser.add_argument("--load", action="store_true", help="Load quantized model.") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--save_accuracy_path", default=None, + help="Save accuracy results path.") +parser.add_argument("--pad_max_length", default=512, type=int, + help="Pad input ids to max length.") +parser.add_argument("--calib_iters", default=512, type=int, + help="calibration iters.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") + +args = parser.parse_args() +if args.ipex: + import intel_extension_for_pytorch as ipex +calib_size = 1 + + +class Evaluator: + def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + self.is_calib = is_calib + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + return self.tokenizer(examples["text"]) + + @torch.no_grad() + def collate_batch(self, batch): + + input_ids_padded = [] + last_ind = [] + + for text in batch: + input_ids = text["input_ids"] + pad_len = self.pad_max - input_ids.shape[0] + last_ind.append(input_ids.shape[0] - 1) + if self.is_calib: + input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + else: + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + input_ids_padded.append(input_ids) + + return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) + + @torch.no_grad() + def evaluate(self, model): + model.eval() + # The task is to predict the last word of the input. + total, hit = 0, 0 + latency = 0 + test_dataloader = DataLoader( + self.dataset, + batch_size=self.batch_size, + shuffle=False, + collate_fn=self.collate_batch, + ) + for i, (input_ids, last_ind) in enumerate(test_dataloader): + label = input_ids[torch.arange(len(last_ind)), last_ind] + input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val + pad_len = self.pad_max - last_ind - 1 + + start = time.time() + outputs = model(input_ids) + latency += time.time() - start + + last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] + pred = last_token_logits.argmax(dim=-1) + total += label.size(0) + hit += (pred == label).sum().item() + if (i + 1) % 50 == 0: + print(hit / total) + print("Processed minibatch:", i) + + acc = hit / total + print("Accuracy: ", acc) + print("Latency: ", latency) + return acc + + +def get_user_model(): + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + + +if args.quantize: + # dataset + user_model, tokenizer = get_user_model() + calib_dataset = load_dataset(args.dataset, split="train") + # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF + calib_dataset = calib_dataset.shuffle(seed=args.seed) + calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) + calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=calib_size, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, + ) + + + from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig + quant_config = get_default_static_config() + quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + if re.search("gpt", user_model.config.model_type): + quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) + + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + from tqdm import tqdm + def run_fn(model): + calib_iter = 0 + for batch in tqdm(calib_dataloader, total=args.calib_iters): + batch = move_input_to_device(batch, device=None) + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + + calib_iter += 1 + if calib_iter >= args.calib_iters: + break + return + + from utils import get_example_inputs + example_inputs = get_example_inputs(user_model, calib_dataloader) + + from neural_compressor.torch.quantization import prepare, convert + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + user_model.save(args.output_dir) + +if args.load: + # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result + if args.int8 or args.int8_bf16_mixed: + print("load int8 model") + from neural_compressor.torch.quantization import load + tokenizer = AutoTokenizer.from_pretrained(args.model) + config = AutoConfig.from_pretrained(args.model) + user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) + setattr(user_model, "config", config) + else: + user_model, tokenizer = get_user_model() + + +if args.accuracy: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh new file mode 100644 index 00000000000..a93d8220d64 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_quant.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + extra_cmd='' + batch_size=8 + approach='static' + DATASET_NAME="NeelNanda/pile-10k" + tuned_checkpoint="saved_results" + + if [ "${topology}" = "opt_125m_ipex" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "llama2_7b_ipex" ]; then + model_name_or_path="meta-llama/Llama-2-7b-hf" + extra_cmd=$extra_cmd" --ipex" + elif [ "${topology}" = "gpt_j_ipex" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --ipex" + fi + + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --dataset ${DATASET_NAME} \ + --quantize \ + --approach ${approach} \ + --output_dir ${tuned_checkpoint} \ + --tasks "lambada_openai" \ + --batch_size ${batch_size} \ + ${extra_cmd} +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py new file mode 100644 index 00000000000..76117f8b0b5 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/utils.py @@ -0,0 +1,47 @@ +import torch +from collections import UserDict +from packaging.version import Version +from neural_compressor.torch.utils import get_torch_version + +def get_example_inputs(model, dataloader): + version = get_torch_version() + from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device + + # Suggest set dataloader like calib_dataloader + if dataloader is None: + return None + device = next(model.parameters()).device + try: + for idx, (input, label) in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, (list, tuple)): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + except Exception as e: # pragma: no cover + for idx, input in enumerate(dataloader): + input = move_input_to_device(input, device) + if isinstance(input, (dict, UserDict)): # pragma: no cover + assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" + if "label" in input.keys(): + input.pop("label") + if version.release <= Version("2.0.1").release: + return tuple(input.values()) + else: + return dict(input) + if isinstance(input, list) or isinstance(input, tuple): + return tuple(input) + if isinstance(input, torch.Tensor): + return input + break + if idx == 0: + assert False, "Please checkout the example_inputs format." From 00fe9d988aa323b57ab8439d5cd6f762fa6ec61f Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Fri, 14 Jun 2024 12:41:14 +0800 Subject: [PATCH 09/10] remove old files Signed-off-by: Cheng, Zixuan --- .../quantization/static_quant/README.md | 57 ---- .../static_quant/requirements.txt | 13 - .../static_quant/run_benchmark.sh | 94 ------- .../static_quant/run_clm_no_trainer.py | 253 ------------------ .../quantization/static_quant/run_quant.sh | 67 ----- .../quantization/static_quant/utils.py | 47 ---- 6 files changed, 531 deletions(-) delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md deleted file mode 100644 index 8ecdc6c5110..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/README.md +++ /dev/null @@ -1,57 +0,0 @@ -Step-by-Step -============ -This document describes the step-by-step instructions to run large language models (LLMs) using Static Quantization on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch and Intel® Extension for PyTorch. - -The script `run_clm_no_trainer.py` supports `GPTJ`, `OPT`, `LLaMA2`, `BLOOM` and `Falcon` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. - -# Prerequisite -## 1. Create Environment -``` -# Installation -pip install -r requirements.txt -``` - -# Run - -Here is how to run the scripts: - -**Causal Language Modeling (CLM)** - -`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) calibration and validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. -### GPT-J-6b - -#### Quantization -```bash -python run_clm_no_trainer.py \ - --model EleutherAI/gpt-j-6B \ - --quantize \ - --alpha 1.0 \ - --ipex \ - --output_dir "saved_results" -``` - -### OPT-125m - -#### Quantization - -```bash -python run_clm_no_trainer.py \ - --model facebook/opt-125m \ - --quantize \ - --alpha 0.5 \ - --ipex \ - --output_dir "saved_results" -``` - -### LLAMA2-7b/13b/70b ->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. -#### Quantization - -```bash -python run_clm_no_trainer.py \ - --model meta-llama/Llama-2-7b-hf \ - --quantize \ - --alpha 0.8 \ - --ipex \ - --output_dir "saved_results" -``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt deleted file mode 100644 index f0b56e558d3..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -accelerate -protobuf -sentencepiece != 0.1.92 -datasets >= 1.1.3 -torch >= 1.10 -transformers -pytest -wandb -einops -neural-compressor -intel-extension-for-transformers -lm_eval==0.4.2 -peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh deleted file mode 100644 index 87359e9a094..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_benchmark.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_benchmark - -} - -# init params -function init_params { - iters=100 - batch_size=16 - approach=static - tuned_checkpoint=saved_results - task=lambada_openai - echo ${max_eval_samples} - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --mode=*) - mode=$(echo $var |cut -f2 -d=) - ;; - --batch_size=*) - batch_size=$(echo $var |cut -f2 -d=) - ;; - --iters=*) - iters=$(echo ${var} |cut -f2 -d=) - ;; - --int8=*) - int8=$(echo ${var} |cut -f2 -d=) - ;; - --config=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - - -# run_benchmark -function run_benchmark { - extra_cmd='' - - if [[ ${mode} == "accuracy" ]]; then - mode_cmd=" --accuracy " - elif [[ ${mode} == "performance" ]]; then - mode_cmd=" --performance --iters "${iters} - else - echo "Error: No such mode: ${mode}" - exit 1 - fi - - if [[ ${int8} == "true" ]]; then - extra_cmd=$extra_cmd" --int8" - fi - echo $extra_cmd - - if [ "${topology}" = "opt_125m_ipex" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "llama2_7b_ipex" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "gpt_j_ipex" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex" - fi - - python -u run_clm_no_trainer.py \ - --model ${model_name_or_path} \ - --approach ${approach} \ - --output_dir ${tuned_checkpoint} \ - --task ${task} \ - --batch_size ${batch_size} \ - ${extra_cmd} ${mode_cmd} -} - -main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py deleted file mode 100644 index 9aee3fbfe55..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_clm_no_trainer.py +++ /dev/null @@ -1,253 +0,0 @@ -import argparse -import os -import sys - -sys.path.append('./') -import time -import re -import torch -from datasets import load_dataset -from torch.nn.functional import pad -from torch.utils.data import DataLoader -from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument( - "--model", nargs="?", default="EleutherAI/gpt-j-6b" -) -parser.add_argument( - "--trust_remote_code", default=True, - help="Transformers parameter: use the external repo") -parser.add_argument( - "--revision", default=None, - help="Transformers parameter: set the model hub commit number") -parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") -parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--quantize", action="store_true") -parser.add_argument( - "--int8_bf16_mixed", - action="store_true", - help="By default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", -) -parser.add_argument( - '--seed', - type=int, default=42, help='Seed for sampling the calibration data.' -) -parser.add_argument("--approach", type=str, default='static', - help="Select from ['dynamic', 'static', 'weight-only']") -parser.add_argument("--int8", action="store_true") -parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") -parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--performance", action="store_true") -parser.add_argument("--iters", default=100, type=int, - help="For accuracy measurement only.") -parser.add_argument("--batch_size", default=1, type=int, - help="For accuracy measurement only.") -parser.add_argument("--save_accuracy_path", default=None, - help="Save accuracy results path.") -parser.add_argument("--pad_max_length", default=512, type=int, - help="Pad input ids to max length.") -parser.add_argument("--calib_iters", default=512, type=int, - help="calibration iters.") -parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", - type=str, help="tasks for accuracy validation") -parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") - -args = parser.parse_args() -if args.ipex: - import intel_extension_for_pytorch as ipex -calib_size = 1 - - -class Evaluator: - def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): - self.dataset = dataset - self.tokenizer = tokenizer - self.batch_size = batch_size - self.pad_val = pad_val - self.pad_max = pad_max - self.is_calib = is_calib - - # tokenize the dataset - self.dataset = self.dataset.map(self.tokenize_function, batched=True) - self.dataset.set_format(type="torch", columns=["input_ids"]) - - @torch.no_grad() - def tokenize_function(self, examples): - return self.tokenizer(examples["text"]) - - @torch.no_grad() - def collate_batch(self, batch): - - input_ids_padded = [] - last_ind = [] - - for text in batch: - input_ids = text["input_ids"] - pad_len = self.pad_max - input_ids.shape[0] - last_ind.append(input_ids.shape[0] - 1) - if self.is_calib: - input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids - else: - input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) - input_ids_padded.append(input_ids) - - return (torch.vstack(input_ids_padded), torch.tensor(last_ind)) - - @torch.no_grad() - def evaluate(self, model): - model.eval() - # The task is to predict the last word of the input. - total, hit = 0, 0 - latency = 0 - test_dataloader = DataLoader( - self.dataset, - batch_size=self.batch_size, - shuffle=False, - collate_fn=self.collate_batch, - ) - for i, (input_ids, last_ind) in enumerate(test_dataloader): - label = input_ids[torch.arange(len(last_ind)), last_ind] - input_ids[torch.arange(len(last_ind)), last_ind] = self.pad_val - pad_len = self.pad_max - last_ind - 1 - - start = time.time() - outputs = model(input_ids) - latency += time.time() - start - - last_token_logits = outputs[0][torch.arange(len(last_ind)), -2 - pad_len, :] - pred = last_token_logits.argmax(dim=-1) - total += label.size(0) - hit += (pred == label).sum().item() - if (i + 1) % 50 == 0: - print(hit / total) - print("Processed minibatch:", i) - - acc = hit / total - print("Accuracy: ", acc) - print("Latency: ", latency) - return acc - - -def get_user_model(): - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - torchscript=True, # torchscript will force `return_dict=False` to avoid jit errors - trust_remote_code=args.trust_remote_code, - revision=args.revision, - ) - tokenizer = AutoTokenizer.from_pretrained(args.model) - - if args.peft_model_id is not None: - from peft import PeftModel - user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) - - # to channels last - user_model = user_model.to(memory_format=torch.channels_last) - user_model.eval() - return user_model, tokenizer - - -if args.quantize: - # dataset - user_model, tokenizer = get_user_model() - calib_dataset = load_dataset(args.dataset, split="train") - # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF - calib_dataset = calib_dataset.shuffle(seed=args.seed) - calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=args.pad_max_length, is_calib=True) - calib_dataloader = DataLoader( - calib_evaluator.dataset, - batch_size=calib_size, - shuffle=False, - collate_fn=calib_evaluator.collate_batch, - ) - - - from neural_compressor.torch.quantization import get_default_static_config, StaticQuantConfig - quant_config = get_default_static_config() - quant_config.excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] - if re.search("gpt", user_model.config.model_type): - quant_config.set_local("add", StaticQuantConfig(w_dtype="fp32", act_dtype="fp32")) - - from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device - from tqdm import tqdm - def run_fn(model): - for batch in tqdm(calib_dataloader): - batch = move_input_to_device(batch, device=None) - if isinstance(batch, tuple) or isinstance(batch, list): - model(batch[0]) - elif isinstance(batch, dict): - model(**batch) - else: - model(batch) - return - - from utils import get_example_inputs - example_inputs = get_example_inputs(user_model, calib_dataloader) - - from neural_compressor.torch.quantization import prepare, convert - user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) - run_fn(user_model) - user_model = convert(user_model) - user_model.save(args.output_dir) - - -# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result -if args.int8 or args.int8_bf16_mixed: - print("load int8 model") - from neural_compressor.torch.quantization import load - tokenizer = AutoTokenizer.from_pretrained(args.model) - config = AutoConfig.from_pretrained(args.model) - user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) - setattr(user_model, "config", config) -else: - user_model, tokenizer = get_user_model() - - -if args.accuracy: - user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser - eval_args = LMEvalParser( - model="hf", - user_model=user_model, - tokenizer=tokenizer, - batch_size=args.batch_size, - tasks=args.tasks, - device="cpu", - ) - results = evaluate(eval_args) - for task_name in args.tasks.split(","): - if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity,none"] - else: - acc = results["results"][task_name]["acc,none"] - print("Accuracy: %.5f" % acc) - print('Batch size = %d' % args.batch_size) - -if args.performance: - user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser - import time - - samples = args.iters * args.batch_size - eval_args = LMEvalParser( - model="hf", - user_model=user_model, - tokenizer=tokenizer, - batch_size=args.batch_size, - tasks=args.tasks, - limit=samples, - device="cpu", - ) - start = time.time() - results = evaluate(eval_args) - end = time.time() - for task_name in args.tasks.split(","): - if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity,none"] - else: - acc = results["results"][task_name]["acc,none"] - print("Accuracy: %.5f" % acc) - print('Throughput: %.3f samples/sec' % (samples / (end - start))) - print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) - print('Batch size = %d' % args.batch_size) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh deleted file mode 100644 index a93d8220d64..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/run_quant.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -set -x - -function main { - - init_params "$@" - run_tuning - -} - -# init params -function init_params { - for var in "$@" - do - case $var in - --topology=*) - topology=$(echo $var |cut -f2 -d=) - ;; - --dataset_location=*) - dataset_location=$(echo $var |cut -f2 -d=) - ;; - --input_model=*) - input_model=$(echo $var |cut -f2 -d=) - ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - *) - echo "Error: No such parameter: ${var}" - exit 1 - ;; - esac - done - -} - -# run_tuning -function run_tuning { - extra_cmd='' - batch_size=8 - approach='static' - DATASET_NAME="NeelNanda/pile-10k" - tuned_checkpoint="saved_results" - - if [ "${topology}" = "opt_125m_ipex" ]; then - model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "llama2_7b_ipex" ]; then - model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex" - elif [ "${topology}" = "gpt_j_ipex" ]; then - model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex" - fi - - python -u run_clm_no_trainer.py \ - --model ${model_name_or_path} \ - --dataset ${DATASET_NAME} \ - --quantize \ - --approach ${approach} \ - --output_dir ${tuned_checkpoint} \ - --tasks "lambada_openai" \ - --batch_size ${batch_size} \ - ${extra_cmd} -} - -main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py deleted file mode 100644 index 76117f8b0b5..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/utils.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -from collections import UserDict -from packaging.version import Version -from neural_compressor.torch.utils import get_torch_version - -def get_example_inputs(model, dataloader): - version = get_torch_version() - from neural_compressor.torch.algorithms.smooth_quant import move_input_to_device - - # Suggest set dataloader like calib_dataloader - if dataloader is None: - return None - device = next(model.parameters()).device - try: - for idx, (input, label) in enumerate(dataloader): - input = move_input_to_device(input, device) - if isinstance(input, (dict, UserDict)): # pragma: no cover - assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" - if "label" in input.keys(): - input.pop("label") - if version.release <= Version("2.0.1").release: - return tuple(input.values()) - else: - return dict(input) - if isinstance(input, (list, tuple)): - return tuple(input) - if isinstance(input, torch.Tensor): - return input - break - except Exception as e: # pragma: no cover - for idx, input in enumerate(dataloader): - input = move_input_to_device(input, device) - if isinstance(input, (dict, UserDict)): # pragma: no cover - assert version.release >= Version("1.12.0").release, "INC support IPEX version >= 1.12.0" - if "label" in input.keys(): - input.pop("label") - if version.release <= Version("2.0.1").release: - return tuple(input.values()) - else: - return dict(input) - if isinstance(input, list) or isinstance(input, tuple): - return tuple(input) - if isinstance(input, torch.Tensor): - return input - break - if idx == 0: - assert False, "Please checkout the example_inputs format." From 959170d961a2b8eca1b9bbf95c13080059f00f03 Mon Sep 17 00:00:00 2001 From: "Cheng, Zixuan" Date: Fri, 14 Jun 2024 12:44:13 +0800 Subject: [PATCH 10/10] fix act_algo Signed-off-by: Cheng, Zixuan --- .../torch/algorithms/static_quant/static_quant.py | 10 +++++++++- neural_compressor/torch/quantization/config.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/neural_compressor/torch/algorithms/static_quant/static_quant.py b/neural_compressor/torch/algorithms/static_quant/static_quant.py index e2eac7f236d..efd1880666c 100644 --- a/neural_compressor/torch/algorithms/static_quant/static_quant.py +++ b/neural_compressor/torch/algorithms/static_quant/static_quant.py @@ -85,7 +85,15 @@ def prepare(self, model, example_inputs, inplace=True, *args, **kwargs): from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig if ipex_ver.release >= Version("2.1").release: - static_qconfig = ipex.quantization.default_static_qconfig_mapping + # HistogramObserver will cause a performance issue. + # static_qconfig = ipex.quantization.default_static_qconfig_mapping + qconfig = QConfig( + activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), + weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric), + ) + from torch.ao.quantization import QConfigMapping + + static_qconfig = QConfigMapping().set_global(qconfig) else: static_qconfig = QConfig( activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index a8bab76b972..27a056d3284 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -980,7 +980,7 @@ def __init__( act_dtype: str = "uint8", act_sym: bool = False, act_granularity: str = "per_tensor", - act_algo: str = "kl", + act_algo: str = "minmax", excluded_precisions: list = [], white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, ):