From 46d3caab6a7695a8d90dc77467380bb7f0df69ef Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 14:10:45 +0800 Subject: [PATCH 01/23] add awq example Signed-off-by: Kaihui-intel --- .../weight_only/run_clm_no_trainer.py | 35 ++++++++++++++++++- .../quantization/weight_only/run_quant.sh | 7 ++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index abd8228354e..d5eb7b2f1b5 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -77,6 +77,15 @@ help='Calibration dataset sequence max length, ' 'this should align with your model config, ' 'and your dataset builder args: args.pad_max_length') +# =============AWQ configs==================== +parser.add_argument("--use_auto_scale", action="store_true", + help="Enables best scales search based on activation distribution.") +parser.add_argument("--use_auto_clip", action="store_true", + help="Enables clip range searchc.") +parser.add_argument("--folding", action="store_true", + help="Allow insert mul before linear when the scale cannot be absorbed by last layer.") +parser.add_argument('--absorb_layer_dict', type=dict, default={}, + help="The layer dict that scale can be absorbed.") # =============DoubleQuant configs==================== parser.add_argument("--double_quant_type", @@ -223,9 +232,14 @@ def get_user_model(): shuffle=False, collate_fn=calib_evaluator.collate_batch, ) + def calib_func(prepared_model): + for i, calib_input in enumerate(calib_dataloader): + if i > args.calib_iters: + break + prepared_model(calib_input[0]) # 3.x api - from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize + from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, AWQConfig, prepare, convert from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False if args.double_quant_type is not None: @@ -311,6 +325,25 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): user_model = prepare(model=user_model, quant_config=quant_config) run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) + elif args.woq_algo == "AWQ": + quant_config = AWQConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + group_dim=args.woq_group_dim, + use_auto_scale=args.use_auto_scale, + use_auto_clip=args.use_auto_clip, + folding=args.folding, + absorb_layer_dict=args.absorb_layer_dict, + ) + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) + run_fn = calib_func + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + + user_model.save(args.output_dir) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index a860712b697..50bed4bbb68 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -85,6 +85,13 @@ function run_tuning { model_name_or_path="EleutherAI/gpt-j-6b" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 ---calib_iters 128" fi python -u run_clm_no_trainer.py \ From 1f9bbab37ea06516fa2f4f47f87575e3cdb9803d Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 14:24:22 +0800 Subject: [PATCH 02/23] update extension test Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index c3ae3f6b5be..afc6b9408fb 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -84,6 +84,20 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 8 }, + "gpt_j_woq_awq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_awq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, "gpt_j_ipex":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", From 9cb9795662585067b89ddbb9c39d0d4bd03a9073 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 15:10:55 +0800 Subject: [PATCH 03/23] update extension path Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index afc6b9408fb..0df06fb940d 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -85,14 +85,14 @@ "batch_size": 8 }, "gpt_j_woq_awq_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, "opt_125m_woq_awq_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/llm", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", From b40c2c90109dd47bce216cfb14c5231fc2951bdb Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 15:21:48 +0800 Subject: [PATCH 04/23] minor fix Signed-off-by: Kaihui-intel --- .../language-modeling/quantization/weight_only/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index 50bed4bbb68..15baea53a77 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -91,7 +91,7 @@ function run_tuning { extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 ---calib_iters 128" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" fi python -u run_clm_no_trainer.py \ From 6382a489708b1b0618372e7db6976d000c827406 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 15:33:32 +0800 Subject: [PATCH 05/23] update awq benchmark Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_benchmark.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 9e1d766128e..2ef001a5534 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -114,6 +114,13 @@ function run_benchmark { model_name_or_path="EleutherAI/gpt-j-6b" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" fi python -u run_clm_no_trainer.py \ From 4552103e14952573da353567c8ee79a2a233ede8 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 15:49:46 +0800 Subject: [PATCH 06/23] update benchmarking Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_benchmark.sh | 55 ++++++++----------- .../quantization/weight_only/run_quant.sh | 3 + 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 2ef001a5534..e5f8d54b644 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -70,65 +70,54 @@ function run_benchmark { fi echo $extra_cmd - if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then + if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then - model_name_or_path="EleutherAI/gpt-j-6b"\ - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" + model_name_or_path="EleutherAI/gpt-j-6b"" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then - model_name_or_path="EleutherAI/gpt-j-6b"\ - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then + model_name_or_path="facebook/opt-125m" fi - - python -u run_clm_no_trainer.py \ - --model ${model_name_or_path} \ - --output_dir ${tuned_checkpoint} \ - --task ${task} \ - --batch_size ${batch_size} \ - ${extra_cmd} ${mode_cmd} + + if [[ ${mode} == "accuracy" ]]; then + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} + elif [[ ${mode} == "performance" ]]; then + incbench --num_cores_per_instance 4 run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + ${extra_cmd} ${mode_cmd} + else + echo "Error: No such mode: ${mode}" + exit 1 } main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index 15baea53a77..0847976a99c 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -92,6 +92,9 @@ function run_tuning { elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128" fi python -u run_clm_no_trainer.py \ From 1a13c7231e9798cfff910452f07f3076af484ef2 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 15:50:37 +0800 Subject: [PATCH 07/23] add autoround Signed-off-by: Kaihui-intel --- .../weight_only/run_clm_no_trainer.py | 67 ++++++++++++++++++- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index d5eb7b2f1b5..761b86a8d46 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -52,7 +52,7 @@ type=str, help="tasks for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============WeightOnly configs=============== -parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound'], help="Weight-only parameter.") parser.add_argument("--woq_bits", type=int, default=8) parser.add_argument("--woq_dtype", type=str, default="int") @@ -86,6 +86,26 @@ help="Allow insert mul before linear when the scale cannot be absorbed by last layer.") parser.add_argument('--absorb_layer_dict', type=dict, default={}, help="The layer dict that scale can be absorbed.") +# ============AUTOROUND configs============== +parser.add_argument( + "--lr", + type=float, + default=None, + help="learning rate, if None, it will be set to 1.0/iters automatically", +) +parser.add_argument( + "--minmax_lr", + type=float, + default=None, + help="minmax learning rate, if None,it will beset to be the same with lr", +) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") +parser.add_argument("--autoround_nsamples", default=128, type=int, help="num samples for autoround calibration.") +parser.add_argument( + "--disable_quanted_input", + action="store_true", + help="whether to use the output of quantized block to tune the next block", +) # =============DoubleQuant configs==================== parser.add_argument("--double_quant_type", @@ -239,7 +259,14 @@ def calib_func(prepared_model): prepared_model(calib_input[0]) # 3.x api - from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, AWQConfig, prepare, convert + from neural_compressor.torch.quantization import ( + RTNConfig, + GPTQConfig, + AWQConfig, + AutoRoundConfig, + prepare, + convert + ) from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False if args.double_quant_type is not None: @@ -342,7 +369,41 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(user_model) user_model = convert(user_model) - + elif args.woq_algo == "AutoRound": + quant_config = AutoRoundConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + enable_quanted_input=not args.disable_quanted_input, + lr=args.lr, + minmax_lr=args.minmax_lr, + seqlen=args.pad_max_length, + nsamples=args.autoround_nsamples, + iters=args.autoround_iters, + ) + quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) + from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader + dataloader = get_dataloader(tokenizer=tokenizer, + seqlen=args.pad_max_length, + dataset_name=datasets, + seed=args.seed, + bs=args.batch_size, + nsamples=args.autoround_nsamples) + @torch.no_grad() + def run_fn_for_autoround(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) + run_fn = run_fn_for_autoround + run_args = (dataloader,) + user_model = prepare(model=user_model, quant_config=quant_config) + run_fn(user_model, *run_args) + user_model = convert(user_model) user_model.save(args.output_dir) From 29f1ba9c1b21fc9ff9e2f87b00f6a99095da6f5d Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 16:01:31 +0800 Subject: [PATCH 08/23] minor fix Signed-off-by: Kaihui-intel --- .../language-modeling/quantization/weight_only/run_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index e5f8d54b644..3f90f89244e 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -85,7 +85,7 @@ function run_benchmark { elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then - model_name_or_path="EleutherAI/gpt-j-6b"" + model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then From b0da8224c80969faf34e53b37718df80097b04c4 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 16:18:48 +0800 Subject: [PATCH 09/23] minor fix Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_benchmark.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 3f90f89244e..d2e83223680 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -101,7 +101,7 @@ function run_benchmark { elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" fi - + if [[ ${mode} == "accuracy" ]]; then python -u run_clm_no_trainer.py \ --model ${model_name_or_path} \ @@ -118,6 +118,8 @@ function run_benchmark { else echo "Error: No such mode: ${mode}" exit 1 + fi + } main "$@" From 17e36fc6ade7f0a988ba206c6ce88db89401dbbc Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 15 Aug 2024 21:25:21 +0800 Subject: [PATCH 10/23] update group_size Signed-off-by: Kaihui-intel --- .../language-modeling/quantization/weight_only/run_quant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index 0847976a99c..fa559ab4cbb 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -87,11 +87,11 @@ function run_tuning { extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --calib_iters 128" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128" elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128" From 538f0089ec7e2297cea1131ae248a9f2735044a0 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 16 Aug 2024 08:43:55 +0800 Subject: [PATCH 11/23] add autoround extension test Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index 0df06fb940d..a618faa6193 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -98,6 +98,13 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, + "opt_125m_woq_autoround_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, "gpt_j_ipex":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", From f703f60af891df874c01f113efc2b026a3b63cd8 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 16 Aug 2024 10:18:32 +0800 Subject: [PATCH 12/23] add teq Signed-off-by: Kaihui-intel --- .../weight_only/run_clm_no_trainer.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 761b86a8d46..6062845b045 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -83,9 +83,9 @@ parser.add_argument("--use_auto_clip", action="store_true", help="Enables clip range searchc.") parser.add_argument("--folding", action="store_true", - help="Allow insert mul before linear when the scale cannot be absorbed by last layer.") + help="Allow insert mul before linear when the scale cannot be absorbed by last layer for TEQ/AWQ.") parser.add_argument('--absorb_layer_dict', type=dict, default={}, - help="The layer dict that scale can be absorbed.") + help="The layer dict that scale can be absorbed for TEQ/AWQ.") # ============AUTOROUND configs============== parser.add_argument( "--lr", @@ -264,6 +264,7 @@ def calib_func(prepared_model): GPTQConfig, AWQConfig, AutoRoundConfig, + TEQConfig, prepare, convert ) @@ -369,6 +370,20 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) run_fn(user_model) user_model = convert(user_model) + elif args.woq_algo == "TEQ": + quant_config = TEQConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + group_dim=args.woq_group_dim, + folding=args.folding, + ) + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) + run_fn = calib_func + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) elif args.woq_algo == "AutoRound": quant_config = AutoRoundConfig( dtype=args.woq_dtype, @@ -404,6 +419,7 @@ def run_fn_for_autoround(model, dataloader): user_model = prepare(model=user_model, quant_config=quant_config) run_fn(user_model, *run_args) user_model = convert(user_model) + user_model.save(args.output_dir) From 6c380326999f156f0006e3d160d8b2933c086cc1 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 16 Aug 2024 10:29:48 +0800 Subject: [PATCH 13/23] update readme Signed-off-by: Kaihui-intel --- .../quantization/weight_only/README.md | 73 ++++++++++++++++--- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md index 889d7b42682..0519b490ff7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md @@ -35,9 +35,8 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --double_quant_type "BNB_NF4" \ + --output_dir saved_results # "--woq_algo RTN" is used to enable RTN algorithms python run_clm_no_trainer.py \ @@ -48,9 +47,38 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ + --double_quant_type "BNB_NF4" + --output_dir saved_results + +# "--woq_algo AWQ" is used to enable AWQ algorithms +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AWQ \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 \ + --calib_iters 128 + +# "--woq_algo AutoRound" is used to enable AutoRound algorithms +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AutoRound \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 + +# "--accuracy" for eval +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --int8 \ --accuracy \ --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --output_dir saved_results ``` **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ. @@ -72,8 +100,6 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" # "--woq_algo RTN" is used to enable RTN algorithms @@ -85,13 +111,40 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ + --double_quant_type "BNB_NF4" + +# "--woq_algo AWQ" is used to enable AWQ algorithms +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AWQ \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 \ + --calib_iters 128 + +# "--woq_algo AutoRound" is used to enable AutoRound algorithms +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AutoRound \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 + +# "--accuracy" for eval +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --int8 \ --accuracy \ --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --output_dir saved_results ``` ### LLAMA2-7b/13b/70b ->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. #### Quantization ```bash @@ -107,8 +160,6 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" # "--woq_algo RTN" is used to enable RTN algorithms @@ -120,8 +171,6 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" ``` From f8fbcf667ce855c271ec718c46e04ef70c00c2c2 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Fri, 16 Aug 2024 17:28:13 +0800 Subject: [PATCH 14/23] limit onnxruntime version Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/ut/run_itrex.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh index 2bbbf958398..5adaf86579b 100644 --- a/.azure-pipelines/scripts/ut/run_itrex.sh +++ b/.azure-pipelines/scripts/ut/run_itrex.sh @@ -18,7 +18,8 @@ bash /intel-extension-for-transformers/.github/workflows/script/install_binary.s sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt pip install -r /intel-extension-for-transformers/tests/requirements.txt # workaround -pip install onnx==1.15.0 +pip install onnx==1.16.0 +pip install onnxruntime==1.18.0 echo "pip list itrex ut deps..." pip list LOG_DIR=/neural-compressor/log_dir From 29e8dc3c43c6bab930921ed9d9fb45d7e353e18c Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 20 Aug 2024 12:32:33 +0800 Subject: [PATCH 15/23] rm float Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_clm_no_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 6062845b045..a75cb907354 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -223,7 +223,7 @@ def get_user_model(): revision=args.revision, ) tokenizer = AutoTokenizer.from_pretrained(args.model) - user_model = user_model.float() + # user_model = user_model.float() # Set model's seq_len when GPTQ calibration is enabled. if args.woq_algo == 'GPTQ': From e8610c1d306dfc74ad62b7dff49b35a08f6f58e0 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 20 Aug 2024 13:28:37 +0800 Subject: [PATCH 16/23] fix ar dtype Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_benchmark.sh | 1 + .../quantization/weight_only/run_clm_no_trainer.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index d2e83223680..5b4627a1d2d 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -100,6 +100,7 @@ function run_benchmark { model_name_or_path="facebook/opt-125m" elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoRound" fi if [[ ${mode} == "accuracy" ]]; then diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index a75cb907354..bba8ec57d6a 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -223,7 +223,9 @@ def get_user_model(): revision=args.revision, ) tokenizer = AutoTokenizer.from_pretrained(args.model) - # user_model = user_model.float() + user_model = user_model.float() + if args.woq_algo == 'AutoRound': + user_model.to(torch.float32) # Set model's seq_len when GPTQ calibration is enabled. if args.woq_algo == 'GPTQ': From a7ec2fe573bec0aaea1a9ca5e712f6e2394d699b Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:14:26 +0800 Subject: [PATCH 17/23] update autotune Signed-off-by: Kaihui-intel --- .../weight_only/run_clm_no_trainer.py | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 7773e73c988..1b44e42f0c8 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -62,6 +62,7 @@ parser.add_argument("--woq_scheme", default="sym") parser.add_argument("--woq_use_mse_search", action="store_true") parser.add_argument("--woq_use_full_range", action="store_true") +parser.add_argument("--quant_lm_head", action="store_true", help="whether to quant the lm_head layer in transformers") # =============GPTQ configs==================== parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") @@ -268,6 +269,9 @@ def calib_func(prepared_model): AWQConfig, AutoRoundConfig, TEQConfig, + TuningConfig, + autotune, + get_woq_tuning_config, prepare, convert ) @@ -283,6 +287,7 @@ def calib_func(prepared_model): # TODO: add group_dim into double quant config? "use_full_range": args.woq_use_full_range, "use_mse_search": args.woq_use_mse_search, + "quant_lm_head": args.quant_lm_head, } ) quant_config = RTNConfig.from_dict(double_quant_config_dict) @@ -300,8 +305,8 @@ def calib_func(prepared_model): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, + quant_lm_head=args.quant_lm_head, ) - quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) user_model = convert(model=user_model) elif args.woq_algo == "GPTQ": @@ -332,6 +337,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): "act_order": args.gptq_actorder, "block_size": args.gptq_block_size, "static_groups": args.gptq_static_groups, + "quant_lm_head": args.quant_lm_head, } ) quant_config = GPTQConfig.from_dict(double_quant_config_dict) @@ -351,8 +357,8 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, + quant_lm_head=args.quant_lm_head, ) - quant_config.set_local("lm_head", GPTQConfig(dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) @@ -367,6 +373,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): use_auto_clip=args.use_auto_clip, folding=args.folding, absorb_layer_dict=args.absorb_layer_dict, + quant_lm_head=args.quant_lm_head, ) example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) run_fn = calib_func @@ -381,6 +388,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): group_size=args.woq_group_size, group_dim=args.woq_group_dim, folding=args.folding, + quant_lm_head=args.quant_lm_head, ) example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) run_fn = calib_func @@ -422,8 +430,23 @@ def run_fn_for_autoround(model, dataloader): user_model = prepare(model=user_model, quant_config=quant_config) run_fn(user_model, *run_args) user_model = convert(user_model) - - + elif args.woq_algo == "AutoTune": + from utils import DataloaderPreprocessor + dataloaderPreprocessor = DataloaderPreprocessor( + dataloader_original=calib_dataloader, + use_max_length=args.gptq_use_max_length, + max_seq_length=args.gptq_max_seq_length, + ) + dataloader = dataloaderPreprocessor.get_prepared_dataloader() + custom_tune_config = TuningConfig(config_set=get_woq_tuning_config()) + best_model = autotune( + model=model, + tune_config=custom_tune_config, + eval_fn=eval_acc_fn, + run_fn=run_fn_for_gptq, + run_args=(dataloader, True), # run_args should be a tuple, + example_inputs=example_inputs, + ) user_model.save(args.output_dir) From d27604f69cee2ce05dbf9aabcc3c37709f4760dc Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:23:29 +0800 Subject: [PATCH 18/23] update woq_algo Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_clm_no_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 1b44e42f0c8..1a6ec1321ef 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -53,7 +53,7 @@ type=str, help="tasks for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============WeightOnly configs=============== -parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound'], +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound', 'AutoTune'], help="Weight-only parameter.") parser.add_argument("--woq_bits", type=int, default=8) parser.add_argument("--woq_dtype", type=str, default="int") From 1aa1b13365a67547b2d3206f66d2cf84b00b7102 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:25:32 +0800 Subject: [PATCH 19/23] update user_model Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_clm_no_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 1a6ec1321ef..74b510e8f33 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -439,8 +439,8 @@ def run_fn_for_autoround(model, dataloader): ) dataloader = dataloaderPreprocessor.get_prepared_dataloader() custom_tune_config = TuningConfig(config_set=get_woq_tuning_config()) - best_model = autotune( - model=model, + user_model = autotune( + model=user_model, tune_config=custom_tune_config, eval_fn=eval_acc_fn, run_fn=run_fn_for_gptq, From f6b5810214706e0eaa7710a000c47b9796ef45c8 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:33:32 +0800 Subject: [PATCH 20/23] add eval fn Signed-off-by: Kaihui-intel --- .../weight_only/run_clm_no_trainer.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 74b510e8f33..afc2c2c6b57 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -242,6 +242,31 @@ def get_user_model(): user_model.eval() return user_model, tokenizer +def eval_fn(user_model=None): + user_model.eval() + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="hpu" if is_hpex_available() else "cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + return acc if args.quantize: # dataset @@ -439,10 +464,22 @@ def run_fn_for_autoround(model, dataloader): ) dataloader = dataloaderPreprocessor.get_prepared_dataloader() custom_tune_config = TuningConfig(config_set=get_woq_tuning_config()) + from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device + from tqdm import tqdm + def run_fn_for_gptq(model, dataloader_for_calibration, *args): + for batch in tqdm(dataloader_for_calibration): + batch = move_input_to_device(batch, device=None) + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + return user_model = autotune( model=user_model, tune_config=custom_tune_config, - eval_fn=eval_acc_fn, + eval_fn=eval_fn, run_fn=run_fn_for_gptq, run_args=(dataloader, True), # run_args should be a tuple, example_inputs=example_inputs, From 45e8d00f55dd8d89bcfd52f2dcc785d20e6405cb Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:42:26 +0800 Subject: [PATCH 21/23] update example_inputs Signed-off-by: Kaihui-intel --- .../quantization/weight_only/run_clm_no_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index afc2c2c6b57..51be2900ba7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -476,6 +476,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): else: model(batch) return + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) user_model = autotune( model=user_model, tune_config=custom_tune_config, From 4a22029d10bd0caf132adc435d815ce69ab50e53 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 8 Oct 2024 13:47:37 +0800 Subject: [PATCH 22/23] update extension test Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 7 +++++++ .../quantization/weight_only/run_benchmark.sh | 2 ++ .../quantization/weight_only/run_quant.sh | 3 +++ 3 files changed, 12 insertions(+) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index a618faa6193..809b898d5e3 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -105,6 +105,13 @@ "main_script": "run_clm_no_trainer.py", "batch_size": 1 }, + "opt_125m_woq_autotune_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, "gpt_j_ipex":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", "dataset_location": "", diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 5b4627a1d2d..6c84e27ce88 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -101,6 +101,8 @@ function run_benchmark { elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AutoRound" + elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then + model_name_or_path="facebook/opt-125m" fi if [[ ${mode} == "accuracy" ]]; then diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index fa559ab4cbb..3c5c7ff9594 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -95,6 +95,9 @@ function run_tuning { elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128" + elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4" fi python -u run_clm_no_trainer.py \ From c9893dc05d6fffd43325fd4db69fd4d4ea018ee7 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Wed, 9 Oct 2024 13:53:45 +0800 Subject: [PATCH 23/23] update autoround config Signed-off-by: Kaihui-intel --- .../language-modeling/quantization/weight_only/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index 3c5c7ff9594..ed4ee705726 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -94,7 +94,7 @@ function run_tuning { extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128" elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128" + extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128 --woq_scheme asym --autoround_iters 200 --autoround_nsamples 500" elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then model_name_or_path="facebook/opt-125m" extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4"