Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions examples/.config/model_params_pytorch_3x.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,34 @@
"main_script": "run_clm_no_trainer.py",
"batch_size": 8
},
"gpt_j_woq_awq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_awq_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_autoround_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"opt_125m_woq_autotune_int4":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
"dataset_location": "",
"input_model": "",
"main_script": "run_clm_no_trainer.py",
"batch_size": 1
},
"gpt_j_ipex":{
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
"dataset_location": "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--double_quant_type "BNB_NF4" \
--output_dir saved_results

# "--woq_algo RTN" is used to enable RTN algorithms
python run_clm_no_trainer.py \
Expand All @@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"
--output_dir saved_results

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128

# "--accuracy" for eval
python run_clm_no_trainer.py \
--model EleutherAI/gpt-j-6B \
--dataset NeelNanda/pile-10k \
--int8 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--output_dir saved_results
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.

Expand All @@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"

# "--woq_algo RTN" is used to enable RTN algorithms
Expand All @@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--double_quant_type "BNB_NF4"

# "--woq_algo AWQ" is used to enable AWQ algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AWQ \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--calib_iters 128

# "--woq_algo AutoRound" is used to enable AutoRound algorithms
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--quantize \
--woq_algo AutoRound \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128

# "--accuracy" for eval
python run_clm_no_trainer.py \
--model facebook/opt-125m \
--dataset NeelNanda/pile-10k \
--int8 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
--output_dir saved_results
```

### LLAMA2-7b/13b/70b
>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
#### Quantization

```bash
Expand All @@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"

# "--woq_algo RTN" is used to enable RTN algorithms
Expand All @@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
--accuracy \
--tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,58 +70,59 @@ function run_benchmark {
fi
echo $extra_cmd

if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"\
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"\
extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
model_name_or_path="facebook/opt-125m"
elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo AutoRound"
elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
model_name_or_path="facebook/opt-125m"
fi

python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
if [[ ${mode} == "accuracy" ]]; then
python -u run_clm_no_trainer.py \
--model ${model_name_or_path} \
--output_dir ${tuned_checkpoint} \
--task ${task} \
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
--model ${model_name_or_path} \
--batch_size ${batch_size} \
--output_dir ${tuned_checkpoint} \
${extra_cmd} ${mode_cmd}
else
echo "Error: No such mode: ${mode}"
exit 1
fi

}

main "$@"
Loading