Molly/v2 bkc (#2448)

DiweiSun · web-flow · commit 12dd0fdf3260 · 2024-09-12T11:11:35.000-07:00
* update for dlrm bkc

* bugfix for dlrm int8

* dlrm_v2 bkc update

* update chatglm bkc

* tuning blocktime for llm optimized perf

* license for dlrm_v2 bkc

* bug fix for chatglm int8

* bugfix for chatglm quantization

---------
diff --git a/models_v2/pytorch/chatglm/inference/cpu/do_quantization.sh b/models_v2/pytorch/chatglm/inference/cpu/do_quantization.sh
@@ -79,7 +79,7 @@ echo "### running with jit mode"
 
 FINETUNED_MODEL=${FINETUNED_MODEL:-"'THUDM/chatglm3-6b'"}
 
-EVAL_SCRIPT=${EVAL_SCRIPT:-"../../../../../../models/language_modeling/pytorch/chatglm/inference/cpu/run_llm.py"}
+EVAL_SCRIPT=${EVAL_SCRIPT:-"${PWD}/run_llm.py"}
 WORK_SPACE=${WORK_SPACE:-${OUTPUT_DIR}}
 rm -rf ${OUTPUT_DIR}/latency_log*
 python -m intel_extension_for_pytorch.cpu.launch --nodes-list 0 --memory-allocator tcmalloc --log_dir=${OUTPUT_DIR} --log_file_prefix="./latency_log_${precision}_${mode}" \
diff --git a/models_v2/pytorch/chatglm/inference/cpu/run_model.sh b/models_v2/pytorch/chatglm/inference/cpu/run_model.sh
@@ -150,7 +150,7 @@ else
             }
         }
     '))
-    first_token_latency=($(grep -i 'first-token-latency:' ${OUTPUT_DIR}/ChatGLM_${PRECISION}_${LOG_PREFIX}*  |sed -e 's/.first-token-latency: //;s/[^0-9.]//g;s/\.$//' |awk '
+    first_token_latency=($(grep -i 'first-token-latency:' ${OUTPUT_DIR}/ChatGLM_${PRECISION}_${LOG_PREFIX}*  |sed -e 's/.*first-token-latency: //;s/[^0-9.]//g;s/\.$//' |awk '
         BEGIN {
             num = 0;
             sum = 0;
diff --git a/models_v2/pytorch/gptj/inference/cpu/run_model.sh b/models_v2/pytorch/gptj/inference/cpu/run_model.sh
@@ -41,7 +41,7 @@ elif [[ "${TEST_MODE}" == "REALTIME" ]]; then
     export OMP_NUM_THREADS=${CORE_PER_INSTANCE}
     BATCH_SIZE=1
     NUM_ITER=${NUM_ITER:-20}
-    export KMP_BLOCKTIME=1
+    export KMP_BLOCKTIME=-1
     rm -rf ${OUTPUT_DIR}/latency_log*
     export USECASE=latency
     ARGS="$ARGS  --benchmark --num-warmup 10 --num-iter $NUM_ITER --token-latency"
diff --git a/models_v2/pytorch/llama/inference/cpu/run_model.sh b/models_v2/pytorch/llama/inference/cpu/run_model.sh
@@ -38,7 +38,7 @@ elif [[ "$TEST_MODE" == "REALTIME" ]]; then
     export LOG_PREFIX="latency_log"
     BATCH_SIZE=${BATCH_SIZE:-1}
     export OMP_NUM_THREADS=${CORE_PER_INSTANCE}
-    export KMP_BLOCKTIME=1
+    export KMP_BLOCKTIME=-1
     rm -rf ${OUTPUT_DIR}/latency_log*
     export usecase=latency
     NUM_WARMUP=${NUM_WARMUP:-10}
diff --git a/models_v2/pytorch/llama/training/cpu/run_model.sh b/models_v2/pytorch/llama/training/cpu/run_model.sh
@@ -60,7 +60,7 @@ if [[ "${DDP}" == "True" ]]; then
     done
 
     export CCL_WORKER_AFFINITY=`echo ${CCL_WORKER_AFFINITY} | tr " " ","`
-    EOF
+EOF
 
     #DDP settings
     export TORCH_CPP_LOG_LEVEL=INFO
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/cpu/_calibration.sh b/models_v2/pytorch/torchrec_dlrm/inference/cpu/_calibration.sh
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+if [ ! -e "${MODEL_DIR}/models/recommendation/pytorch/torchrec_dlrm/dlrm_main.py"  ]; then
+    echo "Could not find the script of dlrm_s_pytorch.py. Please set environment variable '\${MODEL_DIR}'."
+    echo "From which the dlrm_s_pytorch.py exist at the: \${MODEL_DIR}/models/recommendation/pytorch/torchrec_dlrm/dlrm_main.py"
+    exit 1
+fi
+MODEL_SCRIPT=${MODEL_DIR}/models/recommendation/pytorch/torchrec_dlrm/dlrm_main.py
+INT8_CONFIG=${MODEL_DIR}/models/recommendation/pytorch/torchrec_dlrm/int8_configure.json
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+LOG_0="${LOG}/throughput.log"
+export BATCH_SIZE=32768
+python -m intel_extension_for_pytorch.cpu.launch --node_id 0 --enable_jemalloc $MODEL_SCRIPT \
+    --embedding_dim 128 \
+    --dense_arch_layer_sizes 512,256,128 \
+    --over_arch_layer_sizes 1024,1024,512,256,1 \
+    --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \
+    --epochs 1 \
+    --pin_memory \
+    --mmap_mode \
+    --batch_size $BATCH_SIZE \
+    --interaction_type=dcn \
+    --dcn_num_layers=3 \
+    --dcn_low_rank_dim=512 \
+    --ipex-optimize \
+    --inference-only \
+    --dtype int8 \
+    --int8-configure-dir ${INT8_CONFIG}\
+    --calibration \
+    --synthetic_multi_hot_criteo_path $DATASET_DIR \
+    --snapshot-dir $WEIGHT_DIR \
+    --ipex-merged-emb-cat
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/cpu/prepare_int8.sh b/models_v2/pytorch/torchrec_dlrm/inference/cpu/prepare_int8.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ARGS="--dtype int8 --int8-prepare --ipex-merged-emb-cat --int8-configure-dir ${INT8_CONFIG}"
+python $MODEL_SCRIPT \
+    --embedding_dim 128 \
+    --dense_arch_layer_sizes 512,256,128 \
+    --over_arch_layer_sizes 1024,1024,512,256,1 \
+    --num_embeddings_per_feature 40000000,39060,17295,7424,20265,3,7122,1543,63,40000000,3067956,405282,10,2209,11938,155,4,976,14,40000000,40000000,40000000,590152,12973,108,36 \
+    --epochs 1 \
+    --pin_memory \
+    --mmap_mode \
+    --batch_size $BATCH_SIZE \
+    --interaction_type=dcn \
+    --dcn_num_layers=3 \
+    --dcn_low_rank_dim=512 \
+    --limit_val_batches 1000 \
+    --ipex-optimize \
+    --log-freq 10 \
+    --jit \
+    --inference-only \
+    --benchmark \
+    $ARGS $EXTRA_ARGS
diff --git a/models_v2/pytorch/torchrec_dlrm/inference/cpu/run_model.sh b/models_v2/pytorch/torchrec_dlrm/inference/cpu/run_model.sh
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+set -x
 ARGS=""
 EXTRA_ARGS=""
 
@@ -25,10 +25,10 @@ if [[ "${TEST_MODE}" == "THROUGHPUT" ]]; then
     LOG_PREFIX=dlrm_inference_performance_log
     if [ -z "${DATASET_DIR}" ]; then
         echo "DATASET_DIR are not set, will use dummy generated dataset"
-        EXTRA_ARGS="$EXTRA_ARGS --multi_hot_distribution_type uniform "
-        EXTRA_ARGS="$EXTRA_ARGS --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 "
+        export EXTRA_ARGS="$EXTRA_ARGS --multi_hot_distribution_type uniform "
+        export EXTRA_ARGS="$EXTRA_ARGS --multi_hot_sizes 3,2,1,2,6,1,1,1,1,7,3,8,1,6,9,5,1,1,1,12,100,27,10,3,1,1 "
     else
-        EXTRA_ARGS="$EXTRA_ARGS --synthetic_multi_hot_criteo_path $DATASET_DIR "
+        export EXTRA_ARGS="$EXTRA_ARGS --synthetic_multi_hot_criteo_path $DATASET_DIR "
     fi
 elif [[ "${TEST_MODE}" == "ACCURACY" ]]; then
     echo "TEST_MODE set to ACCURACY"
@@ -42,7 +42,7 @@ elif [[ "${TEST_MODE}" == "ACCURACY" ]]; then
         echo "The required environment variable WEIGHT_DIR has not been set"
         exit 1
     fi
-    EXTRA_ARGS="$EXTRA_ARGS --synthetic_multi_hot_criteo_path $DATASET_DIR "
+    export EXTRA_ARGS="$EXTRA_ARGS --synthetic_multi_hot_criteo_path $DATASET_DIR "
 else
     echo "Please set TEST_MODE to THROUGHPUT or ACCURACY"
     exit 1
@@ -54,8 +54,8 @@ if [ ! -e "${MODEL_DIR}/dlrm_main.py"  ]; then
     exit 1
 fi
 
-MODEL_SCRIPT=${MODEL_DIR}/dlrm_main.py
-INT8_CONFIG=${MODEL_DIR}/int8_configure.json
+export MODEL_SCRIPT=${MODEL_DIR}/dlrm_main.py
+export INT8_CONFIG=${MODEL_DIR}/int8_configure.json
 
 echo "PRECISION: ${PRECISION}"
 echo "OUTPUT_DIR: ${OUTPUT_DIR}"
@@ -85,9 +85,9 @@ elif [[ $PRECISION == "fp16" ]]; then
     echo "running fp16 path"
     ARGS="$ARGS --dtype fp16"
 elif [[ $PRECISION == "int8" ]]; then
-    if [ ! -e "${MODEL_DIR}/int8_weight.json"  ]; then
-        echo "int8_weight.json not found in MODEL_DIR, will run weight conversion"
-        ARGS="$ARGS --int8-prepare"
+    if [[ "0" == ${TORCH_INDUCTOR} ]];then
+      echo "prepare int8 weight"
+      bash ${MODEL_DIR}/prepare_int8.sh
     fi
     echo "running int8 path"
     ARGS="$ARGS --dtype int8 --int8-configure-dir ${INT8_CONFIG}"

Original file line number	Diff line number	Diff line change
`@@ -150,7 +150,7 @@ else`
`150`	`150`	`}`
`151`	`151`	`}`
`152`	`152`	`'))`
`153`		`- first_token_latency=($(grep -i 'first-token-latency:' ${OUTPUT_DIR}/ChatGLM_${PRECISION}_${LOG_PREFIX}* \|sed -e 's/.first-token-latency: //;s/[^0-9.]//g;s/\.$//' \|awk '`
	`153`	`+ first_token_latency=($(grep -i 'first-token-latency:' ${OUTPUT_DIR}/ChatGLM_${PRECISION}_${LOG_PREFIX}* \|sed -e 's/.*first-token-latency: //;s/[^0-9.]//g;s/\.$//' \|awk '`
`154`	`154`	`BEGIN {`
`155`	`155`	`num = 0;`
`156`	`156`	`sum = 0;`