@@ -298,9 +298,17 @@ jobs:
298
298
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda-32.json --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
299
299
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
300
300
301
- fi
301
+ fi
302
+
303
+ for DEVICE in cpu; do # cuda
304
+ # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
305
+ # follow up with torchao as a separate PR
306
+ echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
307
+ python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
308
+ python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
309
+ done
302
310
echo "::endgroup::"
303
-
311
+
304
312
test-gpu-aoti-float32 :
305
313
permissions :
306
314
id-token : write
@@ -349,6 +357,11 @@ jobs:
349
357
fi
350
358
echo "::endgroup::"
351
359
360
+ # echo "::group::Run inference with quantize file"
361
+ # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
362
+ # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
363
+ # echo "::endgroup::"
364
+
352
365
test-gpu-aoti-float16 :
353
366
permissions :
354
367
id-token : write
@@ -394,6 +407,11 @@ jobs:
394
407
fi
395
408
echo "::endgroup::"
396
409
410
+ # echo "::group::Run inference with quantize file"
411
+ # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
412
+ # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
413
+ # echo "::endgroup::"
414
+
397
415
test-gpu-eval-sanity-check :
398
416
permissions :
399
417
id-token : write
@@ -509,12 +527,12 @@ jobs:
509
527
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
510
528
511
529
echo "******************************************"
512
- echo "*** --quantize torchchat/quant_config/mobile.json ***"
530
+ echo "*** [TEST DISABLED] Can't test --quantize torchchat/quant_config/mobile.json ***"
531
+ echo "*** Testing --quantize torchchat/quant_config/mobile-32.json instead ***"
513
532
echo "******************************************"
514
- # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
533
+ # python torchchat.py export --quantize torchchat/quant_config/mobile-32 .json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
515
534
# python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
516
535
517
-
518
536
echo "******************************************"
519
537
echo "******* Emb: channel-wise quantized ******"
520
538
echo "******************************************"
@@ -528,16 +546,16 @@ jobs:
528
546
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
529
547
530
548
echo "******************************************"
531
- echo "**** Emb 4bit: channel-wise quantized ****"
549
+ echo "**** [TEST DISABLED] Emb 4bit: channel-wise quantized ****"
532
550
echo "******************************************"
533
- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
534
- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
551
+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
552
+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
535
553
536
554
echo "******************************************"
537
- echo "****** Emb 4bit: group-wise quantized ****"
555
+ echo "****** [TEST DISABLED] Emb 4bit: group-wise quantized ****"
538
556
echo "******************************************"
539
- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
540
- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
557
+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
558
+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
541
559
542
560
echo "******************************************"
543
561
echo "******* INT8 channel-wise quantized ******"
@@ -1069,7 +1087,59 @@ jobs:
1069
1087
./runner/build_android.sh
1070
1088
echo "Tests complete."
1071
1089
1072
- test-torchao-experimental :
1090
+ test-torchao-aoti-experimental :
1091
+ strategy :
1092
+ matrix :
1093
+ runner : [macos-14-xlarge]
1094
+ runs-on : ${{matrix.runner}}
1095
+ steps :
1096
+ - name : Checkout repo
1097
+ uses : actions/checkout@v3
1098
+ with :
1099
+ submodules : true
1100
+ - name : Setup Python
1101
+ uses : actions/setup-python@v2
1102
+ with :
1103
+ python-version : 3.10.11
1104
+ - name : Setup Xcode
1105
+ if : runner.os == 'macOS'
1106
+ uses : maxim-lobanov/setup-xcode@v1
1107
+ with :
1108
+ xcode-version : ' 15.3'
1109
+ - name : Print machine info
1110
+ run : |
1111
+ uname -a
1112
+ if [ $(uname -s) == Darwin ]; then
1113
+ sysctl machdep.cpu.brand_string
1114
+ sysctl machdep.cpu.core_count
1115
+ fi
1116
+ - name : Install torchchat
1117
+ run : |
1118
+ echo "Intalling pip3 packages"
1119
+ ./install/install_requirements.sh
1120
+ pip3 list
1121
+ python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
1122
+ - name : Install torchao-ops
1123
+ id : install-torchao-ops
1124
+ run : |
1125
+ bash torchchat/utils/scripts/build_torchao_ops.sh
1126
+ - name : Install runner AOTI
1127
+ id : install-runner-aoti
1128
+ run : |
1129
+ bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
1130
+ - name : Run inference
1131
+ run : |
1132
+ python torchchat.py download stories110M
1133
+ wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
1134
+ export PRMT="Once upon a time in a land far away"
1135
+ echo "Export and run AOTI (C++ runner)"
1136
+ python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1137
+ ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1138
+ echo "Generate AOTI"
1139
+ python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
1140
+ echo "Tests complete."
1141
+
1142
+ test-torchao-et-experimental :
1073
1143
strategy :
1074
1144
matrix :
1075
1145
runner : [macos-14-xlarge]
@@ -1114,10 +1184,6 @@ jobs:
1114
1184
run : |
1115
1185
echo "Installing runner"
1116
1186
bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
1117
- - name : Install runner AOTI
1118
- id : install-runner-aoti
1119
- run : |
1120
- bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
1121
1187
- name : Run inference
1122
1188
run : |
1123
1189
python torchchat.py download stories110M
@@ -1130,11 +1196,6 @@ jobs:
1130
1196
echo "Export and run ET (C++ runner)"
1131
1197
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1132
1198
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
1133
- echo "Export and run AOTI (C++ runner)"
1134
- python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1135
- ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1136
- echo "Generate AOTI"
1137
- python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
1138
1199
echo "Tests complete."
1139
1200
1140
1201
test-torchao-experimental-mps :
0 commit comments