Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion third-party/ao
Submodule ao updated 79 files
+3 −3 .github/workflows/build_wheels_linux.yml
+2 −0 .github/workflows/regression_test_rocm.yml
+4 −8 .github/workflows/torchao_experimental_test.yml
+13 −5 README.md
+3 −3 benchmarks/float8/float8_roofline.py
+2 −2 benchmarks/float8/profile_lowp_training.py
+1 −0 benchmarks/float8/training/README.md
+2 −1 benchmarks/float8/training/float8_training_benchmark.sh
+27 −4 benchmarks/microbenchmarks/benchmark_inference.py
+63 −8 benchmarks/microbenchmarks/benchmark_runner.py
+6 −2 benchmarks/microbenchmarks/test/benchmark_config.yml
+66 −1 benchmarks/microbenchmarks/test/test_benchmark_inference.py
+104 −0 benchmarks/microbenchmarks/test/test_benchmark_runner.py
+57 −3 benchmarks/microbenchmarks/test/test_utils.py
+41 −4 benchmarks/microbenchmarks/utils.py
+46 −2 benchmarks/mx_formats/cast_bench.py
+3 −0 dev-requirements.txt
+22 −1 test/prototype/mx_formats/test_custom_cast.py
+101 −86 test/prototype/mx_formats/test_mx_linear.py
+224 −18 test/prototype/mx_formats/test_mx_tensor.py
+2 −0 test/quantization/test_galore_quant.py
+0 −1 torchao/_models/llama/generate.py
+38 −14 torchao/csrc/cuda/fp6_llm/fp6_linear.cu
+14 −5 torchao/csrc/cuda/fp6_llm/kernel_matmul.cuh
+6 −0 torchao/dtypes/__init__.py
+12 −8 torchao/dtypes/affine_quantized_tensor.py
+20 −0 torchao/dtypes/affine_quantized_tensor_ops.py
+10 −0 torchao/dtypes/uintx/__init__.py
+450 −0 torchao/dtypes/uintx/packed_linear_int8_dynamic_activation_intx_weight_layout.py
+52 −0 torchao/dtypes/uintx/q_dq_layout.py
+246 −160 torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
+15 −14 torchao/experimental/kernels/cpu/aarch64/embedding/embedding.h
+72 −23 torchao/experimental/kernels/cpu/aarch64/kleidi/kai_matmul_clamp_f32_qai8dxp_qsi4c32p.h
+297 −0 ...r/channelwise_8bit_activation_groupwise_lowbit_weight/channelwise_8bit_activation_groupwise_lowbit_weight.h
+7 −172 ...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x1x32_f32_neondot-impl.h
+7 −165 ...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x4x16_f32_neondot-impl.h
+47 −177 ...els/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/kernel_1x8x16_f32_neondot-impl.h
+8 −8 ...erimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_activations.h
+186 −11 .../experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight/pack_weights.h
+0 −193 torchao/experimental/kernels/cpu/aarch64/linear/linear.h
+384 −0 torchao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x16x16_f32_smlal-impl.h
+336 −0 ...hao/experimental/kernels/cpu/aarch64/matmul/channelwise_8bit_a_channelwise_8bit_b_1x8x16_f32_neondot-impl.h
+275 −0 torchao/experimental/kernels/cpu/aarch64/matmul/fp32_a_input_channelwise_8bit_b_1x16x4_f32_impl.h
+95 −0 torchao/experimental/kernels/cpu/aarch64/matmul/matmul.h
+70 −0 torchao/experimental/kernels/cpu/aarch64/matmul/matmul_utils.h
+20 −3 torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
+9 −0 torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt
+1 −0 torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh
+89 −0 torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp
+23 −14 torchao/experimental/kernels/cpu/aarch64/tests/test_embedding.cpp
+428 −223 torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp
+512 −0 torchao/experimental/kernels/cpu/aarch64/tests/test_qmatmul.cpp
+201 −28 torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h
+235 −0 torchao/experimental/kernels/cpu/aarch64/tests/test_utils_quantized_attention.h
+37 −26 torchao/experimental/kernels/cpu/aarch64/tests/test_weight_packing.cpp
+5 −8 torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h
+238 −0 torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h
+111 −114 torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
+118 −298 torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.cpp
+20 −124 torchao/experimental/ops/linear_8bit_act_xbit_weight/linear_8bit_act_xbit_weight.h
+46 −66 torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
+3 −3 torchao/experimental/ops/linear_8bit_act_xbit_weight/packed_weights_format.h
+137 −189 torchao/experimental/ops/tests/test_linear_8bit_act_xbit_weight.cpp
+15 −421 torchao/experimental/packed_linear_int8_dynamic_activation_intx_weight_layout.py
+8 −51 torchao/experimental/q_dq_layout.py
+16 −7 torchao/experimental/quant_api.py
+1 −0 torchao/experimental/tests/test_embedding_xbit_quantizer.py
+18 −18 torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
+7 −18 torchao/float8/README.md
+13 −0 torchao/ops.py
+90 −68 torchao/prototype/mx_formats/README.md
+17 −0 torchao/prototype/mx_formats/__init__.py
+104 −45 torchao/prototype/mx_formats/config.py
+8 −0 torchao/prototype/mx_formats/constants.py
+315 −1 torchao/prototype/mx_formats/custom_cast.py
+99 −86 torchao/prototype/mx_formats/mx_linear.py
+3 −33 torchao/prototype/mx_formats/mx_ops.py
+164 −153 torchao/prototype/mx_formats/mx_tensor.py
+9 −20 torchao/testing/float8/roofline_utils.py
Loading