pytorch · facebook-github-bot · Oct 23, 2024 · Oct 22, 2024
diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp
@@ -16,6 +16,7 @@
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint6.h>
+#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint7.h>
 #include <torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h>
 #include <cassert>
 
@@ -601,6 +602,49 @@ void unpack_uint_values<6>(
   }
 }
 
+// Benchmark utility to compare variants of uint7 packing.
+template <>
+void pack_uint_values<7>(
+    uint8_t* packed,
+    uint8_t* unpacked,
+    int packed_size,
+    int unpacked_size,
+    int variant) {
+  constexpr int nbit = 7;
+  pack_uint_odd_bit_values(
+      torchao::bitpacking::internal::pack_8_uint7_values,
+      torchao::bitpacking::internal::vec_pack_64_uint7_values,
+      torchao::bitpacking::internal::vec_pack_128_uint7_values,
+      nbit,
+      packed,
+      unpacked,
+      packed_size,
+      unpacked_size,
+      variant);
+}
+
+// Benchmark utility to compare variants of uint7 unpacking.
+template <>
+void unpack_uint_values<7>(
+    uint8_t* unpacked,
+    uint8_t* packed,
+    int unpacked_size,
+    int packed_size,
+    int variant) {
+  constexpr int nbit = 7;
+  unpack_uint_odd_bit_values(
+      torchao::bitpacking::internal::unpack_8_uint7_values,
+      torchao::bitpacking::internal::vec_unpack_64_uint7_values,
+      torchao::bitpacking::internal::vec_unpack_128_uint7_values,
+      nbit,
+      unpacked,
+      packed,
+      unpacked_size,
+      packed_size,
+      variant);
+}
+
+
 } // namespace
 
 template <int nbit>
@@ -653,6 +697,8 @@ BENCHMARK(benchmark_pack_uint_values<5>)->ArgsProduct({{128}, {8, 64, 128}});
 BENCHMARK(benchmark_unpack_uint_values<5>)->ArgsProduct({{128}, {8, 64, 128}});
 BENCHMARK(benchmark_pack_uint_values<6>)->ArgsProduct({{128}, {8, 64, 128}});
 BENCHMARK(benchmark_unpack_uint_values<6>)->ArgsProduct({{128}, {4, 32, 64}});
+BENCHMARK(benchmark_pack_uint_values<7>)->ArgsProduct({{128}, {8, 64, 128}});
+BENCHMARK(benchmark_unpack_uint_values<7>)->ArgsProduct({{128}, {8, 64, 128}});
 
 // Run the benchmark
 BENCHMARK_MAIN();

diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp
@@ -243,6 +243,8 @@ BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x1x32_F32_NEONDOT
     5);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x1x32_F32_NEONDOT(
     6);
+BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x1x32_F32_NEONDOT(
+    7);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT(
     1);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT(
@@ -255,6 +257,8 @@ BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT
     5);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT(
     6);
+BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT(
+    7);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x4x16_F32_NEONDOT(
     1);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x8x16_F32_NEONDOT(
@@ -267,6 +271,8 @@ BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x8x16_F32_NEONDOT
     5);
 BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x8x16_F32_NEONDOT(
     6);
+BENCHMARK_CHANNELWISE_8BIT_ACTIVATION_GROUPWISE_LOWBIT_WEIGHT_1x8x16_F32_NEONDOT(
+    7);
 
 // Run the benchmark
 BENCHMARK_MAIN();
diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h
@@ -15,6 +15,7 @@
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint5.h>
 #include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint6.h>
+#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint7.h>
 #include <torchao/experimental/kernels/cpu/aarch64/macro.h>
 #include <cassert>
 
@@ -79,10 +80,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_32_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   // Shift unpacked values to nonnegative range
   int8x16_t shift = vdupq_n_s8(1 << (nbit - 1));
   uint8x16_t shifted0 = vreinterpretq_u8_s8(vaddq_s8(unpacked0, shift));
@@ -144,6 +141,16 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_32_lowbit_values(
       torchao::bitpacking::internal::vec_pack_32_uint6_values(
           packed, shifted0, shifted1);
       break;
+    case 7:
+      uint8_t buffer7[32];
+      vst1q_u8(buffer7, shifted0);
+      vst1q_u8(buffer7 + 16, shifted1);
+
+      torchao::bitpacking::internal::pack_8_uint7_values(packed, buffer7);
+      torchao::bitpacking::internal::pack_8_uint7_values(packed + 7, buffer7 + 8);
+      torchao::bitpacking::internal::pack_8_uint7_values(packed + 14, buffer7 + 16);
+      torchao::bitpacking::internal::pack_8_uint7_values(packed + 21, buffer7 + 24);
+      break;
     default:
       assert(false);
   }
@@ -157,10 +164,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   uint8x16_t shifted0;
   uint8x16_t shifted1;
 
@@ -219,6 +222,18 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_32_lowbit_values(
       torchao::bitpacking::internal::vec_unpack_32_uint6_values(
           shifted0, shifted1, packed);
       break;
+    case 7:
+      uint8_t buffer7[32];
+      torchao::bitpacking::internal::unpack_8_uint7_values(buffer7, packed);
+      torchao::bitpacking::internal::unpack_8_uint7_values(
+          buffer7 + 8, packed + 7);
+      torchao::bitpacking::internal::unpack_8_uint7_values(
+          buffer7 + 16, packed + 14);
+      torchao::bitpacking::internal::unpack_8_uint7_values(
+          buffer7 + 24, packed + 21);
+      shifted0 = vld1q_u8(buffer7);
+      shifted1 = vld1q_u8(buffer7 + 16);
+      break;
     default:
       assert(false);
   }
@@ -239,10 +254,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_64_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   // Shift unpacked values to nonnegative range
   int8x16_t shift = vdupq_n_s8(1 << (nbit - 1));
   uint8x16_t shifted0 = vreinterpretq_u8_s8(vaddq_s8(unpacked0, shift));
@@ -277,6 +288,10 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_64_lowbit_values(
       torchao::bitpacking::internal::vec_pack_64_uint6_values(
           packed, shifted0, shifted1, shifted2, shifted3);
       break;
+    case 7:
+      torchao::bitpacking::internal::vec_pack_64_uint7_values(
+          packed, shifted0, shifted1, shifted2, shifted3);
+      break;
     default:
       assert(false);
   }
@@ -292,10 +307,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   uint8x16_t shifted0;
   uint8x16_t shifted1;
   uint8x16_t shifted2;
@@ -328,6 +339,10 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_64_lowbit_values(
       torchao::bitpacking::internal::vec_unpack_64_uint6_values(
           shifted0, shifted1, shifted2, shifted3, packed);
       break;
+    case 7:
+      torchao::bitpacking::internal::vec_unpack_64_uint7_values(
+          shifted0, shifted1, shifted2, shifted3, packed);
+      break;
     default:
       assert(false);
   }
@@ -354,10 +369,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_128_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   // Shift unpacked values to nonnegative range
   int8x16_t shift = vdupq_n_s8(1 << (nbit - 1));
   uint8x16_t shifted0 = vreinterpretq_u8_s8(vaddq_s8(unpacked0, shift));
@@ -428,6 +439,18 @@ TORCHAO_ALWAYS_INLINE inline void vec_pack_128_lowbit_values(
       torchao::bitpacking::internal::vec_pack_64_uint6_values(
           packed + 48, shifted4, shifted5, shifted6, shifted7);
       break;
+    case 7:
+      torchao::bitpacking::internal::vec_pack_128_uint7_values(
+          packed,
+          shifted0,
+          shifted1,
+          shifted2,
+          shifted3,
+          shifted4,
+          shifted5,
+          shifted6,
+          shifted7);
+      break;
     default:
       assert(false);
   }
@@ -447,10 +470,6 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values(
   static_assert(nbit < 8);
   static_assert(nbit >= 1);
 
-  // Currently supported values
-  static_assert(nbit >= 1);
-  static_assert(nbit <= 6);
-
   uint8x16_t shifted0;
   uint8x16_t shifted1;
   uint8x16_t shifted2;
@@ -519,6 +538,18 @@ TORCHAO_ALWAYS_INLINE inline void vec_unpack_128_lowbit_values(
       torchao::bitpacking::internal::vec_unpack_64_uint6_values(
           shifted4, shifted5, shifted6, shifted7, packed + 48);
       break;
+    case 7:
+      torchao::bitpacking::internal::vec_unpack_128_uint7_values(
+          shifted0,
+          shifted1,
+          shifted2,
+          shifted3,
+          shifted4,
+          shifted5,
+          shifted6,
+          shifted7,
+          packed);
+      break;
     default:
       assert(false);
   }