From 2062dc704ddf458eee325acd8617f36749d911b0 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 16:37:57 +0000
Subject: [PATCH 01/18] Make alphas_a standard C array

---
 .../forced_align/cpu/compute.cpp              | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 81f5f0a459..d4881318df 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -1,5 +1,9 @@
 #include <torch/script.h>
 #include <torch/torch.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
 using namespace std;
 
@@ -22,17 +26,17 @@ void forced_align_impl(
   const auto T = logProbs.size(1);
   const auto L = targets.size(1);
   const auto S = 2 * L + 1;
-  torch::Tensor alphas = torch::empty(
-                             {2, S},
-                             torch::TensorOptions()
-                                 .device(logProbs.device())
-                                 .dtype(logProbs.dtype()))
-                             .fill_(kNegInfinity);
+
+  auto alphas_a = new scalar_t[S][2];
+  for (int i = 0; i < S; i++) {
+    alphas_a[i][0] = kNegInfinity;
+    alphas_a[i][1] = kNegInfinity;
+  }
+
   torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
-  auto alphas_a = alphas.accessor<scalar_t, 2>();
   auto backPtr_a = backPtr.accessor<int8_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
@@ -52,7 +56,7 @@ void forced_align_impl(
   auto end = (S == 1) ? 1 : 2;
   for (auto i = start; i < end; i++) {
     auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
-    alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx];
+    alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx];
   }
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
@@ -75,18 +79,18 @@ void forced_align_impl(
     auto curIdxOffset = t % 2;
     auto prevIdxOffset = (t - 1) % 2;
     for (auto j = 0; j < S; ++j) {
-      alphas_a[curIdxOffset][j] = -std::numeric_limits<scalar_t>::infinity();
+      alphas_a[j][curIdxOffset] = -std::numeric_limits<scalar_t>::infinity();
     }
     if (start == 0) {
-      alphas_a[curIdxOffset][0] =
-          alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank];
+      alphas_a[0][curIdxOffset] =
+          alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
       backPtr_a[t][0] = 0;
       startloop += 1;
     }
 
     for (auto i = startloop; i < end; i++) {
-      auto x0 = alphas_a[prevIdxOffset][i];
-      auto x1 = alphas_a[prevIdxOffset][i - 1];
+      auto x0 = alphas_a[i][prevIdxOffset];
+      auto x1 = alphas_a[i - 1][prevIdxOffset];
       auto x2 = -std::numeric_limits<scalar_t>::infinity();
 
       auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
@@ -97,7 +101,7 @@ void forced_align_impl(
       // (i != 1) just ensures we don't access targets[i - 2] if its i < 2
       if (i % 2 != 0 && i != 1 &&
           targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
-        x2 = alphas_a[prevIdxOffset][i - 2];
+        x2 = alphas_a[i - 2][prevIdxOffset];
       }
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
@@ -110,11 +114,11 @@ void forced_align_impl(
         result = x0;
         backPtr_a[t][i] = 0;
       }
-      alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx];
+      alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx];
     }
   }
   auto idx1 = (T - 1) % 2;
-  auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2;
+  auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2;
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];

From e70113c879ee86502d5a49cb50a2a8093a8dd029 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 17:59:19 +0000
Subject: [PATCH 02/18] Convert backptr to standard array

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index d4881318df..6dc8bb93ab 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -33,11 +33,14 @@ void forced_align_impl(
     alphas_a[i][1] = kNegInfinity;
   }
 
-  torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
+  auto backPtr_a = new int8_t[T * S];
+  for (int i = 0; i < T * S; i++) {
+    backPtr_a[i] = -1;
+  }
+
   auto logProbs_a = logProbs.accessor<scalar_t, 3>();
   auto targets_a = targets.accessor<target_t, 2>();
   auto paths_a = paths.accessor<target_t, 2>();
-  auto backPtr_a = backPtr.accessor<int8_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
     if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {
@@ -84,7 +87,7 @@ void forced_align_impl(
     if (start == 0) {
       alphas_a[0][curIdxOffset] =
           alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
-      backPtr_a[t][0] = 0;
+      backPtr_a[S * t] = 0;
       startloop += 1;
     }
 
@@ -106,13 +109,13 @@ void forced_align_impl(
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        backPtr_a[t][i] = 2;
+        backPtr_a[t * S + i] = 2;
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        backPtr_a[t][i] = 1;
+        backPtr_a[t * S + i] = 1;
       } else {
         result = x0;
-        backPtr_a[t][i] = 0;
+        backPtr_a[t * S + i] = 0;
       }
       alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx];
     }
@@ -123,7 +126,7 @@ void forced_align_impl(
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
-    ltrIdx -= backPtr_a[t][ltrIdx];
+    ltrIdx -= backPtr_a[t * S + ltrIdx];
   }
 }
 

From 4039399cca52ffdf30148e8e2bfac645dffc641b Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 19:10:34 +0000
Subject: [PATCH 03/18] Create Accessor class

---
 .../forced_align/cpu/compute.cpp              | 74 +++++++++++++++----
 1 file changed, 60 insertions(+), 14 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 6dc8bb93ab..e641525ddd 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -4,12 +4,58 @@
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <cstdarg>
+
 
 using namespace std;
 
 namespace torchaudio {
 namespace alignment {
 namespace cpu {
+
+template<unsigned int k, typename T>
+class Accessor {
+  int64_t shape[k];
+  T *data;
+
+public:
+  Accessor(const torch::Tensor& tensor) {
+    data = tensor.data_ptr<T>();
+    for (int i = 0; i < k; i++) {
+      shape[i] = tensor.size(i);
+    }
+  }
+
+  T index(...) {
+    va_list args;
+    va_start(args, k);
+    int64_t ix = 0;
+    for (int i = 0; i < k; i++) {
+      if (i == k - 1)
+        ix += va_arg(args, int);
+      else
+        ix += shape[i+1] * va_arg(args, int);
+    }
+    va_end(args);
+    return data[ix];
+  }
+
+  // void set_index(T val,...) {
+  //   va_list args;
+  //   va_start(args, k);
+  //   int64_t ix = 0;
+  //   for (int i = 0; i < k; i++) {
+  //     if (i == k - 1)
+  //       ix += va_arg(args, int);
+  //     else
+  //       ix += shape[i+1] * va_arg(args, int);
+  //   }
+  //   va_end(args);
+  //   data[ix] = val;
+  // }
+};
+
 // Inspired from
 // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp
 template <typename scalar_t, at::ScalarType target_scalar_type>
@@ -38,12 +84,12 @@ void forced_align_impl(
     backPtr_a[i] = -1;
   }
 
-  auto logProbs_a = logProbs.accessor<scalar_t, 3>();
-  auto targets_a = targets.accessor<target_t, 2>();
+  auto logProbs_a = Accessor<3, scalar_t>(logProbs);
+  auto targets_a = Accessor<2, target_t>(targets);
   auto paths_a = paths.accessor<target_t, 2>();
   auto R = 0;
   for (auto i = 1; i < L; i++) {
-    if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) {
+    if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) {
       ++R;
     }
   }
@@ -58,22 +104,22 @@ void forced_align_impl(
   auto start = T - (L + R) > 0 ? 0 : 1;
   auto end = (S == 1) ? 1 : 2;
   for (auto i = start; i < end; i++) {
-    auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
-    alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx];
+    auto labelIdx = (i % 2 == 0) ? blank : targets_a.index(batchIndex, i / 2);
+    alphas_a[i][0] = logProbs_a.index(batchIndex,0,labelIdx);
   }
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
       if ((start % 2 == 1) &&
-          targets_a[batchIndex][start / 2] !=
-              targets_a[batchIndex][start / 2 + 1]) {
+          targets_a.index(batchIndex, start / 2) !=
+              targets_a.index(batchIndex, start / 2 + 1)) {
         start = start + 1;
       }
       start = start + 1;
     }
     if (t <= L + R) {
       if (end % 2 == 0 && end < 2 * L &&
-          targets_a[batchIndex][end / 2 - 1] !=
-              targets_a[batchIndex][end / 2]) {
+          targets_a.index(batchIndex, end / 2 - 1) !=
+              targets_a.index(batchIndex, end / 2)) {
         end = end + 1;
       }
       end = end + 1;
@@ -86,7 +132,7 @@ void forced_align_impl(
     }
     if (start == 0) {
       alphas_a[0][curIdxOffset] =
-          alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
+          alphas_a[0][prevIdxOffset] + logProbs_a.index(batchIndex, t, blank);
       backPtr_a[S * t] = 0;
       startloop += 1;
     }
@@ -96,14 +142,14 @@ void forced_align_impl(
       auto x1 = alphas_a[i - 1][prevIdxOffset];
       auto x2 = -std::numeric_limits<scalar_t>::infinity();
 
-      auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
+      auto labelIdx = (i % 2 == 0) ? blank : targets_a.index(batchIndex, i / 2);
 
       // In CTC, the optimal path may optionally chose to skip a blank label.
       // x2 represents skipping a letter, and can only happen if we're not
       // currently on a blank_label, and we're not on a repeat letter
       // (i != 1) just ensures we don't access targets[i - 2] if its i < 2
       if (i % 2 != 0 && i != 1 &&
-          targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
+          targets_a.index(batchIndex, i / 2) != targets_a.index(batchIndex, i / 2 - 1)) {
         x2 = alphas_a[i - 2][prevIdxOffset];
       }
       scalar_t result = 0.0;
@@ -117,14 +163,14 @@ void forced_align_impl(
         result = x0;
         backPtr_a[t * S + i] = 0;
       }
-      alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx];
+      alphas_a[i][curIdxOffset] = result + logProbs_a.index(batchIndex, t, labelIdx);
     }
   }
   auto idx1 = (T - 1) % 2;
   auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2;
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
-    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
+    auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a.index(batchIndex, ltrIdx / 2);
     paths_a[batchIndex][t] = lbl_idx;
     ltrIdx -= backPtr_a[t * S + ltrIdx];
   }

From b733629b0990232a57991dded17e71ad395df009 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 19:16:29 +0000
Subject: [PATCH 04/18] Add MutAccessor

---
 .../forced_align/cpu/compute.cpp              | 46 +++++++++++++------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index e641525ddd..b0d35c334b 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -40,22 +40,38 @@ class Accessor {
     va_end(args);
     return data[ix];
   }
+};
+
+
+template<unsigned int k, typename T>
+class MutAccessor {
+  int64_t shape[k];
+  T *data;
+
+public:
+ MutAccessor(torch::Tensor& tensor) {
+    data = tensor.data_ptr<T>();
+    for (int i = 0; i < k; i++) {
+      shape[i] = tensor.size(i);
+    }
+  }
 
-  // void set_index(T val,...) {
-  //   va_list args;
-  //   va_start(args, k);
-  //   int64_t ix = 0;
-  //   for (int i = 0; i < k; i++) {
-  //     if (i == k - 1)
-  //       ix += va_arg(args, int);
-  //     else
-  //       ix += shape[i+1] * va_arg(args, int);
-  //   }
-  //   va_end(args);
-  //   data[ix] = val;
-  // }
+  void set_index(T value,...) {
+    va_list args;
+    va_start(args, k);
+    int64_t ix = 0;
+    for (int i = 0; i < k; i++) {
+      if (i == k - 1)
+        ix += va_arg(args, int);
+      else
+        ix += shape[i+1] * va_arg(args, int);
+    }
+    va_end(args);
+    data[ix] = value;
+  }
 };
 
+
 // Inspired from
 // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp
 template <typename scalar_t, at::ScalarType target_scalar_type>
@@ -86,7 +102,7 @@ void forced_align_impl(
 
   auto logProbs_a = Accessor<3, scalar_t>(logProbs);
   auto targets_a = Accessor<2, target_t>(targets);
-  auto paths_a = paths.accessor<target_t, 2>();
+  auto paths_a = MutAccessor<2, target_t>(paths);
   auto R = 0;
   for (auto i = 1; i < L; i++) {
     if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) {
@@ -171,7 +187,7 @@ void forced_align_impl(
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a.index(batchIndex, ltrIdx / 2);
-    paths_a[batchIndex][t] = lbl_idx;
+    paths_a.set_index(lbl_idx, batchIndex, t);
     ltrIdx -= backPtr_a[t * S + ltrIdx];
   }
 }

From 9beb34a931defb7564e32f285753d36465ce8f3d Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 21:28:27 +0000
Subject: [PATCH 05/18] Fix multidimensional indexing bug

---
 .../forced_align/cpu/compute.cpp              | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index b0d35c334b..0579b5cf8c 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -14,17 +14,27 @@ namespace torchaudio {
 namespace alignment {
 namespace cpu {
 
+// Compute strides for row-major indexing
+template<unsigned int k>
+void reverse_cumprod(int64_t (&strides)[k]) {
+  // Convert dimensions to strides: stride[i] = product of dimensions [i+1..k-1]
+  for (int i = k - 2; i >= 0; i--) {
+    strides[i] = strides[i] * strides[i + 1];
+  }
+}
+
 template<unsigned int k, typename T>
 class Accessor {
-  int64_t shape[k];
+  int64_t strides[k-1];
   T *data;
 
 public:
   Accessor(const torch::Tensor& tensor) {
     data = tensor.data_ptr<T>();
-    for (int i = 0; i < k; i++) {
-      shape[i] = tensor.size(i);
+    for (int i = 1; i < k; i++) {
+      strides[i-1] = tensor.size(i);
     }
+    reverse_cumprod<k-1>(strides);
   }
 
   T index(...) {
@@ -35,43 +45,42 @@ class Accessor {
       if (i == k - 1)
         ix += va_arg(args, int);
       else
-        ix += shape[i+1] * va_arg(args, int);
+        ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
     return data[ix];
   }
 };
 
-
 template<unsigned int k, typename T>
 class MutAccessor {
-  int64_t shape[k];
+  int64_t strides[k-1];
   T *data;
 
 public:
- MutAccessor(torch::Tensor& tensor) {
+  MutAccessor(torch::Tensor& tensor) {
     data = tensor.data_ptr<T>();
-    for (int i = 0; i < k; i++) {
-      shape[i] = tensor.size(i);
+    for (int i = 1; i < k; i++) {
+      strides[i-1] = tensor.size(i);
     }
+    reverse_cumprod<k-1>(strides);
   }
 
-  void set_index(T value,...) {
+  void set_index(T value, ...) {
     va_list args;
-    va_start(args, k);
+    va_start(args, value);
     int64_t ix = 0;
     for (int i = 0; i < k; i++) {
       if (i == k - 1)
         ix += va_arg(args, int);
       else
-        ix += shape[i+1] * va_arg(args, int);
+        ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
     data[ix] = value;
   }
 };
 
-
 // Inspired from
 // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp
 template <typename scalar_t, at::ScalarType target_scalar_type>

From 11d1e217f0fba3c70d0f6846d829c5d6bd8659ce Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 5 Aug 2025 22:21:33 +0000
Subject: [PATCH 06/18] Use strides rather than computing standard strides from
 dims

---
 .../forced_align/cpu/compute.cpp              | 29 ++++---------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 0579b5cf8c..3ba23c4797 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -14,27 +14,17 @@ namespace torchaudio {
 namespace alignment {
 namespace cpu {
 
-// Compute strides for row-major indexing
-template<unsigned int k>
-void reverse_cumprod(int64_t (&strides)[k]) {
-  // Convert dimensions to strides: stride[i] = product of dimensions [i+1..k-1]
-  for (int i = k - 2; i >= 0; i--) {
-    strides[i] = strides[i] * strides[i + 1];
-  }
-}
-
 template<unsigned int k, typename T>
 class Accessor {
-  int64_t strides[k-1];
+  int64_t strides[k];
   T *data;
 
 public:
   Accessor(const torch::Tensor& tensor) {
     data = tensor.data_ptr<T>();
-    for (int i = 1; i < k; i++) {
-      strides[i-1] = tensor.size(i);
+    for (int i = 0; i < k; i++) {
+      strides[i] = tensor.stride(i);
     }
-    reverse_cumprod<k-1>(strides);
   }
 
   T index(...) {
@@ -42,9 +32,6 @@ class Accessor {
     va_start(args, k);
     int64_t ix = 0;
     for (int i = 0; i < k; i++) {
-      if (i == k - 1)
-        ix += va_arg(args, int);
-      else
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
@@ -54,16 +41,15 @@ class Accessor {
 
 template<unsigned int k, typename T>
 class MutAccessor {
-  int64_t strides[k-1];
+  int64_t strides[k];
   T *data;
 
 public:
   MutAccessor(torch::Tensor& tensor) {
     data = tensor.data_ptr<T>();
-    for (int i = 1; i < k; i++) {
-      strides[i-1] = tensor.size(i);
+    for (int i = 0; i < k; i++) {
+      strides[i] = tensor.stride(i);
     }
-    reverse_cumprod<k-1>(strides);
   }
 
   void set_index(T value, ...) {
@@ -71,9 +57,6 @@ class MutAccessor {
     va_start(args, value);
     int64_t ix = 0;
     for (int i = 0; i < k; i++) {
-      if (i == k - 1)
-        ix += va_arg(args, int);
-      else
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);

From b47c053919739ba5a48c214fceaf161f7e007115 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 16:04:24 +0000
Subject: [PATCH 07/18] Merge Accessor and MutAccessor

---
 .../forced_align/cpu/compute.cpp              | 32 +++++++------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 3ba23c4797..8a5389bff5 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <cstdarg>
+#include <type_traits>
 
 
 using namespace std;
@@ -14,14 +15,16 @@ namespace torchaudio {
 namespace alignment {
 namespace cpu {
 
-template<unsigned int k, typename T>
+template<unsigned int k, typename T, bool IsConst = true>
 class Accessor {
   int64_t strides[k];
   T *data;
 
 public:
-  Accessor(const torch::Tensor& tensor) {
-    data = tensor.data_ptr<T>();
+  using tensor_type = typename std::conditional<IsConst, const torch::Tensor&, torch::Tensor&>::type;
+  
+  Accessor(tensor_type tensor) {
+    data = tensor.template data_ptr<T>();
     for (int i = 0; i < k; i++) {
       strides[i] = tensor.stride(i);
     }
@@ -37,22 +40,9 @@ class Accessor {
     va_end(args);
     return data[ix];
   }
-};
-
-template<unsigned int k, typename T>
-class MutAccessor {
-  int64_t strides[k];
-  T *data;
-
-public:
-  MutAccessor(torch::Tensor& tensor) {
-    data = tensor.data_ptr<T>();
-    for (int i = 0; i < k; i++) {
-      strides[i] = tensor.stride(i);
-    }
-  }
 
-  void set_index(T value, ...) {
+  template<bool C = IsConst>
+  typename std::enable_if<!C, void>::type set_index(T value, ...) {
     va_list args;
     va_start(args, value);
     int64_t ix = 0;
@@ -92,9 +82,9 @@ void forced_align_impl(
     backPtr_a[i] = -1;
   }
 
-  auto logProbs_a = Accessor<3, scalar_t>(logProbs);
-  auto targets_a = Accessor<2, target_t>(targets);
-  auto paths_a = MutAccessor<2, target_t>(paths);
+  auto logProbs_a = Accessor<3, scalar_t, true>(logProbs);
+  auto targets_a = Accessor<2, target_t, true>(targets);
+  auto paths_a = Accessor<2, target_t, false>(paths);
   auto R = 0;
   for (auto i = 1; i < L; i++) {
     if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) {

From 7a94b04e7c584a171934b62367bc2763ae1deace Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 16:45:56 +0000
Subject: [PATCH 08/18] Move Accessor to its own file and add tests

---
 src/libtorchaudio/CMakeLists.txt              |  1 +
 src/libtorchaudio/accessor.h                  | 44 +++++++++++++++++++
 src/libtorchaudio/accessor_tests.cpp          | 25 +++++++++++
 .../forced_align/cpu/compute.cpp              | 40 +----------------
 test/torchaudio_unittest/accessor_test.py     |  7 +++
 5 files changed, 78 insertions(+), 39 deletions(-)
 create mode 100644 src/libtorchaudio/accessor.h
 create mode 100644 src/libtorchaudio/accessor_tests.cpp
 create mode 100644 test/torchaudio_unittest/accessor_test.py

diff --git a/src/libtorchaudio/CMakeLists.txt b/src/libtorchaudio/CMakeLists.txt
index 85bc227cd6..20ad792b32 100644
--- a/src/libtorchaudio/CMakeLists.txt
+++ b/src/libtorchaudio/CMakeLists.txt
@@ -6,6 +6,7 @@ set(
   lfilter.cpp
   overdrive.cpp
   utils.cpp
+  accessor_tests.cpp
   )
 
 set(
diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h
new file mode 100644
index 0000000000..ed2b9f6257
--- /dev/null
+++ b/src/libtorchaudio/accessor.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/torch.h>
+#include <type_traits>
+#include <cstdarg>
+
+template<unsigned int k, typename T, bool IsConst = true>
+class Accessor {
+  int64_t strides[k];
+  T *data;
+
+public:
+  using tensor_type = typename std::conditional<IsConst, const torch::Tensor&, torch::Tensor&>::type;
+
+  Accessor(tensor_type tensor) {
+    data = tensor.template data_ptr<T>();
+    for (int i = 0; i < k; i++) {
+      strides[i] = tensor.stride(i);
+    }
+  }
+
+  T index(...) {
+    va_list args;
+    va_start(args, k);
+    int64_t ix = 0;
+    for (int i = 0; i < k; i++) {
+        ix += strides[i] * va_arg(args, int);
+    }
+    va_end(args);
+    return data[ix];
+  }
+
+  template<bool C = IsConst>
+  typename std::enable_if<!C, void>::type set_index(T value, ...) {
+    va_list args;
+    va_start(args, value);
+    int64_t ix = 0;
+    for (int i = 0; i < k; i++) {
+        ix += strides[i] * va_arg(args, int);
+    }
+    va_end(args);
+    data[ix] = value;
+  }
+};
diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp
new file mode 100644
index 0000000000..ca6e48b193
--- /dev/null
+++ b/src/libtorchaudio/accessor_tests.cpp
@@ -0,0 +1,25 @@
+#include <libtorchaudio/accessor.h>
+#include <cstdint>
+#include <torch/torch.h>
+
+using namespace std;
+
+bool test_accessor(const torch::Tensor& tensor) {
+  int64_t* data_ptr = tensor.template data_ptr<int64_t>();
+  auto accessor = Accessor<3, int64_t>(tensor);
+  for (int i = 0; i < tensor.size(0); i++) {
+    for (int j = 0; j < tensor.size(1); j++) {
+      for (int k = 0; k < tensor.size(2); k++) {
+        auto check = *(data_ptr++) ==  accessor.index(i, j, k);
+        if (!check) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def("torchaudio::_test_accessor", &test_accessor);
+}
diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 8a5389bff5..c0380f25ad 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -5,8 +5,7 @@
 #include <torch/csrc/stable/ops.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
-#include <cstdarg>
-#include <type_traits>
+#include <libtorchaudio/accessor.h>
 
 
 using namespace std;
@@ -15,44 +14,7 @@ namespace torchaudio {
 namespace alignment {
 namespace cpu {
 
-template<unsigned int k, typename T, bool IsConst = true>
-class Accessor {
-  int64_t strides[k];
-  T *data;
-
-public:
-  using tensor_type = typename std::conditional<IsConst, const torch::Tensor&, torch::Tensor&>::type;
-  
-  Accessor(tensor_type tensor) {
-    data = tensor.template data_ptr<T>();
-    for (int i = 0; i < k; i++) {
-      strides[i] = tensor.stride(i);
-    }
-  }
 
-  T index(...) {
-    va_list args;
-    va_start(args, k);
-    int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
-        ix += strides[i] * va_arg(args, int);
-    }
-    va_end(args);
-    return data[ix];
-  }
-
-  template<bool C = IsConst>
-  typename std::enable_if<!C, void>::type set_index(T value, ...) {
-    va_list args;
-    va_start(args, value);
-    int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
-        ix += strides[i] * va_arg(args, int);
-    }
-    va_end(args);
-    data[ix] = value;
-  }
-};
 
 // Inspired from
 // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp
diff --git a/test/torchaudio_unittest/accessor_test.py b/test/torchaudio_unittest/accessor_test.py
new file mode 100644
index 0000000000..db14258dc6
--- /dev/null
+++ b/test/torchaudio_unittest/accessor_test.py
@@ -0,0 +1,7 @@
+import torch
+from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    def test_accessor():
+        tensor = torch.randint(1000, (5,4,3))
+        assert torch.ops.torchaudio._test_accessor(tensor)

From 75d246a646a76bd85f9613d91df23a2e3db08752 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 16:51:41 +0000
Subject: [PATCH 09/18] Add comment about original indexing

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 6dc8bb93ab..3b56aa000a 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -87,7 +87,7 @@ void forced_align_impl(
     if (start == 0) {
       alphas_a[0][curIdxOffset] =
           alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
-      backPtr_a[S * t] = 0;
+      backPtr_a[S * t] = 0; // backPtr_a[t][0] = 0
       startloop += 1;
     }
 
@@ -109,13 +109,13 @@ void forced_align_impl(
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
         result = x2;
-        backPtr_a[t * S + i] = 2;
+        backPtr_a[t * S + i] = 2; // backPtr_a[t][i] = 2
       } else if (x1 > x0 && x1 > x2) {
         result = x1;
-        backPtr_a[t * S + i] = 1;
+        backPtr_a[t * S + i] = 1; // backPtr_a[t][i] = 1
       } else {
         result = x0;
-        backPtr_a[t * S + i] = 0;
+        backPtr_a[t * S + i] = 0; // backPtr_a[t][i] = 0
       }
       alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx];
     }
@@ -126,7 +126,7 @@ void forced_align_impl(
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];
     paths_a[batchIndex][t] = lbl_idx;
-    ltrIdx -= backPtr_a[t * S + ltrIdx];
+    ltrIdx -= backPtr_a[t * S + ltrIdx]; // backPtr_a[t][ltrIdx]
   }
 }
 

From 30ed519b9af566c063c64e74603a9159e9728e26 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 17:17:23 +0000
Subject: [PATCH 10/18] Add requested comment about scalar_t

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index d4881318df..f4b61e272f 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -27,7 +27,7 @@ void forced_align_impl(
   const auto L = targets.size(1);
   const auto S = 2 * L + 1;
 
-  auto alphas_a = new scalar_t[S][2];
+  auto alphas_a = new scalar_t[S][2]; // scalar_t is just logProbs.dtype()
   for (int i = 0; i < S; i++) {
     alphas_a[i][0] = kNegInfinity;
     alphas_a[i][1] = kNegInfinity;

From be13f647d7b492787fdbd71a0ffc92a4459fd013 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 18:53:06 +0000
Subject: [PATCH 11/18] WIP

---
 src/libtorchaudio/accessor.h                  |  6 +-
 src/libtorchaudio/accessor_tests.cpp          | 21 +++--
 .../forced_align/cpu/compute.cpp              | 79 ++++++++++++-------
 3 files changed, 70 insertions(+), 36 deletions(-)

diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h
index ed2b9f6257..2f763cf3c8 100644
--- a/src/libtorchaudio/accessor.h
+++ b/src/libtorchaudio/accessor.h
@@ -1,16 +1,18 @@
 #pragma once
 
-#include <torch/torch.h>
+#include <torch/csrc/stable/tensor.h>
 #include <type_traits>
 #include <cstdarg>
 
+using torch::stable::Tensor;
+
 template<unsigned int k, typename T, bool IsConst = true>
 class Accessor {
   int64_t strides[k];
   T *data;
 
 public:
-  using tensor_type = typename std::conditional<IsConst, const torch::Tensor&, torch::Tensor&>::type;
+  using tensor_type = typename std::conditional<IsConst, const Tensor&, Tensor&>::type;
 
   Accessor(tensor_type tensor) {
     data = tensor.template data_ptr<T>();
diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp
index ca6e48b193..e371a6b1ad 100644
--- a/src/libtorchaudio/accessor_tests.cpp
+++ b/src/libtorchaudio/accessor_tests.cpp
@@ -1,15 +1,18 @@
 #include <libtorchaudio/accessor.h>
 #include <cstdint>
 #include <torch/torch.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/library.h>
 
 using namespace std;
+using torch::stable::Tensor;
 
-bool test_accessor(const torch::Tensor& tensor) {
+bool test_accessor(const Tensor tensor) {
   int64_t* data_ptr = tensor.template data_ptr<int64_t>();
   auto accessor = Accessor<3, int64_t>(tensor);
-  for (int i = 0; i < tensor.size(0); i++) {
-    for (int j = 0; j < tensor.size(1); j++) {
-      for (int k = 0; k < tensor.size(2); k++) {
+  for (unsigned int i = 0; i < tensor.size(0); i++) {
+    for (unsigned int j = 0; j < tensor.size(1); j++) {
+      for (unsigned int k = 0; k < tensor.size(2); k++) {
         auto check = *(data_ptr++) ==  accessor.index(i, j, k);
         if (!check) {
           return false;
@@ -20,6 +23,12 @@ bool test_accessor(const torch::Tensor& tensor) {
   return true;
 }
 
-TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::_test_accessor", &test_accessor);
+void boxed_test_accessor(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor t1(to<AtenTensorHandle>(stack[0]));
+  auto result = compute(std::move(t1));
+  stack[0] = from(result);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def("torchaudio::_test_accessor", &boxed_test_accessor);
 }
diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index c0380f25ad..207af046a1 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -14,19 +14,17 @@ namespace torchaudio {
 namespace alignment {
 namespace cpu {
 
-
+using torch::stable::Tensor;
 
 // Inspired from
 // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp
-template <typename scalar_t, at::ScalarType target_scalar_type>
+template <typename scalar_t, typename target_t>
 void forced_align_impl(
-    const torch::Tensor& logProbs,
-    const torch::Tensor& targets,
-    const int64_t blank,
-    torch::Tensor& paths) {
+    const Tensor logProbs,
+    const Tensor targets,
+    const Tensor blank,
+    Tensor paths) {
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
-  using target_t = typename std::
-      conditional<target_scalar_type == torch::kInt, int, int64_t>::type;
   const auto batchIndex =
       0; // TODO: support batch version and use the real batch index
   const auto T = logProbs.size(1);
@@ -136,11 +134,11 @@ void forced_align_impl(
   }
 }
 
-std::tuple<torch::Tensor, torch::Tensor> compute(
-    const torch::Tensor& logProbs,
-    const torch::Tensor& targets,
-    const torch::Tensor& inputLengths,
-    const torch::Tensor& targetLengths,
+std::tuple<Tensor, Tensor> compute(
+    const Tensor& logProbs,
+    const Tensor& targets,
+    const Tensor& inputLengths,
+    const Tensor& targetLengths,
     const int64_t blank) {
   TORCH_CHECK(logProbs.is_cpu(), "log_probs must be a CPU tensor");
   TORCH_CHECK(targets.is_cpu(), "targets must be a CPU tensor");
@@ -185,19 +183,31 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
 
   const auto B = logProbs.size(0);
   const auto T = logProbs.size(1);
-  auto paths = torch::zeros(
-      {B, T},
-      torch::TensorOptions().device(targets.device()).dtype(targets.dtype()));
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      logProbs.scalar_type(), "forced_align_impl", [&] {
-        if (targets.scalar_type() == torch::kInt64) {
-          forced_align_impl<scalar_t, torch::kInt64>(
-              logProbs, targets, blank, paths);
-        } else {
-          forced_align_impl<scalar_t, torch::kInt32>(
-              logProbs, targets, blank, paths);
-        }
-      });
+
+  int64_t paths_size[2] = {B, T};
+  int64_t paths_stride[2] = {T, 1};
+  AtenTensorHandle paths_h;
+  aoti_torch_empty_strided(1, paths_size, paths_stride, targets_dtype, targets_device, targets_device_index, &paths_h);
+  auto paths = Tensor(paths_h);
+
+
+  if (targets.scalar_type() == aoti_torch_dtype_int64()) {
+    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
+      forced_align_impl<float64, int64>(logProbs, targets, blank, paths);
+    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float32, int64>(logProbs, targets, blank, paths);
+    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
+      forced_align_impl<float16, int64>(logProbs, targets, blank, paths);
+    }
+  } else if (targets.scalar_type() == aoti_torch_dtype_int32()) {
+    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
+      forced_align_impl<float64, int32>(logProbs, targets, blank, paths);
+    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float32, int32>(logProbs, targets, blank, paths);
+    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
+      forced_align_impl<float16, int32>(logProbs, targets, blank, paths);
+    }
+  }
   return std::make_tuple(
       paths,
       logProbs.index(
@@ -207,8 +217,21 @@ std::tuple<torch::Tensor, torch::Tensor> compute(
            paths.index({0})}));
 }
 
-TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
-  m.impl("forced_align", &compute);
+
+void boxed_compute(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor t1(to<AtenTensorHandle>(stack[0]));
+  Tensor t2(to<AtenTensorHandle>(stack[1]));
+  Tensor t3(to<AtenTensorHandle>(stack[2]));
+  Tensor t4(to<AtenTensorHandle>(stack[3]));
+  int64_t blank = to<int64_t>(stack[4]);
+  auto result = compute(
+      std::move(t1), std::move(t2), std::move(t3), std::move(t4), blank);
+  stack[0] = from(std::get<0>(result));
+  stack[1] = from(std::get<1>(result));
+}
+
+STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
+  m.impl("forced_align", &boxed_compute);
 }
 
 } // namespace cpu

From 77fd1ad5926d91cd48bc1099f71e3d86f510c8c7 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Wed, 6 Aug 2025 21:29:11 +0000
Subject: [PATCH 12/18] Use stable tensors throughout forced_align code

---
 src/libtorchaudio/accessor.h                  |  8 +--
 src/libtorchaudio/accessor_tests.cpp          | 20 +++++--
 .../forced_align/cpu/compute.cpp              | 56 ++++++++++---------
 3 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h
index 2f763cf3c8..0fc23e978f 100644
--- a/src/libtorchaudio/accessor.h
+++ b/src/libtorchaudio/accessor.h
@@ -15,8 +15,8 @@ class Accessor {
   using tensor_type = typename std::conditional<IsConst, const Tensor&, Tensor&>::type;
 
   Accessor(tensor_type tensor) {
-    data = tensor.template data_ptr<T>();
-    for (int i = 0; i < k; i++) {
+    data = (T*)tensor.template data_ptr();
+    for (unsigned int i = 0; i < k; i++) {
       strides[i] = tensor.stride(i);
     }
   }
@@ -25,7 +25,7 @@ class Accessor {
     va_list args;
     va_start(args, k);
     int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
+    for (unsigned int i = 0; i < k; i++) {
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
@@ -37,7 +37,7 @@ class Accessor {
     va_list args;
     va_start(args, value);
     int64_t ix = 0;
-    for (int i = 0; i < k; i++) {
+    for (unsigned int i = 0; i < k; i++) {
         ix += strides[i] * va_arg(args, int);
     }
     va_end(args);
diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp
index e371a6b1ad..62e9b23d5a 100644
--- a/src/libtorchaudio/accessor_tests.cpp
+++ b/src/libtorchaudio/accessor_tests.cpp
@@ -4,11 +4,15 @@
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/library.h>
 
+namespace torchaudio {
+
+namespace accessor_tests {
+
 using namespace std;
 using torch::stable::Tensor;
 
 bool test_accessor(const Tensor tensor) {
-  int64_t* data_ptr = tensor.template data_ptr<int64_t>();
+  int64_t* data_ptr = (int64_t*)tensor.data_ptr();
   auto accessor = Accessor<3, int64_t>(tensor);
   for (unsigned int i = 0; i < tensor.size(0); i++) {
     for (unsigned int j = 0; j < tensor.size(1); j++) {
@@ -25,10 +29,18 @@ bool test_accessor(const Tensor tensor) {
 
 void boxed_test_accessor(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
   Tensor t1(to<AtenTensorHandle>(stack[0]));
-  auto result = compute(std::move(t1));
+  auto result = test_accessor(std::move(t1));
   stack[0] = from(result);
 }
 
-STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
-  m.def("torchaudio::_test_accessor", &boxed_test_accessor);
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def(
+      "_test_accessor(Tensor log_probs) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
+  m.impl("torchaudio::_test_accessor", &boxed_test_accessor);
+}
+
+}
 }
diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 5ca95b2b3c..1320533ec9 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <libtorchaudio/accessor.h>
+#include <torch/headeronly/util/Half.h>
 
 
 using namespace std;
@@ -22,7 +23,7 @@ template <typename scalar_t, typename target_t>
 void forced_align_impl(
     const Tensor logProbs,
     const Tensor targets,
-    const Tensor blank,
+    target_t blank,
     Tensor paths) {
   const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity();
   const auto batchIndex =
@@ -143,15 +144,15 @@ std::tuple<Tensor, Tensor> compute(
   TORCH_CHECK(logProbs.is_cpu(), "log_probs must be a CPU tensor");
   TORCH_CHECK(targets.is_cpu(), "targets must be a CPU tensor");
   TORCH_CHECK(
-      logProbs.device() == targets.device(),
+      logProbs.get_device() == targets.get_device(),
       "log_probs and targets need to be on the same device");
   TORCH_CHECK(
-      logProbs.dtype() == torch::kFloat64 ||
-          logProbs.dtype() == torch::kFloat32 ||
-          logProbs.dtype() == torch::kFloat16,
+      logProbs.dtype() == aoti_torch_dtype_float64() ||
+          logProbs.dtype() == aoti_torch_dtype_float32() ||
+          logProbs.dtype() == aoti_torch_dtype_float16(),
       "log_probs must be float64, float32 or float16 (half) type");
   TORCH_CHECK(
-      targets.dtype() == torch::kInt32 || targets.dtype() == torch::kInt64,
+      targets.dtype() == aoti_torch_dtype_int32() || targets.dtype() == aoti_torch_dtype_int64(),
       "targets must be int32 or int64 type");
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
@@ -174,12 +175,13 @@ std::tuple<Tensor, Tensor> compute(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
-  TORCH_CHECK(
-      logProbs.size(1) == at::max(inputLengths).item().toInt(),
-      "input length mismatch");
-  TORCH_CHECK(
-      targets.size(1) == at::max(targetLengths).item().toInt(),
-      "target length mismatch");
+  // TODO: Requires port of `max` operator.
+  // TORCH_CHECK(
+  //     logProbs.size(1) == at::max(inputLengths).item().toInt(),
+  //     "input length mismatch");
+  // TORCH_CHECK(
+  //     targets.size(1) == at::max(targetLengths).item().toInt(),
+  //     "target length mismatch");
 
   const auto B = logProbs.size(0);
   const auto T = logProbs.size(1);
@@ -187,25 +189,27 @@ std::tuple<Tensor, Tensor> compute(
   int64_t paths_size[2] = {B, T};
   int64_t paths_stride[2] = {T, 1};
   AtenTensorHandle paths_h;
-  aoti_torch_empty_strided(1, paths_size, paths_stride, targets_dtype, targets_device, targets_device_index, &paths_h);
+  int32_t targets_device;
+  aoti_torch_get_device_type(targets.get(), &targets_device);
+  aoti_torch_empty_strided(1, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h);
   auto paths = Tensor(paths_h);
 
 
   if (targets.dtype() == aoti_torch_dtype_int64()) {
-    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
-      forced_align_impl<float64, int64>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
-      forced_align_impl<float32, int64>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
-      forced_align_impl<float16, int64>(logProbs, targets, blank, paths);
+    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+      forced_align_impl<double, int64_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float, int64_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+      forced_align_impl<c10::Half, int64_t>(logProbs, targets, blank, paths);
     }
-  } else if (targets.scalar_type() == aoti_torch_dtype_int32()) {
-    if (logProbs.scalar_type() == aoti_torch_dtype_float64()) {
-      forced_align_impl<float64, int32>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) {
-      forced_align_impl<float32, int32>(logProbs, targets, blank, paths);
-    } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) {
-      forced_align_impl<float16, int32>(logProbs, targets, blank, paths);
+  } else if (targets.dtype() == aoti_torch_dtype_int32()) {
+    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+      forced_align_impl<double, int32_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+      forced_align_impl<float, int32_t>(logProbs, targets, blank, paths);
+    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+      forced_align_impl<c10::Half, int32_t>(logProbs, targets, blank, paths);
     }
   }
   return std::make_tuple(

From ced6124daa9b553f0c20768dde22e3a855961592 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Thu, 7 Aug 2025 14:54:57 +0000
Subject: [PATCH 13/18] Free alphas_a array

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index f4b61e272f..c2776948df 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -119,6 +119,7 @@ void forced_align_impl(
   }
   auto idx1 = (T - 1) % 2;
   auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2;
+  delete[] alphas_a;
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {
     auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2];

From 71ce212e98aef6b89299014180334de35de8e6c0 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Thu, 7 Aug 2025 14:58:30 +0000
Subject: [PATCH 14/18] Free backPtr_a

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 530d97abf9..5562500694 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -129,6 +129,7 @@ void forced_align_impl(
     paths_a[batchIndex][t] = lbl_idx;
     ltrIdx -= backPtr_a[t * S + ltrIdx]; // backPtr_a[t][ltrIdx]
   }
+  delete[] backPtr_a;
 }
 
 std::tuple<torch::Tensor, torch::Tensor> compute(

From 9629864f8b0fea3b74cbdff90ea3c8b38cb60ea5 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Thu, 7 Aug 2025 19:23:02 +0000
Subject: [PATCH 15/18] Fix merge conflict

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index e9b271ded9..3d4063da65 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -91,13 +91,8 @@ void forced_align_impl(
     }
     if (start == 0) {
       alphas_a[0][curIdxOffset] =
-<<<<<<< HEAD
           alphas_a[0][prevIdxOffset] + logProbs_a.index(batchIndex, t, blank);
-      backPtr_a[S * t] = 0;
-=======
-          alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
-      backPtr_a[S * t] = 0; // backPtr_a[t][0] = 0
->>>>>>> forced_align_backptr
+      backPtr_a[S * t] = 0;  // backPtr_a[t][0] = 0
       startloop += 1;
     }
 

From 847b72652f43062e766561f9ac7a2dcb9225fc91 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Thu, 7 Aug 2025 20:02:25 +0000
Subject: [PATCH 16/18] Correct dimensionality of path variable

---
 src/libtorchaudio/forced_align/cpu/compute.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 3d4063da65..105443f071 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -177,7 +177,7 @@ std::tuple<Tensor, Tensor> compute(
       blank >= 0 && blank < logProbs.size(-1),
       "blank must be within [0, num classes)");
 
-  // TODO: Requires port of `max` operator.
+  // TODO: Requires port of `max` and `item` operators.
   // TORCH_CHECK(
   //     logProbs.size(1) == at::max(inputLengths).item().toInt(),
   //     "input length mismatch");
@@ -193,7 +193,7 @@ std::tuple<Tensor, Tensor> compute(
   AtenTensorHandle paths_h;
   int32_t targets_device;
   aoti_torch_get_device_type(targets.get(), &targets_device);
-  aoti_torch_empty_strided(1, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h);
+  aoti_torch_empty_strided(2, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h);
   auto paths = Tensor(paths_h);
 
 

From 2663def7574802c262ef14bc4614458796cfa238 Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Fri, 8 Aug 2025 14:53:43 +0000
Subject: [PATCH 17/18] Use 1d indexing in original layout for alphas_a

---
 .../forced_align/cpu/compute.cpp              | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index c2776948df..0c08a346b0 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -27,10 +27,9 @@ void forced_align_impl(
   const auto L = targets.size(1);
   const auto S = 2 * L + 1;
 
-  auto alphas_a = new scalar_t[S][2]; // scalar_t is just logProbs.dtype()
-  for (int i = 0; i < S; i++) {
-    alphas_a[i][0] = kNegInfinity;
-    alphas_a[i][1] = kNegInfinity;
+  auto alphas_a = new scalar_t[2 * S]; // scalar_t is just logProbs.dtype()
+  for (int i = 0; i < 2 * S; i++) {
+    alphas_a[i] = kNegInfinity;
   }
 
   torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1);
@@ -56,7 +55,7 @@ void forced_align_impl(
   auto end = (S == 1) ? 1 : 2;
   for (auto i = start; i < end; i++) {
     auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
-    alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx];
+    alphas_a[i] = logProbs_a[batchIndex][0][labelIdx]; // alphas_a[0, i]
   }
   for (auto t = 1; t < T; t++) {
     if (T - t <= L + R) {
@@ -79,18 +78,18 @@ void forced_align_impl(
     auto curIdxOffset = t % 2;
     auto prevIdxOffset = (t - 1) % 2;
     for (auto j = 0; j < S; ++j) {
-      alphas_a[j][curIdxOffset] = -std::numeric_limits<scalar_t>::infinity();
+      alphas_a[curIdxOffset * S + j] = -std::numeric_limits<scalar_t>::infinity(); // alphas_a[curIdxOffset][j]
     }
     if (start == 0) {
-      alphas_a[0][curIdxOffset] =
-          alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank];
+      alphas_a[curIdxOffset * S] =
+          alphas_a[prevIdxOffset * S] + logProbs_a[batchIndex][t][blank];
       backPtr_a[t][0] = 0;
       startloop += 1;
     }
 
     for (auto i = startloop; i < end; i++) {
-      auto x0 = alphas_a[i][prevIdxOffset];
-      auto x1 = alphas_a[i - 1][prevIdxOffset];
+      auto x0 = alphas_a[prevIdxOffset * S + i]; // alphas_a[prevIdxOffset][i];
+      auto x1 = alphas_a[prevIdxOffset * S + i - 1]; // alphas_a[prevIdxOffset][i - 1];
       auto x2 = -std::numeric_limits<scalar_t>::infinity();
 
       auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2];
@@ -101,7 +100,7 @@ void forced_align_impl(
       // (i != 1) just ensures we don't access targets[i - 2] if its i < 2
       if (i % 2 != 0 && i != 1 &&
           targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) {
-        x2 = alphas_a[i - 2][prevIdxOffset];
+        x2 = alphas_a[prevIdxOffset * S + i - 2]; // alphas_a[prevIdxOffset][i - 2];
       }
       scalar_t result = 0.0;
       if (x2 > x1 && x2 > x0) {
@@ -114,11 +113,12 @@ void forced_align_impl(
         result = x0;
         backPtr_a[t][i] = 0;
       }
-      alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx];
+      alphas_a[curIdxOffset * S + i] = result + logProbs_a[batchIndex][t][labelIdx]; // alphas_a[curIdxOffset][i]
     }
   }
   auto idx1 = (T - 1) % 2;
-  auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2;
+  auto ltrIdx = alphas_a[S * idx1 + S - 1] >
+    alphas_a[S * idx1 + S - 2] ? S - 1 : S - 2; // alphas_a[idx1][S - 1], alphas_a[idx1][S - 2]
   delete[] alphas_a;
   // path stores the token index for each time step after force alignment.
   for (auto t = T - 1; t > -1; t--) {

From 86f75576d36c342140a46a589c65e48e52890a8e Mon Sep 17 00:00:00 2001
From: Sam Anklesaria <sanklesaria@openteams.com>
Date: Tue, 19 Aug 2025 21:42:28 +0000
Subject: [PATCH 18/18] Use c-style dtype API

---
 .../forced_align/cpu/compute.cpp              | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp
index 6314433786..2632941d69 100644
--- a/src/libtorchaudio/forced_align/cpu/compute.cpp
+++ b/src/libtorchaudio/forced_align/cpu/compute.cpp
@@ -150,13 +150,17 @@ std::tuple<Tensor, Tensor> compute(
   TORCH_CHECK(
       logProbs.get_device() == targets.get_device(),
       "log_probs and targets need to be on the same device");
+  int32_t logprobs_dtype;
+  aoti_torch_get_dtype(logProbs.get(), &logprobs_dtype);
   TORCH_CHECK(
-      logProbs.dtype() == aoti_torch_dtype_float64() ||
-          logProbs.dtype() == aoti_torch_dtype_float32() ||
-          logProbs.dtype() == aoti_torch_dtype_float16(),
+    logprobs_dtype == aoti_torch_dtype_float64() ||
+    logprobs_dtype == aoti_torch_dtype_float32() ||
+    logprobs_dtype == aoti_torch_dtype_float16(),
       "log_probs must be float64, float32 or float16 (half) type");
+  int32_t targets_dtype;
+  aoti_torch_get_dtype(targets.get(), &targets_dtype);
   TORCH_CHECK(
-      targets.dtype() == aoti_torch_dtype_int32() || targets.dtype() == aoti_torch_dtype_int64(),
+    targets_dtype == aoti_torch_dtype_int32() || targets_dtype == aoti_torch_dtype_int64(),
       "targets must be int32 or int64 type");
   TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous");
   TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous");
@@ -195,24 +199,24 @@ std::tuple<Tensor, Tensor> compute(
   AtenTensorHandle paths_h;
   int32_t targets_device;
   aoti_torch_get_device_type(targets.get(), &targets_device);
-  aoti_torch_empty_strided(2, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h);
+  aoti_torch_empty_strided(2, paths_size, paths_stride, targets_dtype, targets_device, targets.get_device(), &paths_h);
   auto paths = Tensor(paths_h);
 
 
-  if (targets.dtype() == aoti_torch_dtype_int64()) {
-    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+  if (targets_dtype == aoti_torch_dtype_int64()) {
+    if (logprobs_dtype == aoti_torch_dtype_float64()) {
       forced_align_impl<double, int64_t>(logProbs, targets, blank, paths);
-    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+    } else if (logprobs_dtype == aoti_torch_dtype_float32()) {
       forced_align_impl<float, int64_t>(logProbs, targets, blank, paths);
-    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+    } else if (logprobs_dtype == aoti_torch_dtype_float16()) {
       forced_align_impl<c10::Half, int64_t>(logProbs, targets, blank, paths);
     }
-  } else if (targets.dtype() == aoti_torch_dtype_int32()) {
-    if (logProbs.dtype() == aoti_torch_dtype_float64()) {
+  } else if (targets_dtype == aoti_torch_dtype_int32()) {
+    if (logprobs_dtype == aoti_torch_dtype_float64()) {
       forced_align_impl<double, int32_t>(logProbs, targets, blank, paths);
-    } else if (logProbs.dtype() == aoti_torch_dtype_float32()) {
+    } else if (logprobs_dtype == aoti_torch_dtype_float32()) {
       forced_align_impl<float, int32_t>(logProbs, targets, blank, paths);
-    } else if (logProbs.dtype() == aoti_torch_dtype_float16()) {
+    } else if (logprobs_dtype == aoti_torch_dtype_float16()) {
       forced_align_impl<c10::Half, int32_t>(logProbs, targets, blank, paths);
     }
   }