From 2062dc704ddf458eee325acd8617f36749d911b0 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 16:37:57 +0000 Subject: [PATCH 01/18] Make alphas_a standard C array --- .../forced_align/cpu/compute.cpp | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 81f5f0a459..d4881318df 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include using namespace std; @@ -22,17 +26,17 @@ void forced_align_impl( const auto T = logProbs.size(1); const auto L = targets.size(1); const auto S = 2 * L + 1; - torch::Tensor alphas = torch::empty( - {2, S}, - torch::TensorOptions() - .device(logProbs.device()) - .dtype(logProbs.dtype())) - .fill_(kNegInfinity); + + auto alphas_a = new scalar_t[S][2]; + for (int i = 0; i < S; i++) { + alphas_a[i][0] = kNegInfinity; + alphas_a[i][1] = kNegInfinity; + } + torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1); auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); - auto alphas_a = alphas.accessor(); auto backPtr_a = backPtr.accessor(); auto R = 0; for (auto i = 1; i < L; i++) { @@ -52,7 +56,7 @@ void forced_align_impl( auto end = (S == 1) ? 1 : 2; for (auto i = start; i < end; i++) { auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; - alphas_a[0][i] = logProbs_a[batchIndex][0][labelIdx]; + alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx]; } for (auto t = 1; t < T; t++) { if (T - t <= L + R) { @@ -75,18 +79,18 @@ void forced_align_impl( auto curIdxOffset = t % 2; auto prevIdxOffset = (t - 1) % 2; for (auto j = 0; j < S; ++j) { - alphas_a[curIdxOffset][j] = -std::numeric_limits::infinity(); + alphas_a[j][curIdxOffset] = -std::numeric_limits::infinity(); } if (start == 0) { - alphas_a[curIdxOffset][0] = - alphas_a[prevIdxOffset][0] + logProbs_a[batchIndex][t][blank]; + alphas_a[0][curIdxOffset] = + alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; backPtr_a[t][0] = 0; startloop += 1; } for (auto i = startloop; i < end; i++) { - auto x0 = alphas_a[prevIdxOffset][i]; - auto x1 = alphas_a[prevIdxOffset][i - 1]; + auto x0 = alphas_a[i][prevIdxOffset]; + auto x1 = alphas_a[i - 1][prevIdxOffset]; auto x2 = -std::numeric_limits::infinity(); auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; @@ -97,7 +101,7 @@ void forced_align_impl( // (i != 1) just ensures we don't access targets[i - 2] if its i < 2 if (i % 2 != 0 && i != 1 && targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) { - x2 = alphas_a[prevIdxOffset][i - 2]; + x2 = alphas_a[i - 2][prevIdxOffset]; } scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { @@ -110,11 +114,11 @@ void forced_align_impl( result = x0; backPtr_a[t][i] = 0; } - alphas_a[curIdxOffset][i] = result + logProbs_a[batchIndex][t][labelIdx]; + alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx]; } } auto idx1 = (T - 1) % 2; - auto ltrIdx = alphas_a[idx1][S - 1] > alphas_a[idx1][S - 2] ? S - 1 : S - 2; + auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; From e70113c879ee86502d5a49cb50a2a8093a8dd029 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 17:59:19 +0000 Subject: [PATCH 02/18] Convert backptr to standard array --- src/libtorchaudio/forced_align/cpu/compute.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index d4881318df..6dc8bb93ab 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -33,11 +33,14 @@ void forced_align_impl( alphas_a[i][1] = kNegInfinity; } - torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1); + auto backPtr_a = new int8_t[T * S]; + for (int i = 0; i < T * S; i++) { + backPtr_a[i] = -1; + } + auto logProbs_a = logProbs.accessor(); auto targets_a = targets.accessor(); auto paths_a = paths.accessor(); - auto backPtr_a = backPtr.accessor(); auto R = 0; for (auto i = 1; i < L; i++) { if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) { @@ -84,7 +87,7 @@ void forced_align_impl( if (start == 0) { alphas_a[0][curIdxOffset] = alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; - backPtr_a[t][0] = 0; + backPtr_a[S * t] = 0; startloop += 1; } @@ -106,13 +109,13 @@ void forced_align_impl( scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { result = x2; - backPtr_a[t][i] = 2; + backPtr_a[t * S + i] = 2; } else if (x1 > x0 && x1 > x2) { result = x1; - backPtr_a[t][i] = 1; + backPtr_a[t * S + i] = 1; } else { result = x0; - backPtr_a[t][i] = 0; + backPtr_a[t * S + i] = 0; } alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx]; } @@ -123,7 +126,7 @@ void forced_align_impl( for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; - ltrIdx -= backPtr_a[t][ltrIdx]; + ltrIdx -= backPtr_a[t * S + ltrIdx]; } } From 4039399cca52ffdf30148e8e2bfac645dffc641b Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 19:10:34 +0000 Subject: [PATCH 03/18] Create Accessor class --- .../forced_align/cpu/compute.cpp | 74 +++++++++++++++---- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 6dc8bb93ab..e641525ddd 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -4,12 +4,58 @@ #include #include #include +#include +#include + using namespace std; namespace torchaudio { namespace alignment { namespace cpu { + +template +class Accessor { + int64_t shape[k]; + T *data; + +public: + Accessor(const torch::Tensor& tensor) { + data = tensor.data_ptr(); + for (int i = 0; i < k; i++) { + shape[i] = tensor.size(i); + } + } + + T index(...) { + va_list args; + va_start(args, k); + int64_t ix = 0; + for (int i = 0; i < k; i++) { + if (i == k - 1) + ix += va_arg(args, int); + else + ix += shape[i+1] * va_arg(args, int); + } + va_end(args); + return data[ix]; + } + + // void set_index(T val,...) { + // va_list args; + // va_start(args, k); + // int64_t ix = 0; + // for (int i = 0; i < k; i++) { + // if (i == k - 1) + // ix += va_arg(args, int); + // else + // ix += shape[i+1] * va_arg(args, int); + // } + // va_end(args); + // data[ix] = val; + // } +}; + // Inspired from // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp template @@ -38,12 +84,12 @@ void forced_align_impl( backPtr_a[i] = -1; } - auto logProbs_a = logProbs.accessor(); - auto targets_a = targets.accessor(); + auto logProbs_a = Accessor<3, scalar_t>(logProbs); + auto targets_a = Accessor<2, target_t>(targets); auto paths_a = paths.accessor(); auto R = 0; for (auto i = 1; i < L; i++) { - if (targets_a[batchIndex][i] == targets_a[batchIndex][i - 1]) { + if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) { ++R; } } @@ -58,22 +104,22 @@ void forced_align_impl( auto start = T - (L + R) > 0 ? 0 : 1; auto end = (S == 1) ? 1 : 2; for (auto i = start; i < end; i++) { - auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; - alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx]; + auto labelIdx = (i % 2 == 0) ? blank : targets_a.index(batchIndex, i / 2); + alphas_a[i][0] = logProbs_a.index(batchIndex,0,labelIdx); } for (auto t = 1; t < T; t++) { if (T - t <= L + R) { if ((start % 2 == 1) && - targets_a[batchIndex][start / 2] != - targets_a[batchIndex][start / 2 + 1]) { + targets_a.index(batchIndex, start / 2) != + targets_a.index(batchIndex, start / 2 + 1)) { start = start + 1; } start = start + 1; } if (t <= L + R) { if (end % 2 == 0 && end < 2 * L && - targets_a[batchIndex][end / 2 - 1] != - targets_a[batchIndex][end / 2]) { + targets_a.index(batchIndex, end / 2 - 1) != + targets_a.index(batchIndex, end / 2)) { end = end + 1; } end = end + 1; @@ -86,7 +132,7 @@ void forced_align_impl( } if (start == 0) { alphas_a[0][curIdxOffset] = - alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; + alphas_a[0][prevIdxOffset] + logProbs_a.index(batchIndex, t, blank); backPtr_a[S * t] = 0; startloop += 1; } @@ -96,14 +142,14 @@ void forced_align_impl( auto x1 = alphas_a[i - 1][prevIdxOffset]; auto x2 = -std::numeric_limits::infinity(); - auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; + auto labelIdx = (i % 2 == 0) ? blank : targets_a.index(batchIndex, i / 2); // In CTC, the optimal path may optionally chose to skip a blank label. // x2 represents skipping a letter, and can only happen if we're not // currently on a blank_label, and we're not on a repeat letter // (i != 1) just ensures we don't access targets[i - 2] if its i < 2 if (i % 2 != 0 && i != 1 && - targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) { + targets_a.index(batchIndex, i / 2) != targets_a.index(batchIndex, i / 2 - 1)) { x2 = alphas_a[i - 2][prevIdxOffset]; } scalar_t result = 0.0; @@ -117,14 +163,14 @@ void forced_align_impl( result = x0; backPtr_a[t * S + i] = 0; } - alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx]; + alphas_a[i][curIdxOffset] = result + logProbs_a.index(batchIndex, t, labelIdx); } } auto idx1 = (T - 1) % 2; auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { - auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; + auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a.index(batchIndex, ltrIdx / 2); paths_a[batchIndex][t] = lbl_idx; ltrIdx -= backPtr_a[t * S + ltrIdx]; } From b733629b0990232a57991dded17e71ad395df009 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 19:16:29 +0000 Subject: [PATCH 04/18] Add MutAccessor --- .../forced_align/cpu/compute.cpp | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index e641525ddd..b0d35c334b 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -40,22 +40,38 @@ class Accessor { va_end(args); return data[ix]; } +}; + + +template +class MutAccessor { + int64_t shape[k]; + T *data; + +public: + MutAccessor(torch::Tensor& tensor) { + data = tensor.data_ptr(); + for (int i = 0; i < k; i++) { + shape[i] = tensor.size(i); + } + } - // void set_index(T val,...) { - // va_list args; - // va_start(args, k); - // int64_t ix = 0; - // for (int i = 0; i < k; i++) { - // if (i == k - 1) - // ix += va_arg(args, int); - // else - // ix += shape[i+1] * va_arg(args, int); - // } - // va_end(args); - // data[ix] = val; - // } + void set_index(T value,...) { + va_list args; + va_start(args, k); + int64_t ix = 0; + for (int i = 0; i < k; i++) { + if (i == k - 1) + ix += va_arg(args, int); + else + ix += shape[i+1] * va_arg(args, int); + } + va_end(args); + data[ix] = value; + } }; + // Inspired from // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp template @@ -86,7 +102,7 @@ void forced_align_impl( auto logProbs_a = Accessor<3, scalar_t>(logProbs); auto targets_a = Accessor<2, target_t>(targets); - auto paths_a = paths.accessor(); + auto paths_a = MutAccessor<2, target_t>(paths); auto R = 0; for (auto i = 1; i < L; i++) { if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) { @@ -171,7 +187,7 @@ void forced_align_impl( // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a.index(batchIndex, ltrIdx / 2); - paths_a[batchIndex][t] = lbl_idx; + paths_a.set_index(lbl_idx, batchIndex, t); ltrIdx -= backPtr_a[t * S + ltrIdx]; } } From 9beb34a931defb7564e32f285753d36465ce8f3d Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 21:28:27 +0000 Subject: [PATCH 05/18] Fix multidimensional indexing bug --- .../forced_align/cpu/compute.cpp | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index b0d35c334b..0579b5cf8c 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -14,17 +14,27 @@ namespace torchaudio { namespace alignment { namespace cpu { +// Compute strides for row-major indexing +template +void reverse_cumprod(int64_t (&strides)[k]) { + // Convert dimensions to strides: stride[i] = product of dimensions [i+1..k-1] + for (int i = k - 2; i >= 0; i--) { + strides[i] = strides[i] * strides[i + 1]; + } +} + template class Accessor { - int64_t shape[k]; + int64_t strides[k-1]; T *data; public: Accessor(const torch::Tensor& tensor) { data = tensor.data_ptr(); - for (int i = 0; i < k; i++) { - shape[i] = tensor.size(i); + for (int i = 1; i < k; i++) { + strides[i-1] = tensor.size(i); } + reverse_cumprod(strides); } T index(...) { @@ -35,43 +45,42 @@ class Accessor { if (i == k - 1) ix += va_arg(args, int); else - ix += shape[i+1] * va_arg(args, int); + ix += strides[i] * va_arg(args, int); } va_end(args); return data[ix]; } }; - template class MutAccessor { - int64_t shape[k]; + int64_t strides[k-1]; T *data; public: - MutAccessor(torch::Tensor& tensor) { + MutAccessor(torch::Tensor& tensor) { data = tensor.data_ptr(); - for (int i = 0; i < k; i++) { - shape[i] = tensor.size(i); + for (int i = 1; i < k; i++) { + strides[i-1] = tensor.size(i); } + reverse_cumprod(strides); } - void set_index(T value,...) { + void set_index(T value, ...) { va_list args; - va_start(args, k); + va_start(args, value); int64_t ix = 0; for (int i = 0; i < k; i++) { if (i == k - 1) ix += va_arg(args, int); else - ix += shape[i+1] * va_arg(args, int); + ix += strides[i] * va_arg(args, int); } va_end(args); data[ix] = value; } }; - // Inspired from // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp template From 11d1e217f0fba3c70d0f6846d829c5d6bd8659ce Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 5 Aug 2025 22:21:33 +0000 Subject: [PATCH 06/18] Use strides rather than computing standard strides from dims --- .../forced_align/cpu/compute.cpp | 29 ++++--------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 0579b5cf8c..3ba23c4797 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -14,27 +14,17 @@ namespace torchaudio { namespace alignment { namespace cpu { -// Compute strides for row-major indexing -template -void reverse_cumprod(int64_t (&strides)[k]) { - // Convert dimensions to strides: stride[i] = product of dimensions [i+1..k-1] - for (int i = k - 2; i >= 0; i--) { - strides[i] = strides[i] * strides[i + 1]; - } -} - template class Accessor { - int64_t strides[k-1]; + int64_t strides[k]; T *data; public: Accessor(const torch::Tensor& tensor) { data = tensor.data_ptr(); - for (int i = 1; i < k; i++) { - strides[i-1] = tensor.size(i); + for (int i = 0; i < k; i++) { + strides[i] = tensor.stride(i); } - reverse_cumprod(strides); } T index(...) { @@ -42,9 +32,6 @@ class Accessor { va_start(args, k); int64_t ix = 0; for (int i = 0; i < k; i++) { - if (i == k - 1) - ix += va_arg(args, int); - else ix += strides[i] * va_arg(args, int); } va_end(args); @@ -54,16 +41,15 @@ class Accessor { template class MutAccessor { - int64_t strides[k-1]; + int64_t strides[k]; T *data; public: MutAccessor(torch::Tensor& tensor) { data = tensor.data_ptr(); - for (int i = 1; i < k; i++) { - strides[i-1] = tensor.size(i); + for (int i = 0; i < k; i++) { + strides[i] = tensor.stride(i); } - reverse_cumprod(strides); } void set_index(T value, ...) { @@ -71,9 +57,6 @@ class MutAccessor { va_start(args, value); int64_t ix = 0; for (int i = 0; i < k; i++) { - if (i == k - 1) - ix += va_arg(args, int); - else ix += strides[i] * va_arg(args, int); } va_end(args); From b47c053919739ba5a48c214fceaf161f7e007115 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 16:04:24 +0000 Subject: [PATCH 07/18] Merge Accessor and MutAccessor --- .../forced_align/cpu/compute.cpp | 32 +++++++------------ 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 3ba23c4797..8a5389bff5 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -6,6 +6,7 @@ #include #include #include +#include using namespace std; @@ -14,14 +15,16 @@ namespace torchaudio { namespace alignment { namespace cpu { -template +template class Accessor { int64_t strides[k]; T *data; public: - Accessor(const torch::Tensor& tensor) { - data = tensor.data_ptr(); + using tensor_type = typename std::conditional::type; + + Accessor(tensor_type tensor) { + data = tensor.template data_ptr(); for (int i = 0; i < k; i++) { strides[i] = tensor.stride(i); } @@ -37,22 +40,9 @@ class Accessor { va_end(args); return data[ix]; } -}; - -template -class MutAccessor { - int64_t strides[k]; - T *data; - -public: - MutAccessor(torch::Tensor& tensor) { - data = tensor.data_ptr(); - for (int i = 0; i < k; i++) { - strides[i] = tensor.stride(i); - } - } - void set_index(T value, ...) { + template + typename std::enable_if::type set_index(T value, ...) { va_list args; va_start(args, value); int64_t ix = 0; @@ -92,9 +82,9 @@ void forced_align_impl( backPtr_a[i] = -1; } - auto logProbs_a = Accessor<3, scalar_t>(logProbs); - auto targets_a = Accessor<2, target_t>(targets); - auto paths_a = MutAccessor<2, target_t>(paths); + auto logProbs_a = Accessor<3, scalar_t, true>(logProbs); + auto targets_a = Accessor<2, target_t, true>(targets); + auto paths_a = Accessor<2, target_t, false>(paths); auto R = 0; for (auto i = 1; i < L; i++) { if (targets_a.index(batchIndex, i) == targets_a.index(batchIndex, i - 1)) { From 7a94b04e7c584a171934b62367bc2763ae1deace Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 16:45:56 +0000 Subject: [PATCH 08/18] Move Accessor to its own file and add tests --- src/libtorchaudio/CMakeLists.txt | 1 + src/libtorchaudio/accessor.h | 44 +++++++++++++++++++ src/libtorchaudio/accessor_tests.cpp | 25 +++++++++++ .../forced_align/cpu/compute.cpp | 40 +---------------- test/torchaudio_unittest/accessor_test.py | 7 +++ 5 files changed, 78 insertions(+), 39 deletions(-) create mode 100644 src/libtorchaudio/accessor.h create mode 100644 src/libtorchaudio/accessor_tests.cpp create mode 100644 test/torchaudio_unittest/accessor_test.py diff --git a/src/libtorchaudio/CMakeLists.txt b/src/libtorchaudio/CMakeLists.txt index 85bc227cd6..20ad792b32 100644 --- a/src/libtorchaudio/CMakeLists.txt +++ b/src/libtorchaudio/CMakeLists.txt @@ -6,6 +6,7 @@ set( lfilter.cpp overdrive.cpp utils.cpp + accessor_tests.cpp ) set( diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h new file mode 100644 index 0000000000..ed2b9f6257 --- /dev/null +++ b/src/libtorchaudio/accessor.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include + +template +class Accessor { + int64_t strides[k]; + T *data; + +public: + using tensor_type = typename std::conditional::type; + + Accessor(tensor_type tensor) { + data = tensor.template data_ptr(); + for (int i = 0; i < k; i++) { + strides[i] = tensor.stride(i); + } + } + + T index(...) { + va_list args; + va_start(args, k); + int64_t ix = 0; + for (int i = 0; i < k; i++) { + ix += strides[i] * va_arg(args, int); + } + va_end(args); + return data[ix]; + } + + template + typename std::enable_if::type set_index(T value, ...) { + va_list args; + va_start(args, value); + int64_t ix = 0; + for (int i = 0; i < k; i++) { + ix += strides[i] * va_arg(args, int); + } + va_end(args); + data[ix] = value; + } +}; diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp new file mode 100644 index 0000000000..ca6e48b193 --- /dev/null +++ b/src/libtorchaudio/accessor_tests.cpp @@ -0,0 +1,25 @@ +#include +#include +#include + +using namespace std; + +bool test_accessor(const torch::Tensor& tensor) { + int64_t* data_ptr = tensor.template data_ptr(); + auto accessor = Accessor<3, int64_t>(tensor); + for (int i = 0; i < tensor.size(0); i++) { + for (int j = 0; j < tensor.size(1); j++) { + for (int k = 0; k < tensor.size(2); k++) { + auto check = *(data_ptr++) == accessor.index(i, j, k); + if (!check) { + return false; + } + } + } + } + return true; +} + +TORCH_LIBRARY_FRAGMENT(torchaudio, m) { + m.def("torchaudio::_test_accessor", &test_accessor); +} diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 8a5389bff5..c0380f25ad 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -5,8 +5,7 @@ #include #include #include -#include -#include +#include using namespace std; @@ -15,44 +14,7 @@ namespace torchaudio { namespace alignment { namespace cpu { -template -class Accessor { - int64_t strides[k]; - T *data; - -public: - using tensor_type = typename std::conditional::type; - - Accessor(tensor_type tensor) { - data = tensor.template data_ptr(); - for (int i = 0; i < k; i++) { - strides[i] = tensor.stride(i); - } - } - T index(...) { - va_list args; - va_start(args, k); - int64_t ix = 0; - for (int i = 0; i < k; i++) { - ix += strides[i] * va_arg(args, int); - } - va_end(args); - return data[ix]; - } - - template - typename std::enable_if::type set_index(T value, ...) { - va_list args; - va_start(args, value); - int64_t ix = 0; - for (int i = 0; i < k; i++) { - ix += strides[i] * va_arg(args, int); - } - va_end(args); - data[ix] = value; - } -}; // Inspired from // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp diff --git a/test/torchaudio_unittest/accessor_test.py b/test/torchaudio_unittest/accessor_test.py new file mode 100644 index 0000000000..db14258dc6 --- /dev/null +++ b/test/torchaudio_unittest/accessor_test.py @@ -0,0 +1,7 @@ +import torch +from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE + +if _IS_TORCHAUDIO_EXT_AVAILABLE: + def test_accessor(): + tensor = torch.randint(1000, (5,4,3)) + assert torch.ops.torchaudio._test_accessor(tensor) From 75d246a646a76bd85f9613d91df23a2e3db08752 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 16:51:41 +0000 Subject: [PATCH 09/18] Add comment about original indexing --- src/libtorchaudio/forced_align/cpu/compute.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 6dc8bb93ab..3b56aa000a 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -87,7 +87,7 @@ void forced_align_impl( if (start == 0) { alphas_a[0][curIdxOffset] = alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; - backPtr_a[S * t] = 0; + backPtr_a[S * t] = 0; // backPtr_a[t][0] = 0 startloop += 1; } @@ -109,13 +109,13 @@ void forced_align_impl( scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { result = x2; - backPtr_a[t * S + i] = 2; + backPtr_a[t * S + i] = 2; // backPtr_a[t][i] = 2 } else if (x1 > x0 && x1 > x2) { result = x1; - backPtr_a[t * S + i] = 1; + backPtr_a[t * S + i] = 1; // backPtr_a[t][i] = 1 } else { result = x0; - backPtr_a[t * S + i] = 0; + backPtr_a[t * S + i] = 0; // backPtr_a[t][i] = 0 } alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx]; } @@ -126,7 +126,7 @@ void forced_align_impl( for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; paths_a[batchIndex][t] = lbl_idx; - ltrIdx -= backPtr_a[t * S + ltrIdx]; + ltrIdx -= backPtr_a[t * S + ltrIdx]; // backPtr_a[t][ltrIdx] } } From 30ed519b9af566c063c64e74603a9159e9728e26 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 17:17:23 +0000 Subject: [PATCH 10/18] Add requested comment about scalar_t --- src/libtorchaudio/forced_align/cpu/compute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index d4881318df..f4b61e272f 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -27,7 +27,7 @@ void forced_align_impl( const auto L = targets.size(1); const auto S = 2 * L + 1; - auto alphas_a = new scalar_t[S][2]; + auto alphas_a = new scalar_t[S][2]; // scalar_t is just logProbs.dtype() for (int i = 0; i < S; i++) { alphas_a[i][0] = kNegInfinity; alphas_a[i][1] = kNegInfinity; From be13f647d7b492787fdbd71a0ffc92a4459fd013 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 18:53:06 +0000 Subject: [PATCH 11/18] WIP --- src/libtorchaudio/accessor.h | 6 +- src/libtorchaudio/accessor_tests.cpp | 21 +++-- .../forced_align/cpu/compute.cpp | 79 ++++++++++++------- 3 files changed, 70 insertions(+), 36 deletions(-) diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h index ed2b9f6257..2f763cf3c8 100644 --- a/src/libtorchaudio/accessor.h +++ b/src/libtorchaudio/accessor.h @@ -1,16 +1,18 @@ #pragma once -#include +#include #include #include +using torch::stable::Tensor; + template class Accessor { int64_t strides[k]; T *data; public: - using tensor_type = typename std::conditional::type; + using tensor_type = typename std::conditional::type; Accessor(tensor_type tensor) { data = tensor.template data_ptr(); diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp index ca6e48b193..e371a6b1ad 100644 --- a/src/libtorchaudio/accessor_tests.cpp +++ b/src/libtorchaudio/accessor_tests.cpp @@ -1,15 +1,18 @@ #include #include #include +#include +#include using namespace std; +using torch::stable::Tensor; -bool test_accessor(const torch::Tensor& tensor) { +bool test_accessor(const Tensor tensor) { int64_t* data_ptr = tensor.template data_ptr(); auto accessor = Accessor<3, int64_t>(tensor); - for (int i = 0; i < tensor.size(0); i++) { - for (int j = 0; j < tensor.size(1); j++) { - for (int k = 0; k < tensor.size(2); k++) { + for (unsigned int i = 0; i < tensor.size(0); i++) { + for (unsigned int j = 0; j < tensor.size(1); j++) { + for (unsigned int k = 0; k < tensor.size(2); k++) { auto check = *(data_ptr++) == accessor.index(i, j, k); if (!check) { return false; @@ -20,6 +23,12 @@ bool test_accessor(const torch::Tensor& tensor) { return true; } -TORCH_LIBRARY_FRAGMENT(torchaudio, m) { - m.def("torchaudio::_test_accessor", &test_accessor); +void boxed_test_accessor(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) { + Tensor t1(to(stack[0])); + auto result = compute(std::move(t1)); + stack[0] = from(result); +} + +STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) { + m.def("torchaudio::_test_accessor", &boxed_test_accessor); } diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index c0380f25ad..207af046a1 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -14,19 +14,17 @@ namespace torchaudio { namespace alignment { namespace cpu { - +using torch::stable::Tensor; // Inspired from // https://github.com/flashlight/sequence/blob/main/flashlight/lib/sequence/criterion/cpu/ConnectionistTemporalClassificationCriterion.cpp -template +template void forced_align_impl( - const torch::Tensor& logProbs, - const torch::Tensor& targets, - const int64_t blank, - torch::Tensor& paths) { + const Tensor logProbs, + const Tensor targets, + const Tensor blank, + Tensor paths) { const scalar_t kNegInfinity = -std::numeric_limits::infinity(); - using target_t = typename std:: - conditional::type; const auto batchIndex = 0; // TODO: support batch version and use the real batch index const auto T = logProbs.size(1); @@ -136,11 +134,11 @@ void forced_align_impl( } } -std::tuple compute( - const torch::Tensor& logProbs, - const torch::Tensor& targets, - const torch::Tensor& inputLengths, - const torch::Tensor& targetLengths, +std::tuple compute( + const Tensor& logProbs, + const Tensor& targets, + const Tensor& inputLengths, + const Tensor& targetLengths, const int64_t blank) { TORCH_CHECK(logProbs.is_cpu(), "log_probs must be a CPU tensor"); TORCH_CHECK(targets.is_cpu(), "targets must be a CPU tensor"); @@ -185,19 +183,31 @@ std::tuple compute( const auto B = logProbs.size(0); const auto T = logProbs.size(1); - auto paths = torch::zeros( - {B, T}, - torch::TensorOptions().device(targets.device()).dtype(targets.dtype())); - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - logProbs.scalar_type(), "forced_align_impl", [&] { - if (targets.scalar_type() == torch::kInt64) { - forced_align_impl( - logProbs, targets, blank, paths); - } else { - forced_align_impl( - logProbs, targets, blank, paths); - } - }); + + int64_t paths_size[2] = {B, T}; + int64_t paths_stride[2] = {T, 1}; + AtenTensorHandle paths_h; + aoti_torch_empty_strided(1, paths_size, paths_stride, targets_dtype, targets_device, targets_device_index, &paths_h); + auto paths = Tensor(paths_h); + + + if (targets.scalar_type() == aoti_torch_dtype_int64()) { + if (logProbs.scalar_type() == aoti_torch_dtype_float64()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) { + forced_align_impl(logProbs, targets, blank, paths); + } + } else if (targets.scalar_type() == aoti_torch_dtype_int32()) { + if (logProbs.scalar_type() == aoti_torch_dtype_float64()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) { + forced_align_impl(logProbs, targets, blank, paths); + } + } return std::make_tuple( paths, logProbs.index( @@ -207,8 +217,21 @@ std::tuple compute( paths.index({0})})); } -TORCH_LIBRARY_IMPL(torchaudio, CPU, m) { - m.impl("forced_align", &compute); + +void boxed_compute(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) { + Tensor t1(to(stack[0])); + Tensor t2(to(stack[1])); + Tensor t3(to(stack[2])); + Tensor t4(to(stack[3])); + int64_t blank = to(stack[4]); + auto result = compute( + std::move(t1), std::move(t2), std::move(t3), std::move(t4), blank); + stack[0] = from(std::get<0>(result)); + stack[1] = from(std::get<1>(result)); +} + +STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) { + m.impl("forced_align", &boxed_compute); } } // namespace cpu From 77fd1ad5926d91cd48bc1099f71e3d86f510c8c7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 6 Aug 2025 21:29:11 +0000 Subject: [PATCH 12/18] Use stable tensors throughout forced_align code --- src/libtorchaudio/accessor.h | 8 +-- src/libtorchaudio/accessor_tests.cpp | 20 +++++-- .../forced_align/cpu/compute.cpp | 56 ++++++++++--------- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/src/libtorchaudio/accessor.h b/src/libtorchaudio/accessor.h index 2f763cf3c8..0fc23e978f 100644 --- a/src/libtorchaudio/accessor.h +++ b/src/libtorchaudio/accessor.h @@ -15,8 +15,8 @@ class Accessor { using tensor_type = typename std::conditional::type; Accessor(tensor_type tensor) { - data = tensor.template data_ptr(); - for (int i = 0; i < k; i++) { + data = (T*)tensor.template data_ptr(); + for (unsigned int i = 0; i < k; i++) { strides[i] = tensor.stride(i); } } @@ -25,7 +25,7 @@ class Accessor { va_list args; va_start(args, k); int64_t ix = 0; - for (int i = 0; i < k; i++) { + for (unsigned int i = 0; i < k; i++) { ix += strides[i] * va_arg(args, int); } va_end(args); @@ -37,7 +37,7 @@ class Accessor { va_list args; va_start(args, value); int64_t ix = 0; - for (int i = 0; i < k; i++) { + for (unsigned int i = 0; i < k; i++) { ix += strides[i] * va_arg(args, int); } va_end(args); diff --git a/src/libtorchaudio/accessor_tests.cpp b/src/libtorchaudio/accessor_tests.cpp index e371a6b1ad..62e9b23d5a 100644 --- a/src/libtorchaudio/accessor_tests.cpp +++ b/src/libtorchaudio/accessor_tests.cpp @@ -4,11 +4,15 @@ #include #include +namespace torchaudio { + +namespace accessor_tests { + using namespace std; using torch::stable::Tensor; bool test_accessor(const Tensor tensor) { - int64_t* data_ptr = tensor.template data_ptr(); + int64_t* data_ptr = (int64_t*)tensor.data_ptr(); auto accessor = Accessor<3, int64_t>(tensor); for (unsigned int i = 0; i < tensor.size(0); i++) { for (unsigned int j = 0; j < tensor.size(1); j++) { @@ -25,10 +29,18 @@ bool test_accessor(const Tensor tensor) { void boxed_test_accessor(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) { Tensor t1(to(stack[0])); - auto result = compute(std::move(t1)); + auto result = test_accessor(std::move(t1)); stack[0] = from(result); } -STABLE_TORCH_LIBRARY_FRAGMENT(torchaudio, m) { - m.def("torchaudio::_test_accessor", &boxed_test_accessor); +TORCH_LIBRARY_FRAGMENT(torchaudio, m) { + m.def( + "_test_accessor(Tensor log_probs) -> bool"); +} + +STABLE_TORCH_LIBRARY_IMPL(torchaudio, CPU, m) { + m.impl("torchaudio::_test_accessor", &boxed_test_accessor); +} + +} } diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 5ca95b2b3c..1320533ec9 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -6,6 +6,7 @@ #include #include #include +#include using namespace std; @@ -22,7 +23,7 @@ template void forced_align_impl( const Tensor logProbs, const Tensor targets, - const Tensor blank, + target_t blank, Tensor paths) { const scalar_t kNegInfinity = -std::numeric_limits::infinity(); const auto batchIndex = @@ -143,15 +144,15 @@ std::tuple compute( TORCH_CHECK(logProbs.is_cpu(), "log_probs must be a CPU tensor"); TORCH_CHECK(targets.is_cpu(), "targets must be a CPU tensor"); TORCH_CHECK( - logProbs.device() == targets.device(), + logProbs.get_device() == targets.get_device(), "log_probs and targets need to be on the same device"); TORCH_CHECK( - logProbs.dtype() == torch::kFloat64 || - logProbs.dtype() == torch::kFloat32 || - logProbs.dtype() == torch::kFloat16, + logProbs.dtype() == aoti_torch_dtype_float64() || + logProbs.dtype() == aoti_torch_dtype_float32() || + logProbs.dtype() == aoti_torch_dtype_float16(), "log_probs must be float64, float32 or float16 (half) type"); TORCH_CHECK( - targets.dtype() == torch::kInt32 || targets.dtype() == torch::kInt64, + targets.dtype() == aoti_torch_dtype_int32() || targets.dtype() == aoti_torch_dtype_int64(), "targets must be int32 or int64 type"); TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous"); TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous"); @@ -174,12 +175,13 @@ std::tuple compute( blank >= 0 && blank < logProbs.size(-1), "blank must be within [0, num classes)"); - TORCH_CHECK( - logProbs.size(1) == at::max(inputLengths).item().toInt(), - "input length mismatch"); - TORCH_CHECK( - targets.size(1) == at::max(targetLengths).item().toInt(), - "target length mismatch"); + // TODO: Requires port of `max` operator. + // TORCH_CHECK( + // logProbs.size(1) == at::max(inputLengths).item().toInt(), + // "input length mismatch"); + // TORCH_CHECK( + // targets.size(1) == at::max(targetLengths).item().toInt(), + // "target length mismatch"); const auto B = logProbs.size(0); const auto T = logProbs.size(1); @@ -187,25 +189,27 @@ std::tuple compute( int64_t paths_size[2] = {B, T}; int64_t paths_stride[2] = {T, 1}; AtenTensorHandle paths_h; - aoti_torch_empty_strided(1, paths_size, paths_stride, targets_dtype, targets_device, targets_device_index, &paths_h); + int32_t targets_device; + aoti_torch_get_device_type(targets.get(), &targets_device); + aoti_torch_empty_strided(1, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h); auto paths = Tensor(paths_h); if (targets.dtype() == aoti_torch_dtype_int64()) { - if (logProbs.scalar_type() == aoti_torch_dtype_float64()) { - forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) { - forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) { - forced_align_impl(logProbs, targets, blank, paths); + if (logProbs.dtype() == aoti_torch_dtype_float64()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.dtype() == aoti_torch_dtype_float32()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.dtype() == aoti_torch_dtype_float16()) { + forced_align_impl(logProbs, targets, blank, paths); } - } else if (targets.scalar_type() == aoti_torch_dtype_int32()) { - if (logProbs.scalar_type() == aoti_torch_dtype_float64()) { - forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.scalar_type() == aoti_torch_dtype_float32()) { - forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.scalar_type() == aoti_torch_dtype_float16()) { - forced_align_impl(logProbs, targets, blank, paths); + } else if (targets.dtype() == aoti_torch_dtype_int32()) { + if (logProbs.dtype() == aoti_torch_dtype_float64()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.dtype() == aoti_torch_dtype_float32()) { + forced_align_impl(logProbs, targets, blank, paths); + } else if (logProbs.dtype() == aoti_torch_dtype_float16()) { + forced_align_impl(logProbs, targets, blank, paths); } } return std::make_tuple( From ced6124daa9b553f0c20768dde22e3a855961592 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 7 Aug 2025 14:54:57 +0000 Subject: [PATCH 13/18] Free alphas_a array --- src/libtorchaudio/forced_align/cpu/compute.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index f4b61e272f..c2776948df 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -119,6 +119,7 @@ void forced_align_impl( } auto idx1 = (T - 1) % 2; auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2; + delete[] alphas_a; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { auto lbl_idx = ltrIdx % 2 == 0 ? blank : targets_a[batchIndex][ltrIdx / 2]; From 71ce212e98aef6b89299014180334de35de8e6c0 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 7 Aug 2025 14:58:30 +0000 Subject: [PATCH 14/18] Free backPtr_a --- src/libtorchaudio/forced_align/cpu/compute.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 530d97abf9..5562500694 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -129,6 +129,7 @@ void forced_align_impl( paths_a[batchIndex][t] = lbl_idx; ltrIdx -= backPtr_a[t * S + ltrIdx]; // backPtr_a[t][ltrIdx] } + delete[] backPtr_a; } std::tuple compute( From 9629864f8b0fea3b74cbdff90ea3c8b38cb60ea5 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 7 Aug 2025 19:23:02 +0000 Subject: [PATCH 15/18] Fix merge conflict --- src/libtorchaudio/forced_align/cpu/compute.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index e9b271ded9..3d4063da65 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -91,13 +91,8 @@ void forced_align_impl( } if (start == 0) { alphas_a[0][curIdxOffset] = -<<<<<<< HEAD alphas_a[0][prevIdxOffset] + logProbs_a.index(batchIndex, t, blank); - backPtr_a[S * t] = 0; -======= - alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; - backPtr_a[S * t] = 0; // backPtr_a[t][0] = 0 ->>>>>>> forced_align_backptr + backPtr_a[S * t] = 0; // backPtr_a[t][0] = 0 startloop += 1; } From 847b72652f43062e766561f9ac7a2dcb9225fc91 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 7 Aug 2025 20:02:25 +0000 Subject: [PATCH 16/18] Correct dimensionality of path variable --- src/libtorchaudio/forced_align/cpu/compute.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 3d4063da65..105443f071 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -177,7 +177,7 @@ std::tuple compute( blank >= 0 && blank < logProbs.size(-1), "blank must be within [0, num classes)"); - // TODO: Requires port of `max` operator. + // TODO: Requires port of `max` and `item` operators. // TORCH_CHECK( // logProbs.size(1) == at::max(inputLengths).item().toInt(), // "input length mismatch"); @@ -193,7 +193,7 @@ std::tuple compute( AtenTensorHandle paths_h; int32_t targets_device; aoti_torch_get_device_type(targets.get(), &targets_device); - aoti_torch_empty_strided(1, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h); + aoti_torch_empty_strided(2, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h); auto paths = Tensor(paths_h); From 2663def7574802c262ef14bc4614458796cfa238 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 8 Aug 2025 14:53:43 +0000 Subject: [PATCH 17/18] Use 1d indexing in original layout for alphas_a --- .../forced_align/cpu/compute.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index c2776948df..0c08a346b0 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -27,10 +27,9 @@ void forced_align_impl( const auto L = targets.size(1); const auto S = 2 * L + 1; - auto alphas_a = new scalar_t[S][2]; // scalar_t is just logProbs.dtype() - for (int i = 0; i < S; i++) { - alphas_a[i][0] = kNegInfinity; - alphas_a[i][1] = kNegInfinity; + auto alphas_a = new scalar_t[2 * S]; // scalar_t is just logProbs.dtype() + for (int i = 0; i < 2 * S; i++) { + alphas_a[i] = kNegInfinity; } torch::Tensor backPtr = torch::empty({T, S}, torch::kInt8).fill_(-1); @@ -56,7 +55,7 @@ void forced_align_impl( auto end = (S == 1) ? 1 : 2; for (auto i = start; i < end; i++) { auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; - alphas_a[i][0] = logProbs_a[batchIndex][0][labelIdx]; + alphas_a[i] = logProbs_a[batchIndex][0][labelIdx]; // alphas_a[0, i] } for (auto t = 1; t < T; t++) { if (T - t <= L + R) { @@ -79,18 +78,18 @@ void forced_align_impl( auto curIdxOffset = t % 2; auto prevIdxOffset = (t - 1) % 2; for (auto j = 0; j < S; ++j) { - alphas_a[j][curIdxOffset] = -std::numeric_limits::infinity(); + alphas_a[curIdxOffset * S + j] = -std::numeric_limits::infinity(); // alphas_a[curIdxOffset][j] } if (start == 0) { - alphas_a[0][curIdxOffset] = - alphas_a[0][prevIdxOffset] + logProbs_a[batchIndex][t][blank]; + alphas_a[curIdxOffset * S] = + alphas_a[prevIdxOffset * S] + logProbs_a[batchIndex][t][blank]; backPtr_a[t][0] = 0; startloop += 1; } for (auto i = startloop; i < end; i++) { - auto x0 = alphas_a[i][prevIdxOffset]; - auto x1 = alphas_a[i - 1][prevIdxOffset]; + auto x0 = alphas_a[prevIdxOffset * S + i]; // alphas_a[prevIdxOffset][i]; + auto x1 = alphas_a[prevIdxOffset * S + i - 1]; // alphas_a[prevIdxOffset][i - 1]; auto x2 = -std::numeric_limits::infinity(); auto labelIdx = (i % 2 == 0) ? blank : targets_a[batchIndex][i / 2]; @@ -101,7 +100,7 @@ void forced_align_impl( // (i != 1) just ensures we don't access targets[i - 2] if its i < 2 if (i % 2 != 0 && i != 1 && targets_a[batchIndex][i / 2] != targets_a[batchIndex][i / 2 - 1]) { - x2 = alphas_a[i - 2][prevIdxOffset]; + x2 = alphas_a[prevIdxOffset * S + i - 2]; // alphas_a[prevIdxOffset][i - 2]; } scalar_t result = 0.0; if (x2 > x1 && x2 > x0) { @@ -114,11 +113,12 @@ void forced_align_impl( result = x0; backPtr_a[t][i] = 0; } - alphas_a[i][curIdxOffset] = result + logProbs_a[batchIndex][t][labelIdx]; + alphas_a[curIdxOffset * S + i] = result + logProbs_a[batchIndex][t][labelIdx]; // alphas_a[curIdxOffset][i] } } auto idx1 = (T - 1) % 2; - auto ltrIdx = alphas_a[S - 1][idx1] > alphas_a[S - 2][idx1] ? S - 1 : S - 2; + auto ltrIdx = alphas_a[S * idx1 + S - 1] > + alphas_a[S * idx1 + S - 2] ? S - 1 : S - 2; // alphas_a[idx1][S - 1], alphas_a[idx1][S - 2] delete[] alphas_a; // path stores the token index for each time step after force alignment. for (auto t = T - 1; t > -1; t--) { From 86f75576d36c342140a46a589c65e48e52890a8e Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 19 Aug 2025 21:42:28 +0000 Subject: [PATCH 18/18] Use c-style dtype API --- .../forced_align/cpu/compute.cpp | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/libtorchaudio/forced_align/cpu/compute.cpp b/src/libtorchaudio/forced_align/cpu/compute.cpp index 6314433786..2632941d69 100644 --- a/src/libtorchaudio/forced_align/cpu/compute.cpp +++ b/src/libtorchaudio/forced_align/cpu/compute.cpp @@ -150,13 +150,17 @@ std::tuple compute( TORCH_CHECK( logProbs.get_device() == targets.get_device(), "log_probs and targets need to be on the same device"); + int32_t logprobs_dtype; + aoti_torch_get_dtype(logProbs.get(), &logprobs_dtype); TORCH_CHECK( - logProbs.dtype() == aoti_torch_dtype_float64() || - logProbs.dtype() == aoti_torch_dtype_float32() || - logProbs.dtype() == aoti_torch_dtype_float16(), + logprobs_dtype == aoti_torch_dtype_float64() || + logprobs_dtype == aoti_torch_dtype_float32() || + logprobs_dtype == aoti_torch_dtype_float16(), "log_probs must be float64, float32 or float16 (half) type"); + int32_t targets_dtype; + aoti_torch_get_dtype(targets.get(), &targets_dtype); TORCH_CHECK( - targets.dtype() == aoti_torch_dtype_int32() || targets.dtype() == aoti_torch_dtype_int64(), + targets_dtype == aoti_torch_dtype_int32() || targets_dtype == aoti_torch_dtype_int64(), "targets must be int32 or int64 type"); TORCH_CHECK(logProbs.is_contiguous(), "log_probs must be contiguous"); TORCH_CHECK(targets.is_contiguous(), "targets must be contiguous"); @@ -195,24 +199,24 @@ std::tuple compute( AtenTensorHandle paths_h; int32_t targets_device; aoti_torch_get_device_type(targets.get(), &targets_device); - aoti_torch_empty_strided(2, paths_size, paths_stride, targets.dtype(), targets_device, targets.get_device(), &paths_h); + aoti_torch_empty_strided(2, paths_size, paths_stride, targets_dtype, targets_device, targets.get_device(), &paths_h); auto paths = Tensor(paths_h); - if (targets.dtype() == aoti_torch_dtype_int64()) { - if (logProbs.dtype() == aoti_torch_dtype_float64()) { + if (targets_dtype == aoti_torch_dtype_int64()) { + if (logprobs_dtype == aoti_torch_dtype_float64()) { forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.dtype() == aoti_torch_dtype_float32()) { + } else if (logprobs_dtype == aoti_torch_dtype_float32()) { forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.dtype() == aoti_torch_dtype_float16()) { + } else if (logprobs_dtype == aoti_torch_dtype_float16()) { forced_align_impl(logProbs, targets, blank, paths); } - } else if (targets.dtype() == aoti_torch_dtype_int32()) { - if (logProbs.dtype() == aoti_torch_dtype_float64()) { + } else if (targets_dtype == aoti_torch_dtype_int32()) { + if (logprobs_dtype == aoti_torch_dtype_float64()) { forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.dtype() == aoti_torch_dtype_float32()) { + } else if (logprobs_dtype == aoti_torch_dtype_float32()) { forced_align_impl(logProbs, targets, blank, paths); - } else if (logProbs.dtype() == aoti_torch_dtype_float16()) { + } else if (logprobs_dtype == aoti_torch_dtype_float16()) { forced_align_impl(logProbs, targets, blank, paths); } }