Skip to content

Commit b62bf8c

Browse files
author
Chao Liu
authored
Merge pull request pytorch#14 from ROCmSoftwarePlatform/miopen_downstream_init_integration
MIOpen Downstream: Initial integration 2nd PR
2 parents ccc4a1d + 67ad47e commit b62bf8c

14 files changed

+63
-69
lines changed

composable_kernel/include/tensor_description/cluster_descriptor.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ namespace ck {
88

99
template <typename Lengths,
1010
typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
11-
__host__ __device__ constexpr auto make_cluster_descriptor_v2(
11+
__host__ __device__ constexpr auto make_cluster_descriptor(
1212
const Lengths& lengths,
1313
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
1414
{

composable_kernel/include/tensor_description/multi_index_transform.hpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -481,11 +481,11 @@ struct Merge_v1_carry_check
481481
using LowerIndex = MultiIndex<NDimLow>;
482482
using UpperIndex = MultiIndex<1>;
483483

484-
using LowLengthsScan = decltype(
485-
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
484+
using LowLengthsScan =
485+
decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
486486

487487
using UpLengths =
488-
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
488+
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
489489

490490
LowLengths low_lengths_;
491491
LowLengthsScan low_lengths_scan_;
@@ -496,8 +496,8 @@ struct Merge_v1_carry_check
496496
__host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
497497
: low_lengths_{low_lengths},
498498
low_lengths_scan_{
499-
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
500-
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
499+
container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
500+
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
501501
{
502502
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
503503
}
@@ -1037,7 +1037,7 @@ struct Merge_v2_magic_division
10371037
using UpperIndex = MultiIndex<1>;
10381038

10391039
using UpLengths =
1040-
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
1040+
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
10411041

10421042
using LowLengthsMagicDivisorMultipiler = decltype(
10431043
generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
@@ -1062,7 +1062,7 @@ struct Merge_v2_magic_division
10621062
low_lengths_magic_divisor_shift_{generate_tuple(
10631063
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
10641064
Number<NDimLow>{})},
1065-
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
1065+
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
10661066
{
10671067
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
10681068
}
@@ -1188,11 +1188,11 @@ struct Merge_v2r2_magic_division
11881188
using LowerIndex = MultiIndex<NDimLow>;
11891189
using UpperIndex = MultiIndex<1>;
11901190

1191-
using LowLengthsScan = decltype(
1192-
container_reverse_exclusive_scan(LowLengths{}, math::multiplies_v2{}, Number<1>{}));
1191+
using LowLengthsScan =
1192+
decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
11931193

11941194
using UpLengths =
1195-
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies_v2{}, Number<1>{})));
1195+
decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
11961196

11971197
using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
11981198
lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
@@ -1213,14 +1213,14 @@ struct Merge_v2r2_magic_division
12131213
__host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
12141214
: low_lengths_{low_lengths},
12151215
low_lengths_scan_{
1216-
container_reverse_exclusive_scan(low_lengths, math::multiplies_v2{}, Number<1>{})},
1216+
container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
12171217
low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
12181218
[&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
12191219
Number<NDimLow>{})},
12201220
low_lengths_scan_magic_divisor_shift_{generate_tuple(
12211221
[&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
12221222
Number<NDimLow>{})},
1223-
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies_v2{}, Number<1>{}))}
1223+
up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
12241224
{
12251225
static_assert(LowerIndex::Size() == NDimLow, "wrong!");
12261226
}
@@ -1336,7 +1336,7 @@ struct UnMerge
13361336
using UpperIndex = MultiIndex<NDimUp>;
13371337

13381338
using UpLengthsScan =
1339-
decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies_v2{}, Number<1>{}));
1339+
decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
13401340

13411341
UpLengths up_lengths_;
13421342
UpLengthsScan up_lengths_scan_;
@@ -1346,7 +1346,7 @@ struct UnMerge
13461346
__host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
13471347
: up_lengths_{up_lengths},
13481348
up_lengths_scan_{
1349-
container_reverse_exclusive_scan(up_lengths, math::multiplies_v2{}, Number<1>{})}
1349+
container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
13501350
{
13511351
}
13521352

composable_kernel/include/tensor_description/tensor_adaptor.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ struct TensorAdaptor
6464
Number<ndim_top_>{});
6565

6666
// TODO: make container_reduce support tuple of Number and index_t
67-
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
67+
return container_reduce(lengths, math::multiplies{}, Number<1>{});
6868
}
6969

7070
template <index_t IDim>

composable_kernel/include/tensor_description/tensor_descriptor.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ struct TensorDescriptor
6969
Number<ndim_visible_>{});
7070

7171
// TODO: make container_reduce support tuple of Number and index_t
72-
return container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
72+
return container_reduce(lengths, math::multiplies{}, Number<1>{});
7373
}
7474

7575
template <index_t IDim>

composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt
3838
template <typename... Lengths,
3939
typename... Strides,
4040
typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
41-
__host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple<Lengths...>& lengths,
42-
const Tuple<Strides...>& strides)
41+
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
42+
const Tuple<Strides...>& strides)
4343
{
4444
constexpr index_t N = sizeof...(Lengths);
4545

@@ -100,7 +100,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
100100

101101
constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
102102

103-
const auto element_space_size = container_reduce(lengths, math::multiplies_v2{}, Number<1>{});
103+
const auto element_space_size = container_reduce(lengths, math::multiplies{}, Number<1>{});
104104

105105
return TensorDescriptor<remove_cv_t<decltype(transforms)>,
106106
remove_cv_t<decltype(low_dim_hidden_idss)>,
@@ -112,7 +112,7 @@ make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
112112

113113
template <typename... Lengths, typename Align>
114114
__host__ __device__ constexpr auto
115-
make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align align)
115+
make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
116116
{
117117
constexpr auto I1 = Number<1>{};
118118

@@ -133,7 +133,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
133133
else
134134
{
135135
return container_reduce(lengths,
136-
math::multiplies_v2{},
136+
math::multiplies{},
137137
Number<stride_n_minus_2>{},
138138
i + I1,
139139
Number<N - 1>{},
@@ -142,7 +142,7 @@ make_naive_tensor_descriptor_aligned_v2(const Tuple<Lengths...>& lengths, Align
142142
},
143143
Number<N>{});
144144

145-
return make_naive_tensor_descriptor_v2(lengths, strides);
145+
return make_naive_tensor_descriptor(lengths, strides);
146146
}
147147

148148
} // namespace ck

composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ struct BlockwiseTensorSliceTransfer_v4
143143

144144
private:
145145
static constexpr auto thread_cluster_desc_ =
146-
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
146+
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
147147

148148
using ThreadwiseTransfer =
149149
ThreadwiseTensorSliceTransfer_v3<ThreadSliceLengths,

composable_kernel/include/tensor_operation/blockwise_tensor_slice_transfer_v2.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ struct BlockwiseTensorSliceTransfer_v4r1
131131

132132
private:
133133
static constexpr auto thread_cluster_desc_ =
134-
make_cluster_descriptor_v2(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
134+
make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
135135

136136
using ThreadwiseTransfer =
137137
ThreadwiseTensorSliceTransfer_v3r1<ThreadSliceLengths,

composable_kernel/include/tensor_operation/gridwise_contraction_dlops_v1r2.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,13 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
110110

111111
// A matrix in LDS memory, dst of blockwise copy
112112
// be careful of LDS alignment
113-
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
113+
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
114114
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
115115
max_lds_align);
116116

117117
// B matrix in LDS memory, dst of blockwise copy
118118
// be careful of LDS alignment
119-
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
119+
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
120120
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
121121
max_lds_align);
122122

@@ -248,10 +248,10 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
248248
constexpr auto BN = GN0 * GN11;
249249

250250
constexpr auto BM1 =
251-
Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies_v2{}, I1) *
251+
Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
252252
BM1PerThreadBM11>{};
253253
constexpr auto BN1 =
254-
Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies_v2{}, I1) *
254+
Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
255255
BN1PerThreadBN11>{};
256256

257257
constexpr auto BM0 = BM / BM1;
@@ -354,24 +354,24 @@ struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN
354354

355355
// A matrix in LDS memory, dst of blockwise copy
356356
// be careful of LDS alignment
357-
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned_v2(
357+
constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
358358
make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
359359
max_lds_align);
360360

361361
// B matrix in LDS memory, dst of blockwise copy
362362
// be careful of LDS alignment
363-
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned_v2(
363+
constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
364364
make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
365365
max_lds_align);
366366

367367
// A matrix in LDS memory for blockwise GEMM
368368
// be careful of LDS alignment
369-
constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned_v2(
369+
constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
370370
make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
371371

372372
// B matrix in LDS memory for blockwise GEMM
373373
// be careful of LDS alignment
374-
constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned_v2(
374+
constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
375375
make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
376376

377377
static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==

composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r2.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
166166

167167
// A matrix in LDS memory, dst of blockwise copy
168168
// be careful of LDS alignment
169-
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
169+
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
170170
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
171171

172172
// B matrix in LDS memory, dst of blockwise copy
173173
// be careful of LDS alignment
174-
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
174+
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
175175
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
176176

177177
// LDS allocation for A and B: be careful of alignment
@@ -351,22 +351,22 @@ struct GridwiseGemmDlops_km_kn_mn_v1r2
351351

352352
// A matrix in LDS memory, dst of blockwise copy
353353
// be careful of LDS alignment
354-
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
354+
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
355355
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
356356

357357
// B matrix in LDS memory, dst of blockwise copy
358358
// be careful of LDS alignment
359-
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
359+
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
360360
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
361361

362362
// A matrix in LDS memory, dst of blockwise copy
363363
// be careful of LDS alignment
364-
constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned_v2(
364+
constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
365365
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
366366

367367
// B matrix in LDS memory, dst of blockwise copy
368368
// be careful of LDS alignment
369-
constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned_v2(
369+
constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
370370
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
371371

372372
// A matrix blockwise copy

composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v1r3.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,12 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
163163

164164
// TODO: check alignment
165165
// A matrix in LDS memory, dst of blockwise copy
166-
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned_v2(
166+
constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
167167
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
168168

169169
// TODO: check alignment
170170
// B matrix in LDS memory, dst of blockwise copy
171-
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned_v2(
171+
constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
172172
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
173173

174174
// TODO: check alignment
@@ -274,10 +274,10 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
274274
const auto N0 = N / N1;
275275

276276
constexpr auto M11 =
277-
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies_v2{}, I1) *
277+
Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
278278
M1PerThreadM111>{};
279279
constexpr auto N11 =
280-
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies_v2{}, I1) *
280+
Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
281281
N1PerThreadN111>{};
282282

283283
constexpr auto M10 = M1 / M11;
@@ -354,23 +354,23 @@ struct GridwiseGemmDlops_km_kn_mn_v1r3
354354
// TODO: check alignment
355355
// A matrix in LDS memory, dst of blockwise copy
356356
// be careful of LDS alignment
357-
constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
357+
constexpr auto a_k0_m0_m1_k1_block_desc = make_naive_tensor_descriptor_aligned(
358358
make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}, K1), max_lds_align);
359359

360360
// TODO: check alignment
361361
// B matrix in LDS memory, dst of blockwise copy
362362
// be careful of LDS alignment
363-
constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
363+
constexpr auto b_k0_n0_n1_k1_block_desc = make_naive_tensor_descriptor_aligned(
364364
make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}, K1), max_lds_align);
365365

366366
// TODO: check alignment
367367
// A matrix in LDS memory, for blockwise GEMM
368-
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
368+
constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
369369
make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}, K1), max_lds_align);
370370

371371
// TODO: check alignment
372372
// B matrix in LDS memory, for blockwise GEMM
373-
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned_v2(
373+
constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
374374
make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}, K1), max_lds_align);
375375

376376
static_assert(a_k0_m0_m1_k1_block_desc.GetElementSpaceSize() ==

composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
5858

5959
// A matrix in LDS memory, dst of blockwise copy
6060
// be careful of LDS alignment
61-
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
61+
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
6262
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
6363

6464
// LDS allocation for A and B: be careful of alignment
@@ -132,10 +132,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
132132

133133
// A matrix in LDS memory, dst of blockwise copy
134134
// be careful of LDS alignment
135-
constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned_v2(
135+
constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
136136
make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
137137

138-
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned_v2(
138+
constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
139139
make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
140140

141141
// B matrix in LDS memory, dst of blockwise copy

0 commit comments

Comments
 (0)