[inductor cpp] fix argmax with >1 reduction dims (pytorch#113168)

Jiong Gong · pytorchmergebot · commit 8c704f7a0e27 · 2023-11-09T11:47:51.000Z
Fix pytorch#113013. The argmax (and argmin) implementation doesn't handle the index compute properly when the number of reduction dims is larger than 1. It wrongly assumed only one reduction dim. With the given reproducer, the generated code before the change: ```c++ #include "/tmp/torchinductor_jgong5/tb/ctbgktuhgnnlel6ipqkfk76lfztr5pledachdkcq3asdqtlxpzt6.h" extern "C" void kernel(const double* in_ptr0, long* out_ptr0) { { { struct IndexValue_1 {size_t index; double value;}; IndexValue_1 tmp_acc0{0, -std::numeric_limits<double>::infinity()}; #if !defined(__clang_major__) || __clang_major__ > 9 #pragma omp declare reduction(argmax : IndexValue_1 :\ omp_out.value = omp_in.value < omp_out.value ? omp_out.value : omp_in.value,\ omp_out.index = omp_in.value < omp_out.value ? omp_out.index : omp_in.index)\ initializer(omp_priv = {0, -std::numeric_limits<double>::infinity()}) #endif for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) { for(long x1=static_cast<long>(0L); x1<static_cast<long>(2L); x1+=static_cast<long>(1L)) { auto tmp0 = c10::convert<long>(0); auto tmp1 = c10::convert<long>(1); auto tmp2 = tmp0 < tmp1; auto tmp3 = c10::convert<long>(at::native::div_floor_integer((3L*x1), 2L)); auto tmp4 = c10::convert<long>(2L + (at::native::div_floor_integer((3L*x1), 2L))); auto tmp5 = tmp3 < tmp4; auto tmp6 = tmp2 & tmp5; auto tmp7 = [&] { auto tmp8 = in_ptr0[static_cast<long>((3L*x0) + (at::native::div_floor_integer((3L*x1), 2L)))]; return tmp8; } ; auto tmp9 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0); auto tmp10 = c10::convert<long>(1L + (at::native::div_floor_integer((3L*x1), 2L))); auto tmp11 = tmp10 < tmp4; auto tmp12 = tmp2 & tmp11; auto tmp13 = [&] { auto tmp14 = in_ptr0[static_cast<long>(1L + (3L*x0) + (at::native::div_floor_integer((3L*x1), 2L)))]; return tmp14; } ; auto tmp15 = tmp12 ? tmp13() : static_cast<decltype(tmp13())>(0.0); auto tmp16 = tmp15 + tmp9; auto tmp17 = [&] { auto tmp18 = c10::convert<double>(1.0); return tmp18; } ; auto tmp19 = tmp6 ? tmp17() : static_cast<decltype(tmp17())>(0.0); auto tmp20 = [&] { auto tmp21 = c10::convert<double>(1.0); return tmp21; } ; auto tmp22 = tmp12 ? tmp20() : static_cast<decltype(tmp20())>(0.0); auto tmp23 = tmp22 + tmp19; auto tmp24 = tmp16 / tmp23; if (tmp_acc0.value < tmp24) { tmp_acc0.index = x1; tmp_acc0.value = tmp24; // both x0 and x1 are reduction vars while only x1 is assigned to tmp_acc0.index } } } out_ptr0[static_cast<long>(0L)] = tmp_acc0.index; } } } ``` After fix: ```c++ #include "/tmp/torchinductor_jgong5/tb/ctbgktuhgnnlel6ipqkfk76lfztr5pledachdkcq3asdqtlxpzt6.h" extern "C" void kernel(const double* in_ptr0, long* out_ptr0) { { { struct IndexValue_1 {size_t index; double value;}; IndexValue_1 tmp_acc0{0, -std::numeric_limits<double>::infinity()}; #if !defined(__clang_major__) || __clang_major__ > 9 #pragma omp declare reduction(argmax : IndexValue_1 :\ omp_out.value = omp_in.value < omp_out.value ? omp_out.value : omp_in.value,\ omp_out.index = omp_in.value < omp_out.value ? omp_out.index : omp_in.index)\ initializer(omp_priv = {0, -std::numeric_limits<double>::infinity()}) #endif for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L)) { for(long x1=static_cast<long>(0L); x1<static_cast<long>(2L); x1+=static_cast<long>(1L)) { auto tmp0 = c10::convert<long>(0); auto tmp1 = c10::convert<long>(1); auto tmp2 = tmp0 < tmp1; auto tmp3 = c10::convert<long>(at::native::div_floor_integer((3L*x1), 2L)); auto tmp4 = c10::convert<long>(2L + (at::native::div_floor_integer((3L*x1), 2L))); auto tmp5 = tmp3 < tmp4; auto tmp6 = tmp2 & tmp5; auto tmp7 = [&] { auto tmp8 = in_ptr0[static_cast<long>((3L*x0) + (at::native::div_floor_integer((3L*x1), 2L)))]; return tmp8; } ; auto tmp9 = tmp6 ? tmp7() : static_cast<decltype(tmp7())>(0.0); auto tmp10 = c10::convert<long>(1L + (at::native::div_floor_integer((3L*x1), 2L))); auto tmp11 = tmp10 < tmp4; auto tmp12 = tmp2 & tmp11; auto tmp13 = [&] { auto tmp14 = in_ptr0[static_cast<long>(1L + (3L*x0) + (at::native::div_floor_integer((3L*x1), 2L)))]; return tmp14; } ; auto tmp15 = tmp12 ? tmp13() : static_cast<decltype(tmp13())>(0.0); auto tmp16 = tmp15 + tmp9; auto tmp17 = [&] { auto tmp18 = c10::convert<double>(1.0); return tmp18; } ; auto tmp19 = tmp6 ? tmp17() : static_cast<decltype(tmp17())>(0.0); auto tmp20 = [&] { auto tmp21 = c10::convert<double>(1.0); return tmp21; } ; auto tmp22 = tmp12 ? tmp20() : static_cast<decltype(tmp20())>(0.0); auto tmp23 = tmp22 + tmp19; auto tmp24 = tmp16 / tmp23; if (tmp_acc0.value < tmp24) { tmp_acc0.index = static_cast<long>(x1 + (2L*x0)); tmp_acc0.value = tmp24; } } } out_ptr0[static_cast<long>(0L)] = tmp_acc0.index; } } } ``` Pull Request resolved: pytorch#113168 Approved by: https://github.com/lezcano, https://github.com/jansel
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -7649,6 +7649,16 @@ def fn(x, y):
         b = torch.randn(65, 2**24, device=self.device)
         fn(a, b)
 
+    def test_adaptive_avg_pool1d_argmax(self):
+        # https://github.com/pytorch/pytorch/issues/113013
+        def fn(x):
+            x = torch.adaptive_avg_pool1d(input=x, output_size=2)
+            x = torch.argmax(input=x)
+            return x
+
+        x = torch.rand([3, 3, 3], dtype=torch.float64)
+        self.common(fn, (x,))
+
 
 @dataclasses.dataclass
 class TestFailure:
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -136,6 +136,7 @@ def run(*ex, **kwargs):
     "test_zeros_dynamic_shapes": TestFailure(("cpu",)),
     "test_uint_dynamic_shapes": TestFailure(("cpu",)),
     "test_issue102546_dynamic_shapes": TestFailure(("cpu",)),
+    "test_adaptive_avg_pool1d_argmax_dynamic_shapes": TestFailure(("cpu",)),
     #
     # Failed to find for loop/triton kernel:
     #
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -1269,10 +1269,14 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 argmax_argmin_prefix(reduction_type, src_dtype, acc)
             )
             compare_op = "<" if reduction_type == "argmax" else ">"
+            assert self.reduction_depth is not None
+            index = self.itervars[self.reduction_depth]
+            for i in range(self.reduction_depth + 1, len(self.itervars)):
+                index = index * self.ranges[i] + self.itervars[i]
             self.stores.writelines(
                 [
                     f"if ({acc}.value {compare_op} {value}) {{",
-                    f"    {acc}.index = {self.itervars[-1]}; {acc}.value = {value};",
+                    f"    {acc}.index = {cexpr_index(index)}; {acc}.value = {value};",
                     "}",
                 ],
             )

Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ def run(ex, *kwargs):`
`136`	`136`	`"test_zeros_dynamic_shapes": TestFailure(("cpu",)),`
`137`	`137`	`"test_uint_dynamic_shapes": TestFailure(("cpu",)),`
`138`	`138`	`"test_issue102546_dynamic_shapes": TestFailure(("cpu",)),`
	`139`	`+ "test_adaptive_avg_pool1d_argmax_dynamic_shapes": TestFailure(("cpu",)),`
`139`	`140`	`#`
`140`	`141`	`# Failed to find for loop/triton kernel:`
`141`	`142`	`#`
Original file line number	Diff line number	Diff line change
`@@ -1269,10 +1269,14 @@ def reduction(self, dtype, src_dtype, reduction_type, value):`
`1269`	`1269`	`argmax_argmin_prefix(reduction_type, src_dtype, acc)`
`1270`	`1270`	`)`
`1271`	`1271`	`compare_op = "<" if reduction_type == "argmax" else ">"`
	`1272`	`+ assert self.reduction_depth is not None`
	`1273`	`+ index = self.itervars[self.reduction_depth]`
	`1274`	`+ for i in range(self.reduction_depth + 1, len(self.itervars)):`
	`1275`	`+ index = index * self.ranges[i] + self.itervars[i]`
`1272`	`1276`	`self.stores.writelines(`
`1273`	`1277`	`[`
`1274`	`1278`	`f"if ({acc}.value {compare_op} {value}) {{",`
`1275`		`- f" {acc}.index = {self.itervars[-1]}; {acc}.value = {value};",`
	`1279`	`+ f" {acc}.index = {cexpr_index(index)}; {acc}.value = {value};",`
`1276`	`1280`	`"}",`
`1277`	`1281`	`],`
`1278`	`1282`	`)`