Reduction rand like patch (#2031)

jjsjann123 · web-flow · commit 40e2703d0079 · 2022-10-05T09:39:25.000-07:00
*_like operations are not filtering out reduction domain on inputs. This resulted with output differs in shape on input. Run into this issue on hugging face benchmark with python stack.

1. updated the operation to filter input domain with noReduction;
2. added a test case to verify the breakage and fix;
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -471,17 +471,18 @@ TensorView* uniform(
   return out;
 }
 
-TensorView* rand_like(TensorView* v) {
+TensorView* rand_like(TensorView* tv) {
   TORCH_CHECK(
-      isFloatingPointType(v->dtype()),
+      isFloatingPointType(tv->dtype()),
       "input must have floating point type, but got ",
-      v->dtype());
+      tv->dtype());
   std::vector<Val*> shape;
-  shape.reserve(v->getMaybeRFactorDomain().size());
-  for (auto id : v->getMaybeRFactorDomain()) {
+  auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  shape.reserve(dom.size());
+  for (auto id : dom) {
     shape.emplace_back(id->getMaybeExpandedExtent());
   }
-  return rand(shape, v->dtype());
+  return rand(shape, tv->dtype());
 }
 
 Val* rand_like(Val* v) {
@@ -505,8 +506,9 @@ TensorView* full(
 
 TensorView* full_like(TensorView* tv, Val* fill_value) {
   std::vector<Val*> shape;
-  shape.reserve(tv->getMaybeRFactorDomain().size());
-  for (auto id : tv->getMaybeRFactorDomain()) {
+  auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  shape.reserve(dom.size());
+  for (auto id : dom) {
     shape.emplace_back(id->getMaybeExpandedExtent());
   }
   return full(shape, fill_value, tv->dtype());
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
@@ -365,5 +365,35 @@ TEST_F(NVFuserTest, FusionUniform_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionRandLikeReduction_CUDA) {
+  auto dtype = kFloat;
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = rand_like(tv1);
+  auto tv3 = add(tv1, tv2);
+  fusion->addOutput(tv3);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+  at::Tensor t0 = at::zeros({2, 3}, options);
+
+  at::manual_seed(0);
+  auto cg_outputs = fec.runFusionWithInputs({t0});
+  auto out = cg_outputs[0];
+
+  at::manual_seed(0);
+  auto t1 = t0.sum(0);
+  auto t2 = generate_uniform(3, dtype).expand_as(t1);
+  auto t3 = t1.add(t2);
+
+  testValidate(fec.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch