Patch dropout fix (#1898)

shmsong · web-flow · commit 83dbf56a9554 · 2022-08-09T20:38:03.000-07:00
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -708,7 +708,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     if (!print_inline_) {
       if (op_type == UnaryOpType::RandLike) {
         auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
-        auto index = genTensorIndex(uop->out()->as<kir::TensorIndex>());
+        auto index = genTensorIndex(uop->in()->as<kir::TensorIndex>());
         int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
         indent() << "nvfuser_index_t subseq" << uop->name() << " = (" << index
                  << ") / " << multiple << ";\n";
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -94,20 +94,19 @@ void IndexLowering::handle(const kir::ForLoop* for_loop) {
 
 // TODO: use a separate IR node to represent rand like
 void IndexLowering::lowerRandLike(const UnaryOp* uop) {
-  // TODO: not using this input any more, remove
-  //  when making RandLike a no-input op.
-  const auto in = lowerSrcIndex(uop->in(), uop->out());
-
-  // Default path for scalar output.
-  Val* out = uop->out();
-
   // Write random tensor indices into the consumer
   //  tensor index if the output is a tensor.
   auto out_tv = dynamic_cast<TensorView*>(uop->out());
-  if (out_tv != nullptr) {
-    out = SimplifyingIrBuilder::create<kir::TensorIndex>(
-        out_tv, Index::getRandomTensorStridedIndices(out_tv, for_loops_));
-  }
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr, "rand scalar not yet supported");
+
+  // TODO: using in as a placeholder for the random tensor index
+  //  would need to keep this space on the new rand op when separating
+  //  randlike from the unary op.
+  auto in = SimplifyingIrBuilder::create<kir::TensorIndex>(
+      out_tv, Index::getRandomTensorStridedIndices(out_tv, for_loops_));
+
+  // TensorIndex for writing randlike output.
+  const auto out = lowerDstIndex(uop->out());
 
   pushBack(IrBuilder::create<UnaryOp>(
       UnaryOpType::RandLike, out, in, uop->getRNGOffset()));
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
@@ -133,6 +133,40 @@ TEST_F(NVFuserTest, FusionRNGValidateWithCURand_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionRNGSimpleValidateWithCURand_CUDA) {
+  int64_t size = 128;
+  auto dtype = kFloat;
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
+  fusion->addInput(tv0);
+  auto tv1 = randlike(tv0);
+  auto tv2 = set(tv1);
+  fusion->addOutput(tv2);
+
+  tv2->split(0, 8);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv2, 1);
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+  at::Tensor t0 = at::zeros({size}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0});
+
+  at::manual_seed(0);
+  auto cg_outputs = fe.runFusion({t0});
+  auto out = cg_outputs[0];
+
+  at::manual_seed(0);
+  auto ref = generate_uniform(size, dtype);
+
+  testValidate(fusion, {out}, {t0}, {ref}, __LINE__, __FILE__);
+}
+
 TEST_F(NVFuserTest, FusionBroadcastingRNG_CUDA) {
   for (auto dtype : {kFloat, kDouble}) {
     std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();