csarofeen
diff --git a/‎benchmarks/cpp/nvfuser/bert.cpp‎
Lines changed: 23 additions & 5 deletions b/‎benchmarks/cpp/nvfuser/bert.cpp‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎test/cpp/jit/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎test/cpp/jit/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_jit_cuda_fuser.py‎
Lines changed: 4 additions & 27 deletions b/‎test/test_jit_cuda_fuser.py‎
Lines changed: 4 additions & 27 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/codegen.cpp‎
Lines changed: 30 additions & 8 deletions b/‎torch/csrc/jit/codegen/cuda/codegen.cpp‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/executor.cpp‎
Lines changed: 3 additions & 7 deletions b/‎torch/csrc/jit/codegen/cuda/executor.cpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/index_compute.cpp‎
Lines changed: 30 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/index_compute.cpp‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/index_compute.h‎
Lines changed: 9 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/index_compute.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_internal_nodes.h‎
Lines changed: 17 additions & 1 deletion b/‎torch/csrc/jit/codegen/cuda/ir_internal_nodes.h‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_nodes.cpp‎
Lines changed: 10 additions & 3 deletions b/‎torch/csrc/jit/codegen/cuda/ir_nodes.cpp‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_utils.cpp‎
Lines changed: 7 additions & 2 deletions b/‎torch/csrc/jit/codegen/cuda/ir_utils.cpp‎
Lines changed: 7 additions & 2 deletions
@@ -342,11 +342,6 @@ static void MagicScheduler_BiasDropoutAddLayernormFwd(
       bytes * int64_t(benchmark_state.iterations()));
 }
 
-static void MagicScheduler_fp32_BiasDropoutAddLayernormFwd(
-    benchmark::State& benchmark_state) {
-  MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
-}
-
 static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
@@ -677,6 +672,16 @@ static void DivMaxSoftDropBwd_fp16(benchmark::State& benchmark_state) {
   MagicScheduler_DivMaxSoftDropBwd(benchmark_state, DataType::Half);
 }
 
+static void BiasDropoutAddLayernormFwd_fp32(
+    benchmark::State& benchmark_state) {
+  MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
+}
+
+static void BiasDropoutAddLayernormFwd_tf32(
+    benchmark::State& benchmark_state) {
+  MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
+}
+
 static void BiasDropoutAddLayernormBwd1_fp32(
     benchmark::State& benchmark_state) {
   MagicScheduler_BiasDropoutAddLayernormBwd1(benchmark_state, DataType::Float);
@@ -724,6 +729,19 @@ BENCHMARK(DivMaxSoftDropBwd_fp16)
     ->Unit(benchmark::kMicrosecond)
     ->UseManualTime();
 
+BENCHMARK(BiasDropoutAddLayernormFwd_fp32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
+// Use full ampere wave here
+BENCHMARK(BiasDropoutAddLayernormFwd_tf32)
+    // ->RangeMultiplier(2)
+    ->Ranges({{32, 1024}, {128, 128}, {864, 864}})
+    ->Unit(benchmark::kMicrosecond)
+    ->UseManualTime();
+
 BENCHMARK(BiasDropoutAddLayernormBwd1_fp32)
     // ->RangeMultiplier(2)
     ->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
 
@@ -101,6 +101,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_rng.cu)
 endif()
 
 add_executable(test_jit
 
@@ -2357,10 +2357,13 @@ def t(x: torch.Tensor):
         self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
+    @unittest.skip("Skipped due to rand_like behavior change")
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
     def test_profiling_node(self):
+        # TODO: should we change this test to not use rand_like, or just
+        # remove this test?
         dtype = torch.float
         device = "cuda"
         x = torch.randn(4, 8, 8, 8, dtype=dtype, device=device)
@@ -2372,26 +2375,6 @@ def repro(x: torch.Tensor, alpha: float):
         repro_jit = torch.jit.script(repro)
         self._run_helper(repro_jit, repro, x, 0.6)
 
-    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
-    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
-                     "Requires fusion optimization pass to be effective")
-    def test_rand_like(self):
-        dtype = torch.float
-        device = "cuda"
-
-        def t(x: torch.Tensor, alpha: float):
-            o = torch.rand_like(x)
-            o = torch.add(o, alpha)
-            return o
-
-        # disabling cache so new inputs would generate new graph
-        t.__disable_jit_function_caching__ = True
-
-        for m_format in [torch.contiguous_format, torch.channels_last]:
-            x = torch.randn(4, 5, 6, 7, dtype=dtype, device=device).to(memory_format=m_format)
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x, 0.6, check_stride=True)
-
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
@@ -4864,19 +4847,13 @@ def clamp_min(x):
     def test_device_constant(self):
         x = torch.randn(4, 2, device="cuda")
 
-        def t(x):
-            return torch.rand_like(x, device=torch.device(type='cuda'))
-
         # cpu tensor shouldn't be fused
         def t_cpu(x):
             return torch.rand_like(x, device=torch.device(type='cpu'))
 
         with nvfuser_singleton_fusion(True):
-            t_jit = torch.jit.script(t)
-            self._run_helper(t_jit, t, x)
-
             t_cpu_jit = torch.jit.script(t_cpu)
-            for i in range(5):
+            for _ in range(5):
                 t_cpu_jit(x)
 
             self.assertGraphContainsExactly(t_cpu_jit.graph_for(x), FUSION_GUARD, 0)
 
@@ -247,7 +247,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     }
 
     // Kernels generating random numbers take extra (seed, offset) arguments
-    if (kernel_summary.is_stochastic) {
+    if (kernel_summary.max_rng_offsets >= 0) {
       code_ << ", at::PhiloxCudaState philox_args";
     }
 
@@ -259,14 +259,14 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     const auto& kernel_summary = kernel_->summary();
 
     // Random number generator (optional)
-    if (kernel_summary.is_stochastic) {
-      indent()
-          << "const auto idx = ((((blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x) * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;";
+    if (kernel_summary.max_rng_offsets >= 0) {
       indent() << "auto offset = philox_args.captured_ ?\n";
       indent()
           << "  static_cast<uint64_t>(*(philox_args.offset_.ptr) + philox_args.offset_intragraph_) :\n";
       indent() << "  philox_args.offset_.val;\n";
-      indent() << "Philox rnd(philox_args.seed_, idx, offset);\n";
+      indent() << "uint4 rng_result;\n";
+      indent() << "nvfuser_index_t rng_subseq = -1;\n";
+      indent() << "nvfuser_index_t rng_offset = -1;\n";
     }
 
     // Do we have any dynamic shared memory buffers?
@@ -695,8 +695,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       }
     }
 
+    const auto op_type = uop->getUnaryOpType();
+
     if (uop->out()->isA<NamedScalar>()) {
-      const auto op_type = uop->getUnaryOpType();
       if (auto op = inline_op_str(op_type)) {
         indent() << gen(uop->out()) << " = " << *op << genInline(uop->in())
                  << ";\n";
@@ -705,15 +706,36 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     }
 
     if (!print_inline_) {
+      if (op_type == UnaryOpType::RandLike) {
+        auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
+        auto index = genTensorIndex(uop->out()->as<kir::TensorIndex>());
+        int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
+        indent() << "nvfuser_index_t subseq" << uop->name() << " = (" << index
+                 << ") / " << multiple << ";\n";
+        indent() << "nvfuser_index_t component" << uop->name() << " = ("
+                 << index << ") % " << multiple << ";\n";
+        indent() << "nvfuser_index_t offset" << uop->name() << " = "
+                 << uop->getRNGOffset() << ";\n";
+        indent() << "if (rng_subseq != subseq" << uop->name()
+                 << " || rng_offset != offset" << uop->name() << ") {\n";
+        indent() << "  rng_result = philox(philox_args.seed_, subseq"
+                 << uop->name() << ", offset / 4 + offset" << uop->name()
+                 << ");\n";
+        indent() << "  rng_subseq = subseq" << uop->name() << ";\n";
+        indent() << "  rng_offset = offset" << uop->name() << ";\n";
+        indent() << "}\n";
+      }
+
       indent() << gen(uop->out());
       if (!uop->out()->isScalar() && !uop->in()->isScalar()) {
         code_ << "\n";
         indent() << kTab;
       }
       code_ << " = ";
+    } else {
+      TORCH_INTERNAL_ASSERT(op_type != UnaryOpType::RandLike);
     }
 
-    const auto op_type = uop->getUnaryOpType();
     if (auto op = inline_op_str(op_type)) {
       if (alsoBooleanOperator(op_type) &&
           uop->out()->dtype() == DataType::Bool) {
@@ -742,7 +764,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
 
       code_ << "(";
       if (op_type == UnaryOpType::RandLike) {
-        code_ << "rnd";
+        code_ << "rng_result, component" << uop->name();
       } else {
         code_ << gen(uop->in());
       }
 
@@ -916,18 +916,14 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
     global_buffers = allocGlobalVals(expr_eval);
 
-    if (kernel()->summary().is_stochastic) {
+    if (kernel()->summary().max_rng_offsets >= 0) {
       // NOTE: this is how we map offset to PW kernels in order to have
       // identical random number generator to match native PyTorch results.
       // But it doesn't really work as it takes assumption how threads are
       // binded but is not generally how we handle that in scheduler.
       // Refer to `Philox` in generated kernel to understand how the mapping
       // works.
-      rand_offset = 4 *
-          (std::ceil(
-               allocated_outputs[0].numel() /
-               (4.0 * 128 * launch_params_.gdimx())) + // NOLINT
-           1);
+      rand_offset = (kernel()->summary().max_rng_offsets + 1) * 4;
     }
 
     // This is the entry when we have provided `opt_code` but the entry has not
@@ -961,7 +957,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     kernel_arguments.push(inputs);
     kernel_arguments.push(allocated_outputs);
     kernel_arguments.push(global_buffers.buffers);
-    if (lowered_->kernel()->summary().is_stochastic) {
+    if (lowered_->kernel()->summary().max_rng_offsets >= 0) {
       kernel_arguments.appendPhiloxRNGSeed(rand_offset);
     }
   }
 
@@ -1866,6 +1866,36 @@ std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
   return strided_inds;
 }
 
+std::vector<Val*> Index::getRandomTensorStridedIndices(
+    TensorView* consumer_tv,
+    const std::vector<kir::ForLoop*>& loops) {
+  // Use domain guard to ignore the contiguity of
+  //  consumer tv.
+  TensorDomain* consumer_tv_no_contiguity_domain = nullptr;
+  auto contiguity_vector =
+      std::vector<bool>(consumer_tv->getMaybeRFactorDomain().size(), true);
+  if (consumer_tv->hasRFactor()) {
+    consumer_tv_no_contiguity_domain = IrBuilder::create<TensorDomain>(
+        consumer_tv->getRootDomain(),
+        consumer_tv->getRFactorDomain(),
+        consumer_tv->domain()->domain(),
+        contiguity_vector);
+  } else {
+    consumer_tv_no_contiguity_domain = IrBuilder::create<TensorDomain>(
+        consumer_tv->getRootDomain(),
+        consumer_tv->domain()->domain(),
+        contiguity_vector);
+  }
+
+  ir_utils::TVDomainGuard domain_guard(
+      consumer_tv, consumer_tv_no_contiguity_domain);
+
+  // TODO:
+  //  More optimization on the underlying tensor layout
+  //   will be done in a follow up.
+  return getGlobalConsumerStridedIndices(consumer_tv, loops);
+}
+
 std::vector<Val*> Index::getGlobalConsumerStridedIndices(
     const TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
 
@@ -341,6 +341,15 @@ class Index {
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
+  //! Returns a vector of strided indices mapped onto the (rfactor)
+  //! root domain of a consumer tensor. The returned index is intended
+  //! to be used to index into Philox pseudo random sequences so that
+  //! inlined multivisit to the same element in a random tensor returns
+  //! consistent values.
+  static std::vector<Val*> getRandomTensorStridedIndices(
+      TensorView* consumer_tv,
+      const std::vector<kir::ForLoop*>& loops);
+
   //! Take a consumer tensorview and loop nest and generates predicates
   //! associated with the concrete roots of the loop nest. Returns a list of
   //! predicates, and a list of concrete roots they're associated with. It is
 
@@ -37,7 +37,12 @@ bool areEqualScalars(Val* v1, Val* v2);
 //!   4) split/merge
 class TORCH_CUDA_CU_API UnaryOp : public Expr {
  public:
-  UnaryOp(IrBuilderPasskey, UnaryOpType type, Val* out, Val* in);
+  UnaryOp(
+      IrBuilderPasskey,
+      UnaryOpType type,
+      Val* out,
+      Val* in,
+      int rng_offset = -1);
 
   UnaryOp(const UnaryOp* src, IrCloner* ir_cloner);
 
@@ -52,12 +57,23 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr {
     return unary_op_type_;
   }
 
+  int getRNGOffset() const {
+    return rng_offset_;
+  }
+
+  void setRNGOffset(int val) {
+    rng_offset_ = val;
+  }
+
   bool sameAs(const Statement* other) const override;
 
  private:
   const UnaryOpType unary_op_type_;
   Val* const out_ = nullptr;
   Val* const in_ = nullptr;
+  // TODO: pull RNG op out of Unary ops
+  // https://github.com/csarofeen/pytorch/pull/1892
+  int rng_offset_ = -1;
 };
 
 //! A specialization for Binary operations. Binary operations take in two inputs
 
@@ -182,11 +182,17 @@ bool ComplexDouble::sameAs(const Statement* other) const {
   return false;
 }
 
-UnaryOp::UnaryOp(IrBuilderPasskey passkey, UnaryOpType type, Val* out, Val* in)
+UnaryOp::UnaryOp(
+    IrBuilderPasskey passkey,
+    UnaryOpType type,
+    Val* out,
+    Val* in,
+    int rng_offset)
     : Expr(passkey, ExprType::UnaryOp),
       unary_op_type_{type},
       out_{out},
-      in_{in} {
+      in_{in},
+      rng_offset_(rng_offset) {
   addOutput(out);
   addInput(in);
 }
@@ -195,7 +201,8 @@ UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner)
     : Expr(src, ir_cloner),
       unary_op_type_(src->unary_op_type_),
       out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
+      in_(ir_cloner->clone(src->in_)),
+      rng_offset_(src->rng_offset_) {}
 
 bool UnaryOp::sameAs(const Statement* other) const {
   if (this == other) {
 
@@ -186,7 +186,11 @@ struct SubstituteInExpr : public OptInDispatch {
     auto out =
         reference_->sameAs(unary_expr->out()) ? substitute_ : unary_expr->out();
     expr_ = IrBuilder::create<UnaryOp>(
-        unary_expr->container(), unary_expr->getUnaryOpType(), out, in);
+        unary_expr->container(),
+        unary_expr->getUnaryOpType(),
+        out,
+        in,
+        unary_expr->getRNGOffset());
   }
 
   void handle(BinaryOp* binary_expr) final {
@@ -887,7 +891,8 @@ struct ReplaceValInIndexVal : public OptInDispatch {
     auto inp = last_visited_val_;
     TORCH_INTERNAL_ASSERT(uop->out()->isA<Int>());
     auto out = IrBuilder::create<Int>(c10::nullopt);
-    IrBuilder::create<UnaryOp>(uop->getUnaryOpType(), out, inp);
+    IrBuilder::create<UnaryOp>(
+        uop->getUnaryOpType(), out, inp, uop->getRNGOffset());
     last_visited_val_ = out;
   }