csarofeen
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.cpp
Lines changed: 22 additions & 27 deletions b/‎torch/csrc/jit/codegen/cuda/arith.cpp
Lines changed: 22 additions & 27 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.h
Lines changed: 5 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/arith.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/codegen.cpp
Lines changed: 30 additions & 29 deletions b/‎torch/csrc/jit/codegen/cuda/codegen.cpp
Lines changed: 30 additions & 29 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/dispatch.cpp
Lines changed: 15 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/dispatch.cpp
Lines changed: 15 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/dispatch.h
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/dispatch.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/fusion.cpp
Lines changed: 18 additions & 5 deletions b/‎torch/csrc/jit/codegen/cuda/fusion.cpp
Lines changed: 18 additions & 5 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/fusion.h
Lines changed: 3 additions & 1 deletion b/‎torch/csrc/jit/codegen/cuda/fusion.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_base_nodes.h
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_base_nodes.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_builder.cpp
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_builder.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_cloner.cpp
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_cloner.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_cloner.h
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_cloner.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
Lines changed: 10 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_graphviz.h
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_graphviz.h
Lines changed: 1 addition & 0 deletions
@@ -358,6 +358,19 @@ Val* getMaximumValue(DataType v) {
 
 } // namespace
 
+// TENSOR FACTORIES
+TensorView* rand(const std::vector<Val*>& shape, DataType dtype) {
+  auto n = shape.size();
+  auto out = TensorViewBuilder()
+                 .ndims(n)
+                 .dtype(dtype)
+                 .contiguity(std::vector<bool>(n, true))
+                 .shape(shape)
+                 .build();
+  IrBuilder::create<RNGOp>(RNGOpType::Uniform, out);
+  return out;
+}
+
 Val* castOp(DataType dtype, Val* v1) {
   if (v1->getDataType().value() == dtype) {
     return set(v1);
@@ -404,17 +417,6 @@ Val* unaryOp(UnaryOpType type, Val* v1) {
   TORCH_INTERNAL_ASSERT(
       type != UnaryOpType::Address,
       "The reference operator & is not accessible in the Fusion IR");
-
-  // TODO: We should add the following, but we need to go through schedulers
-  // and make sure all calls to "fusion->inputs" includes the output of RandLike
-  //
-  //  If rand like, there isn't a real dependency on the input value, so map it
-  //  to a dummy scalar. if
-  //
-  // (type == UnaryOpType::RandLike) {
-  //   v1 = new NamedScalar("__rnd", v1->getDataType().value());
-  // }
-
   Val* out = newValLike(v1, v1->getDataType().value());
   IrBuilder::create<UnaryOp>(type, out, v1);
   return out;
@@ -469,28 +471,21 @@ NVFUSER_DEFINE_UNARY_OP(trunc, Trunc)
 NVFUSER_DEFINE_UNARY_OP(print, Print)
 #undef NVFUSER_DEFINE_UNARY_OP
 
-Val* randlike(Val* v) {
+TensorView* randlike(TensorView* v) {
   TORCH_CHECK(
       isFloatingPointType(v->dtype()),
       "input must have floating point type, but got ",
       v->dtype());
-  auto rand_vals = unaryOp(UnaryOpType::RandLike, v);
-  return where(
-      eq(rand_vals, IrBuilder::create<Double>(1.0)),
-      IrBuilder::create<Double>(0.0),
-      rand_vals);
+  std::vector<Val*> shape;
+  shape.reserve(v->getMaybeRFactorDomain().size());
+  for (auto id : v->getMaybeRFactorDomain()) {
+    shape.emplace_back(id->getMaybeExpandedExtent());
+  }
+  return rand(shape, v->dtype());
 }
 
-TensorView* randlike(TensorView* v) {
-  TORCH_CHECK(
-      isFloatingPointType(v->dtype()),
-      "input must have floating point type, but got ",
-      v->dtype());
-  auto rand_vals = unaryOp(UnaryOpType::RandLike, v);
-  return where(
-      eq(rand_vals, IrBuilder::create<Double>(1.0)),
-      IrBuilder::create<Double>(0.0),
-      rand_vals);
+Val* randlike(Val* v) {
+  return randlike(v->as<TensorView>());
 }
 
 Val* bitwise_not(Val* v) {
 
@@ -121,6 +121,11 @@ TORCH_CUDA_CU_API WelfordResult Welford(
     // import IrBuilder just for this one interface.
     Int* init_N = nullptr);
 
+// TENSOR FACTORIES
+TORCH_CUDA_CU_API TensorView* rand(
+    const std::vector<Int*>& shape,
+    DataType dtype);
+
 // UNARY OPERATIONS
 // abs
 TORCH_CUDA_CU_API Val* abs(Val*);
 
@@ -706,34 +706,12 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     }
 
     if (!print_inline_) {
-      if (op_type == UnaryOpType::RandLike) {
-        auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
-        auto index = genTensorIndex(uop->in()->as<kir::TensorIndex>());
-        int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
-        indent() << "nvfuser_index_t rng_subseq" << uop->name() << " = ("
-                 << index << ") / " << multiple << ";\n";
-        indent() << "nvfuser_index_t rng_component" << uop->name() << " = ("
-                 << index << ") % " << multiple << ";\n";
-        indent() << "nvfuser_index_t rng_offset" << uop->name() << " = "
-                 << uop->getRNGOffset() << ";\n";
-        indent() << "if (rng_subseq != rng_subseq" << uop->name()
-                 << " || rng_offset != rng_offset" << uop->name() << ") {\n";
-        indent() << "  rng_result = philox(philox_args.seed_, rng_subseq"
-                 << uop->name() << ", philox_offset / 4 + rng_offset"
-                 << uop->name() << ");\n";
-        indent() << "  rng_subseq = rng_subseq" << uop->name() << ";\n";
-        indent() << "  rng_offset = rng_offset" << uop->name() << ";\n";
-        indent() << "}\n";
-      }
-
       indent() << gen(uop->out());
       if (!uop->out()->isScalar() && !uop->in()->isScalar()) {
         code_ << "\n";
         indent() << kTab;
       }
       code_ << " = ";
-    } else {
-      TORCH_INTERNAL_ASSERT(op_type != UnaryOpType::RandLike);
     }
 
     if (auto op = inline_op_str(op_type)) {
@@ -762,20 +740,43 @@ class CudaKernelGenerator : private OptOutConstDispatch {
         }
       }
 
-      code_ << "(";
-      if (op_type == UnaryOpType::RandLike) {
-        code_ << "rng_result, rng_component" << uop->name();
-      } else {
-        code_ << gen(uop->in());
-      }
-      code_ << ")";
+      code_ << "(" << gen(uop->in()) << ")";
     }
 
     if (!print_inline_) {
       code_ << ";\n";
     }
   }
 
+  void handle(const RNGOp* rop) final {
+    // TODO: TORCH_INTERNAL_ASSERT that the scheduler correctly creates an
+    // innermost ID of size 4 (float) or size 2 (double)?
+    auto out_tv = rop->output(0)->as<kir::TensorIndex>()->view();
+    auto index = genTensorIndex(rop->getPhiloxIndex()->as<kir::TensorIndex>());
+    int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
+    indent() << "nvfuser_index_t rng_subseq" << rop->name() << " = (" << index
+             << ") / " << multiple << ";\n";
+    indent() << "nvfuser_index_t rng_component" << rop->name() << " = ("
+             << index << ") % " << multiple << ";\n";
+    indent() << "nvfuser_index_t rng_offset" << rop->name() << " = "
+             << rop->getRNGOffset() << ";\n";
+    indent() << "if (rng_subseq != rng_subseq" << rop->name()
+             << " || rng_offset != rng_offset" << rop->name() << ") {\n";
+    indent() << "  rng_result = philox(philox_args.seed_, rng_subseq"
+             << rop->name() << ", philox_offset / 4 + rng_offset" << rop->name()
+             << ");\n";
+    indent() << "  rng_subseq = rng_subseq" << rop->name() << ";\n";
+    indent() << "  rng_offset = rng_offset" << rop->name() << ";\n";
+    indent() << "}\n";
+    auto op_type = rop->getRNGOpType();
+    indent() << gen(rop->output(0)) << " = " << op_type;
+    if (needFloatSuffix(op_type) &&
+        rop->output(0)->dtype() == DataType::Float) {
+      code_ << "f";
+    }
+    code_ << "(rng_result, rng_component" << rop->name() << ");\n";
+  }
+
   std::string genBinaryOp(
       BinaryOpType op_type,
       DataType data_type,
 
@@ -104,6 +104,9 @@ void Expr::dispatch(T handler, Expr* expr) {
     case ExprType::TernaryOp:
       ptr(handler)->handle(expr->as<TernaryOp>());
       return;
+    case ExprType::RNGOp:
+      ptr(handler)->handle(expr->as<RNGOp>());
+      return;
     case ExprType::ReductionOp:
       ptr(handler)->handle(expr->as<ReductionOp>());
       return;
@@ -284,6 +287,9 @@ void Expr::constDispatch(T handler, const Expr* expr) {
     case ExprType::TernaryOp:
       ptr(handler)->handle(expr->as<TernaryOp>());
       return;
+    case ExprType::RNGOp:
+      ptr(handler)->handle(expr->as<RNGOp>());
+      return;
     case ExprType::ReductionOp:
       ptr(handler)->handle(expr->as<ReductionOp>());
       return;
@@ -472,6 +478,9 @@ void Expr::mutatorDispatch(T mutator, Expr* expr) {
     case ExprType::TernaryOp:
       ptr(mutator)->mutate(expr->as<TernaryOp>());
       return;
+    case ExprType::RNGOp:
+      ptr(mutator)->mutate(expr->as<RNGOp>());
+      return;
     case ExprType::ReductionOp:
       ptr(mutator)->mutate(expr->as<ReductionOp>());
       return;
@@ -725,6 +734,9 @@ void OptOutConstDispatch::handle(const BinaryOp* stmt) {
 void OptOutConstDispatch::handle(const TernaryOp* stmt) {
   unhandled(stmt);
 }
+void OptOutConstDispatch::handle(const RNGOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutConstDispatch::handle(const ReductionOp* stmt) {
   unhandled(stmt);
 }
@@ -875,6 +887,9 @@ void OptOutDispatch::handle(BinaryOp* stmt) {
 void OptOutDispatch::handle(TernaryOp* stmt) {
   unhandled(stmt);
 }
+void OptOutDispatch::handle(RNGOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutDispatch::handle(ReductionOp* stmt) {
   unhandled(stmt);
 }
 
@@ -71,6 +71,7 @@ class NamedScalar;
 class UnaryOp;
 class BinaryOp;
 class TernaryOp;
+class RNGOp;
 class ReductionOp;
 class GroupedReductionOp;
 class WelfordOp;
@@ -145,6 +146,7 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
   virtual void handle(const UnaryOp* stmt);
   virtual void handle(const BinaryOp* stmt);
   virtual void handle(const TernaryOp* stmt);
+  virtual void handle(const RNGOp* stmt);
   virtual void handle(const ReductionOp* stmt);
   virtual void handle(const GroupedReductionOp* stmt);
   virtual void handle(const WelfordOp* stmt);
@@ -210,6 +212,7 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
   virtual void handle(UnaryOp* stmt);
   virtual void handle(BinaryOp* stmt);
   virtual void handle(TernaryOp* stmt);
+  virtual void handle(RNGOp* stmt);
   virtual void handle(ReductionOp* stmt);
   virtual void handle(GroupedReductionOp* stmt);
   virtual void handle(WelfordOp* stmt);
@@ -316,6 +319,7 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
   virtual void mutate(UnaryOp*);
   virtual void mutate(BinaryOp*);
   virtual void mutate(TernaryOp*);
+  virtual void mutate(RNGOp*);
   virtual void mutate(ReductionOp*);
   virtual void mutate(GroupedReductionOp*);
   virtual void mutate(WelfordOp*);
 
@@ -373,6 +373,18 @@ void Fusion::printMath(bool from_outputs_only) {
   std::cout << "}\n\n";
 }
 
+std::vector<Val*> Fusion::inputsAndCreated() {
+  auto result = inputs_;
+  for (auto expr : exprs()) {
+    if (expr->inputs().empty()) {
+      for (auto v : expr->outputs()) {
+        result.emplace_back(v);
+      }
+    }
+  }
+  return result;
+}
+
 void Fusion::printTransforms() {
   FUSER_PERF_SCOPE("Fusion::printTransforms");
 
@@ -531,14 +543,15 @@ Expr* Fusion::definition(const Val* val) const {
 
 // Indicate to kernel to set itself up to generate random numbers
 bool Fusion::isStochastic() {
-  for (auto expr : exprs())
-    if (expr->getExprType() == ExprType::UnaryOp)
-      if (expr->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::RandLike)
-        return true;
+  for (auto expr : exprs()) {
+    if (expr->getExprType() == ExprType::RNGOp) {
+      return true;
+    }
+  }
   return false;
 }
 
-std::vector<Val*> Fusion::getTerminatingOutputs() {
+std::vector<Val*> Fusion::getTerminatingOutputs() const {
   FUSER_PERF_SCOPE("getTerminatingOutputs");
 
   auto is_reachable_to_output = [](Val* val) {
 
@@ -175,11 +175,13 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
     return inputs_;
   }
 
+  std::vector<Val*> inputsAndCreated();
+
   const auto& outputs() const {
     return outputs_;
   }
 
-  std::vector<Val*> getTerminatingOutputs();
+  std::vector<Val*> getTerminatingOutputs() const;
 
   // Aliasing output to input value, this is a WAR to allow inplace update on
   // input tensor.
 
@@ -48,6 +48,7 @@ class Expr;
 class Val;
 class UnaryOp;
 class BinaryOp;
+class RNGOp;
 class IterDomain;
 class IrCloner;
 class IrContainer;
 
@@ -63,6 +63,7 @@ IR_BUILDER_INSTANTIATE(ViewOp)
 IR_BUILDER_INSTANTIATE(UnaryOp)
 IR_BUILDER_INSTANTIATE(BinaryOp)
 IR_BUILDER_INSTANTIATE(TernaryOp)
+IR_BUILDER_INSTANTIATE(RNGOp)
 IR_BUILDER_INSTANTIATE(ReductionOp)
 IR_BUILDER_INSTANTIATE(GroupedReductionOp)
 IR_BUILDER_INSTANTIATE(WelfordOp)
 
@@ -100,6 +100,10 @@ void IrCloner::handle(const TernaryOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
 
+void IrCloner::handle(const RNGOp* op) {
+  clone_ = IrBuilder::clone(op, this);
+}
+
 void IrCloner::handle(const BroadcastOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
 
@@ -71,6 +71,7 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   void handle(const UnaryOp*) override;
   void handle(const BinaryOp*) override;
   void handle(const TernaryOp*) override;
+  void handle(const RNGOp*) override;
   void handle(const BroadcastOp*) override;
   void handle(const ReductionOp*) override;
   void handle(const GroupedReductionOp*) override;
 
@@ -443,6 +443,16 @@ void IrGraphGenerator::handle(const TernaryOp* op) {
   addArc(op, op->out());
 }
 
+void IrGraphGenerator::handle(const RNGOp* op) {
+  // node
+  std::stringstream label;
+  label << op->getRNGOpType();
+  printExpr(op, label.str());
+
+  // inputs & outputs
+  addArc(op, op->output(0));
+}
+
 void IrGraphGenerator::handle(const BroadcastOp* op) {
   printExpr(op, "Broadcast");
   addArc(op->in(), op);
 
@@ -85,6 +85,7 @@ class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const UnaryOp*) override;
   void handle(const BinaryOp*) override;
   void handle(const TernaryOp*) override;
+  void handle(const RNGOp*) override;
   void handle(const BroadcastOp*) override;
   void handle(const ReductionOp*) override;
Original file line number	Diff line number	Diff line change
`@@ -175,11 +175,13 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {`
`175`	`175`	`return inputs_;`
`176`	`176`	`}`
`177`	`177`
	`178`	`+ std::vector<Val*> inputsAndCreated();`
	`179`	`+`
`178`	`180`	`const auto& outputs() const {`
`179`	`181`	`return outputs_;`
`180`	`182`	`}`
`181`	`183`
`182`		`- std::vector<Val*> getTerminatingOutputs();`
	`184`	`+ std::vector<Val*> getTerminatingOutputs() const;`
`183`	`185`
`184`	`186`	`// Aliasing output to input value, this is a WAR to allow inplace update on`
`185`	`187`	`// input tensor.`
Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,10 @@ void IrCloner::handle(const TernaryOp* op) {`
`100`	`100`	`clone_ = IrBuilder::clone(op, this);`
`101`	`101`	`}`
`102`	`102`
	`103`	`+void IrCloner::handle(const RNGOp* op) {`
	`104`	`+ clone_ = IrBuilder::clone(op, this);`
	`105`	`+}`
	`106`	`+`
`103`	`107`	`void IrCloner::handle(const BroadcastOp* op) {`
`104`	`108`	`clone_ = IrBuilder::clone(op, this);`
`105`	`109`	`}`