csarofeen · jacobhinkle · Dec 21, 2022 · Dec 28, 2022 · Dec 28, 2022 · Dec 29, 2022
diff --git a/test/test_nvfuser_frontend.py b/test/test_nvfuser_frontend.py
@@ -61,13 +61,107 @@ def exec_nvfuser(self, fusion_func, inputs, new_fusion_expected=True) :
         self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected))
         return out, fs
 
-    def test_basic(self) :
+    def test_add(self):
         inputs = [
-            torch.ones(2, 4, 8, device='cuda'),
-            torch.ones(2, 4, 8, device='cuda'),
+            torch.ones(2, 4, 8, device="cuda"),
+            torch.ones(2, 4, 8, device="cuda"),
         ]
 
-        def fusion_func(fd: FusionDefinition) :
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+
+            t2 = fd.ops.add(t0, t1)
+
+            fd.add_output(t2)
+
+        # Expected Output is a tensor of 2's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = inputs[0] + inputs[1]
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_super_basic(self):
+        inputs = [
+            torch.ones(4, 8, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(2)
+            c0 = fd.define_constant(3.0)
+
+            t1 = fd.ops.mul(t0, c0)
+            t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+
+            fd.add_output(t2)
+
+        # Expected Output is a tensor of 24's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_super_basic_fp16(self):
+        inputs = [
+            torch.ones(4, 8, device="cuda", dtype=torch.float16),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(2, DataType.Half)
+            c0 = fd.define_constant(3.0)
+
+            t1 = fd.ops.mul(t0, c0)
+            t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+
+            t3 = fd.ops.cast(t2, DataType.Half)
+            fd.add_output(t3)
+
+        # Expected Output is a tensor of 48's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_basic(self):
+        inputs = [
+            torch.ones(2, 4, 8, device="cuda"),
+            torch.ones(2, 4, 8, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
             t0 = fd.define_tensor(3)
             t1 = fd.define_tensor(3)
             c0 = fd.define_constant(3.0)

diff --git a/third_party/nvfuser/CMakeLists.txt b/third_party/nvfuser/CMakeLists.txt
@@ -332,6 +332,7 @@ if(BUILD_TEST)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing_ops.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_match_frontend.cpp)
 
   set(JIT_TEST_CU_SRCS)
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)

diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
@@ -13,6 +13,8 @@
 #include <lower2device.h>
 #include <lower_bank_conflict.h>
 
+#include <ostream>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -344,26 +346,26 @@ void Fusion::validateInputs() {
   }
 }
 
-void Fusion::print() {
+void Fusion::print(std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::print");
 
   FusionGuard fg(this);
-  std::cout << "\n%kernel {\n";
-  IrMathPrinter op_exprs(std::cout);
+  stream << "\n%kernel {\n";
+  IrMathPrinter op_exprs(stream);
   op_exprs.handle(this);
-  std::cout << "\nTransformPrinter : \n";
-  IrTransformPrinter t_exprs(std::cout);
+  stream << "\nTransformPrinter : \n";
+  IrTransformPrinter t_exprs(stream);
   t_exprs.handle(this);
-  std::cout << "}\n\n";
+  stream << "}\n\n";
 }
 
-void Fusion::printKernel(DataType index_type) {
+void Fusion::printKernel(DataType index_type, std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printKernel");
   TORCH_INTERNAL_ASSERT(
       !this->isA<kir::Kernel>(),
       "Cannot \"print kernel\" of a kernel container. ",
       "This would require lowering during lowering.");
-  std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
+  stream << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
 }
 
 std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
@@ -380,19 +382,19 @@ std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
   return result;
 }
 
-void Fusion::printMath(bool from_outputs_only) {
+void Fusion::printMath(bool from_outputs_only, std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printMath");
 
   FusionGuard fg(this);
   auto exprs_for_print = exprs();
-  std::cout << "Inputs:" << std::endl;
+  stream << "Inputs:" << std::endl;
   for (auto inp : inputs()) {
-    std::cout << "  " << inp << ", " << inp->getDataType().value() << std::endl;
+    stream << "  " << inp << ", " << inp->getDataType().value() << std::endl;
   }
 
-  std::cout << "Outputs:" << std::endl;
+  stream << "Outputs:" << std::endl;
   for (auto out : outputs()) {
-    std::cout << "  " << out << ", " << out->getDataType().value() << std::endl;
+    stream << "  " << out << ", " << out->getDataType().value() << std::endl;
   }
 
   // If we want everything in the fusion, grab all values without uses to
@@ -407,11 +409,11 @@ void Fusion::printMath(bool from_outputs_only) {
     exprs_for_print = StmtSort::getExprs(this, leaf_vals);
   }
 
-  std::cout << "\n%kernel_math {\n";
+  stream << "\n%kernel_math {\n";
   for (auto expr : exprs_for_print) {
-    std::cout << expr;
+    stream << expr;
   }
-  std::cout << "}\n\n";
+  stream << "}\n\n";
 }
 
 std::vector<Val*> Fusion::inputsAndCreated() {
@@ -427,11 +429,11 @@ std::vector<Val*> Fusion::inputsAndCreated() {
   return result;
 }
 
-void Fusion::printTransforms() {
+void Fusion::printTransforms(std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printTransforms");
 
   FusionGuard fg(this);
-  IrTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(stream);
   t_exprs.handle(this);
 }
 

diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
@@ -124,17 +124,22 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   void validateInputs();
 
   //! Print this fusion to the console
-  void print();
+  void print(std::ostream& stream = std::cout);
 
   //! Print Arith exprs
   //! \param from_outputs_only Only print exprs reachable from outputs
-  void printMath(bool from_outputs_only = true);
+  //! \param stream Where to print output (defaults to std::cout)
+  void printMath(
+      bool from_outputs_only = true,
+      std::ostream& stream = std::cout);
 
   //! Print transformations used in fusion (can be very verbose)
-  void printTransforms();
+  void printTransforms(std::ostream& stream = std::cout);
 
   //! Lower the fusion and print a kernel
-  void printKernel(DataType index_type = DataType::Int);
+  void printKernel(
+      DataType index_type = DataType::Int,
+      std::ostream& stream = std::cout);
 
   //! Returns if this fusion is noop, for example, trivially forwarding inputs,
   //! or all outputs are size-0 tensors, etc.