csarofeen
diff --git a/‎benchmarks/cpp/nvfuser/layer_norm_backward.cpp‎
Lines changed: 1 addition & 2 deletions b/‎benchmarks/cpp/nvfuser/layer_norm_backward.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/cpp/nvfuser/rms_norm.cpp‎
Lines changed: 8 additions & 6 deletions b/‎benchmarks/cpp/nvfuser/rms_norm.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎benchmarks/cpp/nvfuser/rms_norm_backward.cpp‎
Lines changed: 14 additions & 15 deletions b/‎benchmarks/cpp/nvfuser/rms_norm_backward.cpp‎
Lines changed: 14 additions & 15 deletions
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎caffe2/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/cpp/jit/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎test/cpp/jit/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 1 addition & 1 deletion b/‎test/cpp/jit/test_gpu.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -64,8 +64,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
   if (dtype != DataType::Float) {
     layer_norm_results.grad_input =
         castOp(dtype, layer_norm_results.grad_input);
-    layer_norm_results.grad_bias =
-        castOp(dtype, layer_norm_results.grad_bias);
+    layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias);
     layer_norm_results.grad_weight =
         castOp(dtype, layer_norm_results.grad_weight);
   }
 
@@ -18,7 +18,9 @@ using namespace torch::jit::fuser::cuda;
 //------------------------------------------------------------------------------
 
 static void setupRMSNorm(Fusion* fusion, DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half || dtype == DataType::BFloat16);
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
 
   FusionGuard fg(fusion);
 
@@ -54,10 +56,11 @@ static void NvFuserScheduler_RMSNorm(
     benchmark::State& benchmark_state,
     FusionExecutorCache* fusion_executor_cache,
     DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half || dtype == DataType::BFloat16);
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
 
-  std::vector<int64_t> input_shape{
-      8, benchmark_state.range(0), 1024};
+  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
   const float kEps = 1e-6;
 
   // inputs
@@ -73,8 +76,7 @@ static void NvFuserScheduler_RMSNorm(
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
-      (2 * input.numel() + weight.numel()) *
-      int64_t(dataTypeSize(dtype)));
+      (2 * input.numel() + weight.numel()) * int64_t(dataTypeSize(dtype)));
 }
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,9 @@ using namespace torch::jit::fuser::cuda;
 static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half || dtype == DataType::BFloat16);
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
 
   const int kReductionAxis = 2;
   Double* eps_ptr = IrBuilder::create<Double>(1e-6);
@@ -47,14 +49,12 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
     rstd = castOp(DataType::Float, rstd);
   }
 
-  auto rms_norm_results = rms_norm_backward(
-      grad_out, input, {1}, rstd, weight, {true, true, true});
+  auto rms_norm_results =
+      rms_norm_backward(grad_out, input, {1}, rstd, weight, {true, true, true});
 
-  if (dtype != DataType::Float ) {
-    rms_norm_results.grad_input =
-        castOp(dtype, rms_norm_results.grad_input);
-    rms_norm_results.grad_weight =
-        castOp(dtype, rms_norm_results.grad_weight);
+  if (dtype != DataType::Float) {
+    rms_norm_results.grad_input = castOp(dtype, rms_norm_results.grad_input);
+    rms_norm_results.grad_weight = castOp(dtype, rms_norm_results.grad_weight);
   }
 
   fusion->addOutput(rms_norm_results.grad_input);
@@ -65,10 +65,11 @@ static void NvFuserScheduler_RMSNorm_BWD(
     benchmark::State& benchmark_state,
     FusionExecutorCache* fusion_executor_cache,
     DataType dtype) {
-  TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half || dtype == DataType::BFloat16);
+  TORCH_INTERNAL_ASSERT(
+      dtype == DataType::Float || dtype == DataType::Half ||
+      dtype == DataType::BFloat16);
 
-  std::vector<int64_t> input_shape{
-      8, benchmark_state.range(0), 1024};
+  std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
 
   // inputs
   at::manual_seed(0);
@@ -79,15 +80,13 @@ static void NvFuserScheduler_RMSNorm_BWD(
   at::Tensor weight = at::randn({input_shape[2]}, options);
   at::Tensor rstd = at::randn({input_shape[0], input_shape[1], 1}, options);
 
-  std::vector<c10::IValue> aten_inputs(
-      {grad_out, input, weight, rstd});
+  std::vector<c10::IValue> aten_inputs({grad_out, input, weight, rstd});
 
   runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *
-      (3 * input.numel() + weight.numel() +
-       rstd.numel()) *
+      (3 * input.numel() + weight.numel() + rstd.numel()) *
       int64_t(dataTypeSize(dtype)));
 }
 
 
@@ -943,6 +943,7 @@ if(USE_CUDA OR USE_ROCM)
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/type_traits.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensorcore.cu
     ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
     ${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/UnpackRaw.cuh
   )
 
@@ -97,6 +97,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu.cpp)
   list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu_fused_reduction.cpp)
   list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu_shift.cpp)
+  list(APPEND JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_gpu_tensorcore.cpp)
 endif()
 
 add_executable(test_jit
 
@@ -21082,7 +21082,7 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
 }
 #endif
 
-TEST_F(NVFuserTest, FusionIssue1430) {
+TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
   // Derived from an expression sorting issue when using loop map, now expr
   // sorting uses parallel map.
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Original file line number	Diff line number	Diff line change
`@@ -943,6 +943,7 @@ if(USE_CUDA OR USE_ROCM)`
`943`	`943`	`${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/type_traits.cu`
`944`	`944`	`${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/welford.cu`
`945`	`945`	`${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/warp.cu`
	`946`	`+ ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/tensorcore.cu`
`946`	`947`	`${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh`
`947`	`948`	`${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/detail/UnpackRaw.cuh`
`948`	`949`	`)`
Original file line number	Diff line number	Diff line change
`@@ -21082,7 +21082,7 @@ TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {`
`21082`	`21082`	`}`
`21083`	`21083`	`#endif`
`21084`	`21084`
`21085`		`-TEST_F(NVFuserTest, FusionIssue1430) {`
	`21085`	`+TEST_F(NVFuserTest, FusionIssue1430_CUDA) {`
`21086`	`21086`	`// Derived from an expression sorting issue when using loop map, now expr`
`21087`	`21087`	`// sorting uses parallel map.`
`21088`	`21088`	`std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();`