csarofeen
diff --git a/‎benchmarks/cpp/nvfuser/bert.cpp
Lines changed: 18 additions & 18 deletions b/‎benchmarks/cpp/nvfuser/bert.cpp
Lines changed: 18 additions & 18 deletions
diff --git a/‎benchmarks/cpp/nvfuser/broadcast.cpp
Lines changed: 1 addition & 2 deletions b/‎benchmarks/cpp/nvfuser/broadcast.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/cpp/nvfuser/reduction.cpp
Lines changed: 1 addition & 2 deletions b/‎benchmarks/cpp/nvfuser/reduction.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎benchmarks/cpp/nvfuser/scale_bias_relu.cpp
Lines changed: 2 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/scale_bias_relu.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎benchmarks/cpp/nvfuser/utils.cpp
Lines changed: 18 additions & 12 deletions b/‎benchmarks/cpp/nvfuser/utils.cpp
Lines changed: 18 additions & 12 deletions
diff --git a/‎benchmarks/cpp/nvfuser/utils.h
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cpp/nvfuser/utils.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Lines changed: 6 additions & 31 deletions b/‎torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Lines changed: 6 additions & 31 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/kernel_cache.h
Lines changed: 1 addition & 2 deletions b/‎torch/csrc/jit/codegen/cuda/kernel_cache.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
Lines changed: 37 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
Lines changed: 37 additions & 0 deletions
@@ -133,8 +133,8 @@ static void MagicScheduler_DivMaxSoftDropFwd(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  schedulePersistentKernel(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -143,7 +143,7 @@ static void MagicScheduler_DivMaxSoftDropFwd(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     CudaKernelTimer timer;
-    cg_outputs = fe.runFusion({t0, t1}, norm_params.value().lparams);
+    cg_outputs = fe.runFusion({t0, t1}, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
@@ -193,8 +193,8 @@ static void MagicScheduler_DivMaxSoftDropBwd(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  schedulePersistentKernel(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -203,7 +203,7 @@ static void MagicScheduler_DivMaxSoftDropBwd(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     CudaKernelTimer timer;
-    cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params.value().lparams);
+    cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
@@ -308,8 +308,8 @@ static void MagicScheduler_BiasDropoutAddLayernormFwd(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  schedulePersistentKernel(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -319,7 +319,7 @@ static void MagicScheduler_BiasDropoutAddLayernormFwd(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
+    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
@@ -423,8 +423,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd1(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getReductionHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  scheduleReduction(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  scheduleReduction(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -434,7 +434,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd1(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     clearL2Cache();
-    cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
+    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
@@ -534,8 +534,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd2(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  schedulePersistentKernel(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  schedulePersistentKernel(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -545,7 +545,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd2(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
+    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
@@ -625,8 +625,8 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd3(
   std::vector<at::Tensor> cg_outputs;
 
   auto norm_params = getReductionHeuristics(&fusion, at_inputs);
-  TORCH_CHECK(norm_params.has_value(), "Norm scheduler can't be used!");
-  scheduleReduction(&fusion, norm_params.value());
+  TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
+  scheduleReduction(&fusion, *norm_params);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion);
@@ -636,7 +636,7 @@ static void MagicScheduler_BiasDropoutAddLayernormBwd3(
   cudaDeviceSynchronize();
   for (auto _ : benchmark_state) {
     CudaKernelTimer timer;
-    cg_outputs = fe.runFusion(at_inputs, norm_params.value().lparams);
+    cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
     benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
   }
   // Sync everything up before we're finished, don't want to run ahead on the
 
@@ -69,8 +69,7 @@ static void NvFuserScheduler_Broadcast(
 
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
-  TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  auto params = toString(compile_log.pointwise_params.value());
+  auto params = toString(compile_log.params);
   auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
 
@@ -65,8 +65,7 @@ static void NvFuserScheduler_Reduction(
 
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
-  TORCH_INTERNAL_ASSERT(compile_log.reduction_params.has_value());
-  auto rparams = toString(compile_log.reduction_params.value());
+  auto rparams = toString(compile_log.params);
   auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(rparams + lparams);
 
@@ -135,8 +135,7 @@ static void NvFuserScheduler_SBR(
 
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
-  TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  auto params = toString(compile_log.pointwise_params.value());
+  auto params = toString(compile_log.params);
   auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
@@ -238,8 +237,7 @@ static void NvFuserScheduler_SBR_Norm(
 
   auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
   auto executor_instance = compile_log.fusion_executor;
-  TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
-  auto params = toString(compile_log.pointwise_params.value());
+  auto params = toString(compile_log.params);
   auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
 
   benchmark_state.SetLabel(params + lparams);
 
@@ -89,6 +89,20 @@ std::string toString(PointwiseParams params) {
   return ss.str();
 }
 
+std::string toString(const std::shared_ptr<HeuristicParams>& params) {
+  auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
+  if (rparams) {
+    return toString(*rparams);
+  }
+  auto pparams = std::dynamic_pointer_cast<PointwiseParams>(params);
+  if (pparams) {
+    return toString(*pparams);
+  }
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
+}
+
 std::string toString(LaunchParams lparams) {
   std::stringstream ss;
   lparams.toString();
@@ -123,9 +137,7 @@ TensorView* makeContigTensor(size_t ndims, DataType dtype) {
       .build();
 }
 
-TensorView* makeConcreteTensor(
-    std::vector<int64_t> shape,
-    DataType dtype) {
+TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
   return TensorViewBuilder().shape(shape).dtype(dtype).build();
 }
 
@@ -157,15 +169,9 @@ void runBenchmarkIterations(
     auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
     auto executor_instance = compile_log.fusion_executor;
 
-    if (compile_log.reduction_params.has_value()) {
-      auto rparams = toString(compile_log.reduction_params.value());
-      auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-      benchmark_state.SetLabel(rparams + lparams);
-    } else if (compile_log.pointwise_params.has_value()){
-      auto pparams = toString(compile_log.pointwise_params.value());
-      auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
-      benchmark_state.SetLabel(pparams + lparams);
-    }
+    auto params = toString(compile_log.params);
+    auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
+    benchmark_state.SetLabel(params + lparams);
 
     executor_instance->setMeasureKernelTimeFlag(true);
 
 
@@ -38,6 +38,7 @@ TensorView* makeContigConcreteTensor(
 
 std::string toString(ReductionParams rparams);
 std::string toString(PointwiseParams params);
+std::string toString(const std::shared_ptr<HeuristicParams>& params);
 std::string toString(LaunchParams lparams);
 
 // Run benchmark iterations with provided inputs. If not segmented, report
 
@@ -341,32 +341,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
     options.index_mode = scheduler_entry->indexMode();
     FusionGuard fg(fusion_to_run.get());
     scheduler_entry->schedule(fusion_to_run.get());
-    // Load launch params for reduction and normalization kernels
-    if (scheduler_entry->hasReductionParam()) {
-      launch_params = scheduler_entry->reductionParams().lparams;
-    } else {
-      launch_params = scheduler_entry->pointwiseParams().lparams;
-    }
+    launch_params = scheduler_entry->params()->lparams;
     executors_[group_id].compileFusion(
         fusion_to_run.get(), inputs, launch_params, options);
   } else {
-    // Load launch params for reduction and normalization kernels
-    if (scheduler_entry->hasReductionParam()) {
-      launch_params = scheduler_entry->reductionParams().lparams;
-    } else {
-      launch_params = scheduler_entry->pointwiseParams().lparams;
-    }
+    launch_params = scheduler_entry->params()->lparams;
   }
 
   if (profiling_) {
     most_recent_executor_log_.fusion_executor = &executors_[group_id];
-    if (scheduler_entry->hasReductionParam()) {
-      most_recent_executor_log_.reduction_params =
-          scheduler_entry->reductionParams();
-    } else {
-      most_recent_executor_log_.pointwise_params =
-          scheduler_entry->pointwiseParams();
-    }
+    most_recent_executor_log_.params = scheduler_entry->params()->clone();
   }
 
   auto& executor = executors_[group_id];
@@ -395,11 +379,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
       }
     }
     std::cout << "Compiler log: " << executor.compilerLog() << "\n";
-    if (scheduler_entry->hasReductionParam()) {
-      std::cout << scheduler_entry->reductionParams().toString() << "\n";
-    } else {
-      std::cout << scheduler_entry->pointwiseParams().toString() << "\n";
-    }
+    std::cout << scheduler_entry->params()->toString() << "\n";
     std::cout << "With arguments: " << executor.lastLaunchParams().toString();
     std::cout << executor.kernelName() << " " << executor.bytesProcessed()
               << " bytes/ " << std::setprecision(3) << executor.kernelTimeMs()
@@ -604,13 +584,8 @@ void FusionKernelRuntime::updateHeuristicsLaunchParams(
       update_heuristics->heuristicsList().size() == scheduler_list_length);
   for (const auto i : c10::irange(scheduler_list_length)) {
     auto& schedulerPtr = heuristics_->heuristicsList()[i];
-    if (schedulerPtr->hasReductionParam()) {
-      schedulerPtr->updateLaunchConstraint(
-          update_heuristics->heuristicsList()[i]->reductionParams().lparams);
-    } else {
-      schedulerPtr->updateLaunchConstraint(
-          update_heuristics->heuristicsList()[i]->pointwiseParams().lparams);
-    }
+    schedulerPtr->updateLaunchConstraint(
+        update_heuristics->heuristicsList()[i]->params()->lparams);
   }
 }
 
 
@@ -25,8 +25,7 @@ class SchedulerRuntimeInfo;
 
 // Utilities for benchmarking and profiling
 struct ExecutorLog {
-  c10::optional<ReductionParams> reduction_params = c10::nullopt;
-  c10::optional<PointwiseParams> pointwise_params = c10::nullopt;
+  std::shared_ptr<HeuristicParams> params = nullptr;
   FusionExecutor* fusion_executor = nullptr;
 };
 
 
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class HeuristicParams {
+ public:
+  std::string tag = "";
+
+  LaunchParams lparams;
+
+  virtual std::string toString() const {
+    return "Undefined Heuristic Params";
+  }
+
+  virtual size_t hash() const = 0;
+
+  virtual ~HeuristicParams() = default;
+
+  virtual bool sameAs(const std::shared_ptr<HeuristicParams>& other) const = 0;
+
+  virtual std::shared_ptr<HeuristicParams> clone() const = 0;
+
+  HeuristicParams() = default;
+  HeuristicParams(const std::string& tag) : tag(tag) {}
+};
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch