csarofeen · IvanYashchuk · Aug 19, 2022 · Aug 12, 2022 · jjsjann123 · Aug 12, 2022
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -69,6 +69,64 @@ TensorView* variance(
   return y;
 }
 
+TORCH_CUDA_CU_API VarMeanResult variance_mean(
+    TensorView* x,
+    const std::vector<int>& dims,
+    int64_t correction,
+    bool keepdim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
+
+  TORCH_CHECK(
+      correction >= 0, "correction must be non-negative, but got ", correction);
+
+  // There are compilation errors for half precision
+  auto dtype = x->getDataType().value();
+  TORCH_CHECK(
+      !(dtype == DataType::Half || dtype == DataType::BFloat16),
+      "variance_mean is not supported for ",
+      dtype,
+      " please upcast to float");
+
+  if (isComplexType(x->getDataType().value())) {
+    // There are compilation errors:
+    // __tmp_kernel1.cu(6727): error: namespace "CudaCodeGen::std" has no member
+    // "imagf"
+    // __tmp_kernel1.cu(6753): error: namespace "CudaCodeGen::std" has no member
+    // "realf"
+    TORCH_CHECK(false, "var_mean is not supported for complex types.");
+    auto out_real = variance_mean(real(x), dims, correction, keepdim);
+    auto out_imag = variance_mean(imag(x), dims, correction, keepdim);
+    // variance of a complex tensor is the sum of real and imaginary variances
+    // and is real mean of a complex tensor is complex complex(out_real.mean,
+    // out_imag.mean) It seems construction of a complex tensor from two real
+    // tensors is not supported yet
+    return {add(out_real.var, out_imag.var), nullptr};
+  }
+
+  const int kNumberOfDims =
+      TensorDomain::noReductions(x->getMaybeRFactorDomain()).size();
+  auto num_features = numFeatures(x, dims, kNumberOfDims);
+  if (correction > 0) {
+    num_features =
+        sub(num_features, IrBuilder::create<Int>(x->container(), correction));
+  }
+
+  auto welford_out = Welford(x, dims);
+  auto mean = welford_out.avg;
+  auto var = mul(welford_out.var_sum, reciprocal(num_features));
+
+  if (keepdim) {
+    std::vector<bool> is_broadcast(kNumberOfDims, false);
+    for (auto dim : dims) {
+      is_broadcast[dim] = true;
+    }
+    var = broadcast(var, is_broadcast);
+    mean = broadcast(mean, is_broadcast);
+  }
+
+  return {var, mean};
+}
+
 TensorView* standard_deviation(
     TensorView* x,
     const std::vector<int>& dims,

diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.h b/torch/csrc/jit/codegen/cuda/ops/normalization.h
@@ -38,6 +38,11 @@ struct BackwardRMSNormResult {
   TensorView* grad_weight = nullptr;
 };
 
+struct VarMeanResult {
+  TensorView* var = nullptr;
+  TensorView* mean = nullptr;
+};
+
 TORCH_CUDA_CU_API TensorView* mean(
     TensorView* x,
     const std::vector<int>& dims,
@@ -55,6 +60,12 @@ TORCH_CUDA_CU_API TensorView* variance(
     int64_t correction,
     bool keepdim);
 
+TORCH_CUDA_CU_API VarMeanResult variance_mean(
+    TensorView* x,
+    const std::vector<int>& dims,
+    int64_t correction,
+    bool keepdim);
+
 TORCH_CUDA_CU_API TensorView* standard_deviation(
     TensorView* x,
     const std::vector<int>& dims,

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -13150,6 +13150,50 @@ TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
   }
 }
 
+namespace {
+void testVarMean(at::ScalarType dtype, int correction, bool keepdim) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
+  fusion->addInput(tv0);
+  auto tvs = variance_mean(tv0, {1}, correction, keepdim);
+  auto tv_mean = tvs.mean;
+  auto tv_var = tvs.var;
+  fusion->addOutput(tv_var);
+  fusion->addOutput(tv_mean);
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto at_var_mean = at::var_mean(t0, {1}, correction, keepdim);
+  std::vector<at::Tensor> aten_outputs = {
+      std::get<0>(at_var_mean), std::get<1>(at_var_mean)};
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0}, aten_outputs, __LINE__, __FILE__);
+}
+} // namespace
+
+TEST_F(NVFuserTest, FusionVarMean_CUDA) {
+  std::vector<at::ScalarType> dtypes = {at::kFloat, at::kDouble};
+  std::vector<int> corrections = {0, 1};
+  std::vector<bool> keepdims = {false, true};
+  for (auto correction : corrections) {
+    for (auto keepdim : keepdims) {
+      for (auto dtype : dtypes) {
+        testVarMean(dtype, correction, keepdim);
+      }
+    }
+  }
+}
+
 TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);