Improve divisible split detection #1970

csarofeen · 2022-09-13T17:51:48Z

Adds propagation of divisible split information, as well as add divisible splits from view based transformations.

Tangibly NVFuserTest.FusionNonDivisibleSplitVectorize2_CUDA before:

__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 0> T3) {
  alignas(16) extern __shared__ char array[];
  void* shared_mem = array;
  NVFUSER_DEFINE_MAGIC_ZERO
  Array<float, (8 * 4), 4> T1;
  #pragma unroll
  for(nvfuser_index_t i21 = 0; i21 < 8; ++i21) {
    if (((((i21 + nvfuser_zero) * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + 3)) < T0.size[0])) {
      loadGlobalToLocal<float, 4, false>(&T1[(i21 * 4)],  &T0[(((i21 + nvfuser_zero) * (ceilDiv(T0.size[0], 8))) + (((nvfuser_index_t)threadIdx.x) * 4))]);
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  // Alias Allocation - register
  auto& T2 = T1;
  #pragma unroll
  for(nvfuser_index_t i23 = 0; i23 < 8; ++i23) {
    #pragma unroll
    for(nvfuser_index_t i24 = 0; i24 < 4; ++i24) {
      if ((((i23 * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + (i24 + nvfuser_zero))) < T0.size[0])) {
        T2[((i23 * 4) + i24)]
          = T1[((i23 * 4) + i24)]
          + (float) 1.00000000000000000e+00;
      }
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  T3[0] = 0.00000000000000000e+00;
  #pragma unroll
  for(nvfuser_index_t i25 = 0; i25 < 8; ++i25) {
    #pragma unroll
    for(nvfuser_index_t i26 = 0; i26 < 4; ++i26) {
      blockReduce<true, false, false>(
        T3[0],
        T2[((i25 * 4) + i26)],
        [](float &a, float b) { a = a + b; },
        threadIdx,
        blockDim,
        static_cast<float*>(shared_mem),
        (((i25 * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + (i26 + nvfuser_zero))) < T0.size[0]),
        float(0.00000000000000000e+00));
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
}

After:

__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 0> T3) {
  alignas(16) extern __shared__ char array[];
  void* shared_mem = array;
  NVFUSER_DEFINE_MAGIC_ZERO
  Array<float, (8 * 4), 4> T1;
  #pragma unroll
  for(nvfuser_index_t i21 = 0; i21 < 8; ++i21) {
    T1.set(0);
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  #pragma unroll
  for(nvfuser_index_t i21 = 0; i21 < 8; ++i21) {
    if (((((i21 + nvfuser_zero) * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + 3)) < T0.size[0])) {
      loadGlobalToLocal<float, 4, false>(&T1[(i21 * 4)],  &T0[(((i21 + nvfuser_zero) * (ceilDiv(T0.size[0], 8))) + (((nvfuser_index_t)threadIdx.x) * 4))]);
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  // Alias Allocation - register
  auto& T2 = T1;
  #pragma unroll
  for(nvfuser_index_t i23 = 0; i23 < 8; ++i23) {
    #pragma unroll
    for(nvfuser_index_t i24 = 0; i24 < 4; ++i24) {
      T2[((i23 * 4) + i24)]
        = T1[((i23 * 4) + i24)]
        + (float) 1.00000000000000000e+00;
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
  T3[0] = 0.00000000000000000e+00;
  #pragma unroll
  for(nvfuser_index_t i25 = 0; i25 < 8; ++i25) {
    #pragma unroll
    for(nvfuser_index_t i26 = 0; i26 < 4; ++i26) {
      blockReduce<true, false, false>(
        T3[0],
        T2[((i25 * 4) + i26)],
        [](float &a, float b) { a = a + b; },
        threadIdx,
        blockDim,
        static_cast<float*>(shared_mem),
        (((i25 * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + (i26 + nvfuser_zero))) < T0.size[0]),
        float(0.00000000000000000e+00));
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO
}

Specifically this predicate is successfully removed:

      if ((((i23 * (ceilDiv(T0.size[0], 8))) + ((((nvfuser_index_t)threadIdx.x) * 4) + (i24 + nvfuser_zero))) < T0.size[0])) {
        T2[((i23 * 4) + i24)]
          = T1[((i23 * 4) + i24)]
          + (float) 1.00000000000000000e+00;
      }

csarofeen · 2022-09-26T14:33:47Z

@naoyam or @zasdfgbnm this should be ready to review. There is one bug here, which is that rfactor domains that are a result of non-divisible splits (only happens with reduction) might not be identified as such, but indexed as such. Was thinking @samnordmann might be able to help here, since this would be required to fix if we wanted mult-gpu supported rfactor stages.

naoyam

Added some comments. Nothing blocking

naoyam · 2022-09-27T01:52:41Z

torch/csrc/jit/codegen/cuda/lower_divisible_split.h

+    Fusion* fusion);
+
+// Same as above but will use provided ComputeAtMap instead of building its own.
+std::unordered_set<Split*> getAllDivisibleSplits(


nit: any specific reason not to have TORCH_CUDA_CU_API for this version.

Not really, I typically only add TORCH_CUDA_CU_API when I build a test on an interface. Can add it for symmetry.

naoyam · 2022-09-27T01:59:37Z