Vectorized welford #2204

naoyam · 2022-11-19T05:18:40Z

Apply loop-invariant code hoisting to serial WelfordOps. For example, when the innermost loop looks like:

for () {
  if (pred) {
    welfordCombine(...);
  }
}

The count input should be invariant when the loop is not a reduction loop, and if the predicate is also loop invariant, then this can be transformed as:

After:

nvfuser_index_t new_count = outN()[0] + 1;
float reciprocal = pred ? 1 / new_count : 0;
for () {
  welfordVectorized(..., new_count, reciprocal);
}

This is meant to optimize outer welford reductions. With this, FusionComputeWith6 looks like:

 for(nvfuser_index_t i186 = 0; i186 < (ceilDiv(16, 4)); ++i186) {
      if ((((((((((i186 + nvfuser_zero) * 4) + 3) * (ceilDiv((ceilDiv(((T0.size[0] * T0.size[1]) * T0.size[2]), 16)), 16))) + ((nvfuser_index_t)blockIdx.y)) * 16) + ((nvfuser_index_t)threadIdx.y)) < ((T0.size[0] * T0.size[1]) * T0.size[2])) && (((((((i192 * 2) + ((nvfuser_index_t)blockIdx.x)) * 8) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) {
        #pragma unroll
        for(nvfuser_index_t i187 = 0; i187 < 4; ++i187) {
          int i284;
          i284 = (i186 * 4) + i187;
          loadGlobalToLocal<__half, 2, false>(&T1[(i284 * 2)],  &T0[((((((((i186 + nvfuser_zero) * 4) + i187) * (ceilDiv((ceilDiv(((T0.size[0] * T0.size[1]) * T0.size[2]), 16)), 16))) + ((nvfuser_index_t)blockIdx.y)) * 16) + ((nvfuser_index_t)threadIdx.y)) * T0.size[3]) + i271]);
          int i1102;
          i1102 = T12[0] + 1;
          float f1103;
          f1103 = (float)(i1102);
          float f1104;
          f1104 = 1 / f1103;
          #pragma unroll
          for(nvfuser_index_t i189 = 0; i189 < 2; ++i189) {
            float T2[1];
            T2[0]
               = __half2float(T1[(i284 * 2) + i189]);
            welfordVectorized<float>(T13[i189], T11[i189], T12[i189], T2[0], f1104, i1102);
          }
        }
      } else {
        #pragma unroll
        for(nvfuser_index_t i187 = 0; i187 < 4; ++i187) {
          int i915;
          i915 = (((((i186 * 4) + (i187 + nvfuser_zero)) * (ceilDiv((ceilDiv(((T0.size[0] * T0.size[1]) * T0.size[2]), 16)), 16))) + ((nvfuser_index_t)blockIdx.y)) * 16) + ((nvfuser_index_t)threadIdx.y);
          int i391;
          i391 = (i186 * 4) + i187;
          if (((i915 < ((T0.size[0] * T0.size[1]) * T0.size[2])) && (((((((i192 * 2) + ((nvfuser_index_t)blockIdx.x)) * 8) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) {
            loadGlobalToLocal<__half, 2, false>(&T1[(i391 * 2)],  &T0[((((((((i186 + nvfuser_zero) * 4) + i187) * (ceilDiv((ceilDiv(((T0.size[0] * T0.size[1]) * T0.size[2]), 16)), 16))) + ((nvfuser_index_t)blockIdx.y)) * 16) + ((nvfuser_index_t)threadIdx.y)) * T0.size[3]) + i271]);
          }
          if (((i915 < ((T0.size[0] * T0.size[1]) * T0.size[2])) && (((((((i192 * 2) + ((nvfuser_index_t)blockIdx.x)) * 8) + ((nvfuser_index_t)threadIdx.x)) * 2) + 1) < T0.size[3]))) {
            int i1105;
            i1105 = T12[0] + 1;
            float f1106;
            f1106 = (float)(i1105);
            float f1107;
            f1107 = 1 / f1106;
            #pragma unroll
            for(nvfuser_index_t i189 = 0; i189 < 2; ++i189) {
              float T2[1];
              T2[0]
                 = __half2float(T1[(i391 * 2) + i189]);
              welfordVectorized<float>(T13[i189], T11[i189], T12[i189], T2[0], f1107, i1105);
            }
          }
        }
      }
    }

Lift the predicated count division outside of the innermost loop if that loop is exactly mapped with vectorized IDs and not a reduction domain. Targeted to address outer-reduction grid welford tuning

Represents the 32-bit floating-point scalar value. Not supported in PyTorch, so can't be used as inputs to fusions

…elford

…lford

csarofeen

Minor questions, but overall looks fine, marking as approved.

csarofeen · 2022-11-21T13:50:38Z

torch/csrc/jit/codegen/cuda/ir_builder.cpp

    case DataType::Float:
-      return IrBuilder::create<Double>(DataType::Float);


I thought you added float scalar support, why is there not supposed to be a float entry here? Is it because this can be user facing?

I did, but then I felt it's sometimes inconvenient to have both Double and Float, so I dropped Float. The dtype field of Double is used to specify its actual type (#2203). It's also the case with Int, which can represent int64_t by Int(DataType::Int) and int by Int(DataType::Int).

torch/csrc/jit/codegen/cuda/lower2device.cpp

csarofeen · 2022-11-21T14:07:05Z