ROCm
diff --git a/‎BUILD.bazel
Lines changed: 8 additions & 0 deletions b/‎BUILD.bazel
Lines changed: 8 additions & 0 deletions
diff --git a/‎aten/src/ATen/CMakeLists.txt
Lines changed: 10 additions & 2 deletions b/‎aten/src/ATen/CMakeLists.txt
Lines changed: 10 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 17 additions & 0 deletions b/‎aten/src/ATen/native/native_functions.yaml
Lines changed: 17 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMath.cpp
Lines changed: 2 additions & 0 deletions b/‎aten/src/ATen/native/nested/NestedTensorMath.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/nested/NestedTensorMath.h
Lines changed: 5 additions & 1 deletion b/‎aten/src/ATen/native/nested/NestedTensorMath.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
Lines changed: 6 additions & 0 deletions b/‎aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
Lines changed: 6 additions & 0 deletions
@@ -228,6 +228,11 @@ filegroup(
     ),
 )
 
+filegroup(
+    name = "aten_native_transformers_cpp",
+    srcs = glob(["aten/src/ATen/native/transformers/*.cpp"]),
+)
+
 filegroup(
     name = "aten_native_mkl_cpp",
     srcs = glob(["aten/src/ATen/native/mkl/*.cpp", "aten/src/ATen/mkl/*.cpp"]),
@@ -278,6 +283,7 @@ filegroup(
             "aten/src/ATen/native/miopen/*.cpp",
             "aten/src/ATen/native/nested/cuda/*.cpp",
             "aten/src/ATen/native/sparse/cuda/*.cpp",
+            "aten/src/ATen/native/transformers/cuda/*.cpp",
             "aten/src/THC/*.cpp",
         ],
     ),
@@ -292,6 +298,7 @@ filegroup(
         "aten/src/ATen/native/nested/cuda/*.cu",
         "aten/src/ATen/native/quantized/cuda/*.cu",
         "aten/src/ATen/native/sparse/cuda/*.cu",
+        "aten/src/ATen/native/transformers/cuda/*.cu",
     ]) + aten_ufunc_generated_cuda_sources("aten/src/ATen/{}"),
     # It's a bit puzzling to me why it's not necessary to declare the
     # target that generates these sources...
@@ -393,6 +400,7 @@ cc_library(
         ":aten_native_quantized_cpp",
         ":aten_native_sparse_cpp",
         ":aten_native_nested_cpp",
+        ":aten_native_transformers_cpp",
         ":aten_native_xnnpack",
         ":aten_src_ATen_config",
     ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
 
@@ -105,6 +105,7 @@ file(GLOB native_quantized_cpp
             "native/quantized/*.cpp"
             "native/quantized/cpu/*.cpp")
 file(GLOB native_nested_cpp "native/nested/*.cpp")
+file(GLOB native_transformers_cpp "native/transformers/*.cpp")
 
 file(GLOB native_h "native/*.h")
 file(GLOB native_ao_sparse_h
@@ -128,6 +129,8 @@ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
 file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu")
 file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp")
 file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp")
+file(GLOB native_transformers_cuda_cu "native/transformers/cuda/*.cu")
+file(GLOB native_transformers_cuda_cpp "native/transformers/cuda/*.cpp")
 
 file(GLOB native_hip_hip "native/hip/*.hip")
 file(GLOB native_hip_cpp "native/hip/*.cpp")
@@ -140,6 +143,8 @@ file(GLOB native_sparse_hip_hip "native/sparse/hip/*.hip")
 file(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp")
 file(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 file(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
+file(GLOB native_transformers_hip_hip "native/transformers/hip/*.hip")
+file(GLOB native_transformers_hip_cpp "native/transformers/hip/*.cpp")
 file(GLOB native_utils_cpp "native/utils/*.cpp")
 
 # XNNPACK
@@ -162,6 +167,7 @@ else()
     all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp}
     ${native_ao_sparse_cpp} ${native_sparse_cpp} ${native_nested_cpp}
     ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp}
+    ${native_transformers_cpp}
     ${native_utils_cpp} ${native_xnnpack} ${generated_sources} ${core_generated_sources}
     ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${ATen_NNAPI_SRCS} ${cpu_kernel_cpp}
   )
@@ -205,6 +211,7 @@ if(USE_CUDA)
     ${native_nested_cuda_cu}
     ${native_sparse_cuda_cu}
     ${native_quantized_cuda_cu}
+    ${native_transformers_cuda_cu}
     ${cuda_generated_sources}
   )
   list(APPEND ATen_CUDA_CPP_SRCS
@@ -216,6 +223,7 @@ if(USE_CUDA)
     ${native_quantized_cuda_cpp}
     ${native_quantized_cudnn_cpp}
     ${native_sparse_cuda_cpp}
+    ${native_transformers_cuda_cpp}
   )
   set(ATen_CUDA_LINALG_SRCS ${native_cuda_linalg_cpp})
   if(NOT BUILD_LAZY_CUDA_LINALG)
@@ -238,9 +246,9 @@ endif()
 
 if(USE_ROCM)
   list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
-  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_nested_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip})
+  set(ATen_HIP_SRCS ${ATen_HIP_SRCS} ${hip_hip} ${native_hip_hip} ${native_nested_hip_hip} ${native_sparse_hip_hip} ${native_quantized_hip_hip} ${native_transformers_hip_hip})
   # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
-  set(all_hip_cpp ${native_nested_hip_cpp} ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS})
+  set(all_hip_cpp ${native_nested_hip_cpp} ${native_sparse_hip_cpp} ${native_quantized_hip_cpp} ${native_transformers_hip_cpp}  ${hip_cpp} ${native_hip_cpp} ${native_hip_linalg_cpp} ${cuda_generated_sources} ${ATen_HIP_SRCS})
   set(all_hip_cpp ${native_miopen_cpp} ${native_cudnn_hip_cpp} ${miopen_cpp} ${all_hip_cpp})
 endif()
 
 
@@ -4662,6 +4662,12 @@
 
 - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
 
+# Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
+- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU, NestedTensorCPU: transform_bias_rescale_qkv_cpu
+    CUDA, NestedTensorCUDA: transform_bias_rescale_qkv_cuda
+
 - func: _nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
   device_check: NoCheck # cpu_nested_shape_example will always be on CPU
   dispatch:
@@ -11602,3 +11608,14 @@
   variants: method
   dispatch:
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_layer_norm
+
+# Apparently, putting "forward" in the name will cause Python bindings to be skipped, so "fwd" it is.
+- func: _transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: transformer_encoder_layer_forward
+
+- func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention
@@ -1,3 +1,5 @@
+#include <ATen/native/nested/NestedTensorMath.h>
+
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/NamedTensorUtils.h>
 
@@ -1,13 +1,17 @@
 #pragma once
 
+#include <c10/macros/Macros.h>
+
+#include <vector>
+
 namespace at {
 namespace native {
 struct NestedTensorImpl;
 
 // TODO: cache this and only do it once per NestedTensor
 int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
 
-std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt);
+TORCH_API std::vector<int64_t> NestedTensor_get_max_size(const NestedTensorImpl& nt);
 
 } // namespace native
 } // namespace at
@@ -35,6 +35,12 @@ Tensor nested_from_padded_cuda(
     const Tensor& sizes,
     bool do_transform_0213) {
   if (padded.dim() > 1 && padded.dim() < 5) {
+    if (padded.dtype() != kFloat && padded.dtype() != kHalf) {
+      TORCH_WARN_ONCE(
+          "nested_from_padded CUDA kernels only support fp32/fp16; falling "
+          "back to slower generic kernel");
+      return at::native::nested_from_padded_generic(padded, sizes, do_transform_0213);
+    }
     TORCH_CHECK(
         (padded.dim() == 4 && do_transform_0213) ||
             (padded.dim() == 3 && !do_transform_0213),
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#include <ATen/native/nested/NestedTensorMath.h>`
	`2`	`+`
`1`	`3`	`#include <ATen/ATen.h>`
`2`	`4`	`#include <ATen/AccumulateType.h>`
`3`	`5`	`#include <ATen/NamedTensorUtils.h>`