From c1652be3a7f97d0d355f4ba7f6fddf4b53cad399 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 17 Jul 2018 14:43:34 -0500
Subject: [PATCH 1/8] Replace hcRNG with rocRAND.

hcRNG is not supported any longer, rocRAND is the library replacing it.

This requires us to disable one function, add some include directories
to setup.py and add an include to THCTensorRandom if we are in the ROCm
context.
---
 aten/src/ATen/CMakeLists.txt                  |   4 +-
 aten/src/THC/THCTensorRandom.h                |   3 +
 cmake/Dependencies.cmake                      |   5 +-
 cmake/public/LoadHIP.cmake                    |   8 -
 setup.py                                      |   4 +
 tools/amd_build/disabled_features.yaml        |  32 ++
 .../pyHIPIFY/cuda_to_hip_mappings.py          | 319 +++++++++---------
 7 files changed, 204 insertions(+), 171 deletions(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 28c4658f5b2aa3..1741820281e054 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -256,9 +256,9 @@ ENDIF()
 IF(USE_ROCM)
  ### Link in the ROCm libraries BLAS / RNG.
  FIND_LIBRARY(HIPBLAS_LIBRARY hipblas HINTS ${HIPBLAS_PATH}/lib)
- FIND_LIBRARY(HIPRNG_LIBRARY hcrng HINTS ${HIPRNG_PATH}/lib)
+ FIND_LIBRARY(HIPRAND_LIBRARY hiprand HINTS ${HIPRAND_PATH}/lib)
 
- list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRNG_LIBRARY})
+ list(APPEND ATen_CUDA_DEPENDENCY_LIBS ${HIPBLAS_LIBRARY} ${HIPRAND_LIBRARY})
 ENDIF()
 
 # Include CPU paths for CUDA as well
diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h
index 5203df28c78e68..1ee539e52c9cd6 100644
--- a/aten/src/THC/THCTensorRandom.h
+++ b/aten/src/THC/THCTensorRandom.h
@@ -5,6 +5,9 @@
 
 #include "generic/THCTensorRandom.h"
 #include "THCGenerateAllTypes.h"
+#ifdef __HIP_PLATFORM_HCC__
+#include <hiprand_kernel.h>
+#endif
 
 typedef struct THCGenerator THCGenerator;
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 1cbcf6486e9b43..9991082800a250 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -540,7 +540,7 @@ if(BUILD_CAFFE2 OR BUILD_ATEN)
       ${rocrand_LIBRARIES} ${hiprand_LIBRARIES} ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipblas_LIBRARIES})
     # Additional libraries required by PyTorch AMD that aren't used by Caffe2 (not in Caffe2's docker image)
     if(BUILD_ATEN)
-      set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES} ${hiprng_LIBRARIES})
+      set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} ${hipsparse_LIBRARIES})
     endif()
     # TODO: There is a bug in rocblas's cmake files that exports the wrong targets name in ${rocblas_LIBRARIES}
     list(APPEND Caffe2_HIP_DEPENDENCY_LIBS
@@ -555,7 +555,8 @@ if(USE_ROCM AND NOT BUILD_CAFFE2)
  include_directories(SYSTEM ${HIP_PATH}/include)
  include_directories(SYSTEM ${HIPBLAS_PATH}/include)
  include_directories(SYSTEM ${HIPSPARSE_PATH}/include)
- include_directories(SYSTEM ${HIPRNG_PATH}/include)
+ include_directories(SYSTEM ${HIPRAND_PATH}/include)
+ include_directories(SYSTEM ${ROCRAND_PATH}/include)
  include_directories(SYSTEM ${THRUST_PATH})
 
  # load HIP cmake module and load platform id
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index f70a5d171e3dfb..7e115af6d441e1 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -45,13 +45,6 @@ ELSE()
   SET(ROCBLAS_PATH $ENV{ROCBLAS_PATH})
 ENDIF()
 
-# HIPRNG_PATH
-IF(NOT DEFINED ENV{HIPRNG_PATH})
-  SET(HIPRNG_PATH ${ROCM_PATH}/hcrng)
-ELSE()
-  SET(HIPRNG_PATH $ENV{HIPRNG_PATH})
-ENDIF()
-
 # HIPSPARSE_PATH
 IF(NOT DEFINED ENV{HIPSPARSE_PATH})
   SET(HIPSPARSE_PATH ${ROCM_PATH}/hcsparse)
@@ -133,7 +126,6 @@ IF(HIP_FOUND)
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
   FIND_LIBRARY(hiprand_LIBRARIES hiprand HINTS ${HIPRAND_PATH}/lib)
-  FIND_LIBRARY(hiprng_LIBRARIES hcrng HINTS ${HIPRNG_PATH}/lib)
   FIND_LIBRARY(hipblas_LIBRARIES hipblas HINTS ${HIPBLAS_PATH}/lib)
   FIND_LIBRARY(hipsparse_LIBRARIES hipsparse HINTS ${HIPSPARSE_PATH}/lib)
 
diff --git a/setup.py b/setup.py
index 85f2f5bca63969..86765baf176b51 100644
--- a/setup.py
+++ b/setup.py
@@ -910,12 +910,16 @@ def run(self):
     hcc_include_path = '/opt/rocm/hcc/include'
     hipblas_include_path = '/opt/rocm/hipblas/include'
     hipsparse_include_path = '/opt/rocm/hcsparse/include'
+    hiprand_include_path = '/opt/rocm/hiprand/include'
+    rocrand_include_path = '/opt/rocm/rocrand/include'
     hip_lib_path = '/opt/rocm/hip/lib'
     hcc_lib_path = '/opt/rocm/hcc/lib'
     include_dirs.append(rocm_include_path)
     include_dirs.append(hcc_include_path)
     include_dirs.append(hipblas_include_path)
     include_dirs.append(hipsparse_include_path)
+    include_dirs.append(hiprand_include_path)
+    include_dirs.append(rocrand_include_path)
     include_dirs.append(tmp_install_path + "/include/THCUNN")
     extra_link_args.append('-L' + hip_lib_path)
     extra_link_args.append('-Wl,-rpath,' + hip_lib_path)
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index a41c4595139786..05ed8e959e7b64 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -91,6 +91,31 @@
           "struct curandStateMtgp32*": "curandStateMtgp32*"
         }
       },
+      {
+        "path": "aten/src/THC/THCTensorRandom.cu",
+        "s_constants": {
+          "struct curandStateMtgp32*": "curandStateMtgp32*"
+        }
+      },
+      {
+        "path": "aten/src/THC/THCTensorRandom.h",
+        "s_constants": {
+          "struct curandStateMtgp32*": "curandStateMtgp32*"
+        }
+      },
+      {
+        "path": "aten/src/THCUNN/generic/RReLU.cu",
+        "s_constants": {
+          "struct curandStateMtgp32*": "curandStateMtgp32*"
+        }
+      },
+      {
+        "path": "aten/src/THC/THCGenerator.hpp",
+        "s_constants": {
+          "struct curandStateMtgp32*": "curandStateMtgp32*",
+          "struct mtgp32_kernel_params": "mtgp32_kernel_params"
+        }
+      },
       {
         "path": "aten/src/ATen/native/cuda/CuFFTUtils.h",
         "s_constants": {
@@ -200,6 +225,13 @@
       "functions": [
         "THCTensor_(getTextureObject)"
       ]
+    },
+    {
+      "path": "aten/src/THC/THCTensorRandom.cu",
+      "functions": [
+        "THCRandom_setRNGState",
+        "set_rngstate_kernel"
+      ]
     }
   ]
 }
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index c03562861692a1..b011a1c13fecf3 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -185,45 +185,45 @@
     "cublasPointerMode_t": ("hipblasPointerMode_t", CONV_TYPE, API_BLAS),
     "cublasAtomicsMode_t": ("hipblasAtomicsMode_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
     "cublasDataType_t": ("hipblasDataType_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
-    "curandStatus": ("hiprngStatus_t", CONV_TYPE, API_RAND),
-    "curandStatus_t": ("hiprngStatus_t", CONV_TYPE, API_RAND),
-    "curandRngType": ("hiprngRngType_t", CONV_TYPE, API_RAND),
-    "curandRngType_t": ("hiprngRngType_t", CONV_TYPE, API_RAND),
-    "curandGenerator_st": ("hiprngGenerator_st", CONV_TYPE, API_RAND),
-    "curandGenerator_t": ("hiprngGenerator_t", CONV_TYPE, API_RAND),
-    "curandDirectionVectorSet": ("hiprngDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDirectionVectorSet_t": ("hiprngDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandOrdering": ("hiprngOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandOrdering_t": ("hiprngOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistribution_st": ("hiprngDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2V_st": ("hiprngDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistribution_t": ("hiprngDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2V_t": ("hiprngDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistributionShift_st": ("hiprngDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistributionShift_t": ("hiprngDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistributionM2Shift_st": ("hiprngDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDistributionM2Shift_t": ("hiprngDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2_st": ("hiprngHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2_t": ("hiprngHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2K_st": ("hiprngHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandHistogramM2K_t": ("hiprngHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDiscreteDistribution_st": ("hiprngDiscreteDistribution_st", CONV_TYPE, API_RAND),
-    "curandDiscreteDistribution_t": ("hiprngDiscreteDistribution_t", CONV_TYPE, API_RAND),
-    "curandMethod": ("hiprngMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandMethod_t": ("hiprngMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandDirectionVectors32_t": ("hiprngDirectionVectors32_t", CONV_TYPE, API_RAND),
-    "curandDirectionVectors64_t": ("hiprngDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandStateMtgp32_t": ("hiprngStateMtgp32_t", CONV_TYPE, API_RAND),
-    "curandStateMtgp32": ("hcrngStateMtgp32", CONV_TYPE, API_RAND),
-    "curandStateScrambledSobol64_t": ("hiprngStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandStateSobol64_t": ("hiprngStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandStateScrambledSobol32_t": ("hiprngStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
-    "curandStateSobol32_t": ("hiprngStateSobol32_t", CONV_TYPE, API_RAND),
-    "curandStateMRG32k3a_t": ("hiprngStateMRG32k3a_t", CONV_TYPE, API_RAND),
-    "curandStatePhilox4_32_10_t": ("hiprngStatePhilox4_32_10_t", CONV_TYPE, API_RAND),
-    "curandStateXORWOW_t": ("hiprngStateXORWOW_t", CONV_TYPE, API_RAND),
-    "curandState_t": ("hiprngState_t", CONV_TYPE, API_RAND),
-    "curandState": ("hiprngState_t", CONV_TYPE, API_RAND)
+    "curandStatus": ("hiprandStatus_t", CONV_TYPE, API_RAND),
+    "curandStatus_t": ("hiprandStatus_t", CONV_TYPE, API_RAND),
+    "curandRngType": ("hiprandRngType_t", CONV_TYPE, API_RAND),
+    "curandRngType_t": ("hiprandRngType_t", CONV_TYPE, API_RAND),
+    "curandGenerator_st": ("hiprandGenerator_st", CONV_TYPE, API_RAND),
+    "curandGenerator_t": ("hiprandGenerator_t", CONV_TYPE, API_RAND),
+    "curandDirectionVectorSet": ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDirectionVectorSet_t": ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandOrdering": ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandOrdering_t": ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistribution_st": ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2V_st": ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistribution_t": ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2V_t": ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionShift_st": ("hiprandDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionShift_t": ("hiprandDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionM2Shift_st": ("hiprandDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDistributionM2Shift_t": ("hiprandDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2_st": ("hiprandHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2_t": ("hiprandHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2K_st": ("hiprandHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandHistogramM2K_t": ("hiprandHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDiscreteDistribution_st": ("hiprandDiscreteDistribution_st", CONV_TYPE, API_RAND),
+    "curandDiscreteDistribution_t": ("hiprandDiscreteDistribution_t", CONV_TYPE, API_RAND),
+    "curandMethod": ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandMethod_t": ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandDirectionVectors32_t": ("hiprandDirectionVectors32_t", CONV_TYPE, API_RAND),
+    "curandDirectionVectors64_t": ("hiprandDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateMtgp32_t": ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND),
+    "curandStateMtgp32": ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND),
+    "curandStateScrambledSobol64_t": ("hiprandStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateSobol64_t": ("hiprandStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateScrambledSobol32_t": ("hiprandStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+    "curandStateSobol32_t": ("hiprandStateSobol32_t", CONV_TYPE, API_RAND),
+    "curandStateMRG32k3a_t": ("hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND),
+    "curandStatePhilox4_32_10_t": ("hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND),
+    "curandStateXORWOW_t": ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND),
+    "curandState_t": ("hiprandState_t", CONV_TYPE, API_RAND),
+    "curandState": ("hiprandState_t", CONV_TYPE, API_RAND)
 }
 
 CUDA_INCLUDE_MAP = {
@@ -239,23 +239,23 @@
     "vector_types.h": ("hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME),
     "cublas.h": ("hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS),
     "cublas_v2.h": ("hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS),
-    "curand.h": ("hiprng.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND),
-    "curand_kernel.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_discrete.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_discrete2.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_globals.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_lognormal.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_mrg32k3a.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_mtgp32.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_mtgp32_host.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_mtgp32_kernel.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_mtgp32dc_p_11213.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_normal.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_normal_static.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_philox4x32_x.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_poisson.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_precalc.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
-    "curand_uniform.h": ("hiprng_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand.h": ("hiprand.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND),
+    "curand_kernel.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_discrete.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_discrete2.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_globals.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_lognormal.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mrg32k3a.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32_host.h": ("hiprand_mtgp32_host.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32_kernel.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_mtgp32dc_p_11213.h": ("rocrand_mtgp32_11213.h", CONV_INCLUDE, API_RAND),
+    "curand_normal.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_normal_static.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_philox4x32_x.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_poisson.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_precalc.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
+    "curand_uniform.h": ("hiprand_kernel.h", CONV_INCLUDE, API_RAND),
     "cusparse.h": ("hipsparse.h", CONV_INCLUDE, API_RAND),
     "#include <cufft.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
     "#include <cufftXt.h>": ("", CONV_INCLUDE, API_RAND, HIP_UNSUPPORTED),
@@ -1977,109 +1977,110 @@
     "cublasDrotm_v2": ("hipblasDrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
     "cublasSrotmg_v2": ("hipblasSrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
     "cublasDrotmg_v2": ("hipblasDrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
-    "CURAND_STATUS_SUCCESS": ("HIPRNG_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_VERSION_MISMATCH": ("hiprng_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_NOT_INITIALIZED": ("hiprng_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_ALLOCATION_FAILED": ("hiprng_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_TYPE_ERROR": ("hiprng_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_OUT_OF_RANGE": ("hiprng_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_LENGTH_NOT_MULTIPLE": ("hiprng_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_DOUBLE_PRECISION_REQUIRED": ("hiprng_STATUS_DOUBLE_PRECISION_REQUIRED", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_LAUNCH_FAILURE": ("hiprng_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_PREEXISTING_FAILURE": ("hiprng_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_INITIALIZATION_FAILED": ("hiprng_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_ARCH_MISMATCH": ("hiprng_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_STATUS_INTERNAL_ERROR": ("hiprng_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_TEST": ("hiprng_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND),
-    "mtgp32dc_params_fast_11213": ("mtgp32_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_DEFAULT": ("hiprng_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_XORWOW": ("hiprng_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_MRG32K3A": ("hiprng_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_MTGP32": ("hiprng_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_MT19937": ("hiprng_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_PSEUDO_PHILOX4_32_10": ("hiprng_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_QUASI_DEFAULT": ("hiprng_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_QUASI_SOBOL32": ("hiprng_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_QUASI_SCRAMBLED_SOBOL32": ("hiprng_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_QUASI_SOBOL64": ("hiprng_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_RNG_QUASI_SCRAMBLED_SOBOL64": ("hiprng_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
-    "curand_ORDERING_PSEUDO_BEST": ("hiprng_ORDERING_PSEUDO_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_ORDERING_PSEUDO_DEFAULT": ("hiprng_ORDERING_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_ORDERING_PSEUDO_SEEDED": ("hiprng_ORDERING_PSEUDO_SEEDED", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_ORDERING_QUASI_DEFAULT": ("hiprng_ORDERING_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_DIRECTION_VECTORS_32_JOEKUO6": ("hiprng_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6": ("hiprng_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_DIRECTION_VECTORS_64_JOEKUO6": ("hiprng_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6": ("hiprng_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_CHOOSE_BEST": ("hiprng_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_ITR": ("hiprng_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_KNUTH": ("hiprng_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_HITR": ("hiprng_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_M1": ("hiprng_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_M2": ("hiprng_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_BINARY_SEARCH": ("hiprng_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_DISCRETE_GAUSS": ("hiprng_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_REJECTION": ("hiprng_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_DEVICE_API": ("hiprng_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_FAST_REJECTION": ("hiprng_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_3RD": ("hiprng_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_DEFINITION": ("hiprng_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curand_POISSON": ("hiprng_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
-    "curandCreateGenerator": ("hiprngCreateGenerator", CONV_MATH_FUNC, API_RAND),
-    "curandCreateGeneratorHost": ("hiprngCreateGeneratorHost", CONV_MATH_FUNC, API_RAND),
-    "curandCreatePoissonDistribution": ("hiprngCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND),
-    "curandDestroyDistribution": ("hiprngDestroyDistribution", CONV_MATH_FUNC, API_RAND),
-    "curandDestroyGenerator": ("hiprngDestroyGenerator", CONV_MATH_FUNC, API_RAND),
-    "curandGenerate": ("hiprngGenerate", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateLogNormal": ("hiprngGenerateLogNormal", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateLogNormalDouble": ("hiprngGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateLongLong": ("hiprngGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGenerateNormal": ("hiprngGenerateNormal", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateNormalDouble": ("hiprngGenerateNormalDouble", CONV_MATH_FUNC, API_RAND),
-    "curandGeneratePoisson": ("hiprngGeneratePoisson", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateSeeds": ("hiprngGenerateSeeds", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateUniform": ("hiprngGenerateUniform", CONV_MATH_FUNC, API_RAND),
-    "curandGenerateUniformDouble": ("hiprngGenerateUniformDouble", CONV_MATH_FUNC, API_RAND),
-    "curandGetDirectionVectors32": ("hiprngGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGetDirectionVectors64": ("hiprngGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGetProperty": ("hiprngGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGetScrambleConstants32": ("hiprngGetScrambleConstants32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGetScrambleConstants64": ("hiprngGetScrambleConstants64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandGetVersion": ("hiprngGetVersion", CONV_MATH_FUNC, API_RAND),
-    "curandSetGeneratorOffset": ("hiprngSetGeneratorOffset", CONV_MATH_FUNC, API_RAND),
-    "curandSetGeneratorOrdering": ("hiprngSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curandSetPseudoRandomGeneratorSeed": ("hiprngSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND),
-    "curandSetQuasiRandomGeneratorDimensions": ("hiprngSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND),
-    "curandSetStream": ("hiprngSetStream", CONV_MATH_FUNC, API_RAND),
-    "curand": ("hiprng", CONV_DEVICE_FUNC, API_RAND),
-    "curand_init": ("hiprng_init", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal": ("hiprng_log_normal", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal_double": ("hiprng_log_normal_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal2": ("hiprng_log_normal2", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal2_double": ("hiprng_log_normal2_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal4": ("hiprng_log_normal4", CONV_DEVICE_FUNC, API_RAND),
-    "curand_log_normal4_double": ("hiprng_log_normal4_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_mtgp32_single": ("hiprng_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curand_mtgp32_single_specific": ("hiprng_mtgp32_single_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curand_mtgp32_specific": ("hiprng_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
-    "curand_normal": ("hiprng_normal", CONV_DEVICE_FUNC, API_RAND),
-    "curandMakeMTGP32Constants": ("hiprngMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND),
-    "curandMakeMTGP32KernelState": ("hiprngMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND),
-    "curand_normal_double": ("hiprng_normal_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_normal2": ("hiprng_normal2", CONV_DEVICE_FUNC, API_RAND),
-    "curand_normal2_double": ("hiprng_normal2_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_normal4": ("hiprng_normal4", CONV_DEVICE_FUNC, API_RAND),
-    "curand_normal4_double": ("hiprng_normal4_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_uniform": ("hiprng_uniform", CONV_DEVICE_FUNC, API_RAND),
-    "curand_uniform_double": ("hiprng_uniform_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_uniform2_double": ("hiprng_uniform2_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_uniform4": ("hiprng_uniform4", CONV_DEVICE_FUNC, API_RAND),
-    "curand_uniform4_double": ("hiprng_uniform4_double", CONV_DEVICE_FUNC, API_RAND),
-    "curand_discrete": ("hiprng_discrete", CONV_DEVICE_FUNC, API_RAND),
-    "curand_discrete4": ("hiprng_discrete4", CONV_DEVICE_FUNC, API_RAND),
-    "curand_poisson": ("hiprng_poisson", CONV_DEVICE_FUNC, API_RAND),
-    "curand_poisson4": ("hiprng_poisson4", CONV_DEVICE_FUNC, API_RAND),
-    "curand_Philox4x32_10": ("hiprng_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)
+    "CURAND_STATUS_SUCCESS": ("HIPRAND_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_VERSION_MISMATCH": ("HIPRAND_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_NOT_INITIALIZED": ("HIPRAND_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_ALLOCATION_FAILED": ("HIPRAND_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_TYPE_ERROR": ("HIPRAND_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_OUT_OF_RANGE": ("HIPRAND_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_LENGTH_NOT_MULTIPLE": ("HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_DOUBLE_PRECISION_REQUIRED": ("HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_LAUNCH_FAILURE": ("HIPRAND_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_PREEXISTING_FAILURE": ("HIPRAND_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_INITIALIZATION_FAILED": ("HIPRAND_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_ARCH_MISMATCH": ("HIPRAND_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_STATUS_INTERNAL_ERROR": ("HIPRAND_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_TEST": ("HIPRAND_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND),
+    "mtgp32dc_params_fast_11213": ("mtgp32dc_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_DEFAULT": ("HIPRAND_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_XORWOW": ("HIPRAND_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MRG32K3A": ("HIPRAND_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MTGP32": ("HIPRAND_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_MT19937": ("HIPRAND_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_PSEUDO_PHILOX4_32_10": ("HIPRAND_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_DEFAULT": ("HIPRAND_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SOBOL32": ("HIPRAND_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SCRAMBLED_SOBOL32": ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SOBOL64": ("HIPRAND_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_RNG_QUASI_SCRAMBLED_SOBOL64": ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+    "curand_ORDERING_PSEUDO_BEST": ("HIPRAND_ORDERING_PSEUDO_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_PSEUDO_DEFAULT": ("HIPRAND_ORDERING_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_PSEUDO_SEEDED": ("HIPRAND_ORDERING_PSEUDO_SEEDED", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ORDERING_QUASI_DEFAULT": ("HIPRAND_ORDERING_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DIRECTION_VECTORS_32_JOEKUO6": ("HIPRAND_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6": ("HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DIRECTION_VECTORS_64_JOEKUO6": ("HIPRAND_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6": ("HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_CHOOSE_BEST": ("HIPRAND_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_ITR": ("HIPRAND_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_KNUTH": ("HIPRAND_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_HITR": ("HIPRAND_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_M1": ("HIPRAND_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_M2": ("HIPRAND_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_BINARY_SEARCH": ("HIPRAND_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DISCRETE_GAUSS": ("HIPRAND_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_REJECTION": ("HIPRAND_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DEVICE_API": ("HIPRAND_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_FAST_REJECTION": ("HIPRAND_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_3RD": ("HIPRAND_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_DEFINITION": ("HIPRAND_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curand_POISSON": ("HIPRAND_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+    "curandCreateGenerator": ("hiprandCreateGenerator", CONV_MATH_FUNC, API_RAND),
+    "curandCreateGeneratorHost": ("hiprandCreateGeneratorHost", CONV_MATH_FUNC, API_RAND),
+    "curandCreatePoissonDistribution": ("hiprandCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND),
+    "curandDestroyDistribution": ("hiprandDestroyDistribution", CONV_MATH_FUNC, API_RAND),
+    "curandDestroyGenerator": ("hiprandDestroyGenerator", CONV_MATH_FUNC, API_RAND),
+    "curandGenerate": ("hiprandGenerate", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLogNormal": ("hiprandGenerateLogNormal", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLogNormalDouble": ("hiprandGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateLongLong": ("hiprandGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGenerateNormal": ("hiprandGenerateNormal", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateNormalDouble": ("hiprandGenerateNormalDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGeneratePoisson": ("hiprandGeneratePoisson", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateSeeds": ("hiprandGenerateSeeds", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateUniform": ("hiprandGenerateUniform", CONV_MATH_FUNC, API_RAND),
+    "curandGenerateUniformDouble": ("hiprandGenerateUniformDouble", CONV_MATH_FUNC, API_RAND),
+    "curandGetDirectionVectors32": ("hiprandGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetDirectionVectors64": ("hiprandGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetProperty": ("hiprandGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetScrambleConstants32": ("hiprandGetScrambleConstants32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetScrambleConstants64": ("hiprandGetScrambleConstants64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandGetVersion": ("hiprandGetVersion", CONV_MATH_FUNC, API_RAND),
+    "curandSetGeneratorOffset": ("hiprandSetGeneratorOffset", CONV_MATH_FUNC, API_RAND),
+    "curandSetGeneratorOrdering": ("hiprandSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curandSetPseudoRandomGeneratorSeed": ("hiprandSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND),
+    "curandSetQuasiRandomGeneratorDimensions": ("hiprandSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND),
+    "curandSetStream": ("hiprandSetStream", CONV_MATH_FUNC, API_RAND),
+    "curand": ("hiprand", CONV_DEVICE_FUNC, API_RAND),
+    "curand_init": ("hiprand_init", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal": ("hiprand_log_normal", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal_double": ("hiprand_log_normal_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal2": ("hiprand_log_normal2", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal2_double": ("hiprand_log_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal4": ("hiprand_log_normal4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_log_normal4_double": ("hiprand_log_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_mtgp32_single": ("hiprand_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_mtgp32_single_specific": ("hiprand_mtgp32_single_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_mtgp32_specific": ("hiprand_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "curand_normal": ("hiprand_normal", CONV_DEVICE_FUNC, API_RAND),
+    "curandMakeMTGP32Constants": ("hiprandMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND),
+    "curandMakeMTGP32KernelState": ("hiprandMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal_double": ("hiprand_normal_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal2": ("hiprand_normal2", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal2_double": ("hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal4": ("hiprand_normal4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_normal4_double": ("hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform": ("hiprand_uniform", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform_double": ("hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform2_double": ("hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform4": ("hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_uniform4_double": ("hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND),
+    "curand_discrete": ("hiprand_discrete", CONV_DEVICE_FUNC, API_RAND),
+    "curand_discrete4": ("hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_poisson": ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND),
+    "curand_poisson4": ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND),
+    "curand_Philox4x32_10": ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+    "mtgp32_kernel_params": ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)
 }
 
 CUDA_SPARSE_MAP = {

From 87a3cba585c8384883aadacfef188de7a7dd65c6 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 24 Jul 2018 13:55:29 -0500
Subject: [PATCH 2/8] Remove hcRNG dependency and get rocRAND v.1.0

---
 docker/caffe2/jenkins/common/install_rocm.sh | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index 0d0e71a21bba33..9b864656e37784 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -40,7 +40,6 @@ install_ubuntu() {
                    miopengemm \
                    rocblas \
                    hipblas \
-                   rocrand \
                    rocm-profiler \
                    cxlactivitylogger
 }
@@ -57,10 +56,14 @@ install_hip_thrust() {
     git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
 }
 
-install_hcrng() {
-    mkdir -p /opt/rocm/debians
-    curl https://s3.amazonaws.com/ossci-linux/hcrng-master-a8c6a0b-Linux.deb -o /opt/rocm/debians/hcrng.deb 
-    dpkg -i /opt/rocm/debians/hcrng.deb
+install_rocrand() {
+    mkdir -p /opt/rocm/tmp/
+    cd /opt/rocm/tmp/
+    git clone -b v1.8.0 https://github.com/ROCmSoftwarePlatform/rocRAND.git
+    mkdir build && cd build
+    cmake ..
+    make
+    make install
 }
 
 install_hcsparse() {
@@ -87,5 +90,5 @@ if [ "$ROCM_VERSION" = "nightly" ]; then
 fi
 
 install_hip_thrust
-install_hcrng
+install_rocrand
 install_hcsparse

From d1fd62c2078142c556b9edaa66e1c5a46043f922 Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Thu, 26 Jul 2018 13:14:40 -0500
Subject: [PATCH 3/8] Do not install the rocRAND release version, switch to
 master in anticipation of more work in rocRAND upstream for PyTorch.

---
 docker/caffe2/jenkins/common/install_rocm.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docker/caffe2/jenkins/common/install_rocm.sh b/docker/caffe2/jenkins/common/install_rocm.sh
index c03be2b6954a93..b0a215296763b6 100644
--- a/docker/caffe2/jenkins/common/install_rocm.sh
+++ b/docker/caffe2/jenkins/common/install_rocm.sh
@@ -21,7 +21,6 @@ install_ubuntu() {
                    miopengemm \
                    rocblas \
                    hipblas \
-                   rocrand \
                    hcsparse \
                    rocm-profiler \
                    cxlactivitylogger
@@ -55,7 +54,7 @@ install_hip_thrust() {
 install_rocrand() {
     mkdir -p /opt/rocm/tmp/
     cd /opt/rocm/tmp/
-    git clone -b v1.8.0 https://github.com/ROCmSoftwarePlatform/rocRAND.git
+    git clone https://github.com/ROCmSoftwarePlatform/rocRAND.git
     mkdir build && cd build
     cmake ..
     make

From 789cf1a0874d0d58795c3e0b10e838c50aee1d82 Mon Sep 17 00:00:00 2001
From: jithunnair-amd <jithun.nair@amd.com>
Date: Mon, 6 Aug 2018 13:53:17 -0500
Subject: [PATCH 4/8] Minor changes to pass flake8 tests

---
 test/common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/common.py b/test/common.py
index bd43ef7ec6b0d0..9cd0a3c8a773e5 100644
--- a/test/common.py
+++ b/test/common.py
@@ -97,6 +97,7 @@ def _check_module_exists(name):
 if TEST_NUMPY:
     import numpy
 
+
 def skipIfRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -106,6 +107,7 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+
 def skipIfNoLapack(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):

From f1c36e67a2eee3878fddfcca3969443fcb853dae Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <Johannes.Dieterich@amd.com>
Date: Tue, 7 Aug 2018 09:49:35 -0500
Subject: [PATCH 5/8] Do not set SHARED flag unconditionally here.

Reported by: bddqqp
---
 caffe2/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index a24a8389d22836..7e5dd07b62c388 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -364,7 +364,7 @@ if(USE_ROCM)
   ENDIF()
 
   # FindHIP.CMake checks if the SHARED flag is set and adds extra logic accordingly.
-  hip_add_library(caffe2_hip SHARED ${Caffe2_HIP_SRCS})
+  hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS})
 
   # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added.
   set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS})

From 74979495f0483ee65a1fe2abab2aa31b9ae571c6 Mon Sep 17 00:00:00 2001
From: Viswanath Sivakumar <viswanath@fb.com>
Date: Tue, 7 Aug 2018 13:16:42 -0700
Subject: [PATCH 6/8] Optional input lengths in CTC op (#10228)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10228

Sometimes, for all items in the minibatch in test mode, input length will be
equal to max time steps. This avoids having to pass in an external tensor.

Differential Revision: D9174378

fbshipit-source-id: 22f7d5c311c855d9c3ac59f2a5e773279bd69974
---
 caffe2/contrib/warpctc/ctc_op.cpp      | 11 ++++++++-
 caffe2/contrib/warpctc/ctc_op.h        | 33 ++++++++++++++++++++++----
 caffe2/contrib/warpctc/ctc_ops_test.py | 27 ++++++++++++++++++---
 3 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/caffe2/contrib/warpctc/ctc_op.cpp b/caffe2/contrib/warpctc/ctc_op.cpp
index 0d0176623a5934..9d06e4041b69fa 100644
--- a/caffe2/contrib/warpctc/ctc_op.cpp
+++ b/caffe2/contrib/warpctc/ctc_op.cpp
@@ -2,6 +2,11 @@
 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/operator.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 
 namespace detail {
@@ -17,9 +22,13 @@ ctcComputeInfo workspaceInfo<CPUContext>(const CPUContext& /*context*/) {
 }
 
 REGISTER_CPU_OPERATOR(CTC, CTCOp<float, CPUContext>);
-OPERATOR_SCHEMA(CTC).NumInputs(4).NumOutputs(2, 3);
+OPERATOR_SCHEMA(CTC).NumInputs(3, 4).NumOutputs(2, 3);
 //    .EnforceInputOutputGradient({{0, 0}});
 
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(CTC, IDEEPFallbackOp<CTCOp<float, CPUContext>>);
+#endif
+
 namespace {
 class GetCTCGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
diff --git a/caffe2/contrib/warpctc/ctc_op.h b/caffe2/contrib/warpctc/ctc_op.h
index 6c27c907726b82..f94fa177ae01e9 100644
--- a/caffe2/contrib/warpctc/ctc_op.h
+++ b/caffe2/contrib/warpctc/ctc_op.h
@@ -45,13 +45,24 @@ class CTCOp final : public Operator<Context> {
   bool RunOnDevice() override {
     // inputs
     const auto& inputs = Input(INPUTS);
+    const auto maxTimeSteps = inputs.dim(0);
     const auto minibatchSize = inputs.dim(1);
     const auto alphabetSize = inputs.dim(2);
     const auto& labels = OperatorBase::template Input<Tensor>(LABELS, CPU);
     const auto& labelLengths =
         OperatorBase::template Input<Tensor>(LABEL_LENGTHS, CPU);
-    const auto& inputLengths =
-        OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
+
+    const int* inputLengthsData = nullptr;
+    if (InputSize() == 4) {
+      const auto& inputLengths =
+          OperatorBase::template Input<Tensor>(INPUT_LENGTHS, CPU);
+      inputLengthsData = inputLengths.template data<int>();
+    } else {
+      // Input lengths not passed in. Default to max timesteps for
+      // each item in minibatch.
+      default_input_lengths_.resize(minibatchSize, maxTimeSteps);
+      inputLengthsData = default_input_lengths_.data();
+    }
 
     // outputs
     Tensor* gradients = nullptr;
@@ -74,28 +85,40 @@ class CTCOp final : public Operator<Context> {
     size_t workspaceSizeBytes;
     CTC_CHECK(get_workspace_size(
         labelLengths.template data<int>(),
-        inputLengths.template data<int>(),
+        inputLengthsData,
         alphabetSize,
         minibatchSize,
         detail::workspaceInfo(context_),
         &workspaceSizeBytes));
     workspace->Resize(workspaceSizeBytes);
+    auto* workspaceData = workspace->template mutable_data<uint8_t>();
+
+    if (is_test_ && labels.dim(0) == 0) {
+      // compute_ctc_loss doesn't handle empty labels well
+      T* costsData = costs->template mutable_data<T>();
+      for (int i = 0; i < costs->size(); ++i) {
+        costsData[i] = 0;
+      }
+      return true;
+    }
+
     CTC_CHECK(compute_ctc_loss(
         inputs.template data<T>(),
         gradients ? gradients->template mutable_data<T>() : nullptr,
         labels.template data<int>(),
         labelLengths.template data<int>(),
-        inputLengths.template data<int>(),
+        inputLengthsData,
         alphabetSize,
         minibatchSize,
         costs->template mutable_data<T>(),
-        workspace->template mutable_data<uint8_t>(),
+        workspaceData,
         detail::workspaceInfo(context_)));
     return true;
   }
 
 private:
  bool is_test_;
+ std::vector<int> default_input_lengths_;
 
  INPUT_TAGS(INPUTS, LABELS, LABEL_LENGTHS, INPUT_LENGTHS);
 };
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 602ab285f3bcb4..25bb0a39e3a965 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -19,7 +19,7 @@ def softmax(w):
 
 
 class CTCOpsTest(test_util.TestCase):
-    def verify_cost(self, device_option, is_test):
+    def verify_cost(self, device_option, is_test, skip_input_lengths=False):
         alphabet_size = 5
         N = 1
         T = 2
@@ -36,9 +36,12 @@ def verify_cost(self, device_option, is_test):
         input_lengths = np.asarray([T]).astype(np.int32)
 
         net = core.Net("test-net")
+        input_blobs = ["inputs", "labels", "label_lengths"]
+        if not skip_input_lengths:
+            input_blobs.append("input_lengths")
         output_blobs = ["costs", "workspace"] if is_test \
                 else ["inputs_grad_to_be_copied", "costs", "workspace"]
-        net.CTC(["inputs", "labels", "label_lengths", "input_lengths"],
+        net.CTC(input_blobs,
                 output_blobs,
                 is_test=is_test,
                 device_option=device_option)
@@ -47,7 +50,8 @@ def verify_cost(self, device_option, is_test):
         self.ws.create_blob("inputs").feed(inputs, device_option=device_option)
         self.ws.create_blob("labels").feed(labels)
         self.ws.create_blob("label_lengths").feed(label_lengths)
-        self.ws.create_blob("input_lengths").feed(input_lengths)
+        if not skip_input_lengths:
+            self.ws.create_blob("input_lengths").feed(input_lengths)
         self.ws.run(net)
         probs = softmax(inputs)
         expected = probs[0, 0, 1] * probs[1, 0, 2]
@@ -68,20 +72,37 @@ def test_ctc_cost_cpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
             is_test=False)
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
+            is_test=False, skip_input_lengths=True)
 
     def test_ctc_cost_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
                                     cuda_gpu_id=0),
             is_test=False)
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
+                                    cuda_gpu_id=0),
+            is_test=False,
+            skip_input_lengths=True)
 
     def test_ctc_forward_only_cpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
             is_test=True)
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU),
+            is_test=True,
+            skip_input_lengths=True)
 
     def test_ctc_forward_only_gpu(self):
         self.verify_cost(
             caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
                                     cuda_gpu_id=0),
             is_test=True)
+        self.verify_cost(
+            caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA,
+                                    cuda_gpu_id=0),
+            is_test=True,
+            skip_input_lengths=True)

From 5bb21493fd070c9c2ac767d80f2b15b768234442 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimelshein@nvidia.com>
Date: Tue, 7 Aug 2018 13:24:31 -0700
Subject: [PATCH 7/8] add fused dropout kernels (#9666)

Summary:
While waiting for dropout to be fully ported to ATen, here's performance fix for the most common dropout case. Dropout is still in python function, I just added efficient path to it. I could not make inplace work, because generator always generates `return self` for inplace function, and I need to return both original tensor and mask, so inplace goes on the existing pass. Even with non-inplace version, since mask is now a ByteTensor, memory used is just a little larger than for inplace dropout, due to savings on mask.
Once dropout is moved to aten, these kernels still can be used for efficient implementation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9666

Reviewed By: SsnL

Differential Revision: D8948077

Pulled By: ezyang

fbshipit-source-id: 52990ef769471d957e464af635e5f9b4e519567a
---
 aten/src/ATen/cuda/CUDAApplyUtils.cuh      |   2 +-
 aten/src/ATen/native/cuda/Dropout.cu       | 162 +++++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |   9 ++
 test/test_nn.py                            |  35 ++++-
 tools/amd_build/disabled_features.yaml     |   3 +-
 torch/nn/_functions/dropout.py             |  18 ++-
 6 files changed, 221 insertions(+), 8 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/Dropout.cu

diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
index a30bed575d3d3c..0673a88b051998 100644
--- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh
+++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "detail/IndexUtils.cuh"
+#include "ATen/cuda/detail/IndexUtils.cuh"
 #include "ATen/TensorUtils.h"
 #include "THC/THCAtomics.cuh"
 #include "ATen/cuda/CUDAContext.h"
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
new file mode 100644
index 00000000000000..2d133a70dc23b6
--- /dev/null
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -0,0 +1,162 @@
+#include "ATen/ATen.h"
+#include "ATen/AccumulateType.h"
+#include "ATen/cuda/CUDAApplyUtils.cuh"
+#include "ATen/cuda/detail/IndexUtils.cuh"
+#include "ATen/cuda/detail/TensorInfo.cuh"
+#include "curand_kernel.h"
+
+#include <THC/THCGeneral.h>
+#include <THC/THCTensorRandom.h>
+#include <THC/THCGenerator.hpp>
+
+
+THCGenerator* THCRandom_getGenerator(THCState* state);
+
+namespace at{
+namespace native{
+
+namespace {
+
+// philox generates 128 bits of randomness at a time. Kernel uses this explicitly by putting suitably transformed result into float4
+// for all members of float4 to be consumed UNROLL has to be 4. Don't change!
+const int UNROLL = 4;
+
+std::pair<uint64_t, uint64_t> next_philox_seed(at::Generator* gen, uint64_t increment) {
+  auto gen_ = THCRandom_getGenerator(at::globalContext().getTHCState());
+  uint64_t offset = gen_->state.philox_seed_offset.fetch_add(increment);
+  return std::make_pair(gen_->state.initial_seed, offset);
+}
+
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType,
+          int ADims>
+#if __CUDA_ARCH__ >= 350
+__launch_bounds__(256,8)
+#endif
+__global__ void
+fused_dropout_kernel(cuda::detail::TensorInfo<scalar_t, IndexType> a,
+                      cuda::detail::TensorInfo<scalar_t, IndexType> b,
+                      cuda::detail::TensorInfo<uint8_t, IndexType> c,
+                      IndexType totalElements, accscalar_t p, std::pair<uint64_t, uint64_t> seeds
+                      ) {
+
+  accscalar_t pinv = accscalar_t(1)/p;
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+    curand_init(
+        seeds.first,
+        idx,
+        seeds.second,
+        &state);
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * 
+        blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+//curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
+       float4 rand = curand_uniform4(&state);
+       scalar_t src[UNROLL];
+       rand.x = rand.x < p;
+       rand.y = rand.y < p;
+       rand.z = rand.z < p;
+       rand.w = rand.w < p;
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `a`
+               const IndexType aOffset =
+                   cuda::detail::IndexToOffset<scalar_t, IndexType, ADims>::get(li, a);
+               src[ii] = a.data[aOffset];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+    // Convert `linearIndex` into an offset of `b`
+               const IndexType bOffset =
+                   cuda::detail::IndexToOffset<scalar_t, IndexType, 1>::get(li, b);
+               b.data[bOffset] = src[ii]*(&rand.x)[ii]*pinv;
+               c.data[bOffset] = (uint8_t)(&rand.x)[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template<typename scalar_t, typename accscalar_t>
+void masked_scale_kernel(at::Tensor& ret, const at::Tensor src, const at::Tensor mask, accscalar_t scale){
+   at::cuda::CUDA_tensor_apply3<scalar_t, scalar_t, uint8_t>(ret, src, mask, [scale]__device__(scalar_t& ret_val, const scalar_t& src_val, const uint8_t mask_val){
+       ret_val = (float)mask_val * src_val * scale;
+  });
+}
+} //anonymous namespace
+
+std::tuple<Tensor,Tensor>
+fused_dropout_cuda(const Tensor& self, double p, Generator * gen){
+  Tensor ret = at::empty_like(self);
+  Tensor mask = self.type().toScalarType(kByte).tensor(self.sizes());
+  const int64_t nelem = self.numel();
+  const int64_t block_size = 256;
+  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  dim3 dim_block(block_size);
+  dim3 grid((nelem + block_size -1)/block_size);
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+//number of times random will be generated per thread, to offset philox counter in thc random state
+  int64_t counter_offset = ((nelem - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
+  if (cuda::detail::canUse32BitIndexMath(self)){
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(p);
+      auto self_info = cuda::detail::getTensorInfo<scalar_t, unsigned int>(self);
+      auto ret_info = cuda::detail::getTensorInfo<scalar_t, unsigned int>(ret);
+      auto mask_info = cuda::detail::getTensorInfo<uint8_t, unsigned int>(mask);
+      self_info.collapseDims();
+      ret_info.collapseDims();
+      mask_info.collapseDims(); //ret and mask are collapsed to 1d contiguous tensor
+      switch (self_info.dims) {
+        case 1:
+            fused_dropout_kernel<scalar_t, accscalar_t, unsigned int, 1><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,counter_offset));
+            break;
+        default:
+            fused_dropout_kernel<scalar_t, accscalar_t, unsigned int, -1><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,counter_offset));
+      }
+   });
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.type(), "fused_dropout", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(p);
+      auto self_info = cuda::detail::getTensorInfo<scalar_t, uint64_t>(self);
+      auto ret_info = cuda::detail::getTensorInfo<scalar_t, uint64_t>(ret);
+      auto mask_info = cuda::detail::getTensorInfo<uint8_t, uint64_t>(mask);
+      self_info.collapseDims();
+      ret_info.collapseDims();
+      mask_info.collapseDims(); //ret and mask are collapsed to 1d contiguous tensor
+      switch (self_info.dims) {
+        case 1:
+            fused_dropout_kernel<scalar_t, accscalar_t, uint64_t, 1><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,counter_offset));
+            break;
+        default:
+            fused_dropout_kernel<scalar_t, accscalar_t, uint64_t, -1><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(self_info, ret_info, mask_info, nelem, pa, next_philox_seed(gen,counter_offset));
+      }
+   });
+  }
+  THCudaCheck(cudaGetLastError());
+  return std::tuple<Tensor,Tensor>(ret, mask);
+}
+
+Tensor masked_scale_cuda(const Tensor& self, const Tensor& mask, double scale){
+   Tensor ret = at::empty_like(self);
+   AT_CHECK(mask.type().scalarType() == at::ScalarType::Byte, "mask should be torch.uint8 dtype");
+   AT_DISPATCH_FLOATING_TYPES_AND_HALF(ret.type(), "masked_scale", [&] {
+      using accscalar_t = acc_type<scalar_t, true>;
+      accscalar_t pa = (accscalar_t)(scale);
+    masked_scale_kernel<scalar_t>(ret, self, mask, pa);
+  });
+  return ret;
+}
+
+}
+}
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a86ab6cb41ba52..1361848c832fd5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -54,6 +54,14 @@
   dispatch:
     CUDA: _cudnn_init_dropout_state
 
+- func: _fused_dropout(Tensor self, double p, Generator* generator=nullptr) -> (Tensor, Tensor)
+  dispatch:
+     CUDA: fused_dropout_cuda
+
+- func: _masked_scale(Tensor self, Tensor mask, double scale) -> Tensor
+  dispatch:
+     CUDA: masked_scale_cuda
+
 - func: abs(Tensor self) -> Tensor
 
 - func: abs_(Tensor self) -> Tensor
@@ -1003,6 +1011,7 @@
 - func: logsumexp_out(Tensor result, Tensor self, int64_t dim, bool keepdim=False) -> Tensor
   variants: function
 
+
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, double margin=0.0, int64_t reduction=Reduction::ElementwiseMean) -> Tensor
   variants: function
 
diff --git a/test/test_nn.py b/test/test_nn.py
index c8d54c58b8e5e5..5bd646661760a5 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -732,9 +732,10 @@ def test_invalid_conv3d(self):
         input = torch.empty(1, 1, 4, 4, 4)
         self.assertRaises(RuntimeError, lambda: module(input))
 
-    def _test_dropout(self, cls, input):
+    def _test_dropout(self, cls, cuda, input):
         p = 0.2
-        input.fill_(1 - p)
+        device = torch.device("cuda") if cuda else torch.device("cpu")
+        input = input.to(device).fill_(1 - p)
 
         module = cls(p)
         input_var = torch.tensor(input, requires_grad=True)
@@ -2098,7 +2099,7 @@ def func(x):
 
     def test_Dropout(self):
         input = torch.Tensor(1000)
-        self._test_dropout(nn.Dropout, input)
+        self._test_dropout(nn.Dropout, False, input)
 
     def test_Dropout2d(self):
         b = random.randint(1, 5)
@@ -2106,7 +2107,7 @@ def test_Dropout2d(self):
         h = random.randint(1, 5)
         num_features = 1000
         input = torch.Tensor(num_features, b, w, h)
-        self._test_dropout(nn.Dropout2d, input)
+        self._test_dropout(nn.Dropout2d, False, input)
 
     def test_Dropout3d(self):
         b = random.randint(1, 5)
@@ -2115,7 +2116,31 @@ def test_Dropout3d(self):
         d = random.randint(1, 2)
         num_features = 1000
         input = torch.Tensor(num_features, b, d, w, h)
-        self._test_dropout(nn.Dropout3d, input)
+        self._test_dropout(nn.Dropout3d, False, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout_cuda(self):
+        input = torch.Tensor(1000)
+        self._test_dropout(nn.Dropout, True, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout2d_cuda(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, w, h)
+        self._test_dropout(nn.Dropout2d, True, input)
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_Dropout3d_cuda(self):
+        b = random.randint(1, 5)
+        w = random.randint(1, 5)
+        h = random.randint(1, 5)
+        d = random.randint(1, 2)
+        num_features = 1000
+        input = torch.Tensor(num_features, b, d, w, h)
+        self._test_dropout(nn.Dropout3d, True, input)
 
     def test_AlphaDropout(self):
         # generate random tensor with zero mean and unit std
diff --git a/tools/amd_build/disabled_features.yaml b/tools/amd_build/disabled_features.yaml
index 1554c839ca7e2b..a74abbd2a95843 100644
--- a/tools/amd_build/disabled_features.yaml
+++ b/tools/amd_build/disabled_features.yaml
@@ -126,7 +126,8 @@
     "aten/src/ATen/native/cuda/CuFFTUtils.h",
     "aten/src/ATen/native/cuda/CuFFTPlanCache.h",
     "aten/src/ATen/native/cuda/SpectralOps.cu",
-    "aten/src/ATen/native/cuda/Distributions.cu"
+    "aten/src/ATen/native/cuda/Distributions.cu",
+    "aten/src/ATen/native/cuda/Dropout.cu"
   ],
   "disabled_functions": [
     {
diff --git a/torch/nn/_functions/dropout.py b/torch/nn/_functions/dropout.py
index 19b589a321f714..24647c0f83900d 100644
--- a/torch/nn/_functions/dropout.py
+++ b/torch/nn/_functions/dropout.py
@@ -15,6 +15,10 @@ def symbolic(g, input, p=0.5, train=False, inplace=False):
         r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
         return r
 
+    @staticmethod
+    def _fused_kernel_acceptable(input, p, cls_name, inplace):
+        return input.is_cuda and p > 0 and p < 1 and not inplace and cls_name == 'Dropout'
+
     @classmethod
     def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
         if p < 0 or p > 1:
@@ -23,10 +27,15 @@ def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
         ctx.p = p
         ctx.train = train
         ctx.inplace = inplace
+        ctx.use_fused_kernel = Dropout._fused_kernel_acceptable(input, ctx.p, cls.__name__, ctx.inplace)
 
         if ctx.p == 0 or not ctx.train:
             return input
 
+        if ctx.use_fused_kernel:
+            output, ctx.noise = input._fused_dropout(1 - ctx.p)
+            return output
+
         if ctx.inplace:
             ctx.mark_dirty(input)
             output = input
@@ -45,7 +54,13 @@ def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
 
     @staticmethod
     def backward(ctx, grad_output):
-        if ctx.p > 0 and ctx.train:
+        if ctx.use_fused_kernel:
+            if not grad_output.requires_grad:
+                return grad_output._masked_scale(ctx.noise, 1. / (1 - ctx.p)), None, None, None
+            else:
+                # use autograd-friendly backward if double backward is required
+                return grad_output * (ctx.noise.type_as(grad_output) * (1. / (1 - ctx.p))), None, None, None
+        elif ctx.p > 0 and ctx.train:
             return grad_output * ctx.noise, None, None, None
         else:
             return grad_output, None, None, None
@@ -84,6 +99,7 @@ def forward(cls, ctx, input, p=0.5, train=False, inplace=False):
         if p < 0 or p > 1:
             raise ValueError("dropout probability has to be between 0 and 1, "
                              "but got {}".format(p))
+        ctx.use_fused_kernel = False
         ctx.p = p
         ctx.train = train
         ctx.inplace = inplace

From 53904762977ec0db343fe33e9af2a4c52bfaf80e Mon Sep 17 00:00:00 2001
From: Peter Goldsborough <psag@fb.com>
Date: Tue, 7 Aug 2018 13:48:18 -0700
Subject: [PATCH 8/8] Add tracing to custom op and simplify tracer overall
 (#10212)

Summary:
This PR adds tracing infrastructure for custom operators. It also simplifies the tracer overall, and changes the codegen to do more metaprogramming there instead of via C++ (which was necessary for the custom op tracing).

To give an example of the tracer/metaprogramming change, what used to look like this in `VariableType.cpp`:

```
jit::tracer::PreTraceInfo trace_info;
  if (jit::tracer::isTracing()) {
    trace_info = jit::tracer::preRecordTrace(jit::aten::index_select, "self", self, "dim", dim, "index", index);
  }
```

is now simply the inlined version of `preRecordTrace`, minus C++ metaprogramming:

```
torch::jit::Node* node = nullptr;
  if (jit::tracer::isTracing()) {
    auto& graph = jit::tracer::getTracingState()->graph;
    node = graph->create(jit::aten::index_select_out, /*outputs=*/0);
    jit::tracer::recordSourceLocation(node);
    jit::tracer::addInputs(node, "result", result);
    jit::tracer::addInputs(node, "self", self);
    jit::tracer::addInputs(node, "dim", dim);
    jit::tracer::addInputs(node, "index", index);
    graph->appendNode(node);
  }
```

zdevito apaszke
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10212

Differential Revision: D9199615

Pulled By: goldsborough

fbshipit-source-id: cd4b603c1dc01340ead407228e109c99bdba2cfc
---
 tools/autograd/gen_variable_type.py       | 18 +++++--
 tools/autograd/templates/VariableType.cpp |  1 +
 torch/csrc/autograd/python_function.cpp   | 20 ++++----
 torch/csrc/jit/custom_operator.h          | 38 +++++++++++++-
 torch/csrc/jit/ir.h                       |  2 +-
 torch/csrc/jit/python_tracer.cpp          | 14 ++----
 torch/csrc/jit/python_tracer.h            |  2 +-
 torch/csrc/jit/test_jit.cpp               | 56 +++++++++++++++++++--
 torch/csrc/jit/tracer.cpp                 | 27 +++++-----
 torch/csrc/jit/tracer.h                   | 60 +++++++++--------------
 10 files changed, 157 insertions(+), 81 deletions(-)

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 2bee61b024317e..af35dc9a8a7d67 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -126,15 +126,21 @@
 profiler::RecordFunction profiler("${name}");""")
 
 PRE_RECORD_TRACE = CodeTemplate("""\
-jit::tracer::PreTraceInfo trace_info;
+torch::jit::Node* node = nullptr;
 if (jit::tracer::isTracing()) {
-  trace_info = jit::tracer::preRecordTrace(jit::aten::${trace_name}, ${trace_inputs});
+  auto& graph = jit::tracer::getTracingState()->graph;
+  node = graph->create(jit::aten::${trace_name}, /*outputs=*/0);
+  jit::tracer::recordSourceLocation(node);
+  ${add_trace_inputs}
+  graph->appendNode(node);
 }
 """)
 
+ADD_TRACE_INPUT = CodeTemplate("""jit::tracer::addInputs(node, "${input}", ${input});""")
+
 POST_RECORD_TRACE = CodeTemplate("""\
 if (jit::tracer::isTracing()) {
-  jit::tracer::postRecordTrace( trace_info,  ${trace_outputs} );
+  jit::tracer::postRecordTrace(node, ArrayRef<Variable>(${trace_outputs}) );
 }
 """)
 
@@ -372,7 +378,11 @@ def emit_record_trace(env):
             return ('', '')
 
         local = {}
-        local['trace_inputs'] = sum([['"{}"'.format(arg['name']), arg['name']] for arg in declaration['arguments']], [])
+
+        add_trace_inputs = []
+        for argument in declaration['arguments']:
+            add_trace_inputs.append(ADD_TRACE_INPUT.substitute(input=argument['name']))
+        local['add_trace_inputs'] = '\n'.join(add_trace_inputs)
 
         # Record inplace operations as out-of-place operations (e.g.,
         # not add_ but add)
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index fd8a960b6f2fa1..da5c5456780495 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -13,6 +13,7 @@
 #include "torch/csrc/jit/tracer.h"
 #include "torch/csrc/jit/constants.h"
 #include "torch/csrc/jit/symbolic_variable.h"
+#include "torch/csrc/jit/ir.h"
 
 #include "torch/csrc/utils/variadic.h"
 #include "torch/csrc/autograd/functions/utils.h"
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index a1dca1e2eed9da..b1a835edf1fb98 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -558,12 +558,12 @@ static void _assert_not_tracing(const char* name, const variable_list& input_var
   }
 }
 
-static jit::tracer::PreTraceInfo _trace_pre_record(
+static Node* _trace_pre_record(
     PyObject* op_obj,
     PyObject *input_objects,
     const variable_list& input_vars) {
   if (!jit::tracer::isTracing()) {
-    return jit::tracer::PreTraceInfo();
+    return nullptr;
   }
 
   // Save scalar args and the calling convention
@@ -593,7 +593,7 @@ static jit::tracer::PreTraceInfo _trace_pre_record(
 }
 
 static void _trace_post_record(
-    const jit::tracer::PreTraceInfo& trace_info,
+    Node* node,
     PyObject* op_obj,
     const variable_list& input_vars,
     PyObject *output_objects,
@@ -610,15 +610,15 @@ static void _trace_post_record(
     output_vars[i] = var->cdata;
   }
 
-  jit::tracer::postRecordTrace(trace_info, output_vars);
+  jit::tracer::postRecordTrace(node, output_vars);
 
-  trace_info.n->i_(attr::inplace, is_inplace);
+  node->i_(attr::inplace, is_inplace);
 
 }
 
 PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const UnpackedInput& unpacked,
                           PyObject *inputs, THPObjectPtr&& raw_output, bool is_executable,
-                          const jit::tracer::PreTraceInfo& trace_info) {
+                          Node* node) {
   bool unpack_output = ensure_tuple(raw_output);
 
   auto num_outputs = PyTuple_GET_SIZE(raw_output.get());
@@ -639,7 +639,7 @@ PyObject* process_outputs(PyObject *op_obj, THPFunction* grad_fn, const Unpacked
 
   bool is_inplace = static_cast<bool>(grad_fn->dirty_tensors);
   _wrap_outputs(grad_fn, inputs, raw_output, outputs, is_executable);
-  _trace_post_record(trace_info, op_obj, unpacked.input_vars, outputs, is_inplace);
+  _trace_post_record(node, op_obj, unpacked.input_vars, outputs, is_inplace);
   if (is_executable) {
     _save_variables(grad_fn);
   } else {
@@ -687,7 +687,7 @@ PyObject *THPFunction_do_forward(THPFunction *self, PyObject *_inputs)
   }
 
   return process_outputs(nullptr, self, unpacked_input, _inputs, std::move(raw_output),
-                         is_executable, jit::tracer::PreTraceInfo());
+                         is_executable, nullptr);
   END_HANDLE_TH_ERRORS
 }
 
@@ -708,7 +708,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
   InputFlags& input_info = info_pair.second;
 
   // Record input nodes if tracing
-  auto trace_info = _trace_pre_record(cls, inputs, unpacked_input.input_vars);
+  auto* node = _trace_pre_record(cls, inputs, unpacked_input.input_vars);
 
   // Initialize backward function (and ctx)
   bool is_executable = input_info.is_executable;
@@ -737,7 +737,7 @@ PyObject *THPFunction_apply(PyObject *cls, PyObject *inputs)
   }
 
   return process_outputs(cls, ctx, unpacked_input, inputs, std::move(tensor_outputs),
-                         is_executable, trace_info);
+                         is_executable, node);
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/jit/custom_operator.h b/torch/csrc/jit/custom_operator.h
index 63e9901964aacd..f0cc29db346963 100644
--- a/torch/csrc/jit/custom_operator.h
+++ b/torch/csrc/jit/custom_operator.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/operator.h>
 #include <torch/csrc/jit/stack.h>
+#include <torch/csrc/jit/tracer.h>
 #include <torch/csrc/utils/variadic.h>
 
 #include <caffe2/utils/Metaprogramming.h>
@@ -56,6 +57,25 @@ FunctionSchema createFunctionSchemaFromTraits(const std::string& name) {
   return {name, arguments, returns};
 }
 
+template <size_t... Is, typename... Types>
+Node* getTracedNode(
+    const FunctionSchema& schema,
+    const std::tuple<Types...>& tuple) {
+  auto symbol = Symbol::fromQualString(schema.name);
+  const auto& graph = tracer::getTracingState()->graph;
+  Node* node = graph->create(std::move(symbol), /*outputs=*/0);
+  tracer::recordSourceLocation(node);
+
+  // Hack to call addInputs for the parameter pack in a sequenced fashion.
+  // https://stackoverflow.com/questions/12030538/calling-a-function-for-each-variadic-template-argument-and-an-array
+  int _[] = {(tracer::addInputs(node, schema.arguments[Is].name.c_str(), std::get<Is>(tuple)), 0)...};
+  (void)_;
+
+  graph->appendNode(node);
+
+  return node;
+}
+
 /// Does two things for an operator implementation and a tuple of arguments:
 /// 1. Pops all necessary arguments off the stack into the tuple's elements,
 /// 2. Unpacks the tuple and calls the operator implementation.
@@ -66,12 +86,25 @@ template <
     typename... Types,
     size_t... Is>
 ReturnType callOperatorWithTuple(
+    const FunctionSchema& schema,
     Implementation&& implementation,
     Stack& stack,
     std::tuple<Types...>& tuple,
     Indices<Is...>) {
+  Node* node = nullptr;
+  if (jit::tracer::isTracing()) {
+    node = getTracedNode<Is...>(schema, tuple);
+  }
+
   pop(stack, std::get<Is>(tuple)...);
-  return std::forward<Implementation>(implementation)(std::get<Is>(tuple)...);
+  auto result =
+      std::forward<Implementation>(implementation)(std::get<Is>(tuple)...);
+
+  if (jit::tracer::isTracing()) {
+    jit::tracer::postRecordTrace(node, result);
+  }
+
+  return result;
 }
 
 void checkArgumentVector(
@@ -175,9 +208,10 @@ Operator createOperator(
 
   auto schema = torch::jit::detail::inferAndCheckSchema<Traits>(schemaOrName);
 
-  return Operator(schema, [implementation](Stack& stack) {
+  return Operator(schema, [implementation, schema](Stack& stack) {
     ArgumentTuple tuple;
     auto result = torch::jit::detail::callOperatorWithTuple<ReturnType>(
+        schema,
         std::move(implementation),
         stack,
         tuple,
diff --git a/torch/csrc/jit/ir.h b/torch/csrc/jit/ir.h
index 94f1cbd0fac446..447d5eb3219049 100644
--- a/torch/csrc/jit/ir.h
+++ b/torch/csrc/jit/ir.h
@@ -1131,7 +1131,7 @@ friend struct Block;
     return oss.str();
   }
 
-  friend std::ostream& operator<<(std::ostream & out, const Graph & g);
+  friend TORCH_API std::ostream& operator<<(std::ostream & out, const Graph & g);
   TORCH_API std::shared_ptr<Graph> copy();
 
 private:
diff --git a/torch/csrc/jit/python_tracer.cpp b/torch/csrc/jit/python_tracer.cpp
index 494ea4da7a36a2..8b68da694f596f 100644
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@@ -69,7 +69,7 @@ std::shared_ptr<torch::jit::Graph> createGraphByTracing(
   }
 }
 
-PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj,
+Node* preRecordPythonTrace(THPObjectPtr pyobj,
                                   std::string arg_types,
                                   at::ArrayRef<Variable> inputs,
                                   pyobj_list scalar_args) {
@@ -78,14 +78,10 @@ PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj,
     throw python_error();
   }
 
-  PreTraceInfo info;
-  auto & state = getTracingState();
-  auto & graph = state->graph;
+  auto & graph = getTracingState()->graph;
 
-  Node *n = info.n = graph->createPythonOp(
-                        std::move(apply),
-                        arg_types,
-                        std::move(scalar_args));
+  Node* n = graph->createPythonOp(
+      std::move(apply), arg_types, std::move(scalar_args));
   recordSourceLocation(n);
 
   for (const Variable & input : inputs) {
@@ -95,7 +91,7 @@ PreTraceInfo preRecordPythonTrace(THPObjectPtr pyobj,
   // NB: Order matters. This must append after inputs but before outputs.
   graph->appendNode(n);
 
-  return info;
+  return n;
 }
 
 void pythonRecordSourceLocation(Node* n) {
diff --git a/torch/csrc/jit/python_tracer.h b/torch/csrc/jit/python_tracer.h
index 960f48516dd4e1..ae3a63d5b4e4e2 100644
--- a/torch/csrc/jit/python_tracer.h
+++ b/torch/csrc/jit/python_tracer.h
@@ -10,7 +10,7 @@ void initPythonTracerBindings(PyObject *module);
 
 
 std::string getPythonInterpreterStackTrace();
-tracer::PreTraceInfo preRecordPythonTrace(
+Node* preRecordPythonTrace(
     THPObjectPtr pyobj, std::string arg_types, at::ArrayRef<autograd::Variable> inputs,
     pyobj_list scalar_args);
 
diff --git a/torch/csrc/jit/test_jit.cpp b/torch/csrc/jit/test_jit.cpp
index dd523c8d741892..e4e9781bc78217 100644
--- a/torch/csrc/jit/test_jit.cpp
+++ b/torch/csrc/jit/test_jit.cpp
@@ -985,12 +985,12 @@ void testCustomOperators() {
     REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
-    push(stack, 2.0f, at::ones(5));
+    push(stack, 2.0f, autograd::make_variable(at::ones(5)));
     op->getOperation()(stack);
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(at::full(5, 3.0f)));
+    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     RegisterOperators reg({createOperator(
@@ -1014,12 +1014,12 @@ void testCustomOperators() {
     REQUIRE(op->schema().returns[0].type->kind() == TypeKind::DynamicType);
 
     Stack stack;
-    push(stack, 2.0f, at::ones(5));
+    push(stack, 2.0f, autograd::make_variable(at::ones(5)));
     op->getOperation()(stack);
     at::Tensor output;
     pop(stack, output);
 
-    REQUIRE(output.allclose(at::full(5, 3.0f)));
+    REQUIRE(output.allclose(autograd::make_variable(at::full(5, 3.0f))));
   }
   {
     // Check that lists work well.
@@ -1050,7 +1050,7 @@ void testCustomOperators() {
     Stack stack;
     push(stack, std::vector<int64_t>{1, 2});
     push(stack, std::vector<double>{1.0, 2.0});
-    push(stack, std::vector<at::Tensor>{at::ones(5)});
+    push(stack, std::vector<at::Tensor>{autograd::make_variable(at::ones(5))});
     op->getOperation()(stack);
     std::vector<double> output;
     pop(stack, output);
@@ -1089,6 +1089,52 @@ void testCustomOperators() {
                    "for the return value in that position"));
 #endif // USE_CATCH
   }
+  {
+    auto op = createOperator(
+        "traced::op(float a, Tensor b) -> Tensor",
+        [](double a, at::Tensor b) { return a + b; });
+
+    std::shared_ptr<tracer::TracingState> state;
+    variable_list trace_vars_in;
+    std::tie(state, trace_vars_in) = tracer::enter({});
+
+    Stack stack;
+    push(stack, 2.0f, autograd::make_variable(at::ones(5)));
+    op.getOperation()(stack);
+    at::Tensor output = autograd::make_variable(at::empty({}));
+    pop(stack, output);
+
+    tracer::exit({output});
+
+    std::string op_name("traced::op");
+    bool contains_traced_op = false;
+    for (const auto& node : state->graph->nodes()) {
+      if (std::string(node->kind().toQualString()) == op_name) {
+        contains_traced_op = true;
+        break;
+      }
+    }
+    REQUIRE(contains_traced_op);
+  }
+  {
+#ifdef USE_CATCH
+    // vector<double> is not supported yet.
+    auto op = createOperator(
+        "traced::op(float[] f) -> int",
+        [](const std::vector<double>& f) -> int64_t { return f.size(); });
+
+    std::shared_ptr<tracer::TracingState> state;
+    variable_list trace_vars_in;
+    std::tie(state, trace_vars_in) = tracer::enter({});
+
+    Stack stack;
+    push(stack, std::vector<double>{1.0});
+
+    REQUIRE_THROWS_WITH(
+        op.getOperation()(stack),
+        StartsWith("Tracing float lists currently not supported!"));
+#endif
+  }
 }
 
 TORCH_API std::string runJITCPPTests() {
diff --git a/torch/csrc/jit/tracer.cpp b/torch/csrc/jit/tracer.cpp
index d6219685c1acff..b261636a6e5c30 100644
--- a/torch/csrc/jit/tracer.cpp
+++ b/torch/csrc/jit/tracer.cpp
@@ -25,17 +25,20 @@ void genericAddInput(Node *n, T value) {
 }
 
 void badArgType() {
-  throw std::runtime_error("Found an unsupported argument type in the JIT tracer. File a bug report.");
+  AT_ERROR("Found an unsupported argument type in the JIT tracer. File a bug report.");
 }
 
+thread_local std::shared_ptr<TracingState> tracing_state;
+
+} // namespace detail
 
-void addInputs(Node *n, const char * name, int64_t value)            { genericAddInput(n, value); }
-void addInputs(Node *n, const char * name, bool value)               { genericAddInput(n, value); }
-void addInputs(Node *n, const char * name, double value)             { genericAddInput(n, value); }
-void addInputs(Node *n, const char * name, const at::Scalar& value)  { genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, int64_t value)            { detail::genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, bool value)               { detail::genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, double value)             { detail::genericAddInput(n, value); }
+void addInputs(Node *n, const char * name, const at::Scalar& value)  { detail::genericAddInput(n, value); }
 void addInputs(Node *n, const char * name, const at::Tensor& value)  { n->addInput(getValueTrace(value)); }
-void addInputs(Node *n, const char * name, const std::string& value)         { badArgType(); }
-void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { badArgType(); }
+void addInputs(Node *n, const char * name, const std::string& value)         { detail::badArgType(); }
+void addInputs(Node *n, const char * name, const at::SparseTensorRef& value) { detail::badArgType(); }
 
 void addInputs(Node *n, const char * name, at::TensorList value) {
   Graph *g = n->owningGraph();
@@ -64,9 +67,9 @@ void addInputs(Node *n, const char * name, at::IntList value) {
   n->addInput(g->insertNode(g->createList(jit::IntType::get(), info))->output());
 }
 
-thread_local std::shared_ptr<TracingState> tracing_state;
-
-} // namespace detail
+void addInputs(Node *n, const char * name, const ArrayRef<double>& value) {
+  AT_ERROR("Tracing float lists currently not supported!");
+}
 
 const std::shared_ptr<TracingState>& getTracingState() {
   return detail::tracing_state;
@@ -81,11 +84,11 @@ TracingState::TracingState()
 
 TracingState::~TracingState() = default;
 
-void postRecordTrace(const PreTraceInfo& info,
+void postRecordTrace(Node* node,
                      at::ArrayRef<Variable> outputs) {
   for (size_t i = 0; i < outputs.size(); i++) {
     auto & output = outputs[i];
-    Value * value = info.n->addOutput();
+    Value * value = node->addOutput();
     if (output.defined()) {
       value->inferTypeFrom(output.data());
       setValueTrace(output, value);
diff --git a/torch/csrc/jit/tracer.h b/torch/csrc/jit/tracer.h
index fe34b12df30fd6..337e9d892fb826 100644
--- a/torch/csrc/jit/tracer.h
+++ b/torch/csrc/jit/tracer.h
@@ -1,13 +1,17 @@
 #pragma once
 
+#include "torch/csrc/autograd/function_hook.h"
+#include "torch/csrc/autograd/variable.h"
 #include "torch/csrc/jit/assertions.h"
-#include "torch/csrc/jit/ir.h"
 #include "torch/csrc/jit/constants.h"
-#include "torch/csrc/WindowsTorchApiMacro.h"
+#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/functional.h"
 #include "torch/csrc/utils/variadic.h"
-#include "torch/csrc/autograd/function_hook.h"
-#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/utils/variadic.h"
+#include "torch/csrc/WindowsTorchApiMacro.h"
+
+#include <ATen/Backtrace.h>
 
 #include <memory>
 #include <mutex>
@@ -187,18 +191,10 @@ inline void abandon() {
   setTracingState(nullptr);
 }
 
-// Pre-recorded information about the trace before we actually carry
-// out the trace
-struct PreTraceInfo {
-  Node *n;
-};
-
 
 TORCH_API void recordSourceLocation(Node* n);
 TORCH_API void setRecordSourceLocation(void (*v)(Node*));
 
-namespace detail {
-
 // NB: those serve both as an intermediate steps in addInputs below,
 // as well as the overloads that terminate template recursion
 void addInputs(Node *n, const char * name, int64_t value);
@@ -208,6 +204,7 @@ void addInputs(Node *n, const char * name, const at::Scalar& value);
 void addInputs(Node *n, const char * name, const at::Tensor& value);
 void addInputs(Node *n, const char * name, at::IntList value);
 void addInputs(Node *n, const char * name, at::TensorList value);
+void addInputs(Node *n, const char * name, const ArrayRef<double>& value);
 void addInputs(Node *n, const char * name, const std::string& value);
 void addInputs(Node *n, const char * name, const at::SparseTensorRef& value);
 
@@ -216,34 +213,23 @@ void addInputs(Node *n, const char * name, std::array<bool, N> value) {
   throw std::runtime_error("Found an unsupported argument type in the JIT tracer. File a bug report.");
 }
 
-template<typename T, typename... Args>
-void addInputs(Node *n, const char * arg_name, T arg, const char * next_arg_name, Args... args) {
-  addInputs(n, arg_name, arg);
-  addInputs(n, next_arg_name, args...);
-}
-
-} // namespace detail
+TORCH_API void postRecordTrace(Node* node, at::ArrayRef<Variable> outputs);
 
-// NB: if you change this function, you might want to take a look at
-// preRecordPythonTrace from python_tracer.cpp
-template<typename... Args>
-PreTraceInfo preRecordTrace(Symbol op, Args... inputs) {
-  PreTraceInfo info;
-  auto & state = getTracingState();
-  auto & graph = state->graph;
-
-  Node * n = info.n = graph->create(op, /*outputs=*/0);
-  recordSourceLocation(n);
-
-  detail::addInputs(n, inputs...);
-
-  // NB: Order matters. This must append after inputs but before outputs.
-  graph->appendNode(n);
-
-  return info;
+inline void postRecordTrace(Node* node, at::ArrayRef<at::Tensor> tensors) {
+  postRecordTrace(node, fmap<Variable>(tensors));
 }
 
-TORCH_API void postRecordTrace(const PreTraceInfo& info, at::ArrayRef<Variable> outputs);
+template <
+    typename T,
+    typename = torch::enable_if_t<
+        (!std::is_convertible<torch::decay_t<T>, ArrayRef<Variable>>::value &&
+         !std::is_convertible<torch::decay_t<T>, ArrayRef<at::Tensor>>::value &&
+         !std::is_convertible<torch::decay_t<T>, Variable>::value)>>
+void postRecordTrace(Node* node, T&&) {
+  AT_ERROR(
+      "Found an unsupported argument type ", at::demangle_type<T>(),
+      " in the JIT tracer. File a bug report.");
+}
 
 TORCH_API autograd::Variable getSizeOf(const autograd::Variable& var, int64_t dim);