cublaslt/hipblaslt persistent workspace (pytorch#156495)

jeffdaily · pragupta · commit ccea0315204c · 2025-07-21T23:51:05.000Z
Similar to cublas/hipblas, LT now allocates one workspace per handle+stream combo. - fixes hipblaslt issue where memory use increased during graph capture - preserves CUDA env var TORCH_CUBLASLT_UNIFIED_WORKSPACE - moves LT workspace and size from CUDABlas.cpp into CublasHandlePool.cpp, new APIs - size_t getCUDABlasLtWorkspaceSize() - void* getCUDABlasLtWorkspace() Fixes ROCm#2286. Pull Request resolved: pytorch#156495 Approved by: https://github.com/eqy (cherry picked from commit 996206e)
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -188,82 +188,11 @@ uint32_t _getAlignment(uintptr_t address) {
 }
 #endif
 
-static size_t _parseChosenWorkspaceSize() {
-  auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
-#ifdef USE_ROCM
-  if (!val.has_value()) {
-    // accept either env var
-    val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
-  }
-  size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
-#else
-  size_t workspace_size = 1024; /* default size in KiB according to #73328 */
-#endif
-
-  if (val.has_value()) {
-    try {
-      workspace_size = std::stoi(val.value());
-    } catch (std::invalid_argument const&) {
-      TORCH_WARN(
-          "invalid CUBLASLT_WORKSPACE_SIZE,",
-          " using default workspace size of ",
-          workspace_size,
-          " KiB.");
-    } catch (std::out_of_range const&) {
-      TORCH_WARN(
-          "CUBLASLT_WORKSPACE_SIZE out of range,",
-          " using default workspace size of ",
-          workspace_size,
-          " KiB.");
-    }
-  }
-  return workspace_size * 1024;
-}
-
-static size_t _getWorkspaceSize() {
-  static size_t workspace_size = _parseChosenWorkspaceSize();
-  return workspace_size;
-}
-
-void* _getUnifiedWorkspaceWithoutHandle() {
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  auto stream = c10::cuda::getCurrentCUDAStream();
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-  TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
-  return workspace_it->second.mutable_get();
-}
-
 struct CublasLtWorkspace {
   CublasLtWorkspace() {
-    size = _getWorkspaceSize();
-#ifndef USE_ROCM
-    static bool unified = c10::utils::check_env("TORCH_CUBLASLT_UNIFIED_WORKSPACE") == true;
-    if (unified) {
-      auto cublasWorkspaceSize = at::cuda::getChosenWorkspaceSize();
-      if (cublasWorkspaceSize < size) {
-        TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", size,
-                        " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
-                        " bytes. Please increase CUBLAS workspace size",
-                        " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
-                        " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
-                        " size will be limited to the CUBLAS workspace size.");
-        size = cublasWorkspaceSize;
-      }
-      ptr = _getUnifiedWorkspaceWithoutHandle();
-    } else {
-      auto allocator = c10::cuda::CUDACachingAllocator::get();
-      stashed_ptr_ = allocator->allocate(size);
-      ptr = stashed_ptr_.mutable_get();
-    }
-#else
-    auto allocator = c10::cuda::CUDACachingAllocator::get();
-    stashed_ptr_ = allocator->allocate(size);
-    ptr = stashed_ptr_.mutable_get();
-#endif
+    size = at::cuda::getCUDABlasLtWorkspaceSize();
+    ptr = at::cuda::getCUDABlasLtWorkspace();
   }
-  at::DataPtr stashed_ptr_;
   void * ptr;
   size_t size;
 };
@@ -2111,10 +2040,8 @@ void int8_gemm(
 
 #ifdef USE_ROCM
   CuBlasLtMatmulPreference preference;
-  size_t workspaceSize = _getWorkspaceSize();
-  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
-  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  auto workspace = allocator.allocate(workspaceSize);
+  auto ltworkspace = CublasLtWorkspace();
+  preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
@@ -2152,12 +2079,12 @@ void int8_gemm(
       nullptr, // Heuristics don't seem to work for int8
 #endif
 #ifdef USE_ROCM
-      workspace.mutable_get(),
+      ltworkspace.ptr,
 #else
       nullptr, // Non-zero workspace doesn't seem to work.
 #endif
 #ifdef USE_ROCM
-      workspaceSize,
+      ltworkspace.size,
 #else
       0,
 #endif
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -89,7 +89,10 @@ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 
 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
 TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
+TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
+TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
 
 #if defined(CUDART_VERSION) || defined(USE_ROCM)
 TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -23,6 +23,9 @@
  * To work around this difference in behavior, a separate handle pool is available for ROCm builds.
  * For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle,
  * whereas for ROCm builds, it is a distinct function.
+ *
+ * The workspace pools are separate for ROCm. On CUDA, the env var
+ * TORCH_CUBLASLT_UNIFIED_WORKSPACE can be used to opt-in to unifying the workspace pools.
  */
 
 namespace at::cuda {
@@ -109,8 +112,14 @@ std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_works
   return instance;
 }
 
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+  return instance;
+}
+
 void clearCublasWorkspaces() {
   cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }
 
 size_t parseChosenWorkspaceSize() {
@@ -157,15 +166,97 @@ size_t parseChosenWorkspaceSize() {
   }
 }
 
+size_t parseCUDABlasLtWorkspaceSize() {
+  auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
+#ifdef USE_ROCM
+  if (!val.has_value()) {
+    // accept either env var
+    val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
+  }
+  size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
+#else
+  size_t workspace_size = 1024; /* default size in KiB according to #73328 */
+#endif
+
+  if (val.has_value()) {
+    try {
+      workspace_size = std::stoi(val.value());
+    } catch (std::invalid_argument const&) {
+      TORCH_WARN(
+          "invalid CUBLASLT_WORKSPACE_SIZE,",
+          " using default workspace size of ",
+          workspace_size,
+          " KiB.");
+    } catch (std::out_of_range const&) {
+      TORCH_WARN(
+          "CUBLASLT_WORKSPACE_SIZE out of range,",
+          " using default workspace size of ",
+          workspace_size,
+          " KiB.");
+    }
+  }
+  return workspace_size * 1024;
+}
+
 size_t getChosenWorkspaceSize() {
   size_t pool_size = parseChosenWorkspaceSize();
   return pool_size;
 }
 
+#define TORCH_CUBLASLT_UNIFIED_WORKSPACE "TORCH_CUBLASLT_UNIFIED_WORKSPACE"
+
+size_t getCUDABlasLtWorkspaceSize() {
+  size_t pool_size = parseCUDABlasLtWorkspaceSize();
+#ifndef USE_ROCM
+  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+  if (unified) {
+    auto cublasWorkspaceSize = getChosenWorkspaceSize();
+    if (cublasWorkspaceSize < pool_size) {
+      TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", pool_size,
+                      " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
+                      " bytes. Please increase CUBLAS workspace size",
+                      " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
+                      " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
+                      " size will be limited to the CUBLAS workspace size.");
+      pool_size = cublasWorkspaceSize;
+    }
+  }
+#endif
+  return pool_size;
+}
+
 at::DataPtr getNewWorkspace() {
   return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize());
 }
 
+at::DataPtr getNewCUDABlasLtWorkspace() {
+  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
+}
+
+void* getCUDABlasLtWorkspace() {
+#ifndef USE_ROCM
+  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+  if (unified) {
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+    auto stream = c10::cuda::getCurrentCUDAStream();
+    cudaStream_t _stream = stream;
+    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    return workspace_it->second.mutable_get();
+  }
+#endif
+  cublasLtHandle_t handle = getCurrentCUDABlasLtHandle();
+  auto stream = c10::cuda::getCurrentCUDAStream();
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+  }
+  return workspace_it->second.mutable_get();
+}
+
 cublasHandle_t getCurrentCUDABlasHandle() {
   c10::DeviceIndex device = 0;
   AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -381,28 +381,6 @@ static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
   return HIPBLAS_OP_T;
 }
 
-static size_t GetHipblasltWorkspaceSize() {
-  static const auto env = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
-  // 256MB is max workspace size allowed for hipblaslt
-  // hipblaslt-bench uses 32MB
-  // recommendation from hipblaslt author was 76MB
-  // TunableOp hipBLASLt workspace size is aligned with
-  // PyTorch's default in CUDABlas.cpp (_parseChosenWorkspaceSize)
-  size_t workspace_size = 76*1024;
-  if (env) {
-    try {
-      workspace_size = std::stoi(env.value());
-    } catch(std::invalid_argument const& e) {
-      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
-                 " using default workspace size of ", workspace_size, " KiB.");
-    } catch(std::out_of_range const& e) {
-      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
-                 " using default workspace size of ", workspace_size, " KiB.");
-    }
-  }
-  return workspace_size * 1024;
-}
-
 template <typename T, cublasStatus_t (*destructor)(T*)>
 struct HipBlasLtDeleter {
   void operator()(T* x) {
@@ -550,7 +528,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
         }
       }
 
-      size_t workspace_size = GetHipblasltWorkspaceSize();
+      size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize();
 
       auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
 
@@ -575,10 +553,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
         return FAIL;
       }
 
-      void* workspace_buffer = nullptr;
-      if (workspace_size > 0) {
-        workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
-      }
+      void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace();
 
       TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
             matmul.descriptor(),
@@ -601,9 +576,6 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
       TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
-      if (workspace_size > 0) {
-        c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
-      }
       return OK;
     }