@@ -9413,7 +9413,7 @@ TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) {
9413
9413
"");
9414
9414
}
9415
9415
9416
- TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA ) {
9416
+ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA ) {
9417
9417
Fusion fusion;
9418
9418
FusionGuard fg(&fusion);
9419
9419
@@ -9519,10 +9519,11 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) {
9519
9519
const int64_t dimy = 16384;
9520
9520
9521
9521
auto properties = at::cuda::getDeviceProperties(0);
9522
- // Require 70KB of smem to run test
9523
- const size_t required_smem_size = 70 << 10 ;
9522
+ const size_t required_smem_size =
9523
+ (dimy - static_size) * sizeof(float) + TIDX * sizeof(float) ;
9524
9524
if (properties->sharedMemPerBlockOptin < required_smem_size) {
9525
- GTEST_SKIP() << "not enough shared memory space on device to run test";
9525
+ GTEST_SKIP() << "not enough shared memory space on device to run test: "
9526
+ << properties->sharedMemPerBlock;
9526
9527
}
9527
9528
9528
9529
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -9708,6 +9709,14 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
9708
9709
const float kEps = 1e-5;
9709
9710
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9710
9711
9712
+ auto properties = at::cuda::getDeviceProperties(0);
9713
+ const size_t required_smem_size =
9714
+ (dimy - static_size) * sizeof(float) + TIDX * sizeof(float);
9715
+ if (properties->sharedMemPerBlockOptin < required_smem_size) {
9716
+ GTEST_SKIP() << "not enough shared memory space on device to run test: "
9717
+ << properties->sharedMemPerBlock;
9718
+ }
9719
+
9711
9720
at::Tensor aten_input = at::randn({dimx, dimy}, options);
9712
9721
at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
9713
9722
at::Tensor aten_dynamic_in =
@@ -9723,13 +9732,6 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
9723
9732
torch::jit::fuser::cuda::FusionExecutor fe;
9724
9733
fe.compileFusion(&fusion, aten_inputs);
9725
9734
9726
- auto properties = at::cuda::getDeviceProperties(0);
9727
- // Require 70KB of smem to run test
9728
- const size_t required_smem_size = 70 << 10;
9729
- if (properties->sharedMemPerBlockOptin < required_smem_size) {
9730
- GTEST_SKIP() << "not enough shared memory space on device to run test";
9731
- }
9732
-
9733
9735
fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
9734
9736
9735
9737
auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
0 commit comments