@@ -9432,6 +9432,13 @@ TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) {
9432
9432
const int64_t dimx = 1024;
9433
9433
const int64_t dimy = 16384;
9434
9434
9435
+ auto properties = at::cuda::getDeviceProperties(0);
9436
+ // Require 70KB of smem to run test
9437
+ const size_t required_smem_size = 70 << 10;
9438
+ if (properties->sharedMemPerBlockOptin < required_smem_size) {
9439
+ GTEST_SKIP() << "not enough shared memory space on device to run test";
9440
+ }
9441
+
9435
9442
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
9436
9443
at::Tensor aten_input = at::randn({dimx, dimy}, options);
9437
9444
at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
@@ -9631,6 +9638,13 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
9631
9638
fe.compileFusion(&fusion, aten_inputs);
9632
9639
fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
9633
9640
9641
+ auto properties = at::cuda::getDeviceProperties(0);
9642
+ // Require 70KB of smem to run test
9643
+ const size_t required_smem_size = 70 << 10;
9644
+ if (properties->sharedMemPerBlockOptin < required_smem_size) {
9645
+ GTEST_SKIP() << "not enough shared memory space on device to run test";
9646
+ }
9647
+
9634
9648
auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
9635
9649
auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
9636
9650
auto at_rvar = at::rsqrt(at::add(at_var, kEps));
0 commit comments