diff --git a/benchmarks/cpp/nvfuser/utils.cpp b/benchmarks/cpp/nvfuser/utils.cpp
index 3915f7d65298..227195740d56 100644
--- a/benchmarks/cpp/nvfuser/utils.cpp
+++ b/benchmarks/cpp/nvfuser/utils.cpp
@@ -6,7 +6,7 @@
 
 using namespace torch::jit::fuser::cuda;
 
-std::string toString(ReductionParams rparams) {
+std::string toString(const ReductionParams& rparams) {
   std::stringstream ss;
   ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
      << (rparams.persistent_kernel ? "Persistent Kernel // " : "")
@@ -65,7 +65,7 @@ std::string toString(ReductionParams rparams) {
   return ss.str();
 }
 
-std::string toString(PointwiseParams params) {
+std::string toString(const PointwiseParams& params) {
   std::stringstream ss;
   if (params.break_point) {
     ss << "2D Schedule at " << params.break_point << "/";
@@ -89,6 +89,15 @@ std::string toString(PointwiseParams params) {
   return ss.str();
 }
 
+std::string toString(const TransposeParams& params) {
+  std::stringstream ss;
+  ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2
+     << ")/";
+  ss << "Vectorize size: (" << params.vectorize_factor1 << ","
+     << params.vectorize_factor2 << ")";
+  return ss.str();
+}
+
 std::string toString(const std::shared_ptr<HeuristicParams>& params) {
   auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
   if (rparams) {
@@ -98,6 +107,10 @@ std::string toString(const std::shared_ptr<HeuristicParams>& params) {
   if (pparams) {
     return toString(*pparams);
   }
+  auto tparams = std::dynamic_pointer_cast<TransposeParams>(params);
+  if (tparams) {
+    return toString(*tparams);
+  }
   TORCH_INTERNAL_ASSERT(
       false,
       "Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
diff --git a/benchmarks/cpp/nvfuser/utils.h b/benchmarks/cpp/nvfuser/utils.h
index e24fdfb127da..67c5329d9668 100644
--- a/benchmarks/cpp/nvfuser/utils.h
+++ b/benchmarks/cpp/nvfuser/utils.h
@@ -36,8 +36,9 @@ TensorView* makeContigConcreteTensor(
     std::vector<int64_t> shape,
     DataType dtype = DataType::Float);
 
-std::string toString(ReductionParams rparams);
-std::string toString(PointwiseParams params);
+std::string toString(const ReductionParams& rparams);
+std::string toString(const PointwiseParams& params);
+std::string toString(const TransposeParams& params);
 std::string toString(const std::shared_ptr<HeuristicParams>& params);
 std::string toString(LaunchParams lparams);
 
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 5df55ee45dfd..d8f2589f81c2 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -41,8 +41,12 @@
 if RUN_NVFUSER and torch.version.cuda is not None:
     CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])
 
-os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition'
-os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma'
+if 'PYTORCH_NVFUSER_ENABLE' not in os.environ:
+    os.environ['PYTORCH_NVFUSER_ENABLE'] = ""
+os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE']
+if 'PYTORCH_NVFUSER_DISABLE' not in os.environ:
+    os.environ['PYTORCH_NVFUSER_DISABLE'] = ""
+os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE']
 os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
 # TODO: enable complex when we fixes the extremal cases in OpInfo
 # see issue https://github.com/csarofeen/pytorch/issues/1730"
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index 2fd96e1313c7..86834a853e60 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -974,14 +974,16 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         const auto& input_tensor = input.toTensor();
         std::cout << "  " << input_tensor.scalar_type() << " "
                   << input.toTensor().sizes()
-                  << " (strides = " << input.toTensor().strides() << ")"
+                  << " (strides = " << input.toTensor().strides()
+                  << ", address = " << input.toTensor().data_ptr() << ")"
                   << std::endl;
       }
     }
     std::cout << "Outputs:" << std::endl;
     for (const auto& output : allocated_outputs) {
       std::cout << "  " << output.scalar_type() << " " << output.sizes()
-                << " (strides = " << output.strides() << ")" << std::endl;
+                << " (strides = " << output.strides()
+                << ", address = " << output.data_ptr() << ")" << std::endl;
     }
     std::cout << "Reduction and semaphore buffers:" << std::endl;
     TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 55f08882c1f0..7b3edfd74cdb 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1256,15 +1256,18 @@ class TransposeScheduler : public SchedulerEntry {
   }
 
   static bool canScheduleCompileTime(Fusion* fusion) {
-    // Not enabling this yet. Needs more validation.
-    return false;
-#if 0
+    if (!isOptionEnabled(EnableOption::TransposeScheduler)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "not enabled");
+      return false;
+    }
+
     // Temporarily disallow view in transpose scheduler
     // TODO Add more testing before enabling
     auto view_tvs = scheduler_utils::getViewTVs(fusion);
     if (view_tvs.size() > 0) {
       scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Reduction, "No support for view op");
+          ScheduleHeuristic::Transpose, "No support for view op");
       return false;
     }
 
@@ -1293,7 +1296,6 @@ class TransposeScheduler : public SchedulerEntry {
     }
 
     return true;
-#endif
   }
 
   static bool canScheduleRunTime(
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 6937685e7fb7..afb5f09f2ec6 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -126,13 +126,15 @@ class DomainMap : public pointwise_utils::DomainMap {
         auto group =
             scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false);
         for (auto member_tv : group) {
-          TORCH_INTERNAL_ASSERT(
-              grouped.count(member_tv) == 0 || member_tv == tv,
-              "The group of ",
-              member_tv->toString(),
-              " is ambiguous. This is likely a bug.");
-          grouped.emplace(member_tv);
-          groups.back().emplace_back(member_tv);
+          if (grouped.count(member_tv) == 0) {
+            grouped.emplace(member_tv);
+            groups.back().emplace_back(member_tv);
+          } else if (member_tv != tv) {
+            // Ambiguous grouping. This should only happen at `canSchedule`, so
+            // we just return a null result which will tell the scheduler to
+            // reject the fusion
+            return {};
+          }
         }
       }
     }
@@ -229,15 +231,26 @@ void maybeBuildVirtualInnerDims(
       (merged_size2 < params.tile_size2)) {
     return; // no need to split
   }
-  // If one of them are not satisfied, this usually means that the satisfied one
-  // just merged in a large dim. We split this large dim, so that now we have
-  // two available dims to satisfy both virtual innermost dim.
+  // If one of them are not satisfied, there might be two cases:
+  // 1. The satisfied one just merged in a large dim. If this is the case, We
+  //    split this large dim, so that now we have two available dims to satisfy
+  //    both virtual innermost dim.
+  // 2. The satisfied one did not merge in anything. For example,
+  //    T0[I0{1024*1024}, I1{2}]
   int64_t large_dim;
   int64_t split_factor;
   if (merged_size1 < params.tile_size1) {
+    if (params.dims_merged_with_2.empty()) {
+      // case 2
+      return;
+    }
     large_dim = params.dims_merged_with_2.back();
     split_factor = ceilDiv(params.tile_size1, merged_size1);
   } else {
+    if (params.dims_merged_with_1.empty()) {
+      // case 2
+      return;
+    }
     large_dim = params.dims_merged_with_1.back();
     split_factor = ceilDiv(params.tile_size2, merged_size2);
   }
@@ -435,7 +448,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   auto max_unroll_factor_block =
       ceilDiv(params->tile_size1 * params->tile_size2, 32);
   max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_block);
-  max_unroll_factor = scheduler_utils::lastPow2(max_unroll_factor);
 
   // Compute maximum vectorize factor that can be used
   size_t vectorize_factor1 = max_unroll_factor;
@@ -456,15 +468,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
     vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
   }
 
-  params->vectorize_factor1 =
-      std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1);
-  params->vectorize_factor2 =
-      std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2);
+  params->vectorize_factor1 = scheduler_utils::lastPow2(
+      std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1));
+  params->vectorize_factor2 = scheduler_utils::lastPow2(
+      std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2));
 
   params->lparams.bind(params->getThreadsPerBlock(), ParallelType::TIDx);
 
   if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
     std::cerr << "\n===== Transpose Stats ========\n"
+              << "inputs: " << ir_utils::toString(fusion->inputs()) << "\n"
+              << "outputs: " << ir_utils::toString(fusion->outputs()) << "\n"
               << "num_elems: " << n_elems << "\n"
               << "n_input_tensors: " << n_input_tensors << "\n"
               << "max_input_dtype_size: " << max_input_dtype_size << "\n"
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
index 1fa0f16a7d78..dea4917fc34d 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
@@ -1390,13 +1390,8 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
 
   std::vector<TensorView*> vectorizable_tensors;
 
-  for (auto input_tv :
-       ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
-    if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
-      vectorizable_tensors.push_back(input_tv);
-    }
-  }
-
+  // We put outputs in front of inputs because this would make the transpose
+  // scheduler prefer to use output instead of input as reference tensor.
   for (auto output_tv :
        ir_utils::filterByType<TensorView>(reference_tv->fusion()->outputs())) {
     if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) {
@@ -1404,6 +1399,13 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
     }
   }
 
+  for (auto input_tv :
+       ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
+    if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
+      vectorizable_tensors.push_back(input_tv);
+    }
+  }
+
   return vectorizable_tensors;
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index 91afac7270d7..5e82014c0c38 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -169,7 +169,8 @@ auto parseEnableOptions() {
       {EnableOption::Complex, false},
       {EnableOption::KernelProfile, false},
       {EnableOption::LinearDecomposition, false},
-      {EnableOption::ConvDecomposition, false}};
+      {EnableOption::ConvDecomposition, false},
+      {EnableOption::TransposeScheduler, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
     c10::string_view options_view(dump_options);
@@ -184,13 +185,16 @@ auto parseEnableOptions() {
         options_map[EnableOption::LinearDecomposition] = true;
       } else if (token == "conv_decomposition") {
         options_map[EnableOption::ConvDecomposition] = true;
+      } else if (token == "transpose_scheduler") {
+        options_map[EnableOption::TransposeScheduler] = true;
       } else {
         TORCH_CHECK(
             false,
-            "Invalid disable option: '",
+            "Invalid enable option: '",
             token,
             "'\nAvailable options:\n",
-            "\tcomplex, kernel_profile");
+            "\tcomplex, kernel_profile, linear_decomposition,",
+            "conv_decomposition, transpose_scheduler");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 69e3aa4a4d68..679776b383af 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -79,7 +79,8 @@ enum class EnableOption {
   Complex, //! Enable complex support on python
   KernelProfile, //! Enable intra-kernel performance profiling
   LinearDecomposition, //! Enable linear-bias decomposition
-  ConvDecomposition //! Enable conv-bias decomposition
+  ConvDecomposition, //! Enable conv-bias decomposition
+  TransposeScheduler //! Enable the experimental transpose scheduler
 };
 
 TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);