NVIDIA · ptrendx · Apr 8, 2025 · Apr 9, 2025 · Apr 14, 2025 · Apr 26, 2025
diff --git a/tests/pytorch/test_float8blockwisetensor.py b/tests/pytorch/test_float8blockwisetensor.py
@@ -206,7 +206,7 @@ def test_quantize_dequantize_dims(
     @pytest.mark.parametrize(
         "dims", [[], 256, 311, [264], [256, 512], [250, 500], [7, 5, 3], [2, 3, 5, 3]]
     )
-    @pytest.mark.parametrize("block_scaling_dim", [1, 2])
+    @pytest.mark.parametrize("block_scaling_dim", [1])
     @pytest.mark.parametrize("dq_columnwise", [True, False])
     @pytest.mark.xfail(raises=NotImplementedError)
     def test_quantize_dequantize_compact_format(

diff --git a/tests/pytorch/test_sanity.py b/tests/pytorch/test_sanity.py
@@ -104,7 +104,7 @@ def is_fp8_supported(self):
 
 model_configs = {
     "126m": ModelConfig(12, 2048, 2, 768, 12),
-    "small": ModelConfig(2, 32, 2, 64, 2),
+    "small": ModelConfig(2, 16, 2, 128, 1),
     "weird": ModelConfig(2, 37, 3, 69, 3),
     "large": ModelConfig(1, 128, 2, 512, 4, 128),
 }
@@ -398,6 +398,34 @@ def _test_sanity_common(
     loss.backward()
     torch.cuda.synchronize()
 
+    # now try eval with weight caching
+    block.eval()
+    te_inp.requires_grad = False
+
+    with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
+        te_out = block(te_inp, is_first_microbatch=True)
+    with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
+        te_out = block(te_inp, is_first_microbatch=False)
+    torch.cuda.synchronize()
+
+    # now try regular execution again with weight caching
+    block.train()
+    te_inp.requires_grad = True
+
+    with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
+        te_out = block(te_inp, is_first_microbatch=True)
+    if isinstance(te_out, tuple):
+        te_out = te_out[0]
+    loss = te_out.sum()
+    loss.backward()
+    with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
+        te_out = block(te_inp, is_first_microbatch=False)
+    if isinstance(te_out, tuple):
+        te_out = te_out[0]
+    loss = te_out.sum()
+    loss.backward()
+    torch.cuda.synchronize()
+
 
 def _test_sanity_normalization_amp(block, dtype, config, skip_wgrad, skip_dgrad):
     if skip_dgrad and skip_wgrad:
@@ -1124,27 +1152,31 @@ def test_sanity_fp8_gemm_with_unalignment(N, datatype):
 @pytest.mark.skipif(not fp8_available, reason=reason_for_no_fp8)
 @pytest.mark.skipif(get_device_compute_capability() < (9, 0), reason="FP8 tests require Hopper.")
 @pytest.mark.skipif(get_cudnn_version() < (9, 3, 0), reason="cuDNN 9.3.0+ is required.")
-@pytest.mark.parametrize("model", ["large"])
+@pytest.mark.parametrize("model", ["small"])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_sanity_attention_extra_state(model, dtype):
     config = model_configs[model]
+    print("regular")
     outputs = _run_attention_extra_state(dtype, config, checkpoint=False)
+    print("checkpointed")
     outputs_checkpoint = _run_attention_extra_state(dtype, config, checkpoint=True)
-    outputs_checkpoint_v1_6 = _run_attention_extra_state(
-        dtype, config, mimic_v1_6=True, checkpoint=True
-    )
+    # outputs_checkpoint_v1_6 = _run_attention_extra_state(
+    #     dtype, config, mimic_v1_6=True, checkpoint=True
+    # )
 
     # Check that results match
     tols = dtype_tols(dtype)
     if dtype in (torch.float16, torch.bfloat16):
         tols.update(dict(rtol=2e-2, atol=2e-3))
     for i, (ref, test) in enumerate(zip(outputs, outputs_checkpoint)):
+        print(i)
         torch.testing.assert_close(
             test,
             ref,
             **tols,
         )
     for i, (ref, test) in enumerate(zip(outputs, outputs_checkpoint_v1_6)):
+        print(f"Second loop {i}")
         torch.testing.assert_close(
             test,
             ref,
@@ -1173,6 +1205,8 @@ def _run_attention_extra_state(dtype, config, checkpoint=False, mimic_v1_6=False
         requires_grad=True,
     )
 
+    torch.set_printoptions(threshold=100_000_000)
+
     def get_model(dtype, config):
         sigma = 0.023
         init_method = init_method_normal(sigma)
@@ -1191,15 +1225,59 @@ def get_model(dtype, config):
                 params_dtype=dtype,
                 device="cuda",
             )
+            # block = torch.nn.Sequential(
+            #         Linear(config.hidden_size,
+            #                config.hidden_size),
+            #         Linear(config.hidden_size,
+            #                config.hidden_size),
+            #         Linear(config.hidden_size,
+            #                config.hidden_size),
+            #         Linear(config.hidden_size,
+            #                config.hidden_size))
+            # block.to(dtype=dtype)
         return block
 
     block = get_model(dtype, config)
+    print("Before the first loop")
+    # for n,p in block.named_parameters():
+    #     print(n)
+    #     print(p)
+    print("data")
+    print(block.self_attention.proj.weight._data)
+    print("scale_inv")
+    print(block.self_attention.proj.weight._scale_inv)
+    print("transpose")
+    print(block.self_attention.proj.weight._transpose)
+
+    import transformer_engine.pytorch.attention.dot_product_attention.backends as bbb
+
+    bbb.DEBUG_BLOCK = block
+    print("set!")
+
+    print("End before the first loop")
+    print(f"scale inv: {block.self_attention.proj.weight._scale_inv}")
     for i in range(steps // 2):
         with fp8_autocast(enabled=fp8_enabled, fp8_recipe=fp8_recipe):
-            output = block(hidden_states, None)
+            print(f"scale inv 0: {block.self_attention.proj.weight._scale_inv}")
+            output = block(hidden_states)
+            print(f"scale inv 1: {block.self_attention.proj.weight._scale_inv}")
+            print(f"output {i}")
+            print(output)
             loss = output.sum()
-            loss.backward()
-
+        loss.backward()
+        print(f"scale inv 2: {block.self_attention.proj.weight._scale_inv}")
+
+    print("Before the checkpoint")
+    # for n,p in block.named_parameters():
+    #     print(n)
+    #     print(p)
+    print("data")
+    print(block.self_attention.proj.weight._data)
+    print("scale_inv")
+    print(block.self_attention.proj.weight._scale_inv)
+    print("transpose")
+    print(block.self_attention.proj.weight._transpose)
+    print("End before the checkpoint")
     if checkpoint:
         sd = block.state_dict()
         if mimic_v1_6:
@@ -1231,10 +1309,23 @@ def get_model(dtype, config):
 
     for i in range((steps + 1) // 2):
         with fp8_autocast(enabled=fp8_enabled, fp8_recipe=fp8_recipe):
-            output = block(hidden_states, None)
+            output = block(hidden_states)
+            print(f"after output {i}")
+            print(output)
             loss = output.sum()
             loss.backward()
 
+    print("After the checkpoint")
+    # for n,p in block.named_parameters():
+    #     print(n)
+    #     print(p)
+    print("data")
+    print(block.self_attention.proj.weight._data)
+    print("scale_inv")
+    print(block.self_attention.proj.weight._scale_inv)
+    print("transpose")
+    print(block.self_attention.proj.weight._transpose)
+    print("End after the checkpoint")
     torch.cuda.synchronize()
 
     if os.path.exists(path):

diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -57,6 +57,8 @@
     AttentionLogging as attn_log,
 )
 
+DEBUG_BLOCK = None
+
 # Global vars for flash attn v2 and v3 imports
 flash_attn_cuda_bwd = None
 flash_attn_func = None
@@ -964,6 +966,8 @@ def forward(
                     case _:
                         raise "Invalid qkv_layout " + qkv_layout
             # q_fp8, k_fp8, v_fp8, out_fp8: torch.float8_e4m3fn
+            print(f"Q quantizer scale: {q_fp8._quantizer.scale.shape}")
+            print(f"mixed quantizer scale: {qkv_fp8._quantizer.scale.shape}")
             out_fp8, aux_ctx_tensors = fused_attn_fwd(
                 is_training,
                 max_seqlen_q,
@@ -1190,6 +1194,11 @@ def backward(ctx, d_out):
                     dqkv_dtype = TE_DType[d_out_fp8._data.dtype]
                     # q_fp8, k_fp8, v_fp8, out_fp8:      torch.float8_e4m3fn
                     # d_out_fp8, dq_fp8, dk_fp8, dv_fp8: torch.float8_e5m2
+                    print(DEBUG_BLOCK)
+                    if DEBUG_BLOCK is not None:
+                        print(
+                            f"Inside attention: {DEBUG_BLOCK.self_attention.proj.weight._scale_inv}"
+                        )
                     dq_fp8, dk_fp8, dv_fp8, *rest = fused_attn_bwd(
                         ctx.max_seqlen_q,
                         ctx.max_seqlen_kv,
@@ -1218,6 +1227,11 @@ def backward(ctx, d_out):
                         ctx.window_size,
                         ctx.deterministic,
                     )
+                    if DEBUG_BLOCK is not None:
+                        print(
+                            "After Inside attention:"
+                            f" {DEBUG_BLOCK.self_attention.proj.weight._scale_inv}"
+                        )
 
                     # is_input_fp8 = False: dq, dk, dv: torch.float16 or torch.bfloat16
                     # is_input_fp8 = True:  dq, dk, dv: torch.float8_e5m2

diff --git a/transformer_engine/pytorch/cpp_extensions/fused_attn.py b/transformer_engine/pytorch/cpp_extensions/fused_attn.py
@@ -445,6 +445,9 @@ def fused_attn_bwd(
             len(aux_ctx_tensors) == 3
         ), "aux_ctx_tensors is required to be [M, ZInv, rng_state] for FP8 fused attention."
 
+    import transformer_engine.pytorch.attention.dot_product_attention.backends as bbb
+
+    debug = bbb.DEBUG_BLOCK.self_attention.proj.weight._scale_inv
     output_tensors = tex.fused_attn_bwd(
         max_seqlen_q,
         max_seqlen_kv,
@@ -471,6 +474,7 @@ def fused_attn_bwd(
         s_quantizer,
         dp_quantizer,
         dqkv_quantizer,
+        debug,
     )
 
     return output_tensors
diff --git a/transformer_engine/pytorch/csrc/common.cpp b/transformer_engine/pytorch/csrc/common.cpp
@@ -67,7 +67,7 @@ TensorWrapper makeTransformerEngineTensor(py::handle tensor, py::handle quantize
   // also during dequantize, the quantizer param is unknown -> so quantizer is NoneQuantizer
   for (auto [check_type, check_quantizer_type, create_tensor, _] :
        detail::custom_types_converters) {
-    if (check_type(tensor.ptr())) {
+    if (check_type(tensor.ptr()) != PythonTensorType::INVALID) {
       if (!(quantizer.is_none() || check_quantizer_type(quantizer.ptr()))) {
         continue;
       }
@@ -286,9 +286,4 @@ std::vector<size_t> convertShape(const NVTEShape& shape) {
   return std::vector<size_t>(shape.data, shape.data + shape.ndim);
 }
 
-int roundup(const int value, const int multiple) {
-  assert(multiple > 0);
-  return ((value + multiple - 1) / multiple) * multiple;
-}
-
 }  // namespace transformer_engine::pytorch
diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h
@@ -98,7 +98,7 @@ class Quantizer {
   virtual void set_quantization_params(TensorWrapper* tensor) const = 0;
 
   virtual std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const = 0;
 
   virtual ~Quantizer() = default;
@@ -121,7 +121,7 @@ class NoneQuantizer : public Quantizer {
   void set_quantization_params(TensorWrapper* tensor) const override {}
 
   std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 };
 
@@ -139,7 +139,7 @@ class Float8Quantizer : public Quantizer {
   void set_quantization_params(TensorWrapper* tensor) const override;
 
   std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 };
 
@@ -161,7 +161,7 @@ class Float8CurrentScalingQuantizer : public Quantizer {
   void set_quantization_params(TensorWrapper* tensor) const override;
 
   std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 };
 
@@ -195,7 +195,7 @@ class Float8BlockQuantizer : public Quantizer {
   // for the tensor. Should set quantized data, scales for rowwise
   // and optionally columnwise usage.
   std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 };
 
@@ -210,7 +210,7 @@ class MXFP8Quantizer : public Quantizer {
   void set_quantization_params(TensorWrapper* tensor) const override;
 
   std::pair<TensorWrapper, py::object> create_tensor(
-      const std::vector<size_t>& shape, DType dtype,
+      const std::vector<size_t>& shape, DType dtype, const py::object& output = py::none(),
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 };
 
@@ -354,7 +354,16 @@ void* getDataPtr(at::Tensor tensor, int offset = 0);
 
 std::vector<size_t> convertShape(const NVTEShape& shape);
 
-int roundup(const int value, const int multiple);
+template <typename T>
+T divup(const T value, const T multiple) {
+  assert(multiple > 0);
+  return ((value + multiple - 1) / multiple);
+}
+
+template <typename T>
+T roundup(const T value, const T multiple) {
+  return divup(value, multiple) * multiple;
+}
 
 NVTEShape convertTorchShape(const c10::IntArrayRef torch_shape);
 }  // namespace transformer_engine::pytorch

diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -62,7 +62,7 @@ std::vector<py::object> fused_attn_bwd(
     const std::vector<at::Tensor> Aux_CTX_Tensors,
     const std::optional<at::Tensor> cu_seqlens_q_padded,
     const std::optional<at::Tensor> cu_seqlens_kv_padded, py::handle s_quantizer,
-    py::handle dp_quantizer, py::handle dqkv_quantizer);
+    py::handle dp_quantizer, py::handle dqkv_quantizer, at::Tensor debug);
 
 at::Tensor fa_prepare_fwd(at::Tensor qkvi);
 at::Tensor fa_prepare_bwd(at::Tensor q, at::Tensor k, at::Tensor v);