[wip] support float8 weight caching for gradient accumulation/PP

vkuzo · vkuzo · commit e4b126a57d9a · 2023-12-17T15:24:12.000-08:00
Summary: In the cases where the optimizer update does not happen after every forward such as microbatching/PP, we can save the casted weight to trade some time for memory. For now I'm just testing out performance+accuracy. We can improve on the API in future PRs. In terms of accuracy this should be no change, I will validate this further if we want to land this. For performance, on @drisspg's LLaMa 7B pretrain script, with bsz==128 and micro_bsz == 1: 1. baseline bf16 + compile: 2.38 it/s 2. delayed scaling + compile: 2.80 it/s (1.18x over baseline) 3. delayed scaling + compile + this PR: 3.04 it/s (1.28x over baseline) Test Plan: ``` pytest test/test_base.py -s -k test_weight_caching ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/float8_experimental/config.py b/float8_experimental/config.py
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+# If True, allocates buffers for float8 weight cache
+allocate_float8_weight_cache_buffers = False
+
+# A global flag for controlling the weight cache, off by default. Intended 
+# usage is for users to modify this from their training loop directly 
+# according to their microbatching/pipeline parallel setup.
+# Note: this is currently a global flag for simplicity and dynamo performance.
+weight_cache_enabled = False
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -22,7 +22,10 @@
     _maybe_initialize_amaxes_scales_for_float8_cast,
 )
 
-from float8_experimental.float8_tensor import Float8Tensor
+from float8_experimental.float8_tensor import (
+    Float8Tensor, 
+    calculate_amax_and_cast_to_float8,
+)
 
 from float8_experimental.float8_utils import (
     E4M3_MAX_POS,
@@ -31,6 +34,8 @@
     to_fp8_saturated,
 )
 
+import float8_experimental.config as config
+
 
 class NoopFwToFloat8E5M2Bw(torch.autograd.Function):
     """
@@ -148,6 +153,15 @@ def __init__(self, *args, **kwargs):
         # will access the scale when it has ensured that it is on GPU.
         self._float8_tensor_ctor = lambda *args, **kwargs: Float8Tensor(*args, **kwargs)
 
+        if config.allocate_float8_weight_cache_buffers:
+            # this is a buffer to get `to(dtype)` for free
+            # TODO(future): hide this from serialization
+            # TODO(future): force this to stay in float8_e4m3fn
+            self.register_buffer(
+                'cached_fp8_weight', 
+                torch.empty(self.weight.shape, dtype=torch.float8_e4m3fn),
+            )
+
     def register_always_float32_buffer(
         self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True
     ) -> None:
@@ -204,8 +218,26 @@ def cast_w_to_float8(
             torch.float8_e4m3fn,
             is_amax_initialized,
         )
+
+        if config.weight_cache_enabled:
+            assert config.allocate_float8_weight_cache_buffers
+            w_bits_fp8 = self.cached_fp8_weight
+        else:
+            # manual calculation of fp8 bits:
+            # 1. calculate the bits without Float8Tensor, without grad
+            # 2. store the bits here
+            # 3. create Float8Tensor from the bits calculated in 2
+            # motivation: this will take care of saving the bits without
+            # interacting with tensor subclasses, as w_fp8._data is not
+            # currently traceable by dynamo
+            w_bits_fp8 = calculate_amax_and_cast_to_float8(
+                self.weight, self.fp8_scale_w, torch.float8_e4m3fn,
+                self.fp8_amax_w)
+            if config.allocate_float8_weight_cache_buffers:
+                self.cached_fp8_weight.copy_(w_bits_fp8)
         w_fp8 = Float8Tensor.to_float8(
-            w, self.fp8_scale_w, torch.float8_e4m3fn, self.fp8_amax_w, self.emulate
+            w, self.fp8_scale_w, torch.float8_e4m3fn, self.fp8_amax_w, 
+            self.emulate, cached_casted_weight=w_bits_fp8,
         )
         return w_fp8
 
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -149,6 +149,9 @@ def sync_float8_amax_and_scale_history(
         if not isinstance(child, fp8_classes):
             continue
 
+        # TODO(future): enable skipping weight related syncing if weight cache
+        # is on
+
         #
         # 1. in distributed contexts, syncs amax values across workers
         #
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -11,6 +11,15 @@
 
 aten = torch.ops.aten
 
+@torch.no_grad()
+def calculate_amax_and_cast_to_float8(tensor, scale, float8_dtype, amax_buffer):
+    if amax_buffer is not None:
+        amax_buffer.fill_(tensor_to_amax(tensor))
+
+    tensor_scaled = tensor * scale
+    bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
+    return bits_fp8
+
 
 class ToFloat8ConstrFunc(torch.autograd.Function):
     """
@@ -25,24 +34,21 @@ def forward(
         float8_dtype=torch.float8_e4m3fn,
         amax_buffer=None,
         emulate: bool = False,
+        cached_casted_weight = None,
     ):
-        # In TransformerEngine, the casts to float8 are fused with calculating
-        # the new amax value. In this codebase, the eager mode code for those
-        # two things is colocated in this function. We expect PT2.0 to fuse it
-        # for us.
-        if amax_buffer is not None:
-            amax_buffer.fill_(tensor_to_amax(tensor))
-
-        tensor_scaled = tensor * scale
-        bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
+        if cached_casted_weight is not None:
+            return Float8Tensor(cached_casted_weight, scale, tensor.dtype, 
+                emulate=emulate)
+        bits_fp8 = calculate_amax_and_cast_to_float8(
+            tensor, scale, float8_dtype, amax_buffer)
         return Float8Tensor(bits_fp8, scale, tensor.dtype, emulate=emulate)
 
     @staticmethod
     def backward(ctx, g):
         if isinstance(g, Float8Tensor):
-            return g.to_original_precision(), None, None, None, None
+            return g.to_original_precision(), None, None, None, None, None
         else:
-            return g, None, None, None, None
+            return g, None, None, None, None, None
 
 
 class FromFloat8ConstrFunc(torch.autograd.Function):
@@ -123,6 +129,9 @@ def __tensor_flatten__(self):
 
     @staticmethod
     def __tensor_unflatten__(inner_tensors: Dict, metadata):
+        # TODO(TBD): this seems unused, and it's out of date after
+        # the new args in https://github.com/pytorch/pytorch/pull/114311
+        # we should just delete it
         assert len(inner_tensors) == 2
         return Float8Tensor(
             inner_tensors["_data"],
@@ -136,7 +145,7 @@ def to_original_precision(self):
 
     @staticmethod
     @torch._dynamo.allow_in_graph
-    def to_float8(tensor, scale, float8_dtype, amax_buffer=None, emulate: bool = False):
+    def to_float8(tensor, scale, float8_dtype, amax_buffer=None, emulate: bool = False, cached_casted_weight = None):
         """Converts a higher precision tensor to float8 in a differentiable way.
 
         Args:
@@ -149,7 +158,7 @@ def to_float8(tensor, scale, float8_dtype, amax_buffer=None, emulate: bool = Fal
             Float8Tensor: a float8 tensor
         """
         return ToFloat8ConstrFunc.apply(
-            tensor, scale, float8_dtype, amax_buffer, emulate
+            tensor, scale, float8_dtype, amax_buffer, emulate, cached_casted_weight,
         )
 
     @classmethod
diff --git a/test/test_base.py b/test/test_base.py
@@ -14,6 +14,7 @@
 
 import torch
 import torch.nn as nn
+import float8_experimental.float8_linear as float8_linear
 from float8_experimental.float8_linear import Float8Linear
 from float8_experimental.float8_linear_utils import (
     get_float8_linear,
@@ -32,6 +33,8 @@
     tensor_to_scale,
 )
 
+import float8_experimental.config as config
+
 random.seed(0)
 torch.manual_seed(0)
 
@@ -231,6 +234,37 @@ def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
             y.dtype == torch.bfloat16
         ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
 
+    @pytest.mark.parametrize("use_compile", [False, True])
+    def test_weight_caching(self, use_compile):
+        M, K, N = 16, 32, 64
+        dtype = torch.bfloat16
+        config.allocate_float8_weight_cache_buffers = True
+
+        x = torch.randn(M, K, device="cuda", dtype=dtype)
+        m_ref = nn.Linear(K, N, bias=True, device="cuda", dtype=dtype)
+        m = Float8Linear.from_float(copy.deepcopy(m_ref), emulate = False)
+
+        if use_compile:
+            m = torch.compile(m)
+
+        config.weight_cache_enabled = False
+
+        y1 = m(x)
+        y1.sum().backward()
+        grad1 = m.weight.grad.clone().detach()
+
+        config.weight_cache_enabled = True
+        sync_float8_amax_and_scale_history(m)
+
+        y2 = m(x)
+        y2.sum().backward()
+        grad2 = m.weight.grad.clone().detach()
+
+        torch.testing.assert_close(grad2, grad1 * 2)
+
+        config.allocate_float8_weight_cache_buffers = False
+        
+
 
 class TestScaledMM:
     @unittest.skipIf(

Original file line number	Diff line number	Diff line change
`@@ -149,6 +149,9 @@ def sync_float8_amax_and_scale_history(`
`149`	`149`	`if not isinstance(child, fp8_classes):`
`150`	`150`	`continue`
`151`	`151`
	`152`	`+ # TODO(future): enable skipping weight related syncing if weight cache`
	`153`	`+ # is on`
	`154`	`+`
`152`	`155`	`#`
`153`	`156`	`# 1. in distributed contexts, syncs amax values across workers`
`154`	`157`	`#`