[DSD] Correctly handle shared parameters for optimizer state_dict (pytorch#128685)

fegin · pytorchmergebot · commit 1a527915a64b · 2024-06-18T18:34:32.000Z
* Fixes pytorch#128011 See the discussion in pytorch#128076 Current implementation of `set_optimizer_state_dict()` assumes that all the fqns returned by `_get_fqns()` must exist in the optimizer state_dict. This is not true if the model has shared parameters. In such a case, only one fqn of the shared parameters will appear in the optimizer state_dict. This PR addresses the issue. Differential Revision: [D58573487](https://our.internmc.facebook.com/intern/diff/D58573487/) Pull Request resolved: pytorch#128685 Approved by: https://github.com/LucasLLC
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
@@ -851,6 +851,33 @@ def test_deprecate_fsdp_api(self) -> None:
             ):
                 get_model_state_dict(model)
 
+    @with_comms
+    @skip_if_lt_x_gpu(2)
+    def test_shared_weight(self):
+        class TiedEmbeddingModel(nn.Module):
+            def __init__(self, vocab_size, embedding_dim):
+                super().__init__()
+                self.embedding = nn.Embedding(vocab_size, embedding_dim)
+                self.decoder = nn.Linear(embedding_dim, vocab_size)
+                self.decoder.weight = self.embedding.weight  # Tying weights
+
+            def forward(self, input):
+                input = (input * 10).to(torch.int)
+                embedded = self.embedding(input)
+                output = self.decoder(embedded)
+                return output
+
+        def init_model_optim():
+            device_mesh = init_device_mesh("cuda", (self.world_size,))
+            orig_model = TiedEmbeddingModel(10000, 300).to(torch.device("cuda"))
+            orig_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-3)
+            copy_optim = torch.optim.AdamW(orig_model.parameters(), lr=1e-3)
+            dist_model = FSDP(copy.deepcopy(orig_model), device_mesh=device_mesh)
+            dist_optim = torch.optim.AdamW(dist_model.parameters(), lr=1e-3)
+            return orig_model, orig_optim, copy_optim, dist_model, dist_optim
+
+        self._test_save_load(init_model_optim)
+
 
 class TestNoComm(MultiProcessTestCase):
     def setUp(self) -> None:
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
@@ -153,6 +153,9 @@ class _StateDictInfo(StateDictOptions):
     fqn_param_mapping: Dict[
         Union[str, torch.Tensor], Union[FQNS_T, torch.Tensor]
     ] = field(default_factory=dict)
+    shared_params_mapping: Dict[
+        Union[str, torch.Tensor], Union[FQNS_T, torch.Tensor]
+    ] = field(default_factory=dict)
     submodule_prefixes: Set[str] = field(default_factory=set)
     handle_model: bool = True
     handle_optim: bool = True
@@ -286,14 +289,29 @@ def _verify_options(
     fqn_param_mapping: Dict[
         Union[str, torch.Tensor], Union[Set[str], torch.Tensor]
     ] = {}
+    shared_params_mapping: Dict[
+        Union[str, torch.Tensor], Union[Set[str], torch.Tensor]
+    ] = {}
     for name, param in _iterate_valid_model_state(model):
+        if isinstance(param, _EXTRA_STATE):
+            continue
+
         fqns = _get_fqns(model, name)
-        if not isinstance(param, _EXTRA_STATE):
-            fqn_param_mapping[param] = fqns
+        fqn = fqn_param_mapping.get(param, None)
+        if fqn is not None:
+            cast(Set[str], fqn_param_mapping[param]).update(fqns)
+            shared_params_mapping[param] = fqn_param_mapping[param]
+        else:
+            # We need to do copy as _get_fqns is lru_cached
+            fqn_param_mapping[param] = fqns.copy()
         for fqn in fqns:
             if not isinstance(param, _EXTRA_STATE):
                 fqn_param_mapping[fqn] = param
 
+    for param_, fqns_ in list(shared_params_mapping.items()):
+        for fqn in fqns_:
+            shared_params_mapping[fqn] = cast(torch.Tensor, param_)
+
     submodule_prefixes: Set[str] = set()
     if submodules:
         submodules = set(submodules)
@@ -361,6 +379,7 @@ def fsdp_state_dict_type_without_warning(
     return _StateDictInfo(
         **asdict(options),
         fqn_param_mapping=fqn_param_mapping,
+        shared_params_mapping=shared_params_mapping,
         submodule_prefixes=submodule_prefixes,
         fsdp_context=fsdp_context,
         fsdp_modules=cast(List[nn.Module], fsdp_modules),
@@ -450,7 +469,7 @@ def _get_model_state_dict(
 
     for key in list(state_dict.keys()):
         fqns = _get_fqns(model, key)
-        assert len(fqns) == 1
+        assert len(fqns) == 1, (key, fqns)
         fqn = next(iter(fqns))
         if fqn != key:
             # As we only support FSDP, DDP, and TP, the only cases are
@@ -797,6 +816,19 @@ def _split_optim_state_dict(
         pg_state.append({_PARAMS: []})
         for param in param_group[_PARAMS]:
             for fqn in info.fqn_param_mapping[param]:
+                if fqn in info.shared_params_mapping:
+                    in_params = False
+                    for loaded_param_group in cast(
+                        ListDictValueType, optim_state_dict[_PG]
+                    ):
+                        if fqn in cast(List[str], loaded_param_group[_PARAMS]):
+                            in_params = True
+                            break
+                else:
+                    in_params = True
+                if not in_params:
+                    continue
+
                 params = pg_state[-1][_PARAMS]
                 assert isinstance(params, list)
                 params.append(fqn)
@@ -805,9 +837,7 @@ def _split_optim_state_dict(
                 for loaded_param_group in cast(
                     ListDictValueType, optim_state_dict[_PG]
                 ):
-                    params = loaded_param_group[_PARAMS]
-                    assert isinstance(params, list)
-                    if fqn in params:
+                    if fqn in cast(List[str], loaded_param_group[_PARAMS]):
                         pg_mapping[id(loaded_param_group)] = len(return_osd[_PG]) - 1
 
     for param_group in cast(ListDictValueType, optim_state_dict[_PG]):