fix non sharding model publish (#3333)

emlin · facebook-github-bot · commit 72b814e8745d · 2025-09-01T20:02:11.000-07:00
Summary: Pull Request resolved: #3333 since zch weight tensor includes extra metadata in checkpoint, we need to shift the weight processing start and end column index, currently weight processing relies on DI sharding pass to generate shifted_weight_shard, but not every model has DI sharding. This diff added a default behavior to generate shifted_weight_shard if this is empty and it's for ZCH weight tensor. Reviewed By: EddyLXJ Differential Revision: D80434727 fbshipit-source-id: a40903217e4c661780915b3675bc25bca0f4b9c0
diff --git a/torchrec/distributed/tests/test_infer_shardings.py b/torchrec/distributed/tests/test_infer_shardings.py
@@ -380,7 +380,9 @@ def test_rw_with_virtual_table_eviction(
         batch_size = 4
         local_device = torch.device(f"{device_type}:0")
         eviction_policy = TimestampBasedEvictionPolicy()
-        eviction_policy.init_metaheader_config(dtype_to_data_type(torch.float16))
+        eviction_policy.init_metaheader_config(
+            dtype_to_data_type(torch.float16), emb_dim
+        )
         mi = create_test_model(
             num_embeddings,
             emb_dim,
@@ -392,6 +394,13 @@ def test_rw_with_virtual_table_eviction(
             weight_dtype=weight_dtype,
             virtual_table_eviction_policy=eviction_policy,
         )
+        for t in mi.tables:
+            self.assertIsNotNone(t.virtual_table_eviction_policy)
+            self.assertEqual(
+                # pyre-ignore [16]
+                t.virtual_table_eviction_policy.get_embedding_dim(),
+                emb_dim,
+            )
 
         non_sharded_model = mi.quant_model
         num_emb_half = num_embeddings // 2
@@ -430,19 +439,18 @@ def test_rw_with_virtual_table_eviction(
             ["table_0"],
             ShardingType.ROW_WISE.value,
         )
-        print(weights_spec)
-        assert (
+
+        self.assertIsNotNone(
             weights_spec[
                 "_module.sparse.ebc.tbes.0.0.table_0.weight"
             ].virtual_table_dim_offsets
-            is not None
         )
-        assert (
+        self.assertEqual(
             # pyre-ignore [16]
             weights_spec[
                 "_module.sparse.ebc.tbes.0.0.table_0.weight"
-            ].virtual_table_dim_offsets[0]
-            == 8
+            ].virtual_table_dim_offsets[0],
+            8,
         )
 
     @unittest.skipIf(
diff --git a/torchrec/modules/embedding_configs.py b/torchrec/modules/embedding_configs.py
@@ -172,24 +172,29 @@ def data_type_to_dtype(data_type: DataType) -> torch.dtype:
 class VirtualTableEvictionPolicy:
     # metadata header length in element size for virtual table in weight tensor value
     meta_header_len: int = 0
+    embedding_dim: int = 0
     initialized: bool = False
 
     """
     Eviction policy for virtual table.
     """
 
-    def init_metaheader_config(self, data_type: DataType) -> None:
+    def init_metaheader_config(self, data_type: DataType, embedding_dim: int) -> None:
         # the eviction metaheader is set for training data type only. Once initialized, we don't need to reinitialize again
         if self.initialized:
             return
         # 8 bytes for key, 4 bytes timestamp, 4 bytes shared by used and count: 1 bit for used, 31 bits for count
         # for more details, please refer to: https://github.com/pytorch/FBGEMM/pull/4187
         self.meta_header_len = 16 // data_type_to_dtype(data_type).itemsize
+        self.embedding_dim = embedding_dim
         self.initialized = True
 
     def get_meta_header_len(self) -> int:
         return self.meta_header_len
 
+    def get_embedding_dim(self) -> int:
+        return self.embedding_dim
+
 
 @dataclass
 class CountBasedEvictionPolicy(VirtualTableEvictionPolicy):