NVIDIA · dongxuy04 · Aug 12, 2025 · Aug 11, 2025 · Aug 12, 2025
@@ -533,7 +533,7 @@ def forward_chunk(
                             self.fc31_input_scale,
                             self.scaling_vector_size,
                             sfUseUE8M0=False,
-                            swizzedLayout=False)
+                            isSfSwizzledLayout=False)
                     x_sf = x_sf.view((x_row, -1))
 
             elif self.has_deepseek_fp8_block_scales:

@@ -347,8 +347,8 @@ def load_expert_w3_w1_weight(self, module: torch.nn.Module,
         Load w1 and w3 weights for each expert.
         Override this method if you need to preprocess the weights differently.
         """
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
         device = dst_w3_w1_weight.device
-        assert device.type == "cuda"
         w1_weight_shard = load_weight_shard(w1_weight,
                                             module.tp_size,
                                             module.tp_rank,
@@ -372,8 +372,8 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
         Load w2 weight for each expert.
         Override this method if you need to preprocess the weights differently.
         """
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
         device = dst_w2_weight.device
-        assert device.type == "cuda"
         w2_weight_shard = load_weight_shard(w2_weight,
                                             module.tp_size,
                                             module.tp_rank,
@@ -1537,8 +1537,8 @@ def load_expert_w3_w1_weight_scale_nvfp4(
             self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
             w3_weight_scale: torch.Tensor,
             dst_w3_w1_weight_scale: torch.Tensor):
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
         device = dst_w3_w1_weight_scale.device
-        assert device.type == "cuda"
         w1_weight_scale = load_weight_shard(w1_weight_scale,
                                             module.tp_size,
                                             module.tp_rank,
@@ -1577,8 +1577,8 @@ def load_expert_w3_w1_weight_scale_nvfp4(
     def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
                                           w2_weight_scale: torch.Tensor,
                                           dst_w2_weight_scale: torch.Tensor):
+        # device don't have to be 'cuda', e.g. 'cpu' for online EPLB
         device = dst_w2_weight_scale.device
-        assert device.type == "cuda"
         w2_weight_scale = load_weight_shard(w2_weight_scale,
                                             module.tp_size,
                                             module.tp_rank,