[CPU offload optim] Fix when there are non-trainable params (#1210)

gau-nernst · web-flow · commit f99b6678c4db · 2024-11-02T08:48:07.000+08:00
fix non-trainable params
diff --git a/test/prototype/test_low_bit_optim.py b/test/prototype/test_low_bit_optim.py
@@ -211,6 +211,7 @@ def test_optim_4bit_correctness(self, optim_name):
     def test_optim_cpu_offload_correctness(self, offload_grad, grad_accum):
         device = "cuda"
         model1 = nn.Sequential(nn.Linear(32, 1024), nn.ReLU(), nn.Linear(1024, 128)).to(device)
+        model1[0].requires_grad_(False)  # make sure it can work in the presence of non-trainable params
         model2 = copy.deepcopy(model1)
 
         optim1 = torch.optim.AdamW(model1.parameters())
diff --git a/torchao/prototype/low_bit_optim/cpu_offload.py b/torchao/prototype/low_bit_optim/cpu_offload.py
@@ -66,6 +66,9 @@ def backward_hook(p_cuda):
             params = param_group.pop("params")
 
             for p_cuda in params:
+                if not p_cuda.requires_grad:
+                    continue
+
                 # pre-allocate CPU params and grads
                 p_cpu = torch.empty_like(p_cuda, device="cpu", pin_memory=True)
                 p_cpu.grad = torch.empty_like(p_cpu, pin_memory=True)