address issue 2 in #83

foundation-model-stack · Oct 14, 2024 · 1f8cc16 · 1f8cc16
1 parent a50ff63
commit 1f8cc16
Showing 1 changed file with 11 additions and 2 deletions.
diff --git a/...s/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py b/...s/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py
@@ -54,10 +54,19 @@ def _all_reduce_hook(grad):
 
         # because we will ignore these from FSDP, we need to manually
         # move them to gpu if they are already not on them
+        # - if the adapters are on meta, we assume that this is for FSDP
+        #   low_cpu_mem_mode purposes, and that the values will be synced over
+        # - So just initialize them to empty.
         if not A.weight.is_cuda:
-            set_module_tensor_to_device(A, "weight", "cuda")
+            value = None
+            if A.weight.device == torch.device('meta'):
+                value = torch.empty(*A.weight.size(), dtype=A.weight.dtype)
+            set_module_tensor_to_device(A, "weight", "cuda", value)
         if not B.weight.is_cuda:
-            set_module_tensor_to_device(B, "weight", "cuda")
+            value = None
+            if B.weight.device == torch.device('meta'):
+                value = torch.empty(*B.weight.size(), dtype=B.weight.dtype)
+            set_module_tensor_to_device(B, "weight", "cuda", value)
 
 
 def register_foak_model_patch_rules(base_type):