huggingface · muellerzr · Apr 3, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
@@ -17,6 +17,7 @@
 import copy
 import importlib.metadata as importlib_metadata
 import importlib.util
+import os
 import weakref
 from functools import partialmethod
 
@@ -282,9 +283,19 @@ def unset_hf_deepspeed_config():
     _hf_deepspeed_config_weak_ref = None
 
 
-def is_deepspeed_zero3_enabled():
+def is_deepspeed_zero3_enabled(check_accelerate=False):
+    """
+    If `check_accelerate`, will also check if `deepspeed_zero3` has been enabled through
+    the environment variables setup during `accelerate launch`.
+    """
+    accelerate_zero_stage = int(os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE", -1))
+    accelerate_zero_init = os.environ.get("ACCELERATE_DEEPSPEED_ZERO3_INIT", "0")
     if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
         return _hf_deepspeed_config_weak_ref().is_zero3()
+    # This only gets triggered passively if the user launches code with a configured
+    # `accelerate launch` without making `TrainingArguments`
+    elif check_accelerate and accelerate_zero_stage != -1 and accelerate_zero_init != "0":
+        return True, False
     else:
         return False
 

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -1312,7 +1312,15 @@ def _from_config(cls, config, **kwargs):
             torch_dtype=torch_dtype,
         )
 
-        if is_deepspeed_zero3_enabled():
+        deepspeed_enabled, accelerate_enabled = is_deepspeed_zero3_enabled(check_accelerate=True)
+
+        if deepspeed_enabled:
+            if not accelerate_enabled:
+                raise ValueError(
+                    "Detected that you want to use `zero-3` Init, but the environment "
+                    "has not been setup yet. Please create `TrainingArguments` before "
+                    "initializing the model."
+                )
             import deepspeed
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
@@ -3386,7 +3394,15 @@ def from_pretrained(
         # Instantiate model.
         init_contexts = [no_init_weights(_enable=_fast_init)]
 
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        deepspeed_enabled, accelerate_enabled = is_deepspeed_zero3_enabled(check_accelerate=True)
+
+        if deepspeed_enabled and not is_quantized:
+            if not accelerate_enabled:
+                raise ValueError(
+                    "Detected that you want to use `zero-3` Init, but the environment "
+                    "has not been setup yet. Please create `TrainingArguments` before "
+                    "initializing the model."
+                )
             import deepspeed
 
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -503,6 +503,11 @@ class TrainingArguments:
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
             `ds_config.json`) or an already loaded json file as a `dict`"
 
+            <Tip warning={true}>
+                If enabling any Zero-init, make sure that your model is not initialized until
+                *after* initializing the `TrainingArguments`, else it will not be applied.
+            </Tip>
+
         accelerator_config (`str`, `dict`, or `AcceleratorConfig`, *optional*):
             Config to be used with the internal `Accelerator` implementation. The value is either a location of
             accelerator json config file (e.g., `accelerator_config.json`), an already loaded json file as `dict`,