add configurations and minor updates

foundation-model-stack · Aug 19, 2024 · 736d3c1 · 736d3c1
1 parent 0f48421
commit 736d3c1
Show file tree

Hide file tree

Showing 8 changed files with 36 additions and 23 deletions.
diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md
@@ -9,5 +9,5 @@ This library contains plugins to accelerate finetuning with the following optimi
 Currently databricks megablocks does not have a PyPi repository and does not have a proper release, so we have to install from the github repository as below. Please note that installing from github will require CUDA Toolkit to build.
 
 ```
-pip install git+https://github.com/databricks/megablocks.git
+pip install git+https://github.com/databricks/megablocks.git@bce5d7b2aaf5038bc93b36f76c2baf51c2939bd2
 ```
diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_megablocks.py b/plugins/accelerated-moe/src/fms_acceleration_moe/framework_plugin_megablocks.py
@@ -44,7 +44,7 @@ def requires_custom_loading(self):
     def model_loader(self, model_name: str, **kwargs):
         # guarded
         from .megablocks_utils.config_utils import update_mlp_registry
-        from megablocks_utils.shard_moe_utils import shard_moe, get_moe_kwargs
+        from .megablocks_utils.shard_moe_utils import shard_moe, get_moe_kwargs
 
         # this one does a forward patching on MLP, but needs to be fixed
         # properly as the load balancing loss is currently not properly
@@ -88,6 +88,8 @@ def model_loader(self, model_name: str, **kwargs):
             ),
         )
 
+        return model
+
     def get_callbacks_and_ready_for_train(
         self, model: torch.nn.Module = None, accelerator=None
     ):

diff --git a/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/sparse_mlp2.py b/plugins/accelerated-moe/src/fms_acceleration_moe/megablocks_utils/sparse_mlp2.py
@@ -14,10 +14,8 @@ class SparseMLPv2(torch.nn.Module):
     def __init__(self, args : Arguments):
         super().__init__()
         self.args = args
-        self._num_rows_per_rank = (
-            (mpu.experts_per_rank(args) * mpu.features_per_rank(args)) //
-            mpu.get_weight_parallel_world_size(args)
-        )
+        self._num_rows_per_rank = mpu.experts_per_rank(args) * mpu.features_per_rank(args)
+
 
         self.w1 = torch.nn.Parameter(torch.empty(
             self._num_rows_per_rank,
@@ -46,8 +44,7 @@ def __init__(self, args : Arguments):
                 args, args.moe_num_experts, args.ffn_hidden_size,
                 args.hidden_size, args.output_layer_init_method))
 
-        self._should_set_parallelism_attribute = (
-            args.moe_expert_model_parallelism or args.moe_weight_parallelism)
+        self._should_set_parallelism_attribute = args.moe_expert_model_parallelism
         mpu.set_expert_model_parallel_attributes(
             self.w1, self._should_set_parallelism_attribute)
         mpu.set_expert_model_parallel_attributes(

diff --git a/plugins/framework/src/fms_acceleration/constants.py b/plugins/framework/src/fms_acceleration/constants.py
@@ -21,4 +21,4 @@
 # and activated.
 # - hence the plugins that have model loaders should be on top of this list
 
-PLUGINS = ["peft", "foak", "aadp"]
+PLUGINS = ["peft", "foak", "aadp", "moe"]
diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml
@@ -36,4 +36,9 @@ framework_configs:
     - shortname: aadp-padding-free
       plugins:
         - attention-and-distributed-packing
-      filename: aadp-padding-free-sample-configuration.yaml
+      filename: aadp-padding-free-sample-configuration.yaml
+
+    - shortname: moe-megablocks
+      plugins:
+        - accelerated-moe
+      filename: moe-megablocks-sample-configuration.yaml
diff --git a/sample-configurations/moe-megablocks-sample-configuration.yaml b/sample-configurations/moe-megablocks-sample-configuration.yaml
@@ -0,0 +1,14 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  training:
+
+    # mixture-of-experts configurations
+    moe:
+
+      # expert-parallel for MoE
+      megablocks:
+
+        dummy: 1
diff --git a/scripts/benchmarks/scenarios.yaml b/scripts/benchmarks/scenarios.yaml
@@ -67,20 +67,12 @@ scenarios:
                 - 'mistralai/Mixtral-8x7B-Instruct-v0.1'
                 - 'NousResearch/Llama-2-70b-hf'
 
-    -   name: accelerated-peft-gptq
+    -   name: accelerated-moe-megablocks
         framework_config: 
-            - accelerated-peft-autogptq
-            - accelerated-peft-autogptq-foak
+            - moe-megablocks
         arguments:
             learning_rate: 2e-4
-            fp16: True
-            torch_dtype: float16
-            peft_method: lora
-            r: 16
-            lora_alpha: 16
-            lora_dropout: 0.1
-            target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
+            bf16: True
+            torch_dtype: bfloat16
             model_name_or_path: 
-                - 'TheBloke/Mistral-7B-v0.1-GPTQ'
-                - 'TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ'
-                - 'TheBloke/Llama-2-70B-GPTQ'
+                - 'mistralai/Mixtral-8x7B-Instruct-v0.1'
diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py
@@ -145,6 +145,7 @@ def read_configuration(path: str) -> Dict:
 KEY_AUTO_GPTQ_FOAK = "auto-gptq-foak"
 KEY_BNB_NF4_FOAK = "bnb-nf4-foak"
 KEY_AADP_PADDING_FREE = "aadp-padding-free"
+KEY_MEGABLOCKS = "moe-megablocks"
 
 CONFIGURATIONS = {
     KEY_AUTO_GPTQ: "plugins/accelerated-peft/configs/autogptq.yaml",
@@ -168,6 +169,7 @@ def read_configuration(path: str) -> Dict:
         [("peft.quantization.fused_ops_and_kernels.base_layer", "bitsandbytes")],
     ),
     KEY_AADP_PADDING_FREE: "plugins/attention-and-distributed-packing/configs/aadp.yaml",
+    KEY_MEGABLOCKS: "plugins/accelerated-moe/configs/megablocks.yaml",
 }
 
 # list of (tag, combi) tuples
@@ -182,6 +184,7 @@ def read_configuration(path: str) -> Dict:
     ("accelerated-peft-autogptq-foak", (KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK)),
     ("accelerated-peft-bnb-nf4-foak", (KEY_BNB_NF4, KEY_BNB_NF4_FOAK)),
     ("aadp-padding-free", (KEY_AADP_PADDING_FREE,)),
+    ("moe-megablocks", (KEY_MEGABLOCKS,)),
 ]