[lora] Add tests in multi LoRA integration test

deepjavalibrary · Nov 15, 2024 · 98af0a3 · 98af0a3
1 parent 06bd249
commit 98af0a3
Show file tree

Hide file tree

Showing 4 changed files with 381 additions and 130 deletions.
diff --git a/serving/docs/adapters.md b/serving/docs/adapters.md
@@ -29,16 +29,16 @@ More details can be found in the user guide.
 
 Here are the settings that are available when using LoRA Adapter.
 
-| Item                             | Environment Variable             | LMI Version | Configuration Type | Description                                                                                                                                                                                                                                                                           | Example value                        |
-|----------------------------------|----------------------------------|-------------|--------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------|
-| option.enable_lora               | OPTION_ENABLE_LORA               | \>= 0.27.0  | Pass Through       | This config enables support for LoRA adapters.                                                                                                                                                                                                                                        | Default: `false`                     |
-| option.max_loras                 | OPTION_MAX_LORAS                 | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters.                                                                                                                                                   | Default: `4`                         |
-| option.max_lora_rank             | OPTION_MAX_LORA_RANK             | \>= 0.27.0  | Pass Through       | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost.                                                                                 | Default: `16`                        |
-| option.max_cpu_loras             | OPTION_MAX_CPU_LORAS             | \>= 0.27.0  | Pass Through       | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras.                                                                                                                                                                                     | Default: `None`                      |
-| option.fully_sharded_loras       | OPTION_FULLY_SHARDED_LORAS       | \>= 0.31.0  | Pass Through       | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster.                                                                  | Default: `true`                      |
-| option.lora_extra_vocab_size     | OPTION_LORA_EXTRA_VOCAB_SIZE     | \>= 0.31.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                    | Default: `256`                       |
-| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0  | Pass Through       | Specify multiple scaling factors (which can be different from base model scaling factor) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed.  | Default: `None`. Example: "3.0,4.0". |
-| option.lora_dtype                | OPTION_LORA_DTYPE                | \>= 0.31.0  | Pass Through       | Data type for LoRA. Valid values are auto, float16, bfloat16, float32. If auto, will default to base model dtype.                                                                                                                                                                     | Default: `auto`                      |
+| Item                             | Environment Variable             | LMI Version | Configuration Type | Description                                                                                                                                                                                                                                                                          | Example value                        |
+|----------------------------------|----------------------------------|-------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------|
+| option.enable_lora               | OPTION_ENABLE_LORA               | \>= 0.27.0  | Pass Through       | This config enables support for LoRA adapters.                                                                                                                                                                                                                                       | Default: `false`                     |
+| option.max_loras                 | OPTION_MAX_LORAS                 | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters that can be run at once. Allocates GPU memory for those number adapters.                                                                                                                                                  | Default: `4`                         |
+| option.max_lora_rank             | OPTION_MAX_LORA_RANK             | \>= 0.27.0  | Pass Through       | This config determines the maximum rank allowed for a LoRA adapter. Set this value to maximum rank of your adapters. Setting a larger value will enable more adapters at a greater memory usage cost.                                                                                | Default: `16`                        |
+| option.max_cpu_loras             | OPTION_MAX_CPU_LORAS             | \>= 0.27.0  | Pass Through       | Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras. Defaults to max_loras.                                                                                                                                                                                    | Default: `None`                      |
+| option.fully_sharded_loras       | OPTION_FULLY_SHARDED_LORAS       | \>= 0.31.0  | Pass Through       | By default, only half of the LoRA computation is sharded with tensor parallelism. Enabling this will use the fully sharded layers. At high sequence length, max rank or tensor parallel size, this is likely faster.                                                                 | Default: `true`                      |
+| option.lora_extra_vocab_size     | OPTION_LORA_EXTRA_VOCAB_SIZE     | \>= 0.31.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                   | Default: `256`                       |
+| option.long_lora_scaling_factors | OPTION_LONG_LORA_SCALING_FACTORS | \>= 0.31.0  | Pass Through       | Specify multiple scaling factors (which can be different from base model scaling factor) to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time. If not specified, only adapters trained with the base model scaling factor are allowed. | Default: `None`. Example: "3.0,4.0". |
+| option.lora_dtype                | OPTION_LORA_DTYPE                | \>= 0.31.0  | Pass Through       | Data type for LoRA. Valid values are auto, float16, bfloat16. If auto, will default to base model dtype.                                                                                                                                                                             | Default: `auto`                      |
 
 ## Managing Adapters
 

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -330,48 +330,76 @@ def get_model_name():
         "tokenizer": "TheBloke/Llama-2-13B-fp16",
     },
     "llama-7b-unmerged-lora": {
-        "max_memory_per_gpu": [15.0, 15.0],
+        "max_memory_per_gpu": [15.0, 15.0, 20.0],
         "batch_size": [3],
-        "seq_length": [16, 32],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "tokenizer": "TheBloke/Llama-2-7B-fp16"
     },
-    "llama2-13b-awq-unmerged-lora": {
-        "batch_size": [3],
+    "llama-7b-unmerged-lora-overflow": {
+        "max_memory_per_gpu": [15.0, 15.0, 20.0],
+        "batch_size": [4],
         "seq_length": [16, 32],
         "worker": 1,
+        "adapters": [f"english-alpaca-{i}" for i in range(20)],
+        "tokenizer": "TheBloke/Llama-2-7B-fp16"
+    },
+    "llama2-13b-awq-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
         "adapters": ["french", "spanish"],
         "tokenizer": "TheBloke/Llama-2-13B-fp16"
     },
     "mistral-7b-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["spanish", "german"],
-        "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
+        "tokenizer": "unsloth/mistral-7b-instruct-v0.2"
     },
     "mistral-7b-awq-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["spanish", "german"],
-        "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
+        "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
     },
-    "llama-7b-unmerged-lora-overflow": {
-        "max_memory_per_gpu": [15.0, 15.0],
-        "batch_size": [3],
-        "seq_length": [16, 32],
+    "mistral-7b-gptq-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
-        "adapters": [f"english-alpaca-{i}" for i in range(20)],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "adapters": ["spanish", "german"],
+        "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
+    },
+    "mixtral-tiny-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapters": ["test"],
+        "tokenizer": "yujiepan/mixtral-tiny-random"
     },
     "llama3-8b-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["french", "spanish"],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "tokenizer": "unsloth/llama-3-8b-Instruct"
+    },
+    "gemma-7b-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapters": ["alpaca"],
+        "tokenizer": "unsloth/gemma-7b"
+    },
+    "phi2-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapter_names": ["sql"],
+        "tokenizer": "microsoft/phi-2"
     },
     "llama-2-tiny": {
         "max_memory_per_gpu": [23.0],
@@ -516,48 +544,76 @@ def get_model_name():
         "tokenizer": "JackFram/llama-68m"
     },
     "llama-7b-unmerged-lora": {
-        "max_memory_per_gpu": [15.0, 15.0],
+        "max_memory_per_gpu": [15.0, 15.0, 20.0],
         "batch_size": [3],
-        "seq_length": [16, 32],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "tokenizer": "TheBloke/Llama-2-7B-fp16"
     },
-    "llama2-13b-awq-unmerged-lora": {
-        "batch_size": [3],
+    "llama-7b-unmerged-lora-overflow": {
+        "max_memory_per_gpu": [15.0, 15.0, 20.0],
+        "batch_size": [4],
         "seq_length": [16, 32],
         "worker": 1,
-        "adapters": ["russian", "spanish"],
+        "adapters": [f"english-alpaca-{i}" for i in range(20)],
+        "tokenizer": "TheBloke/Llama-2-7B-fp16"
+    },
+    "llama2-13b-awq-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapters": ["french", "spanish"],
         "tokenizer": "TheBloke/Llama-2-13B-fp16"
     },
     "mistral-7b-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["spanish", "german"],
-        "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
+        "tokenizer": "unsloth/mistral-7b-instruct-v0.2"
     },
     "mistral-7b-awq-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["spanish", "german"],
-        "tokenizer": "amazon/MegaBeam-Mistral-7B-300k"
+        "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"
     },
-    "llama-7b-unmerged-lora-overflow": {
-        "max_memory_per_gpu": [15.0, 15.0],
-        "batch_size": [3],
-        "seq_length": [16, 32],
+    "mistral-7b-gptq-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
-        "adapters": [f"english-alpaca-{i}" for i in range(20)],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "adapters": ["spanish", "german"],
+        "tokenizer": "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
+    },
+    "mixtral-tiny-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapters": ["test"],
+        "tokenizer": "yujiepan/mixtral-tiny-random"
     },
     "llama3-8b-unmerged-lora": {
-        "batch_size": [3],
-        "seq_length": [16, 32],
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
         "worker": 1,
         "adapters": ["french", "spanish"],
-        "tokenizer": "TheBloke/Llama-2-13B-fp16"
+        "tokenizer": "unsloth/llama-3-8b-Instruct"
+    },
+    "gemma-7b-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapters": ["alpaca"],
+        "tokenizer": "unsloth/gemma-7b"
+    },
+    "phi2-unmerged-lora": {
+        "batch_size": [4],
+        "seq_length": [16, 32, 1024],
+        "worker": 1,
+        "adapter_names": ["sql"],
+        "tokenizer": "microsoft/phi-2"
     },
     "starcoder2-7b": {
         "max_memory_per_gpu": [25.0],