Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] Updating lmi-dist ci tests for rubikon-engine #1651

Merged
merged 1 commit into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/llm_integration_p4d.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ jobs:
python3 llm/client.py lmi_dist_aiccl mixtral-8x7b-aiccl
if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
echo "aiccl backend not used"
return 1
else
echo "Using aiccl backend"
fi
Expand All @@ -81,7 +80,6 @@ jobs:
python3 llm/client.py lmi_dist_aiccl llama-2-70b-aiccl
if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
echo "aiccl backend not used"
return 1
else
echo "Using aiccl backend"
fi
Expand All @@ -96,7 +94,6 @@ jobs:
python3 llm/client.py lmi_dist_aiccl codellama-34b-aiccl
if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
echo "aiccl backend not used"
return 1
else
echo "Using aiccl backend"
fi
Expand All @@ -111,7 +108,6 @@ jobs:
python3 llm/client.py lmi_dist_aiccl falcon-40b-aiccl
if [ "$(docker logs $(docker ps -aq) 2>&1 | grep -c 'Starting torch distributed with aiccl backend')" -lt 8 ]; then
echo "aiccl backend not used"
return 1
else
echo "Using aiccl backend"
fi
Expand Down
39 changes: 33 additions & 6 deletions .github/workflows/rolling_batch_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,32 @@ jobs:
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py lmi_dist octocoder
docker rm -f $(docker ps -aq)
- name: Test gpt-neox-20b-bitsandbytes
- name: Test speculative-llama-13b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist gpt-neox-20b-bitsandbytes
python3 llm/prepare.py lmi_dist speculative-llama-13b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py lmi_dist gpt-neox-20b-bitsandbytes
python3 llm/client.py lmi_dist speculative-llama-13b
docker rm -f $(docker ps -aq)
- name: Test starcoder2-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist starcoder2-7b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py lmi_dist starcoder2-7b
docker rm -f $(docker ps -aq)
- name: Test gemma-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist gemma-7b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py lmi_dist gemma-7b
docker rm -f $(docker ps -aq)
- name: Test llama2-13b-gptq
working-directory: tests/integration
Expand Down Expand Up @@ -426,14 +444,23 @@ jobs:
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm phi-2
docker rm -f $(docker ps -aq)
- name: Test Speculative Decoding with LLAMA 13B model
- name: Test starcoder2-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm starcoder2-7b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm starcoder2-7b
docker rm -f $(docker ps -aq)
- name: Test gemma-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm speculative-llama-13b
python3 llm/prepare.py vllm gemma-7b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm speculative-llama-13b
python3 llm/client.py vllm gemma-7b
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
Expand Down
2 changes: 1 addition & 1 deletion engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_rolling_batch_class_from_str(rolling_batch_type: str, is_mpi: bool,
from djl_python.rolling_batch.scheduler_rolling_batch import SchedulerRollingBatch
return SchedulerRollingBatch
elif rolling_batch_type == "lmi-dist":
from djl_python.rolling_batch.lmi_dist_v2_rolling_batch import LmiDistRollingBatch
from djl_python.rolling_batch.lmi_dist_rolling_batch import LmiDistRollingBatch
return LmiDistRollingBatch
elif rolling_batch_type == "vllm":
from djl_python.rolling_batch.vllm_rolling_batch import VLLMRollingBatch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,75 +10,44 @@
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import logging
import os
from enum import Enum
from typing import Optional

import torch
from pydantic.v1.class_validators import root_validator
from pydantic.v1.class_validators import validator, root_validator

from djl_python.properties_manager.hf_properties import get_torch_dtype_from_str
from djl_python.properties_manager.properties import Properties


class LmiDistQuantizeMethods(str, Enum):
# added for backward compatibility lmi-dist
bitsandbytes = 'bitsandbytes'
bitsandbytes8 = 'bitsandbytes8'
gptq = 'gptq'
awq = 'awq'
gptq = 'gptq'
squeezellm = 'squeezellm'


class LmiDistRbProperties(Properties):
engine: Optional[str] = None
dtype: Optional[str] = "auto"
load_format: Optional[str] = "auto"
quantize: Optional[LmiDistQuantizeMethods] = None
tensor_parallel_degree: Optional[int] = 1
max_rolling_batch_prefill_tokens: Optional[int] = 4096
device: Optional[int] = None
dtype: Optional[str] = None
torch_dtype: Optional[torch.dtype] = None

@root_validator(skip_on_failure=True)
def validate_mpi_mode(cls, properties):
if not properties.get("is_mpi") and int(
properties.get("tensor_parallel_degree", "1")) != 1:
raise ValueError(f"Need mpi_mode to start lmi-dist RollingBatcher."
f"Try with engine=MPI in your serving.properties")
return properties

@root_validator(skip_on_failure=True)
def validate_quantize(cls, properties):
if properties.get('quantize') is None:
if properties.get('dtype') == "int8":
properties['quantize'] = LmiDistQuantizeMethods.bitsandbytes
else:
# parsing bitsandbytes8, so it can be directly passed to lmi dist model loader.
if properties.get(
'quantize') == LmiDistQuantizeMethods.bitsandbytes8:
properties['quantize'] = LmiDistQuantizeMethods.bitsandbytes
if properties.get('dtype') is not None:
raise ValueError(
f"Can't set both dtype: {properties['dtype']} and quantize: {properties['quantize']}"
)
return properties

@root_validator(skip_on_failure=True)
def set_device(cls, properties):
if properties.get('is_mpi'):
properties['device'] = int(os.getenv("LOCAL_RANK", 0))
return properties

@root_validator(skip_on_failure=True)
def construct_dtype(cls, properties):
if properties.get('dtype'):
properties["torch_dtype"] = get_torch_dtype_from_str(
properties['dtype'].lower())
elif properties.get('data_type'):
logging.warning('option.data_type is deprecated.'
'Please use option.dtype')
properties["torch_dtype"] = get_torch_dtype_from_str(
properties['data_type'].lower())
else:
properties['torch_dtype'] = torch.float16
return properties
tensor_parallel_degree: Optional[int] = None
max_rolling_batch_prefill_tokens: Optional[int] = None
# Adjustable prefix model length for certain 32k or longer model
max_model_len: Optional[int] = None
# TODO: change Enforce eager to False once SageMaker driver issue resolved
enforce_eager: Optional[bool] = False
# TODO: this default may change with different vLLM versions
# TODO: try to get good default from vLLM to prevent revisiting
# TODO: last time check: vllm 0.3.1
gpu_memory_utilization: Optional[float] = 0.9
# TODO: speculative decoding changes
speculative_draft_model: Optional[str] = None
speculative_length: int = 5
draft_model_tp_size: int = 1
record_acceptance_rate: Optional[bool] = False

@validator('engine')
def validate_engine(cls, engine):
if engine != "MPI":
raise AssertionError(
f"Need MPI engine to start lmidist_v2 RollingBatcher")
return engine

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,6 @@ class VllmRbProperties(Properties):
# TODO: try to get good default from vLLM to prevent revisiting
# TODO: last time check: vllm 0.3.1
gpu_memory_utilization: Optional[float] = 0.9
# TODO: speculative decoding changes
speculative_draft_model: Optional[str] = None
speculative_length: int = 5
draft_model_tp_size: int = 1
record_acceptance_rate: Optional[bool] = False
enable_lora: Optional[bool] = False
max_loras: Optional[int] = 4
max_lora_rank: Optional[int] = 16
Expand Down
Loading
Loading