Skip to content

Commit

Permalink
Merge branch 'main' into binbin/router
Browse files Browse the repository at this point in the history
  • Loading branch information
ChenBinbin1996 committed Jan 3, 2025
2 parents ec2cff9 + b479e56 commit 22e0fa6
Show file tree
Hide file tree
Showing 77 changed files with 2,287 additions and 652 deletions.
11 changes: 6 additions & 5 deletions .github/workflows/installation-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
image: [controller-manager, plugins, runtime, users]
image: [controller-manager, gateway-plugins, runtime, metadata-service]
steps:
- name: Check out code
uses: actions/checkout@v4
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
- name: Load image into Kind
run: |
for image in controller-manager plugins runtime users; do
for image in controller-manager gateway-plugins runtime metadata-service; do
docker load < ${image}-image/${image}.tar
# Retag the image
Expand All @@ -99,9 +99,10 @@ jobs:
- name: Deploy controller with the built image
run: |
kubectl create -k config/dependency
cd config/manager && kustomize edit set image controller=aibrix/controller-manager:${{ github.sha }}
cd ${{ github.workspace }}
cd config/gateway && kustomize edit set image plugins=aibrix/plugins:${{ github.sha }} && kustomize edit set image users=aibrix/users:${{ github.sha }}
cd config/default
kustomize edit set image controller=aibrix/controller-manager:${{ github.sha }}
kustomize edit set image gateway-plugins=aibrix/gateway-plugins:${{ github.sha }}
kustomize edit set image metadata-service=aibrix/metadata-service:${{ github.sha }}
cd ${{ github.workspace }}
kubectl create -k config/default
Expand Down
21 changes: 11 additions & 10 deletions .github/workflows/release-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,12 @@ jobs:
run: |
cd python/aibrix
poetry build
mkdir -p $GITHUB_WORKSPACE/artifacts
cp dist/* $GITHUB_WORKSPACE/artifacts/
ls -al dist/*
- name: Upload release artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: aibrix-python-packages
path: $GITHUB_WORKSPACE/artifacts/
path: python/aibrix/dist/*.whl

artifact-release:
runs-on: ubuntu-latest
Expand All @@ -110,13 +109,13 @@ jobs:
# Upload the Kustomize YAML as a release artifact
- name: Upload Kustomize YAML
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: aibrix-dependency-${{ github.ref_name }}.yaml
path: aibrix-dependency-${{ github.ref_name }}.yaml

- name: Upload Kustomize YAML
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: aibrix-core-${{ github.ref_name }}.yaml
path: aibrix-core-${{ github.ref_name }}.yaml
Expand All @@ -132,17 +131,17 @@ jobs:

# Download the Kustomize artifact from the previous job
- name: Download Kustomize YAML
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: aibrix-dependency-${{ github.ref_name }}.yaml

- name: Download Kustomize YAML
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: aibrix-core-${{ github.ref_name }}.yaml

- name: Download PYTHON wheels
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: aibrix-python-packages

Expand Down Expand Up @@ -172,4 +171,6 @@ jobs:
prerelease: ${{ env.prerelease }}
files: |
aibrix-dependency-${{ github.ref_name }}.yaml
aibrix-core-${{ github.ref_name }}.yaml
aibrix-core-${{ github.ref_name }}.yaml
aibrix-*.whl
aibrix-python-packages/aibrix-*.whl
38 changes: 23 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ GIT_COMMIT_HASH ?= $(shell git rev-parse HEAD)
# Image URL to use all building/pushing image targets
AIBRIX_CONTAINER_REGISTRY_NAMESPACE ?= aibrix
DOCKERFILE_PATH ?= build/container
IMAGES := controller-manager plugins runtime users
IMAGES := controller-manager gateway-plugins runtime metadata-service

# note: this is not being used, only for tracking some commands we have not updated yet.
IMG ?= ${AIBRIX_CONTAINER_REGISTRY_NAMESPACE}/controller-manager:${GIT_COMMIT_HASH}
Expand Down Expand Up @@ -137,42 +137,42 @@ define push_image
endef

.PHONY: docker-build-all
docker-build-all: docker-build-controller-manager docker-build-plugins docker-build-runtime docker-build-users ## Build all docker images
docker-build-all: docker-build-controller-manager docker-build-gateway-plugins docker-build-runtime docker-build-metadata-service ## Build all docker images

.PHONY: docker-build-controller-manager
docker-build-controller-manager: ## Build docker image with the manager.
$(call build_and_tag,controller-manager,Dockerfile)

.PHONY: docker-build-plugins
docker-build-plugins: ## Build docker image with the plugins.
$(call build_and_tag,plugins,Dockerfile.gateway)
.PHONY: docker-build-gateway-plugins
docker-build-gateway-plugins: ## Build docker image with the gateway plugins.
$(call build_and_tag,gateway-plugins,Dockerfile.gateway)

.PHONY: docker-build-runtime
docker-build-runtime: ## Build docker image with the AI Runtime.
$(call build_and_tag,runtime,Dockerfile.runtime)

.PHONY: docker-build-users
docker-build-users: ## Build docker image with the users.
$(call build_and_tag,users,Dockerfile.users)
.PHONY: docker-build-metadata-service
docker-build-metadata-service: ## Build docker image with the metadata-service.
$(call build_and_tag,metadata-service,Dockerfile.metadata)

.PHONY: docker-push-all
docker-push-all: docker-push-controller-manager docker-push-plugins docker-push-runtime docker-push-users ## Push all docker images
docker-push-all: docker-push-controller-manager docker-push-gateway-plugins docker-push-runtime docker-push-metadata-service ## Push all docker images

.PHONY: docker-push-controller-manager
docker-push-controller-manager: ## Push docker image with the manager.
$(call push_image,controller-manager)

.PHONY: docker-push-plugins
docker-push-plugins: ## Push docker image with the plugins.
$(call push_image,plugins)
.PHONY: docker-push-gateway-plugins
docker-push-gateway-plugins: ## Push docker image with the gateway plugins.
$(call push_image,gateway-plugins)

.PHONY: docker-push-runtime
docker-push-runtime: ## Push docker image with the AI Runtime.
$(call push_image,runtime)

.PHONY: docker-push-users
docker-push-users: ## Push docker image with the users.
$(call push_image,users)
.PHONY: docker-push-metadata-service
docker-push-metadata-service: ## Push docker image with the metadata-service.
$(call push_image,metadata-service)

# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
Expand Down Expand Up @@ -245,6 +245,14 @@ deploy-vke: manifests kustomize ## Deploy controller to the K8s cluster specifie
undeploy-vke: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
$(KUSTOMIZE) build config/overlays/vke/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -

.PHONY: deploy-vke-ipv4
deploy-vke-ipv4: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
$(KUSTOMIZE) build config/overlays/vke-ipv4/default | $(KUBECTL) create -f -

.PHONY: undeploy-vke-ipv4
undeploy-vke-ipv4: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
$(KUSTOMIZE) build config/overlays/vke-ipv4/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -

##@ Dependencies

## Location to install dependencies to
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/autoscaling/bench_workload_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,15 @@ def plot_workload(workload_dict, interval_sec, output_path: str = None):
interval = 30
# Generate workloads with different parameters
workload_dict = {
'Quick Rising':
'quick_rising':
generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, period=5, only_rise=True),
'Slow Rising':
'slow_rising':
generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, period=0.25,
only_rise=True),
'Slight Fluctuation':
'slight_fluctuation':
generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, B=5, period=1,
only_rise=False),
'Severe Fluctuation':
'severe_fluctuation':
generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, B=10, period=12,
only_rise=False),
}
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/generator/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
output
plot
69 changes: 44 additions & 25 deletions benchmarks/generator/README.md
Original file line number Diff line number Diff line change
@@ -1,76 +1,95 @@
# Using Workload Generator

## Generate workload file
### Generate a workload file based on workload patterns (synthetic patterns)
If no trace file path is specified, the generator will generate workload file based on 4 synthetic pattern described [here](https://github.com/aibrix/aibrix/blob/main/benchmarks/autoscaling/bench_workload_generator.py):

### Prerequisite

```shell
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -O /tmp/ShareGPT_V3_unfiltered_cleaned_split.json
export SHAREGPT_FILE_PATH=/tmp/ShareGPT_V3_unfiltered_cleaned_split.json
```
export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type synthetic --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output"
```
Here ```--interval-ms``` specifies the granularity of concurently dispatched requests (in milliseconds). ```--duration-ms``` specifies the total length of the trace in milliseconds.

The file would be stored under ```output``` folder based on the name of different patterns. And the plot illustrates the workload pattern will be under the ```plot``` directory.
### Generate a workload file based on workload patterns (synthetic patterns)

If no trace file path is specified, the generator will generate workload file based on 4 synthetic pattern described [here](https://github.com/aibrix/aibrix/blob/main/benchmarks/autoscaling/bench_workload_generator.py):
```shell
python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type synthetic --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output"
```
Here `--interval-ms` specifies the granularity of concurrent dispatched requests (in milliseconds). `--duration-ms` specifies the total length of the trace in milliseconds.

The file would be stored under `output` folder based on the name of different patterns. And the plot illustrates the workload pattern will be under the `plot` directory.

## Generate a workload file based on internal load summary .csv file
```

```shell
export SUMMARY_FILE=${PATH_TO_SUMMARY_FILE}
export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 3600000 --trace-type internal --trace-file "$SUMMARY_FILE" --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output"
python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type internal --traffic-file "$SUMMARY_FILE" --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output"
```

This generator assumes trace file to be in the following format
```
"Time","Total","Success","4xx Error"
2024-10-1 00:00:00,100,99,1
```
### Indicate the length of prompt/completion
In this case, you can also indicate the request's prompt length by the `--prompt-len-file` config, or the output length by the `--completion-len-file`,
based on the parameters, the generator will select the proper length in the prompt_file to simulate the length of the real flow's load.

This generator generate workload file (in .json format) under ```output``` folder. The file would look like the following:
The format of the file should follow the table head format and have the **exact same row length** as the traffic file
```
P50,P70,P99
2000,4000,10000
...
2000,4000,10000(same row size with traffic file)
```

This generator generate workload file (in .json format) under `output` folder. The file would look like the following:
```
[
[["Prompt1", prompt_len_1, output_len_1, null],["Prompt2", prompt_len_2, output_len_2, null], ...],
[["Prompt3", prompt_len_3, output_len_3, null],["Prompt4", prompt_len_4, output_len_4, null], ...],
...
]
```
And the plot illustrates the workload pattern will be under the ```plot``` directory.

And the plot illustrates the workload pattern will be under the `plot` directory.


## Generate a workload file based on Azure LLM Trace

To produce a workload based on [Azure LLM Trace](https://github.com/Azure/AzurePublicDataset/tree/master/data), use the following commands:

```
export AZURE_TRACE_NAME=${PATH_TO_AZURE_TRACE_NAME}
export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 3600000 --trace-type azure --trace-file "$AZURE_TRACE_NAME" --group-interval-seconds 1 --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output"
wget https://raw.githubusercontent.com/Azure/AzurePublicDataset/refs/heads/master/data/AzureLLMInferenceTrace_conv.csv -O /tmp/AzureLLMInferenceTrace_conv.csv
export AZURE_TRACE_NAME=/tmp/AzureLLMInferenceTrace_conv.csv
python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type azure --trace-file "$AZURE_TRACE_NAME" --group-interval-seconds 1 --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output"
```

Note that the trace file contains both input and output lengths. And therefore dataset in ```$SHARE_GPT_PATH``` needs to be tokenized to be able to sampled based on their input/output token lengths. Therefore it is required to specify tokenizer to generate based on this trace. Use ```--group-interval-seconds``` to specify grouping interval from the origianl trace. The file would be stored under ```output``` folder and the plot illustrates the workload pattern will be under the ```plot``` directory.

Note that the trace file contains both input and output lengths. And therefore dataset in `$SHAREGPT_FILE_PATH` needs to be tokenized to be able to sampled based on their input/output token lengths. Therefore it is required to specify tokenizer to generate based on this trace. Use `--group-interval-seconds` to specify grouping interval from the original trace. The file would be stored under `output` folder and the plot illustrates the workload pattern will be under the `plot` directory.

## Run Workload Generator

Starting vllm server:

```
```shell
python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 \
--port "8000" --model /root/models/deepseek-coder-6.7b-instruct \
--trust-remote-code --max-model-len "14304" \
--port "8000" \
--model /root/models/deepseek-coder-6.7b-instruct \
--trust-remote-code \
--max-model-len "14304" \
--api-key sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi \
--enable-chunked-prefill
```

Using a sample workload in a client:
```
python3 client.py --workload-path "output/Quick Rising.jsonl" \

```shell
python3 client.py \
--workload-path "output/quick_rising.jsonl" \
--endpoint "http://localhost:8000" \
--model /root/models/deepseek-coder-6.7b-instruct \
--api-key sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi \
--output-file-path output.jsonl
```

The output will be stored as a ```.jsonl``` file in ```output.jsonl```
The output will be stored as a `.jsonl` file in `output.jsonl`
28 changes: 15 additions & 13 deletions benchmarks/generator/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@

from utils import (load_workload, wrap_prompt_as_chat_message)


# Asynchronous request handler
async def send_request(client, model, endpoint, prompt, output_file):
start_time = asyncio.get_event_loop().time()
try:
response = await client.chat.completions.create(
model = model,
messages = prompt,
temperature = 0,
max_tokens = 128
model=model,
messages=prompt,
temperature=0,
max_tokens=128
)

latency = asyncio.get_event_loop().time() - start_time
Expand Down Expand Up @@ -45,43 +46,44 @@ async def send_request(client, model, endpoint, prompt, output_file):
logging.error(f"Error sending request to at {endpoint}: {str(e)}")
return None


async def benchmark(endpoint, model, api_key, workload_path, output_file_path):
client = openai.AsyncOpenAI(
api_key=api_key,
base_url=endpoint+"/v1",
base_url=endpoint + "/v1",
)
with open(output_file_path, 'a', encoding='utf-8') as output_file:
load_struct = load_workload(workload_path)
batch_tasks = []
base_time = time.time()
num_requests = 0
for requests_dict in load_struct:
ts = int(requests_dict["Timestamp"])
requests = requests_dict["Requests"]
ts = int(requests_dict["timestamp"])
requests = requests_dict["requests"]
cur_time = time.time()
target_time = base_time + ts/1000.0
target_time = base_time + ts / 1000.0
logging.warning(f"Prepare to launch {len(requests)} tasks after {target_time - cur_time}")
if target_time > cur_time:
await asyncio.sleep(target_time - cur_time)
formatted_prompts = [wrap_prompt_as_chat_message(request["Prompt"]) for request in requests]
formatted_prompts = [wrap_prompt_as_chat_message(request["prompt"]) for request in requests]
for formatted_prompt in formatted_prompts:
task = asyncio.create_task(
send_request(client, model, endpoint, formatted_prompt, output_file)
send_request(client, model, endpoint, formatted_prompt, output_file)
)
batch_tasks.append(task)
num_requests += len(requests)
await asyncio.gather(*batch_tasks)
logging.warning(f"All {num_requests} requests completed for deployment.")


def main(args):
logging.info(f"Starting benchmark on endpoint {args.endpoint}")
start_time = time.time()
asyncio.run(benchmark(args.endpoint, args.model, args.api_key, args.workload_path, args.output_file_path))
end_time = time.time()
logging.info(f"Benchmark completed in {end_time - start_time:.2f} seconds")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Workload Generator')
parser.add_argument("--workload-path", type=str, default=None, help="File path to the workload file.")
Expand Down
Loading

0 comments on commit 22e0fa6

Please sign in to comment.