Merge branch 'main' into binbin/router

vllm-project · Jan 3, 2025 · 22e0fa6 · 22e0fa6
2 parents ec2cff9 + b479e56
commit 22e0fa6
Show file tree

Hide file tree

Showing 77 changed files with 2,287 additions and 652 deletions.
diff --git a/.github/workflows/installation-tests.yml b/.github/workflows/installation-tests.yml
@@ -31,7 +31,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        image: [controller-manager, plugins, runtime, users]
+        image: [controller-manager, gateway-plugins, runtime, metadata-service]
     steps:
       - name: Check out code
         uses: actions/checkout@v4
@@ -84,7 +84,7 @@ jobs:
 
       - name: Load image into Kind
         run: |
-          for image in controller-manager plugins runtime users; do
+          for image in controller-manager gateway-plugins runtime metadata-service; do
             docker load < ${image}-image/${image}.tar
           
             # Retag the image
@@ -99,9 +99,10 @@ jobs:
       - name: Deploy controller with the built image
         run: |
           kubectl create -k config/dependency
-          cd config/manager && kustomize edit set image controller=aibrix/controller-manager:${{ github.sha }}
-          cd ${{ github.workspace }}
-          cd config/gateway && kustomize edit set image plugins=aibrix/plugins:${{ github.sha }} && kustomize edit set image users=aibrix/users:${{ github.sha }}
+          cd config/default
+          kustomize edit set image controller=aibrix/controller-manager:${{ github.sha }}
+          kustomize edit set image gateway-plugins=aibrix/gateway-plugins:${{ github.sha }}
+          kustomize edit set image metadata-service=aibrix/metadata-service:${{ github.sha }}
           cd ${{ github.workspace }}
           kubectl create -k config/default
 

diff --git a/.github/workflows/release-build.yaml b/.github/workflows/release-build.yaml
@@ -81,13 +81,12 @@ jobs:
         run: |
           cd python/aibrix
           poetry build
-          mkdir -p $GITHUB_WORKSPACE/artifacts
-          cp dist/* $GITHUB_WORKSPACE/artifacts/
+          ls -al dist/*
       - name: Upload release artifacts
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: aibrix-python-packages
-          path: $GITHUB_WORKSPACE/artifacts/
+          path: python/aibrix/dist/*.whl
 
   artifact-release:
     runs-on: ubuntu-latest
@@ -110,13 +109,13 @@ jobs:
 
       # Upload the Kustomize YAML as a release artifact
       - name: Upload Kustomize YAML
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: aibrix-dependency-${{ github.ref_name }}.yaml
           path: aibrix-dependency-${{ github.ref_name }}.yaml
 
       - name: Upload Kustomize YAML
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: aibrix-core-${{ github.ref_name }}.yaml
           path: aibrix-core-${{ github.ref_name }}.yaml
@@ -132,17 +131,17 @@ jobs:
 
       # Download the Kustomize artifact from the previous job
       - name: Download Kustomize YAML
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: aibrix-dependency-${{ github.ref_name }}.yaml
 
       - name: Download Kustomize YAML
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: aibrix-core-${{ github.ref_name }}.yaml
 
       - name: Download PYTHON wheels
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: aibrix-python-packages
 
@@ -172,4 +171,6 @@ jobs:
           prerelease: ${{ env.prerelease }}
           files: |
             aibrix-dependency-${{ github.ref_name }}.yaml
-            aibrix-core-${{ github.ref_name }}.yaml
+            aibrix-core-${{ github.ref_name }}.yaml
+            aibrix-*.whl
+            aibrix-python-packages/aibrix-*.whl
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ GIT_COMMIT_HASH ?= $(shell git rev-parse HEAD)
 # Image URL to use all building/pushing image targets
 AIBRIX_CONTAINER_REGISTRY_NAMESPACE ?= aibrix
 DOCKERFILE_PATH ?= build/container
-IMAGES := controller-manager plugins runtime users
+IMAGES := controller-manager gateway-plugins runtime metadata-service
 
 # note: this is not being used, only for tracking some commands we have not updated yet.
 IMG ?= ${AIBRIX_CONTAINER_REGISTRY_NAMESPACE}/controller-manager:${GIT_COMMIT_HASH}
@@ -137,42 +137,42 @@ define push_image
 endef
 
 .PHONY: docker-build-all
-docker-build-all: docker-build-controller-manager docker-build-plugins docker-build-runtime docker-build-users ## Build all docker images
+docker-build-all: docker-build-controller-manager docker-build-gateway-plugins docker-build-runtime docker-build-metadata-service ## Build all docker images
 
 .PHONY: docker-build-controller-manager
 docker-build-controller-manager: ## Build docker image with the manager.
 	$(call build_and_tag,controller-manager,Dockerfile)
 
-.PHONY: docker-build-plugins
-docker-build-plugins: ## Build docker image with the plugins.
-	$(call build_and_tag,plugins,Dockerfile.gateway)
+.PHONY: docker-build-gateway-plugins
+docker-build-gateway-plugins: ## Build docker image with the gateway plugins.
+	$(call build_and_tag,gateway-plugins,Dockerfile.gateway)
 
 .PHONY: docker-build-runtime
 docker-build-runtime: ## Build docker image with the AI Runtime.
 	$(call build_and_tag,runtime,Dockerfile.runtime)
 
-.PHONY: docker-build-users
-docker-build-users: ## Build docker image with the users.
-	$(call build_and_tag,users,Dockerfile.users)
+.PHONY: docker-build-metadata-service
+docker-build-metadata-service: ## Build docker image with the metadata-service.
+	$(call build_and_tag,metadata-service,Dockerfile.metadata)
 
 .PHONY: docker-push-all
-docker-push-all: docker-push-controller-manager docker-push-plugins docker-push-runtime docker-push-users ## Push all docker images
+docker-push-all: docker-push-controller-manager docker-push-gateway-plugins docker-push-runtime docker-push-metadata-service ## Push all docker images
 
 .PHONY: docker-push-controller-manager
 docker-push-controller-manager: ## Push docker image with the manager.
 	$(call push_image,controller-manager)
 
-.PHONY: docker-push-plugins
-docker-push-plugins: ## Push docker image with the plugins.
-	$(call push_image,plugins)
+.PHONY: docker-push-gateway-plugins
+docker-push-gateway-plugins: ## Push docker image with the gateway plugins.
+	$(call push_image,gateway-plugins)
 
 .PHONY: docker-push-runtime
 docker-push-runtime: ## Push docker image with the AI Runtime.
 	$(call push_image,runtime)
 
-.PHONY: docker-push-users
-docker-push-users: ## Push docker image with the users.
-	$(call push_image,users)
+.PHONY: docker-push-metadata-service
+docker-push-metadata-service: ## Push docker image with the metadata-service.
+	$(call push_image,metadata-service)
 
 # PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
 # architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
@@ -245,6 +245,14 @@ deploy-vke: manifests kustomize ## Deploy controller to the K8s cluster specifie
 undeploy-vke: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
 	$(KUSTOMIZE) build config/overlays/vke/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
 
+.PHONY: deploy-vke-ipv4
+deploy-vke-ipv4: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
+	$(KUSTOMIZE) build config/overlays/vke-ipv4/default | $(KUBECTL) create -f -
+
+.PHONY: undeploy-vke-ipv4
+undeploy-vke-ipv4: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
+	$(KUSTOMIZE) build config/overlays/vke-ipv4/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
+
 ##@ Dependencies
 
 ## Location to install dependencies to

diff --git a/benchmarks/autoscaling/bench_workload_generator.py b/benchmarks/autoscaling/bench_workload_generator.py
@@ -120,15 +120,15 @@ def plot_workload(workload_dict, interval_sec, output_path: str = None):
     interval = 30
     # Generate workloads with different parameters
     workload_dict = {
-        'Quick Rising':
+        'quick_rising':
             generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, period=5, only_rise=True),
-        'Slow Rising':
+        'slow_rising':
             generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, period=0.25,
                               only_rise=True),
-        'Slight Fluctuation':
+        'slight_fluctuation':
             generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, B=5, period=1,
                               only_rise=False),
-        'Severe Fluctuation':
+        'severe_fluctuation':
             generate_workload(demo_requests, duration_sec=600, interval_sec=interval, A=5, B=10, period=12,
                               only_rise=False),
     }

diff --git a/benchmarks/generator/.gitignore b/benchmarks/generator/.gitignore
@@ -0,0 +1,2 @@
+output
+plot
diff --git a/benchmarks/generator/README.md b/benchmarks/generator/README.md
@@ -1,76 +1,95 @@
 # Using Workload Generator
 
 ## Generate workload file
-### Generate a workload file based on workload patterns (synthetic patterns)
-If no trace file path is specified, the generator will generate workload file based on 4 synthetic pattern described [here](https://github.com/aibrix/aibrix/blob/main/benchmarks/autoscaling/bench_workload_generator.py):
 
+### Prerequisite
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -O /tmp/ShareGPT_V3_unfiltered_cleaned_split.json
+export SHAREGPT_FILE_PATH=/tmp/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
-export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
-python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type synthetic --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output" 
-```
-Here ```--interval-ms``` specifies the granularity of concurently dispatched requests (in milliseconds). ```--duration-ms``` specifies the total length of the trace in milliseconds. 
 
-The file would be stored under ```output``` folder based on the name of different patterns. And the plot illustrates the workload pattern will be under the ```plot``` directory. 
+### Generate a workload file based on workload patterns (synthetic patterns)
 
+If no trace file path is specified, the generator will generate workload file based on 4 synthetic pattern described [here](https://github.com/aibrix/aibrix/blob/main/benchmarks/autoscaling/bench_workload_generator.py):
+```shell
+python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type synthetic --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output" 
+```
+Here `--interval-ms` specifies the granularity of concurrent dispatched requests (in milliseconds). `--duration-ms` specifies the total length of the trace in milliseconds.
+
+The file would be stored under `output` folder based on the name of different patterns. And the plot illustrates the workload pattern will be under the `plot` directory. 
 
 ## Generate a workload file based on internal load summary .csv file
-```
+
+```shell
 export SUMMARY_FILE=${PATH_TO_SUMMARY_FILE}
-export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
-python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 3600000 --trace-type internal --trace-file "$SUMMARY_FILE" --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output" 
+python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type internal --traffic-file "$SUMMARY_FILE" --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output"
 ```
 
 This generator assumes trace file to be in the following format
 ```
 "Time","Total","Success","4xx Error"
 2024-10-1 00:00:00,100,99,1
 ```
+### Indicate the length of prompt/completion
+In this case, you can also indicate the request's prompt length by the `--prompt-len-file` config, or the output length by the `--completion-len-file`,
+based on the parameters, the generator will select the proper length in the prompt_file to simulate the length of the real flow's load.
 
-This generator generate workload file (in .json format) under ```output``` folder. The file would look like the following:
+The format of the file should follow the table head format and have the **exact same row length** as the traffic file
+```
+P50,P70,P99
+2000,4000,10000
+...
+2000,4000,10000(same row size with traffic file)
+```
+
+This generator generate workload file (in .json format) under `output` folder. The file would look like the following:
 ```
 [
     [["Prompt1", prompt_len_1, output_len_1, null],["Prompt2", prompt_len_2, output_len_2, null], ...],
     [["Prompt3", prompt_len_3, output_len_3, null],["Prompt4", prompt_len_4, output_len_4, null], ...],
     ...
 ]
-
 ```
-And the plot illustrates the workload pattern will be under the ```plot``` directory. 
+
+And the plot illustrates the workload pattern will be under the `plot` directory. 
 
 
 ## Generate a workload file based on Azure LLM Trace
 
 To produce a workload based on [Azure LLM Trace](https://github.com/Azure/AzurePublicDataset/tree/master/data), use the following commands:
 
 ```
-export AZURE_TRACE_NAME=${PATH_TO_AZURE_TRACE_NAME}
-export SHARE_GPT_PATH=${PATH_TO_SHARE_GPT_FILE}
-python workload_generator.py --prompt-file $SHARE_GPT_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 3600000 --trace-type azure --trace-file "$AZURE_TRACE_NAME" --group-interval-seconds 1 --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output "output" 
+wget https://raw.githubusercontent.com/Azure/AzurePublicDataset/refs/heads/master/data/AzureLLMInferenceTrace_conv.csv -O /tmp/AzureLLMInferenceTrace_conv.csv
+export AZURE_TRACE_NAME=/tmp/AzureLLMInferenceTrace_conv.csv
+python workload_generator.py --prompt-file $SHAREGPT_FILE_PATH --num-prompts 100 --interval-ms 1000 --duration-ms 600000 --trace-type azure --trace-file "$AZURE_TRACE_NAME" --group-interval-seconds 1 --model "Qwen/Qwen2.5-Coder-7B-Instruct" --output-dir "output"
 ```
 
-Note that the trace file contains both input and output lengths. And therefore dataset in ```$SHARE_GPT_PATH``` needs to be tokenized to be able to sampled based on their input/output token lengths. Therefore it is required to specify tokenizer to generate based on this trace. Use ```--group-interval-seconds``` to specify grouping interval from the origianl trace. The file would be stored under ```output``` folder and the plot illustrates the workload pattern will be under the ```plot``` directory. 
-
+Note that the trace file contains both input and output lengths. And therefore dataset in `$SHAREGPT_FILE_PATH` needs to be tokenized to be able to sampled based on their input/output token lengths. Therefore it is required to specify tokenizer to generate based on this trace. Use `--group-interval-seconds` to specify grouping interval from the original trace. The file would be stored under `output` folder and the plot illustrates the workload pattern will be under the `plot` directory.
 
 ## Run Workload Generator
 
 Starting vllm server:
 
-```
+```shell
 python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 \
---port "8000" --model /root/models/deepseek-coder-6.7b-instruct \
---trust-remote-code --max-model-len "14304" \
+--port "8000" \
+--model /root/models/deepseek-coder-6.7b-instruct \
+--trust-remote-code \
+--max-model-len "14304" \
 --api-key sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi \
 --enable-chunked-prefill
 ```
 
 Using a sample workload in a client:
-```
-python3 client.py --workload-path "output/Quick Rising.jsonl" \
+
+```shell
+python3 client.py \
+--workload-path "output/quick_rising.jsonl" \
 --endpoint "http://localhost:8000" \
 --model /root/models/deepseek-coder-6.7b-instruct \
 --api-key sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi \
 --output-file-path output.jsonl
-
 ```
 
-The output will be stored as a ```.jsonl``` file in ```output.jsonl```
+The output will be stored as a `.jsonl` file in `output.jsonl`
diff --git a/benchmarks/generator/client.py b/benchmarks/generator/client.py
@@ -7,15 +7,16 @@
 
 from utils import (load_workload, wrap_prompt_as_chat_message)
 
+
 # Asynchronous request handler
 async def send_request(client, model, endpoint, prompt, output_file):
     start_time = asyncio.get_event_loop().time()
     try:
         response = await client.chat.completions.create(
-            model = model,
-            messages = prompt,
-            temperature = 0,
-            max_tokens = 128
+            model=model,
+            messages=prompt,
+            temperature=0,
+            max_tokens=128
         )
 
         latency = asyncio.get_event_loop().time() - start_time
@@ -45,43 +46,44 @@ async def send_request(client, model, endpoint, prompt, output_file):
         logging.error(f"Error sending request to at {endpoint}: {str(e)}")
         return None
 
+
 async def benchmark(endpoint, model, api_key, workload_path, output_file_path):
     client = openai.AsyncOpenAI(
         api_key=api_key,
-        base_url=endpoint+"/v1",
+        base_url=endpoint + "/v1",
     )
     with open(output_file_path, 'a', encoding='utf-8') as output_file:
         load_struct = load_workload(workload_path)
         batch_tasks = []
         base_time = time.time()
         num_requests = 0
         for requests_dict in load_struct:
-            ts = int(requests_dict["Timestamp"])
-            requests = requests_dict["Requests"]
+            ts = int(requests_dict["timestamp"])
+            requests = requests_dict["requests"]
             cur_time = time.time()
-            target_time = base_time + ts/1000.0
+            target_time = base_time + ts / 1000.0
             logging.warning(f"Prepare to launch {len(requests)} tasks after {target_time - cur_time}")
             if target_time > cur_time:
                 await asyncio.sleep(target_time - cur_time)
-            formatted_prompts = [wrap_prompt_as_chat_message(request["Prompt"]) for request in requests]
+            formatted_prompts = [wrap_prompt_as_chat_message(request["prompt"]) for request in requests]
             for formatted_prompt in formatted_prompts:
                 task = asyncio.create_task(
-                        send_request(client, model, endpoint, formatted_prompt, output_file)
+                    send_request(client, model, endpoint, formatted_prompt, output_file)
                 )
                 batch_tasks.append(task)
             num_requests += len(requests)
         await asyncio.gather(*batch_tasks)
         logging.warning(f"All {num_requests} requests completed for deployment.")
-        
+
 
 def main(args):
     logging.info(f"Starting benchmark on endpoint {args.endpoint}")
     start_time = time.time()
     asyncio.run(benchmark(args.endpoint, args.model, args.api_key, args.workload_path, args.output_file_path))
     end_time = time.time()
     logging.info(f"Benchmark completed in {end_time - start_time:.2f} seconds")
-    
-    
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Workload Generator')
     parser.add_argument("--workload-path", type=str, default=None, help="File path to the workload file.")