diff --git a/core/testcasecontroller/algorithm/paradigm/joint_inference/joint_inference.py b/core/testcasecontroller/algorithm/paradigm/joint_inference/joint_inference.py index 5681b449..0a95b102 100644 --- a/core/testcasecontroller/algorithm/paradigm/joint_inference/joint_inference.py +++ b/core/testcasecontroller/algorithm/paradigm/joint_inference/joint_inference.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KubeEdge Authors. +# Copyright 2024 The KubeEdge Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,6 @@ """Cloud-Edge Joint Inference""" -# Ianvs imports import os from tqdm import tqdm @@ -43,8 +42,8 @@ class JointInference(ParadigmBase): workspace: string the output required for multi-edge inference paradigm. kwargs: dict - config required for the test process of multi-edge inference paradigm, - e.g.: algorithm modules, dataset, initial model, etc. + config required for the test process of joint inference paradigm, + e.g.: hard_example_mining_mode """ @@ -58,12 +57,15 @@ def __init__(self, workspace, **kwargs): ) def set_config(self): - """Configure output_dir, dataset, modules + """ Set the configuration for the joint inference paradigm. - Raises: - KeyError: Required Modules are not fully loaded. + Raises + ------ + KeyError + If required modules are not provided. """ + inference_output_dir = os.path.dirname(self.workspace) os.environ["RESULT_SAVED_URL"] = inference_output_dir os.makedirs(inference_output_dir, exist_ok=True) @@ -98,12 +100,12 @@ def set_config(self): def run(self): """ - run the test flow of multi-edge inference paradigm. + run the test flow of joint inference paradigm. Returns ------ - test result: numpy.ndarray - system metric info: dict + inference_result: list + system_metric_info: dict information needed to compute system metrics. """ @@ -118,14 +120,22 @@ def run(self): return inference_result, self.system_metric_info def _cleanup(self, job): + """Call module's cleanup method to release resources + + Parameters + ---------- + job : Sedna JointInference + Sedna JointInference API + """ + LOGGER.info("Release models") # release module resources for module in self.module_instances.values(): if hasattr(module, "cleanup"): module.cleanup() - # Since the hard example mining module is instantiated within the job, - # special call is required. + # Special call is required for hard example mining module + # since it is instantiated within the job. mining_instance = job.hard_example_mining_algorithm if hasattr(mining_instance, "cleanup"): mining_instance.cleanup() @@ -133,6 +143,18 @@ def _cleanup(self, job): del job def _inference(self, job): + """Inference each data in Inference Dataset + + Parameters + ---------- + job : Sedna JointInference + Sedna JointInference API + + Returns + ------- + tuple + Inference Result with the format of `(is_hard_example, res, edge_result, cloud_result)` + """ results = [] cloud_count, edge_count = 0,0 @@ -163,4 +185,4 @@ def _inference(self, job): LOGGER.info("Inference Finished") - return results # (is_hard_example, res, edge_result, cloud_result) + return results diff --git a/examples/cloud-edge-collaborative-inference-for-llm/README.md b/examples/cloud-edge-collaborative-inference-for-llm/README.md index 44c8fa62..85705406 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/README.md +++ b/examples/cloud-edge-collaborative-inference-for-llm/README.md @@ -76,15 +76,14 @@ conda activate ianvs-experiment git clone https://github.com/kubeedge/ianvs.git cd ianvs -# Install a modified sedna wheel (a small bug and dependencies was fixed) -wget https://github.com/FuryMartin/sedna/releases/download/v0.4.1.1/sedna-0.4.1.1-py3-none-any.whl -pip install sedna-0.4.1.1-py3-none-any.whl +# Install Sedna +pip install examples/resources/third_party/sedna-0.6.0.1-py3-none-any.whl # Install dependencies for this example. -pip install examples/cloud-edge-collaborative-inference-for-llm/requirements.txt +pip install -r examples/cloud-edge-collaborative-inference-for-llm/requirements.txt # Install dependencies for Ianvs Core. -pip install requirements.txt +pip install -r requirements.txt # Install ianvs python setup.py install @@ -94,14 +93,25 @@ python setup.py install ### Dataset Configuration -Note: The currently supported dataset includes MMLU, and you can also construct the dataset you need for testing according to the format requirements of the dataset. +1. Download `mmlu-5-shot` from [Ianvs-MMLU-5-shot](https://huggingface.co/datasets/FuryMartin/Ianvs-MMLU-5-shot), which is a transformed MMLU-5-shot dataset formatted to fit Ianvs's requirements. -You need to create a dataset folder in`ianvs/` in the following structure. +2. Create a `dataset` folder in the root directory of Ianvs and move `mmlu-5-shot` into the `dataset` folder. + +3. Then, check the path of `train_data` and `test_dat` in +`examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml`. + + - If you created the `dataset` folder inside `ianvs/` as mentioned earlier, then the relative path is correct and does not need to be modified. + + - If your `dataset` is created in a different location, please use an absolute path, and using `~` to represent the home directory is not supported. + +#### Dataset Details + +If you want to construct your own dataset, please see the details below and follow the instruction. ``` . ├── dataset -│ └── mmlu +│ └── mmlu-5-shot │ ├── test_data │ │ ├── data.jsonl │ │ └── metadata.json @@ -135,12 +145,7 @@ Here is an example: } ``` -Then, check the path of `train_data` and `test_dat` in -`examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml`. - -- If you created the `dataset` folder inside `ianvs/` as mentioned earlier, then the relative path is correct and does not need to be modified. -- If your `dataset` is created in a different location, please use an absolute path, and using `~` to represent the home directory is not supported. ### Metric Configuration @@ -151,7 +156,7 @@ We have designed multiple metrics for edge-cloud collaborative inference, includ | Metric | Description | Unit | | :---------------------- | :------------------------------------------------------ | ------- | | Accuracy | Accuracy on the test Dataset | - | -| Rate to Edge | proportion of queries router to edge | - | +| Edge Ratio | proportion of queries router to edge | - | | Time to First Token | Time taken to generate the first token | s | | Internal Token Latency | Time taken to generate each token | s | | Throughput | Token generation speed | token/s | @@ -227,36 +232,62 @@ You can modify the `router` parameter in `test_queryrouting.yaml` to select the For BERT router, you can use [routellm/bert](https://huggingface.co/routellm/bert) or [routellm/bert_mmlu_augmented](https://huggingface.co/routellm/bert_mmlu_augmented) or your own BERT model/ #### Data Processor Configuration -Data Processor 允许你在 `ianvs` 读取数据集后,自行实现需要的数据构造形式,如 few-shot、CoT 等复杂的 prompts 等。 +The Data Processor allows you to custom your own data format after the dataset loaded. Currently, supported routers include: | Data Processor | Description | Parameters | | ------------ | ------------------------------------------------------------ | ---------------- | -| MultiShotGenertor | Few-shot query generator | shot-nums | +| OracleRouterDatasetProcessor | Expose `gold` label to OracleRouter | - | ## Step 3. Run Ianvs +### Provided Response Cache +The testing process may take much time, depending on the number of test cases and the inference speed of the model. + +To enable you directly get the results, here we provide a workspace folder with cached results of `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`,`Qwen/Qwen2.5-7B-Instruct` and `gpt-4o-mini`. + +You can download `workspace-mmlu` folder from [Ianvs-MMLU-5-shot](https://huggingface.co/datasets/FuryMartin/Ianvs-MMLU-5-shot) and put it under your `ianvs` folder. + +### Run Joint Inference example + Run the following command: -`ianvs -f examples/llm/singletask_learning_bench/simple_qa/benchmarkingjob.yaml` +`ianvs -f examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml` + +After the process finished, you will see output like this: -After the process finished, you will see output. +```bash +[2024-10-28 18:03:37,314] edge_model.py(43) [INFO] - {'model': 'Qwen/Qwen2.5-1.5B-Instruct', 'backend': 'vllm', 'temperature': 0, 'top_p': 0.8, 'max_tokens': 512, 'repetition_penalty': 1.05, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.9, 'use_cache': True} +[2024-10-28 18:03:37,314] cloud_model.py(34) [INFO] - {'model': 'gpt-4o-mini', 'temperature': 0, 'top_p': 0.8, 'max_tokens': 512, 'repetition_penalty': 1.05, 'use_cache': True} +[2024-10-28 18:03:37,850] joint_inference.py(73) [INFO] - Loading dataset +[2024-10-28 18:03:38,703] hard_sample_mining.py(30) [INFO] - USING EdgeOnlyFilter +[2024-10-28 18:03:38,704] joint_inference.py(162) [INFO] - Inference Start +100%|██████████████████████████████████| 14042/14042 [00:02<00:00, 6182.92it/s, Edge=14042, Cloud=0] +[2024-10-28 18:03:40,975] joint_inference.py(186) [INFO] - Inference Finished +[2024-10-28 18:03:40,976] joint_inference.py(131) [INFO] - Release models +``` + +### Results + +Change the Router type to `EdgeOnly`, `CloudOnly`, `OracleRouter` (or another router) will yield better results. + +The recommend testing order is `EdgeOnly`, `CloudOnly`, `OracleRouter`, `BERTRouter`, `RandomRouter`. By changing different models and Router parameters, you may see output like: ```bash -+------+---------------+----------+--------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ -| rank | algorithm | Accuracy | Rate to Edge | Time to First Token | Throughput | Internal Token Latency | Cloud Prompt Tokens | Cloud Completion Tokens | Edge Prompt Tokens | Edge Completion Tokens | paradigm | hard_example_mining | edgemodel-model | edgemodel-backend | cloudmodel-model | time | url | -+------+---------------+----------+--------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ -| 1 | query-routing | 83.48 | 88.32 | 0.362 | 139.53 | 0.007 | 1416860 | 11836 | 10987945 | 48533 | jointinference | OracleRouter | Qwen/Qwen2.5-7B-Instruct | vllm | gpt-4o-mini | 2024-10-17 15:52:21 | ./workspace-mmlu/benchmarkingjob/query-routing/9f85b598-8c5c-11ef-ad26-51366965e425 | -| 2 | query-routing | 82.64 | 76.89 | 0.277 | 338.51 | 0.003 | 2804317 | 15707 | 9547941 | 24060 | jointinference | OracleRouter | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-17 15:51:51 | ./workspace-mmlu/benchmarkingjob/query-routing/9f85b596-8c5c-11ef-ad26-51366965e425 | -| 3 | query-routing | 82.1 | 81.78 | 0.313 | 248.38 | 0.004 | 2214701 | 11887 | 10161486 | 81147 | jointinference | OracleRouter | Qwen/Qwen2.5-3B-Instruct | vllm | gpt-4o-mini | 2024-10-17 15:52:06 | ./workspace-mmlu/benchmarkingjob/query-routing/9f85b597-8c5c-11ef-ad26-51366965e425 | -| 4 | query-routing | 76.43 | 0.0 | 0.782 | 1194.58 | 0.001 | 12017546 | 47583 | 0 | 0 | jointinference | CloudOnly | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-17 15:50:39 | ./workspace-mmlu/benchmarkingjob/query-routing/747c4176-8c5c-11ef-ad26-51366965e425 | -| 5 | query-routing | 71.8 | 100.0 | 0.306 | 125.22 | 0.008 | 0 | 0 | 12456589 | 55634 | jointinference | EdgeOnly | Qwen/Qwen2.5-7B-Instruct | vllm | gpt-4o-mini | 2024-10-17 13:46:20 | ./workspace-mmlu/benchmarkingjob/query-routing/0ca33c20-8c4b-11ef-ad26-51366965e425 | -| 6 | query-routing | 63.89 | 100.0 | 0.209 | 210.62 | 0.005 | 0 | 0 | 12456589 | 103378 | jointinference | EdgeOnly | Qwen/Qwen2.5-3B-Instruct | vllm | gpt-4o-mini | 2024-10-17 13:46:09 | ./workspace-mmlu/benchmarkingjob/query-routing/0ca33c1f-8c4b-11ef-ad26-51366965e425 | -| 7 | query-routing | 59.53 | 100.0 | 0.124 | 278.34 | 0.004 | 0 | 0 | 12454484 | 31193 | jointinference | EdgeOnly | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-17 13:45:58 | ./workspace-mmlu/benchmarkingjob/query-routing/0ca33c1e-8c4b-11ef-ad26-51366965e425 | -+------+---------------+----------+--------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ ++------+---------------+----------+------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ +| rank | algorithm | Accuracy | Edge Ratio | Time to First Token | Throughput | Internal Token Latency | Cloud Prompt Tokens | Cloud Completion Tokens | Edge Prompt Tokens | Edge Completion Tokens | paradigm | hard_example_mining | edgemodel-model | edgemodel-backend | cloudmodel-model | time | url | ++------+---------------+----------+------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ +| 1 | query-routing | 84.22 | 87.62 | 0.347 | 179.28 | 0.006 | 1560307 | 20339 | 10695142 | 30104 | jointinference | OracleRouter | Qwen/Qwen2.5-7B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:58:30 | ./workspace-mmlu/benchmarkingjob/query-routing/b8eb2606-950a-11ef-8cbc-c97e05df5d14 | +| 2 | query-routing | 82.75 | 77.55 | 0.316 | 216.72 | 0.005 | 2727792 | 18177 | 9470276 | 291364 | jointinference | OracleRouter | Qwen/Qwen2.5-3B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:58:19 | ./workspace-mmlu/benchmarkingjob/query-routing/b8eb2605-950a-11ef-8cbc-c97e05df5d14 | +| 3 | query-routing | 82.22 | 76.12 | 0.256 | 320.39 | 0.003 | 2978026 | 23254 | 9209538 | 29126 | jointinference | OracleRouter | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:58:09 | ./workspace-mmlu/benchmarkingjob/query-routing/b8eb2604-950a-11ef-8cbc-c97e05df5d14 | +| 4 | query-routing | 75.99 | 0.0 | 0.691 | 698.83 | 0.001 | 11739216 | 79115 | 0 | 0 | jointinference | CloudOnly | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:57:43 | ./workspace-mmlu/benchmarkingjob/query-routing/abe4062e-950a-11ef-8cbc-c97e05df5d14 | +| 5 | query-routing | 71.84 | 100.0 | 0.301 | 164.34 | 0.006 | 0 | 0 | 12335559 | 34817 | jointinference | EdgeOnly | Qwen/Qwen2.5-7B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:57:30 | ./workspace-mmlu/benchmarkingjob/query-routing/9b726328-950a-11ef-8cbc-c97e05df5d14 | +| 6 | query-routing | 60.3 | 100.0 | 0.206 | 176.71 | 0.006 | 0 | 0 | 12335559 | 397386 | jointinference | EdgeOnly | Qwen/Qwen2.5-3B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:57:23 | ./workspace-mmlu/benchmarkingjob/query-routing/9b726327-950a-11ef-8cbc-c97e05df5d14 | +| 7 | query-routing | 58.35 | 100.0 | 0.123 | 271.81 | 0.004 | 0 | 0 | 12335559 | 38982 | jointinference | EdgeOnly | Qwen/Qwen2.5-1.5B-Instruct | vllm | gpt-4o-mini | 2024-10-28 16:57:16 | ./workspace-mmlu/benchmarkingjob/query-routing/9b726326-950a-11ef-8cbc-c97e05df5d14 | ++------+---------------+----------+------------+---------------------+------------+------------------------+---------------------+-------------------------+--------------------+------------------------+----------------+---------------------+----------------------------+-------------------+------------------+---------------------+-------------------------------------------------------------------------------------+ ``` Ianvs will output a `rank.csv` and `selected_rank.csv` in `ianvs/workspace`, which will record the test results of each test. @@ -269,16 +300,18 @@ You can modify the relevant model parameters in `examples/cloud-edge-collaborati Query Routing is a very useful cloud-edge collaboration strategy based on two facts: -- Calling top-tier large language models is expensive: For GPT-4o, the pricing is $5.00 / 1M input tokens and \$15.00 / 1M output tokens. +- Calling top-tier large language models is expensive: For GPT-4o, the pricing is \$5.00 / 1M input tokens and \$15.00 / 1M output tokens. - Not all tasks require calling top-tier models: For tasks like translation, organization, summarization, data formatting,and casual conversation, small models with 3B parameters or less can achieve satisfactory results. These two facts suggest that if we can call different models based on the difficulty of the task, it will help save unnecessary API calls and thus reduce costs. Additionally, if edge device prformance is sufficient, locally deployed small models can also demonstrate excellent latency and throughput metrics, further enhancing user experience. -Our Oracle Router is the ideal router that can route problems where the actual performance of edge small models outperforms that of cloud large models to the edge. Experiments have shown that when Qwen2.5-7B-Instruct collaborates with gpt-4o-mini, the accuracy on the MMLU (5-shot) dataset is +11.68% compared to pure edge and +8.85% compared to pure cloud, with 88.32% of queries routed to edge. +Our Oracle Router is the ideal router that can route problems where the actual performance of edge small models outperforms that of cloud large models to the edge. Experiments have shown that when Qwen2.5-7B-Instruct collaborates with gpt-4o-mini, the accuracy on the MMLU (5-shot) dataset is +12.38% compared to pure edge and +8.23% absolute accuracy compared to pure cloud, with 87.62% of queries routed to edge. ![](./assets/Oracle%20Router%20Demo.png) +You can modify and run `performance-cost-plot.py` to get your Performance-Cost figure. + Some related research $^{[1]}$ has trained pratical routers that can save up to 40% of GPT-4 API calls while maintaining essentially unchanged accuracy on the test set. diff --git a/examples/cloud-edge-collaborative-inference-for-llm/assets/Oracle Router Demo.png b/examples/cloud-edge-collaborative-inference-for-llm/assets/Oracle Router Demo.png index ad0b36d2..dac855a7 100644 Binary files a/examples/cloud-edge-collaborative-inference-for-llm/assets/Oracle Router Demo.png and b/examples/cloud-edge-collaborative-inference-for-llm/assets/Oracle Router Demo.png differ diff --git a/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml b/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml index 5060db0d..d86f9beb 100755 --- a/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml +++ b/examples/cloud-edge-collaborative-inference-for-llm/benchmarkingjob.yaml @@ -61,7 +61,7 @@ benchmarkingjob: # 1> "all": select all metrics in the leaderboard; # 2> metrics in the leaderboard, e.g., "f1_score" # metrics: [ "acc" , "edge-rate", "cloud-prompt", "cloud-completion", "edge-prompt", "edge-completion", "input-throughput", "output-throughput", "latency"] - metrics: ["Accuracy", "Rate to Edge", "Time to First Token", "Throughput", "Internal Token Latency", "Cloud Prompt Tokens", "Cloud Completion Tokens", "Edge Prompt Tokens", "Edge Completion Tokens"] + metrics: ["Accuracy", "Edge Ratio", "Time to First Token", "Throughput", "Internal Token Latency", "Cloud Prompt Tokens", "Cloud Completion Tokens", "Edge Prompt Tokens", "Edge Completion Tokens"] # model of save selected and all dataitems in workspace; string type; # currently the options of value are as follows: diff --git a/examples/cloud-edge-collaborative-inference-for-llm/performance-cost-plot.py b/examples/cloud-edge-collaborative-inference-for-llm/performance-cost-plot.py new file mode 100644 index 00000000..4f97358c --- /dev/null +++ b/examples/cloud-edge-collaborative-inference-for-llm/performance-cost-plot.py @@ -0,0 +1,61 @@ +import numpy as np + +import matplotlib.pyplot as plt +from scipy.optimize import curve_fit + +colors = plt.cm.Paired.colors # Set1 调色板 +plt.rcParams["axes.prop_cycle"] = plt.cycler("color", colors) + +# a sigmoid function to fit non-oracle models' performance vs cost +def sigmoid_fit(x, L, k, x0): + return L / (1 + np.exp(-k * (x - x0))) + +def plot_accuracy_cost(models, costs, accuracy, non_oracle_costs, non_oracle_accuracy): + # Fit the sigmoid model + params_sigmoid, _ = curve_fit(sigmoid_fit, non_oracle_costs, non_oracle_accuracy, p0=[100, 1, 0.2]) + + # Generate points for the sigmoid fitted curve + curve_x_sigmoid = np.linspace(min(non_oracle_costs), max(non_oracle_costs), 100) + curve_y_sigmoid = sigmoid_fit(curve_x_sigmoid, *params_sigmoid) + + plt.figure(figsize=(10, 6)) + + # Plot all models + for i in range(len(models)): + if "Oracle" in models[i]: + marker = '^' # Triangle marker for Oracle models + else: + marker = 'o' # Circle marker for non-Oracle models + plt.scatter(costs[i], accuracy[i], label=models[i], marker=marker) + + # Plot the sigmoid fitted curve + plt.plot(curve_x_sigmoid, curve_y_sigmoid, 'gray', linestyle='dashed') # Gray dashed line for the curve + + plt.title('Model Performance vs Cost') + plt.xlabel('Cost($/M token)') + plt.ylabel('Accuracy (%)') + plt.legend(title='Model Name') + plt.grid(True) + plt.savefig('model_performance_sigmoid_fitted_curve.png', dpi=300) + plt.show() + +if __name__ == '__main__': + models = [ + "Oracle-Qwen2.5-7b-instruct + gpt-4o-mini", + "Oracle-Qwen2.5-1.5b-instruct + gpt-4o-mini", + "Oracle-Qwen2.5-3b-instruct + gpt-4o-mini", + "gpt-4o-mini", + "Qwen2.5-7B-Instruct", + "Qwen2.5-3B-Instruct", + "Qwen2.5-1.5B-Instruct" + ] + # The Oracle Routed Model's cost is an average weighted by the Edge Ratio between edge model costs and cloud model costs. + # The edge model’s cost is estimated based on its parameter size. + costs = [0.16, 0.18, 0.17, 0.60, 0.10, 0.08, 0.05] + accuracy = [84.22, 82.75, 82.22, 75.99, 71.84, 60.3, 58.35] + + # Non Oracle Models: gpt-4o-mini, Qwen2.5-7B-Instruct, Qwen2.5-3B-Instruct, Qwen2.5-1.5B-Instruct + non_oracle_costs = costs[-4:] # Costs in $/M token + non_oracle_accuracy = accuracy[-4:] # Accuracies in % + + plot_accuracy_cost(models, costs, accuracy, non_oracle_costs, non_oracle_accuracy) \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py index 10d76e6f..f466b367 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KubeEdge Authors. +# Copyright 2024 The KubeEdge Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,12 +15,10 @@ from __future__ import absolute_import, division, print_function import os + from core.common.log import LOGGER from sedna.common.class_factory import ClassType, ClassFactory - from models import APIBasedLLM -device = "cuda" # the device to load the model onto - os.environ['BACKEND_TYPE'] = 'TORCH' @@ -28,14 +26,46 @@ @ClassFactory.register(ClassType.GENERAL, alias="CloudModel") class CloudModel: + """Models being deployed on the Cloud + """ def __init__(self, **kwargs): + """Initialize the CloudModel. See `APIBasedLLM` for details about `kwargs`. + """ LOGGER.info(kwargs) self.model = APIBasedLLM(**kwargs) - self.model.load(model = kwargs.get("model", "gpt-4o-mini")) - - def inference(self, data, input_shape=None, **kwargs): + self.load(kwargs.get("model", "gpt-4o-mini")) + + def load(self, model): + """Set the model. + + Parameters + ---------- + model : str + Existing model from your OpenAI provider. Example: `gpt-4o-mini` + """ + self.model._load(model = model) + + def inference(self, data, **kwargs): + """Inference the model with the given data. + + Parameters + ---------- + data : dict + The data to be used for inference. See format at BaseLLM's `inference()`. + kwargs : dict + To Align with Sedna's JointInference interface. + + Returns + ------- + dict + Formatted Response. See `model._format_response()` for more details. + """ + return self.model.inference(data) - + def cleanup(self): + """Save the cache and cleanup the model. + """ + self.model.save_cache() self.model.cleanup() \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/data_processor.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/data_processor.py index 99206a5f..ee29ed40 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/data_processor.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/data_processor.py @@ -1,12 +1,39 @@ -import numpy as np +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassFactory, ClassType +from sedna.datasources import BaseDataSource @ClassFactory.register(ClassType.GENERAL, alias="OracleRouterDatasetProcessor") class OracleRouterDatasetProcessor: + """ A Customized Dataset Processor for Oracle Router""" def __init__(self, **kwargs): pass def __call__(self, dataset): + """Transform the dataset to another format for Oracle Router + + Parameters + ---------- + dataset : sedna.datasources.BaseDataSource + The dataset loaded by Sedna + + Returns + ------- + sedna.datasources.BaseDataSource + Transformed dataset + """ dataset.x = [{"query": x, "gold": y} for x,y in zip(dataset.x, dataset.y)] return dataset \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py index 96931a6f..42ca2542 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KubeEdge Authors. +# Copyright 2024 The KubeEdge Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,41 +15,51 @@ from __future__ import absolute_import, division, print_function import os + from core.common.log import LOGGER from sedna.common.class_factory import ClassType, ClassFactory - from models import HuggingfaceLLM, APIBasedLLM, VllmLLM -device = "cuda" # the device to load the model onto - os.environ['BACKEND_TYPE'] = 'TORCH' __all__ = ["BaseModel"] @ClassFactory.register(ClassType.GENERAL, alias="EdgeModel") class EdgeModel: - """ - This is actually the Edge Model. + """Models being deployed on the Edge """ def __init__(self, **kwargs): - LOGGER.info(kwargs) + """Initialize the CloudModel. + + Parameters + ---------- + kwargs : dict + Parameters that are passed to the model. Details can be found in the `VllmLLM`, `HuggingfaceLLM`, `APIBasedLLM` class. + Special keys: + - `backend`: str, default "huggingface". The serving framework to be used. + """ + + LOGGER.info(kwargs) self.kwargs = kwargs self.model_name = kwargs.get("model", None) - self.backend = kwargs.get("backend", "huggingface") - self.quantization = kwargs.get("quantization", "full") + self.backend = kwargs.get("backend", "huggingface") self._set_config() - # 'backend' means serving framework: "huggingface", "vllm" - # 'quantization' means quantization mode:"full","4-bit","8-bit" - + def _set_config(self): - # # Some parameters are passed to Sedna through environment variables - parameters = os.environ - # # EdgeModel URL, see at https://github.com/kubeedge/sedna/blob/ac623ab32dc37caa04b9e8480dbe1a8c41c4a6c2/lib/sedna/core/base.py#L132 - parameters["model_path"] = self.model_name + """Set the model path in our environment variables due to Sedna’s [check](https://github.com/kubeedge/sedna/blob/ac623ab32dc37caa04b9e8480dbe1a8c41c4a6c2/lib/sedna/core/base.py#L132). + """ + # + os.environ["model_path"] = self.model_name def load(self, **kwargs): - # Align with Sedna's TorchBackend interface, see at https://github.com/kubeedge/sedna/blob/ac623ab32dc37caa04b9e8480dbe1a8c41c4a6c2/lib/sedna/backend/torch/__init__.py#L55-L67 + """Set the model backend to be used. Will be called by Sedna's JointInference interface. + + Raises + ------ + Exception + When the backend is not supported. + """ if self.backend == "huggingface": self.model = HuggingfaceLLM(**self.kwargs) elif self.backend == "vllm": @@ -57,15 +67,31 @@ def load(self, **kwargs): elif self.backend == "api": self.model = APIBasedLLM(**self.kwargs) else: - raise Exception(f"Backend {self.backend} is not supported") - - # TODO cloud service must be configured in JointInference + raise Exception(f"Backend {self.backend} is not supported. Please use 'huggingface', 'vllm', or `api` ") + + def predict(self, data, **kwargs): + """Inference the model with the given data. + + Parameters + ---------- + data : dict + The data to be used for inference. See format at BaseLLM's `inference()`. + kwargs : dict + To Align with Sedna's JointInference interface. - def predict(self, data, input_shape=None, **kwargs): - answer_list = self.model.inference(data) + Returns + ------- + dict + Formatted Response. See `model._format_response()` for more details. + """ + + answer = self.model.inference(data) + + return answer - return answer_list - def cleanup(self): + """Save the cache and cleanup the model. + """ + self.model.save_cache() self.model.cleanup() diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py index 8d643c37..469c9f3e 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/hard_sample_mining.py @@ -1,4 +1,4 @@ -# Copyright 2021 The KubeEdge Authors. +# Copyright 2024 The KubeEdge Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +20,8 @@ from sedna.common.class_factory import ClassFactory, ClassType from core.common.log import LOGGER -__all__ = ('ThresholdFilter', 'CrossEntropyFilter', 'IBTFilter') +__all__ = ('BERTFilter', 'EdgeOnlyFilter', 'CloudOnlyFilter', + 'RandomRouterFilter', 'OracleRouterFilter') class BaseFilter(metaclass=abc.ABCMeta): """The base class to define unified interface.""" @@ -48,11 +49,21 @@ def __call__(self, infer_result=None): def data_check(cls, data): """Check the data in [0,1].""" return 0 <= float(data) <= 1 - + @ClassFactory.register(ClassType.HEM, alias="BERTRouter") class BERTFilter(BaseFilter, abc.ABC): def __init__(self, **kwargs): + """Initialize the BERTFilter. + + Parameters + ---------- + kwargs: dict + Possible kwargs are: + - `model`: str, default "routellm/bert". The model to be used. + - `task`: str, default "text-classification". The task to be used. + - `max_length`: int, default 512. The maximum length of the input. + """ super().__init__(**kwargs) self.kwargs = kwargs @@ -65,6 +76,23 @@ def __init__(self, **kwargs): self.classifier = pipeline(self.task, model=self.model, device="cuda") def _text_classification_postprocess(self, result): + """Postprocess the text classification result + + Parameters + ---------- + result : list + The result from the classifier. Example: + ``` + [{"label": "LABEL_0", "score": 0.5}, + {"label": "LABEL_1", "score": 0.4}, + {"label": "LABEL_2", "score": 0.1}] + + Returns + ------- + bool + `True` means hard sample, `False` means not. + """ + res = {item["label"]:item["score"] for item in result} scaled_score = res["LABEL_0"] / (res["LABEL_0"] + res["LABEL_1"]) @@ -73,45 +101,85 @@ def _text_classification_postprocess(self, result): return False if label == "LABEL_0" else True def _predict(self, data): - print(data) - # result = self.classifier(data) + """Predict the data label + + Parameters + ---------- + data : dict + See format at BaseLLM's `inference()`. + + Returns + ------- + bool + `True` means hard sample, `False` means not. + + Raises + ------ + NotImplementedError + If the task is not supported + """ + if self.task == "text-classification": result = self.classifier(data, top_k=None) - is_hard_example = self._text_classification_postprocess(result) + is_hard_sample = self._text_classification_postprocess(result) else: raise NotImplementedError - return is_hard_example - + return is_hard_sample + def _preprocess(self, data): - messages = data.get("messages") - return messages[-1]["content"][:self.max_length] - + """Preprocess the data + + Parameters + ---------- + data : dict + See format at BaseLLM's `inference()`. + + Returns + ------- + str + query string + """ + query = data.get("query") + if "query" in query: + return query["query"][:self.max_length] + else: + return query[:self.max_length] + + def cleanup(self): + """Release the classifier model + """ del self.classifier def __call__(self, data=None) -> bool: data = self._preprocess(data) return self._predict(data) - + @ClassFactory.register(ClassType.HEM, alias="EdgeOnly") class EdgeOnlyFilter(BaseFilter, abc.ABC): + """Route all queries to edge. + """ def __init__(self, **kwargs): super().__init__(**kwargs) def __call__(self, data=None) -> bool: return False - + @ClassFactory.register(ClassType.HEM, alias="CloudOnly") class CloudOnlyFilter(BaseFilter, abc.ABC): + """Route all queries to cloud. + """ def __init__(self, **kwargs): super().__init__(**kwargs) def __call__(self, data=None) -> bool: return True - + @ClassFactory.register(ClassType.HEM, alias="RandomRouter") class RandomRouterFilter(BaseFilter, abc.ABC): + """Randomly route the queries to edge or cloud. + """ def __init__(self, **kwargs): super().__init__(**kwargs) self.threshold = kwargs.get("threshold", 0) @@ -121,6 +189,8 @@ def __call__(self, data=None) -> bool: @ClassFactory.register(ClassType.HEM, alias="OracleRouter") class OracleRouterFilter(BaseFilter, abc.ABC): + """The Opitmal Router, which routes the queries to edge or cloud based on the models' prediction. + """ def __init__(self, **kwargs): super().__init__(**kwargs) self.edge_better = 0 @@ -131,7 +201,19 @@ def __init__(self, **kwargs): self.edge_model = kwargs.get("edgemodel") self.cloud_model = kwargs.get("cloudmodel") - def __call__(self, data=None) -> bool: + def __call__(self, data=None): + """Route the query to edge or cloud based on the models' prediction. + + Parameters + ---------- + data : dict + See format at BaseLLM's `inference()`. + + Returns + ------- + bool + `True` means hard sample, `False` means not. + """ gold = data.get("gold", None) edge_result = self.edge_model.predict(data).get("prediction") @@ -157,8 +239,10 @@ def __call__(self, data=None) -> bool: else: # both correct + both wrong + edge_better, easy sample return False - + def cleanup(self): + """Leverage the `cleanup()` interface to print the statistics. + """ message = [ f"OracleRouter Statistics: \n", f"Both Wrong: {self.both_wrong}, ", diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py index 496d859c..96ce42a4 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/api_llm.py @@ -1,11 +1,27 @@ -import os +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os import time -from openai import OpenAI +from openai import OpenAI from models.base_llm import BaseLLM class APIBasedLLM(BaseLLM): def __init__(self, **kwargs) -> None: + """ Initialize the APIBasedLLM class + """ BaseLLM.__init__(self, **kwargs) api_key=os.environ.get("OPENAI_API_KEY") @@ -16,11 +32,33 @@ def __init__(self, **kwargs) -> None: base_url=base_url ) - def load(self, model): + def _load(self, model): + """Set the model to be used. + + Parameters + ---------- + model : str + Existing model from your OpenAI provider. Example: `gpt-4o-mini` + """ + self.model = model - + def _infer(self, messages): - # messages = self.get_message_chain(question, system_prompt) + """Call the OpenAI API to get the response + + Parameters + ---------- + messages : list + OpenAI style message chain. Example: + ``` + [{"role": "user", "content": "Hello, how are you?"}] + ``` + + Returns + ------- + dict + Formatted Response. See `_format_response()` for more details. + """ time_to_first_token = 0.0 internal_token_latency = [] @@ -58,8 +96,8 @@ def _infer(self, messages): throughput = 1 / internal_token_latency response = self._format_response( - text, - prompt_tokens, + text, + prompt_tokens, completion_tokens, time_to_first_token, internal_token_latency, @@ -67,9 +105,3 @@ def _infer(self, messages): ) return response - -if __name__ == '__main__': - llm = APIBasedLLM(model="gpt-4o-mini") - data = ["你好吗?介绍一下自己"] - res = llm.inference(data) - print(res) \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py index debe2bdd..f80bb0eb 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py @@ -1,32 +1,91 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import json -# from evals import extract_prediction def extract_prediction(input_string): - # 检查输入是否为空或只包含非字母字符 + """Extract the prediction from the completion. This function is used when caching the responses. + """ if not input_string or not any(char.isalpha() for char in input_string): return None - # 倒序遍历字符串,找到最后一个字母 + # Find the last letter in the string for char in reversed(input_string): if 'A' <= char <= 'D': return char - # 如果没有找到字母,返回None return None class BaseLLM: def __init__(self, **kwargs) -> None: + """ Initialize the BaseLLM class + + Parameters + ---------- + kwargs : dict + Parameters that are passed to the model. For details, see `_parse_kwargs()` + """ self.config = kwargs self._parse_kwargs(**kwargs) self.is_cache_loaded = False self.model_loaded = False - def load(self): + def _load(self): + """Interface for Model Loading + + Raises + ------ + NotImplementedError + When the method is not implemented + """ + raise NotImplementedError + + + def _infer(self, messages): + """Interface for Model Inference + + Parameters + ---------- + messages : list + OpenAI style message chain. Example: + ``` + [{"role": "user", "content": "Hello, how are you?"}] + ``` + + Raises + ------ + NotImplementedError + When the method is not implemented + """ raise NotImplementedError def _parse_kwargs(self, **kwargs): + """Parse the kwargs and set the attributes + + Parameters + ---------- + kwargs : dict + Parameters that are passed to the model. Possible keys are: + - `model`: str, default None. Model name + - `temperature`: float, default 0.8. Temperature for sampling + - `top_p`: float, default 0.8. Top p for sampling + - `repetition_penalty`: float, default 1.05. Repetition penalty + - `max_tokens`: int, default 512. Maximum tokens to generate + - `use_cache`: bool, default True. Whether to use reponse cache + """ + self.model_name = kwargs.get("model", None) - self.quantization = kwargs.get("quantization", "full") self.temperature = kwargs.get("temperature", 0.8) self.top_p = kwargs.get("top_p", 0.8) self.repetition_penalty = kwargs.get("repetition_penalty", 1.05) @@ -34,15 +93,31 @@ def _parse_kwargs(self, **kwargs): self.use_cache = kwargs.get("use_cache", True) def inference(self, data): - - if isinstance(data, list): - return [self._infer(line) for line in data] - - elif isinstance(data, str): - return self._infer(data) - - elif isinstance(data, dict): - + """Inference the model + + Parameters + ---------- + data : dict + The input data. Example: + ``` + # With Gold Answer (For special uses like OracleRouter) + {"query": "What is the capital of China?", "gold": "A"} + # Without Gold Answer + {"query": "What is the capital of China?"} + ``` + + Returns + ------- + dict + Formatted Response. See `_format_response()` for more details. + + Raises + ------ + ValueError + If the data is not a dict + """ + + if isinstance(data, dict): gold = data.get("gold", None) query = data.get("query") @@ -55,7 +130,7 @@ def inference(self, data): return response if not self.model_loaded: - self.load(self.model_name) + self._load(self.model_name) self.model_loaded = True response = self._infer(messages) @@ -70,9 +145,24 @@ def inference(self, data): return response else: - raise ValueError(f"DataType {type(data)} is not supported, it must be `list` or `str` or `dict`") + raise ValueError(f"DataType {type(data)} is not supported, it must be `dict`") def get_message_chain(self, question, system = None): + """Get the OpenAI Chat style message chain + + Parameters + ---------- + question : str + User prompt. + system : str, optional + System Prompt, by default None + + Returns + ------- + list + OpenAI Chat style message chain. + """ + if system: messages = [ {"role": "system", "content": system}, @@ -85,18 +175,45 @@ def get_message_chain(self, question, system = None): return messages - def validate_input(self, data): - expected_format = """{'question':'Lorem', "prompts": {infer_system_prompt:"Lorem"}}""" - - if "question" not in data: - raise ValueError(f"Missing Key 'question' in data, data should have format like {expected_format}") - if "prompts" not in data: - raise ValueError(f"Missing Key 'prompts' in data, data should have format like {expected_format}") - - def _infer(self, messages): - raise NotImplementedError def _format_response(self, text, prompt_tokens, completion_tokens, time_to_first_token, internal_token_latency, throughput): + """Format the response + + Parameters + ---------- + text : str + The completion text + prompt_tokens : int + The number of tokens in the prompt + completion_tokens : int + The number of tokens in the completion + time_to_first_token : float + The time consumed to generate the first token. Unit: s(seconds) + internal_token_latency : float + The average time consumed to generate a token. Unit: s(seconds) + throughput : float + The throughput of the completion. Unit: tokens/s + + Returns + ------- + dict + Example: + ``` + { + "completion": "A", + "usage": { + "prompt_tokens": 505, + "completion_tokens": 1, + "total_tokens": 506 + }, + "perf": { + "time_to_first_token": 0.6393, + "internal_token_latency": 0.0005, + "throughput": 1750.6698 + } + } + ``` + """ total_tokens = prompt_tokens + completion_tokens @@ -120,6 +237,8 @@ def _format_response(self, text, prompt_tokens, completion_tokens, time_to_first return resposne def _load_cache(self): + """Load cached Responses from `$RESULT_SAVED_URL/cache.json`. + """ self.cache = None self.cache_hash = {} self.cache_models = [] @@ -135,6 +254,18 @@ def _load_cache(self): self.is_cache_loaded = True def _try_cache(self, question): + """Try to get the response from cache + + Parameters + ---------- + question : str + User prompt + + Returns + ------- + dict + If the question is found in cache, return the Formatted Response. Otherwise, return None. + """ if not self.is_cache_loaded: self._load_cache() @@ -142,6 +273,19 @@ def _try_cache(self, question): return self.cache_hash.get(question, None) def _update_cache(self, question, response, prediction, gold): + """Update the cache with the new item + + Parameters + ---------- + question : str + User prompt + response : dict + Formatted Response. See `_format_response()` for more details. + prediction : str + The prediction extracted from the response + gold : str + The gold answer for the question + """ if not self.is_cache_loaded: self._load_cache() @@ -162,6 +306,8 @@ def _update_cache(self, question, response, prediction, gold): self.cache_models.append(self.cache) def save_cache(self): + """Save the cache to `$RESULT_SAVED_URL/cache.json`. + """ cache_file = os.path.join(os.environ["RESULT_SAVED_URL"], "cache.json") @@ -170,4 +316,6 @@ def save_cache(self): json.dump(self.cache_models, f, indent=4) def cleanup(self): + """Default Cleanup Method to release resources + """ pass diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/huggingface_llm.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/huggingface_llm.py index 93a95676..8bf87385 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/huggingface_llm.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/huggingface_llm.py @@ -1,29 +1,74 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +from threading import Thread + from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from models.base_llm import BaseLLM -from threading import Thread -import time -import os device = "cuda" os.environ["TOKENIZERS_PARALLELISM"] = "true" class HuggingfaceLLM(BaseLLM): def __init__(self, **kwargs) -> None: + """ Initialize the HuggingfaceLLM class + + Parameters + ---------- + kwargs : dict + Parameters that are passed to the model. Details can be found in the BaseLLM class. + """ BaseLLM.__init__(self, **kwargs) - def load(self, model_url): + def _load(self, model): + """Load the model via Hugging Face API + + Parameters + ---------- + model : str + Hugging Face style model name. Example: `Qwen/Qwen2.5-0.5B-Instruct` + """ self.model = AutoModelForCausalLM.from_pretrained( - model_url, + model, torch_dtype="auto", device_map="auto", trust_remote_code=True ) self.tokenizer = AutoTokenizer.from_pretrained( - model_url, + model, trust_remote_code=True ) def _infer(self, messages): + """Call the transformers inference API to get the response + + Parameters + ---------- + messages : list + OpenAI style message chain. Example: + ``` + [{"role": "user", "content": "Hello, how are you?"}] + ``` + + Returns + ------- + dict + Formatted Response. See `_format_response()` for more details. + """ + st = time.perf_counter() most_recent_timestamp = st @@ -42,8 +87,8 @@ def _infer(self, messages): streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True) generation_kwargs = dict( - model_inputs, - streamer=streamer, + model_inputs, + streamer=streamer, max_new_tokens=self.max_tokens, temperature=self.temperature, top_p=self.top_p, @@ -77,12 +122,12 @@ def _infer(self, messages): throughput = 1 / internal_token_latency response = self._format_response( - text, - prompt_tokens, + text, + prompt_tokens, completion_tokens, time_to_first_token, internal_token_latency, throughput ) - - return response \ No newline at end of file + + return response diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py index 2735b295..5d572306 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py @@ -1,7 +1,22 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment from models.base_llm import BaseLLM -import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -11,14 +26,35 @@ class VllmLLM(BaseLLM): def __init__(self, **kwargs) -> None: + """ Initialize the VllmLLM class + + Parameters + ---------- + kwargs : dict + Parameters that are passed to the model. Details can be found in the BaseLLM class. + + Special keys: + - `tensor_parallel_size`: int, default 1. Number of tensor parallelism. + - `gpu_memory_utilization`: float, default 0.8. GPU memory utilization. + + See details about special parameters in [vLLM's Named Arguments](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html). + """ + BaseLLM.__init__(self, **kwargs) self.tensor_parallel_size = kwargs.get("tensor_parallel_size", 1) self.gpu_memory_utilization = kwargs.get("gpu_memory_utilization", 0.8) - - def load(self, model_url): + + def _load(self, model): + """Load the model via vLLM API + + Parameters + ---------- + model : str + Hugging Face style model name. Example: `Qwen/Qwen2.5-0.5B-Instruct` + """ self.model = LLM( - model=model_url, + model=model, trust_remote_code=True, dtype="float16", tensor_parallel_size=self.tensor_parallel_size, @@ -28,24 +64,41 @@ def load(self, model_url): ) self.sampling_params = SamplingParams( - temperature=self.temperature, - top_p=self.top_p, - repetition_penalty=self.repetition_penalty, + temperature=self.temperature, + top_p=self.top_p, + repetition_penalty=self.repetition_penalty, max_tokens=self.max_tokens - ) - + ) + # Warmup to make metrics more accurate self.warmup() def warmup(self): + """Warm up the Model for more accurate performance metrics + """ + self.model.chat( - [{"role": "user", "content": "Hello"}], + [{"role": "user", "content": "Hello"}], self.sampling_params, use_tqdm=False ) def _infer(self, messages): - # messages = self.get_message_chain(question, system_prompt) + """Call the vLLM Offline Inference API to get the response + + Parameters + ---------- + messages : list + OpenAI style message chain. Example: + ``` + [{"role": "user", "content": "Hello, how are you?"}] + ``` + + Returns + ------- + dict + Formatted Response. See `_format_response()` for more details. + """ outputs = self.model.chat( messages=messages, @@ -67,19 +120,21 @@ def _infer(self, messages): throughput = 1 / internal_token_latency response = self._format_response( - text, - prompt_tokens, + text, + prompt_tokens, completion_tokens, time_to_first_token, internal_token_latency, throughput ) - return response + return response def cleanup(self): + """Release the model from GPU + """ destroy_model_parallel() destroy_distributed_environment() if hasattr(self, "model"): - del self.model.llm_engine.model_executor \ No newline at end of file + del self.model.llm_engine.model_executor diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/test_queryrouting.yaml b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/test_queryrouting.yaml index 31644a70..a3926146 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/test_queryrouting.yaml +++ b/examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/test_queryrouting.yaml @@ -1,25 +1,20 @@ algorithm: # paradigm name; string type; - # currently the options of value are as follows: - # 1> "singletasklearning" - # 2> "incrementallearning" paradigm_type: "jointinference" # algorithm module configuration in the paradigm; list type; modules: # kind of algorithm module; string type; - # currently the options of value are as follows: - # 1> "basemodel" - type: "dataset_processor" + # name of custom dataset processor; string type; name: "OracleRouterDatasetProcessor" + # the url address of custom dataset processor; string type; url: "./examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/data_processor.py" - type: "edgemodel" - # name of python module; string type; - # example: basemodel.py has BaseModel module that the alias is "FPN" for this benchmarking; - # This defines the edge model + # name of edge model module; string type; name: "EdgeModel" - # the url address of python module; string type; + # the url address of edge model module; string type; url: "./examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/edge_model.py" hyperparameters: @@ -30,35 +25,45 @@ algorithm: - "Qwen/Qwen2.5-3B-Instruct" - "Qwen/Qwen2.5-7B-Instruct" - backend: + # backend; string type; + # currently the options of value are as follows: + # 1> "huggingface": transformers backend; + # 2> "vllm": vLLM backend; + # 3> "api": OpenAI API backend; values: - "vllm" - temperature: + # What sampling temperature to use, between 0 and 2; float type; + # For reproducable results, the temperature should be set to 0; values: - 0 - top_p: + # nucleus sampling parameter; float type; values: - 0.8 - max_tokens: + # The maximum number of tokens that can be generated in the chat completion; int type; values: - 512 - repetition_penalty: + # The parameter for repetition penalty; float type; values: - 1.05 - tensor_parallel_size: + # The size of tensor parallelism (Used for vLLM) values: - # 1 or total count of gpu - 4 - gpu_memory_utilization: + # The percentage of GPU memory utilization (Used for vLLM) values: - 0.9 - use_cache: + # Whether to use reponse cache; boolean type; values: - true - type: "cloudmodel" # name of python module; string type; - # example: basemodel.py has BaseModel module that the alias is "FPN" for this benchmarking; - # This defines the edge model name: "CloudModel" # the url address of python module; string type; url: "./examples/cloud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/cloud_model.py" @@ -83,9 +88,9 @@ algorithm: - use_cache: values: - true - + - type: "hard_example_mining" - # name of python module; string type; + # name of Router module; string type; # BERTRouter, EdgeOnly, CloudOnly, RandomRouter, OracleRouter name: "EdgeOnly" # the url address of python module; string type; diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/accuracy.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/accuracy.py index bc03f146..a3dc088f 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/accuracy.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/accuracy.py @@ -1,4 +1,4 @@ -# Copyright 2022 The KubeEdge Authors. +# Copyright 2024 The KubeEdge Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,30 +18,42 @@ __all__ = ["acc"] def get_last_letter(input_string): - # 检查输入是否为空或只包含非字母字符 + """Extract the prediction from the completion. This function is used when caching the responses. + """ if not input_string or not any(char.isalpha() for char in input_string): return None - - # 倒序遍历字符串,找到最后一个字母 + # Find the last letter in the string for char in reversed(input_string): if 'A' <= char <= 'D': return char - - # 如果没有找到字母,返回None return None @ClassFactory.register(ClassType.GENERAL, alias="Accuracy") def acc(y_true, y_pred): - + """Calculate the accuracy. + + Parameters + ---------- + y_true : list + Ground truth + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + float + The accuracy (%) + """ + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] y_pred = [get_last_letter(pred.result.completion) for pred in infer_res] y_true = [get_last_letter(y) for y in y_true] - + # 使用列表推导来比较两个列表中的元素是否相同 same_elements = [y_pred[i] == y_true[i] for i in range(len(y_pred))] # 计算相同元素的数量 acc = sum(same_elements) / len(same_elements) - + return round(acc * 100, 2) diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_completion_tokens.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_completion_tokens.py index 2adc61d0..06c09b78 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_completion_tokens.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_completion_tokens.py @@ -1,11 +1,39 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Cloud Completion Tokens") -def cloud_completion_tokens(y_true, y_pred): - +def cloud_completion_tokens(_, y_pred): + """Calculate the number of completion tokens generated by the cloud model. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + int + Number of completion tokens generated by the cloud model + """ + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] cloud_completion_tokens = sum([pred.cloud_result.completion_tokens for pred in infer_res]) - + return cloud_completion_tokens \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_prompt_tokens.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_prompt_tokens.py index 884ad5a6..f3aed044 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_prompt_tokens.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_prompt_tokens.py @@ -1,11 +1,39 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Cloud Prompt Tokens") -def cloud_prompt_tokens(y_true, y_pred): - +def cloud_prompt_tokens(_, y_pred): + """Calculate the number of prompt tokens generated by the cloud model. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + int + Number of prompt tokens generated by the cloud model + """ + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] cloud_prompt_tokens = sum([pred.cloud_result.prompt_tokens for pred in infer_res]) - + return cloud_prompt_tokens \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_completion_tokens.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_completion_tokens.py index 94b314ef..05af423f 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_completion_tokens.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_completion_tokens.py @@ -1,11 +1,40 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Edge Completion Tokens") -def edge_completion_tokens(y_true, y_pred): - +def edge_completion_tokens(_, y_pred): + """Calculate the number of completion tokens generated by the edge model. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + int + Number of completion tokens generated by the edge model + """ + + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] edge_completion_tokens = sum([pred.edge_result.completion_tokens for pred in infer_res]) - + return edge_completion_tokens \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_prompt_tokens.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_prompt_tokens.py index 58560489..f2a30b3b 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_prompt_tokens.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_prompt_tokens.py @@ -1,11 +1,40 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Edge Prompt Tokens") -def edge_prompt_tokens(y_true, y_pred): - +def edge_prompt_tokens(_, y_pred): + """Calculate the number of prompt tokens generated by the edge model. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + int + Number of prompt tokens generated by the edge model + """ + + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] edge_prompt_tokens = sum([pred.edge_result.prompt_tokens for pred in infer_res]) - + return edge_prompt_tokens \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_rate.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_rate.py deleted file mode 100644 index ac411108..00000000 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_rate.py +++ /dev/null @@ -1,14 +0,0 @@ -from sedna.common.class_factory import ClassType, ClassFactory -from result_parser import JointInferenceResult - -@ClassFactory.register(ClassType.GENERAL, alias="Rate to Edge") -def edge_rate(y_true, y_pred): - - infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] - - y_pred = [pred.is_hard_example for pred in infer_res] - - edge_rate = 1 - sum(y_pred) / len(y_pred) - - return round(edge_rate * 100,2) - diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_ratio.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_ratio.py new file mode 100644 index 00000000..6ae5a1f5 --- /dev/null +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_ratio.py @@ -0,0 +1,42 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sedna.common.class_factory import ClassType, ClassFactory +from result_parser import JointInferenceResult + +@ClassFactory.register(ClassType.GENERAL, alias="Edge Ratio") +def edge_ratio(_, y_pred): + """Calculate the ratio of of queries routed to EdgeModel. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + int + The ratio of queries routed to EdgeModel (%) + """ + + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] + + y_pred = [pred.is_hard_example for pred in infer_res] + + edge_ratio = 1 - sum(y_pred) / len(y_pred) + + return round(edge_ratio * 100,2) + diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/internal_token_latency.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/internal_token_latency.py index 40c4c9e0..9024a2b3 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/internal_token_latency.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/internal_token_latency.py @@ -1,11 +1,41 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Internal Token Latency") -def internal_token_latency(y_true, y_pred): - +def internal_token_latency(_, y_pred): + """Calculate the Internal Token Latency of the system. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + float + Average Internal Token Latency (s) of the system + """ + + + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] average_itl = sum([pred.result.internal_token_latency for pred in infer_res]) / len(infer_res) - + return round(average_itl,3) \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/result_parser.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/result_parser.py index 6fcfd4c2..bbaba90e 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/result_parser.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/result_parser.py @@ -1,8 +1,24 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dataclasses import dataclass from typing import TypedDict @dataclass class Response: + """Formatted Response Parser""" + completion: str prompt_tokens : int completion_tokens : int @@ -13,6 +29,19 @@ class Response: @classmethod def from_dict(cls, response): + """Create a Response object from a dictionary + + Parameters + ---------- + response : dict + Formatted Response, See `BaseLLM._format_response()` for more details. + + Returns + ------- + Response + `Response` Object + """ + if response: return cls( response["completion"], @@ -28,6 +57,7 @@ def from_dict(cls, response): @dataclass class JointInferenceResult: + """Joint Inference Result Parser""" is_hard_example : bool result : Response edge_result: Response @@ -35,6 +65,25 @@ class JointInferenceResult: @classmethod def from_list(cls, is_hard_example, result, edge_result, cloud_reslut): + """Create a JointInferenceResult object from a list + + Parameters + ---------- + is_hard_example : bool + Whter the example is hard or not + result : dict + Formatted Response. See `BaseLLM._format_response()` for more details. + edge_result : dict + Formatted Response from the Edge Model. See `BaseLLM._format_response()` for more details. + cloud_reslut : dict + Formatted Response from the Cloud Model. See `BaseLLM._format_response()` for more details. + + Returns + ------- + _type_ + _description_ + """ + return cls( is_hard_example, Response.from_dict(result), diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml b/examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml index d13390c4..18510b64 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/testenv.yaml @@ -5,7 +5,7 @@ testenv: train_data: "./dataset/mmlu-5-shot/train_data/data.json" # the url address of test dataset index; string type; test_data_info: "./dataset/mmlu-5-shot/test_data/metadata.json" - + # metrics configuration for test case's evaluation; list type; metrics: # metric name; string type; @@ -13,15 +13,15 @@ testenv: # the url address of python file url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/accuracy.py" - - name: "Rate to Edge" - url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_rate.py" + - name: "Edge Ratio" + url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_ratio.py" - name: "Cloud Prompt Tokens" url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_prompt_tokens.py" - name: "Cloud Completion Tokens" url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/cloud_completion_tokens.py" - + - name: "Edge Prompt Tokens" url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/edge_prompt_tokens.py" @@ -30,7 +30,7 @@ testenv: - name: "Time to First Token" url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/time_to_first_token.py" - + - name: "Throughput" url: "./examples/cloud-edge-collaborative-inference-for-llm/testenv/throughput.py" diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/throughput.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/throughput.py index 49037e11..6a2ccc61 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/throughput.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/throughput.py @@ -1,13 +1,41 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Throughput") -def throughput(y_true, y_pred): - +def throughput(_, y_pred): + """Calculate the Throughput of the system. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + float + Average Throughput (token/s) of the system + """ + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] average_itl = sum([pred.result.internal_token_latency for pred in infer_res]) / len(infer_res) - + average_throughput = 1 / average_itl - + return round(average_throughput,2) \ No newline at end of file diff --git a/examples/cloud-edge-collaborative-inference-for-llm/testenv/time_to_first_token.py b/examples/cloud-edge-collaborative-inference-for-llm/testenv/time_to_first_token.py index 2e92b73b..0f0964e7 100644 --- a/examples/cloud-edge-collaborative-inference-for-llm/testenv/time_to_first_token.py +++ b/examples/cloud-edge-collaborative-inference-for-llm/testenv/time_to_first_token.py @@ -1,11 +1,39 @@ +# Copyright 2024 The KubeEdge Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from sedna.common.class_factory import ClassType, ClassFactory from result_parser import JointInferenceResult @ClassFactory.register(ClassType.GENERAL, alias="Time to First Token") -def time_to_first_token(y_true, y_pred): - +def time_to_first_token(_, y_pred): + """Calculate the Time to First Token of the system. + + Parameters + ---------- + _ : + Ignored + y_pred : list + List of predictions from the JointInference paradigm + + Returns + ------- + float + Average Time to First Token (s) of the system + """ + infer_res = [JointInferenceResult.from_list(*pred) for pred in y_pred] average_ttft = sum([pred.result.time_to_first_token for pred in infer_res]) / len(infer_res) - + return round(average_ttft, 3) \ No newline at end of file