diff --git a/.github/workflows/instant_benchmark.yml b/.github/workflows/instant_benchmark.yml index 82f1b8d3e7..0b8a48598a 100644 --- a/.github/workflows/instant_benchmark.yml +++ b/.github/workflows/instant_benchmark.yml @@ -15,9 +15,14 @@ on: - g5.2xlarge - g5.12xlarge - g5.48xlarge + - g6.2xlarge + - g6.12xlarge + - g6.48xlarge - g4dn.12xlarge - g4dn.2xlarge - p4d.24xlarge + - p4de.24xlarge + - p5.24xlarge - inf2.8xlarge - inf2.24xlarge - trn1.2xlarge @@ -127,26 +132,20 @@ jobs: run: | wget https://publish.djl.ai/awscurl/awscurl chmod +x awscurl - - name: Run benchmark job - working-directory: tests/integration - run: | - echo "${{ needs.environment-setup.outputs.template }}" >> template.json - python3 instant_benchmark.py --template template.json \ - --job ${{ matrix.job }} --instance ${{ inputs.instance }} - - bash instant_benchmark.sh - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving aws-region: us-east-1 - - name: Record benchmark job - if: ${{ inputs.record == 'table' || inputs.record == 'cloudwatch' }} + - name: Run benchmark job working-directory: tests/integration run: | - python3 record_benchmark.py --template template.json \ + echo "${{ needs.environment-setup.outputs.template }}" >> template.json + python3 instant_benchmark.py --template template.json \ --job ${{ matrix.job }} --instance ${{ inputs.instance }} \ - --model models/test --record ${{ inputs.record }} + --record ${{ inputs.record }} + + bash instant_benchmark.sh - name: Get serving logs if: always() working-directory: tests/integration diff --git a/tests/integration/benchmark/nightly/g5-12xl.txt b/tests/integration/benchmark/nightly/g5-12xl.txt index 1244b5adea..cd73232d2b 100644 --- a/tests/integration/benchmark/nightly/g5-12xl.txt +++ b/tests/integration/benchmark/nightly/g5-12xl.txt @@ -13,6 +13,7 @@ option.tensor_parallel_degree=max TOKENIZER=TheBloke/Llama-2-7B-fp16 ./awscurl -c 32 -N 10 \ -X POST http://127.0.0.1:8080/invocations \ --connect-timeout 60 -H "Content-type: application/json" \ +--json-path benchmark_result.json \ -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}' \ -t -o /tmp/output.txt [test_name] @@ -22,13 +23,13 @@ ENGINE={vllm,lmi-dist} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] -engine=Python option.rolling_batch=$ENGINE option.model_id=s3://djl-llm/llama-3-8b-hf/ option.tensor_parallel_degree=max [aws_curl] TOKENIZER=TheBloke/Llama-2-13B-fp16 ./awscurl -c 32 -N 10 \ -X POST http://127.0.0.1:8080/invocations \ +--json-path benchmark_result.json \ --connect-timeout 60 -H "Content-type: application/json" \ -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}' \ -t -o /tmp/output.txt diff --git a/tests/integration/benchmark/nightly/g5-2xl.txt b/tests/integration/benchmark/nightly/g5-2xl.txt index 097361342c..9ba10af681 100644 --- a/tests/integration/benchmark/nightly/g5-2xl.txt +++ b/tests/integration/benchmark/nightly/g5-2xl.txt @@ -2,17 +2,19 @@ mistral [vars] ENGINE={vllm,lmi-dist} +[benchmark_vars] +CONCURRENCY={1,2,4,8} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] -engine=Python option.rolling_batch=$ENGINE option.model_id=NousResearch/Hermes-2-Pro-Mistral-7B option.tensor_parallel_degree=max option.max_model_len=8192 [aws_curl] -TOKENIZER=TheBloke/Mistral-7B-Instruct-v0.2-AWQ ./awscurl -c 32 -N 10 \ +TOKENIZER=TheBloke/Mistral-7B-Instruct-v0.2-AWQ ./awscurl -c 32 -N $CONCURRENCY \ -X POST http://127.0.0.1:8080/invocations \ --connect-timeout 60 -H "Content-type: application/json" \ +--json-path benchmark_result.json \ -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}' \ -t -o /tmp/output.txt diff --git a/tests/integration/benchmark/nightly/g5-48xl.txt b/tests/integration/benchmark/nightly/g5-48xl.txt index bfdda8953e..e2749edabb 100644 --- a/tests/integration/benchmark/nightly/g5-48xl.txt +++ b/tests/integration/benchmark/nightly/g5-48xl.txt @@ -5,13 +5,13 @@ ENGINE={vllm,lmi-dist} [container] deepjavalibrary/djl-serving:lmi-nightly [serving_properties] -engine=Python option.rolling_batch=$ENGINE option.model_id=s3://djl-llm/mixtral-8x7b option.tensor_parallel_degree=max [aws_curl] TOKENIZER=NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO ./awscurl -c 32 -N 10 \ -X POST http://127.0.0.1:8080/invocations \ +--json-path benchmark_result.json \ --connect-timeout 60 -H "Content-type: application/json" \ -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}' \ -t -o /tmp/output.txt diff --git a/tests/integration/instant_benchmark.py b/tests/integration/instant_benchmark.py index fc10fa5579..3d0f3ed0dc 100644 --- a/tests/integration/instant_benchmark.py +++ b/tests/integration/instant_benchmark.py @@ -29,6 +29,10 @@ required=False, type=str, help="The current instance name") +parser.add_argument("--record", + required=False, + type=str, + help="Place to record metrics") parser.add_argument("--job", required=False, type=str, help="The job string") args = parser.parse_args() @@ -110,6 +114,7 @@ def parse_raw_template(url, override_container): commandline = [] requirements = [] vars = [] + benchmark_vars = [] info = None while iterator < len(lines): if '[test_name]' == lines[iterator]: @@ -148,6 +153,12 @@ def parse_raw_template(url, override_container): lines[iterator]): vars.append(lines[iterator]) iterator += 1 + elif '[benchmark_vars]' == lines[iterator]: + iterator += 1 + while iterator < len(lines) and not is_square_bracket( + lines[iterator]): + benchmark_vars.append(lines[iterator]) + iterator += 1 elif '[info]' == lines[iterator]: info = [] iterator += 1 @@ -174,8 +185,14 @@ def parse_raw_template(url, override_container): if info is not None: cur_result['info'] = info mul_results = multiply_template_with_vars(name, cur_result, vars) + # each of the replicated deployment options for r in mul_results.values(): - r['awscurl'] = r['awscurl'].encode().hex() + replicated_awscurl = multiply_template_with_vars( + '', {'awscurl': cur_result['awscurl']}, benchmark_vars) + for option in replicated_awscurl.keys(): + replicated_awscurl[option] = replicated_awscurl[option][ + 'awscurl'].encode().hex() + r['awscurl'] = replicated_awscurl final_result.update(mul_results) name = '' container = None @@ -219,23 +236,33 @@ def machine_translation(machine_name: str): return "lmi" -def build_running_script(template, job, instance): +def build_running_script(template, job, instance, record): with open(template) as f: template = json.load(f) job_template = template[job] - job_template['awscurl'] = bytes.fromhex( - job_template['awscurl']).decode("utf-8") + for key in job_template['awscurl'].keys(): + job_template['awscurl'][key] = bytes.fromhex( + job_template['awscurl'][key]).decode("utf-8") write_model_artifacts(job_template['properties'], job_template['requirements'], job_template['env']) - container = job_template['container'] + benchmark_command = [] + record_benchmark = ('python3 record_benchmark.py --template template.json ' + f'--job {job} --instance {instance} ' + f'--model models/test --record {record}') + + for key, value in job_template['awscurl'].items(): + benchmark_command.append(value) + benchmark_command.append(record_benchmark + f' --benchmark-vars "{key}"') + bash_command = [ - 'set -euo pipefail', 'echo "Start Launching container..."', + 'set -euo pipefail', + 'echo "Start Launching container..."', f"docker pull {container}", f"./launch_container.sh {container} $PWD/models {machine_translation(instance)}", - job_template['awscurl'] + " | tee benchmark.log" ] + bash_command.extend(benchmark_command) with open("instant_benchmark.sh", "w") as f: f.write('\n'.join(bash_command)) @@ -249,7 +276,8 @@ def build_running_script(template, job, instance): command = f"echo \"template={json.dumps(json.dumps(json.dumps(result)))}\" >> $GITHUB_OUTPUT" sp.call(command, shell=True) elif args.template and args.job and args.instance: - build_running_script(args.template, args.job, args.instance) + build_running_script(args.template, args.job, args.instance, + args.record) else: parser.print_help() raise ValueError("args not supported") diff --git a/tests/integration/record_benchmark.py b/tests/integration/record_benchmark.py index 218ecb6c47..df6cdcaeba 100755 --- a/tests/integration/record_benchmark.py +++ b/tests/integration/record_benchmark.py @@ -37,6 +37,11 @@ required=False, type=str, help="The path to the model input directory") +parser.add_argument("--benchmark-vars", + required=False, + type=str, + help="The benchmark variables used to differentiate in" + " cloudwatch like [CONCURRENCY=2,DATASET=gsm8k]") parser.add_argument("--info", required=False, type=str, @@ -46,55 +51,61 @@ data = {} +cloudwatch_report_schema = { + "totalTimeMills": 'Milliseconds', + "totalRequests": 'Count', + "failedRequests": 'Count', + "concurrentClients": 'Count', + "totalTokens": 'Count', + "tokenPerRequest": 'Count', + "averageLatency": 'Milliseconds', + "p50Latency": 'Milliseconds', + "p90Latency": 'Milliseconds', + "p99Latency": 'Milliseconds', + "timeToFirstByte": 'Milliseconds', + "p50TimeToFirstByte": 'Milliseconds', + "p90TimeToFirstByte": 'Milliseconds', + "p99TimeToFirstByte": 'Milliseconds', +} + class Benchmark: - def __init__(self, dyn_resource): + def __init__(self, dyn_resource, data: dict): self.dyn_resource = dyn_resource self.table = dyn_resource.Table("RubikonBenchmarks") self.table.load() + self.data = data def add_benchmark(self): - self.table.put_item(Item=data) + self.table.put_item(Item=self.data) -def record_table(): +def record_table(data: dict): table = boto3.resource("dynamodb").Table("RubikonBenchmarks") table.put_item(Item=data) -def record_cloudwatch(): +def record_cloudwatch(data: dict): esc = lambda n: n.replace("/", "-").replace(".", "-").replace("=", "-" ).strip(' -') job_name = data["modelId"] if "job" not in data else data["job"] - metric_name = lambda n: f"lmi_{data['instance']}_{esc(data['image'])}_{esc(job_name)}_{n}" - metric_data = [ - { - 'MetricName': metric_name("throughput"), - 'Unit': 'Count/Second', - 'Value': data['throughput'] - }, - { - 'MetricName': metric_name("latency_p50"), - 'Unit': 'Milliseconds', - 'Value': data['P50'] - }, - { - 'MetricName': metric_name("latency_p90"), - 'Unit': 'Milliseconds', - 'Value': data['P90'] - }, - { - 'MetricName': metric_name("latency_p99"), - 'Unit': 'Milliseconds', - 'Value': data['P99'] - }, - ] + benchmark_vars = data["benchmark_vars"] if data["benchmark_vars"] else "" + metric_name = lambda n: (f"lmi_{data['instance']}_{esc(data['image'])}" + f"_{esc(job_name)}_{esc(benchmark_vars)}_{n}") + metric_data = [] + for metric, unit in cloudwatch_report_schema.items(): + if metric in data.keys(): + metric_data.append({ + 'MetricName': metric_name(metric), + 'Unit': unit, + 'Value': data[metric] + }) cw = boto3.client('cloudwatch', region_name='us-east-1') cw.put_metric_data(Namespace="LMI_Benchmark", MetricData=metric_data) -def data_basic(): +def data_basic(data: dict): data["modelServer"] = "DJLServing" data["service"] = "ec2" data["Timestamp"] = Decimal(time.time()) @@ -110,33 +121,39 @@ def data_basic(): data[[split[0]]] = split[1] -def data_from_client(): - with open("benchmark.log", "r") as f: - for line in f.readlines(): - line = line.strip() - if "Total time:" in line: - data["totalTime"] = Decimal(line.split(" ")[2]) - if "error rate:" in line: - data["errorRate"] = Decimal(line.split(" ")[-1]) - if "Concurrent clients:" in line: - data["concurrency"] = int(line.split(" ")[2]) - if "Total requests:" in line: - data["requests"] = int(line.split(" ")[2]) - if "TPS:" in line: - data["tps"] = Decimal(line.split(" ")[1].split("/")[0]) - if "Average Latency:" in line: - data["avgLatency"] = Decimal(line.split(" ")[2]) - if "P50:" in line: - data["P50"] = Decimal(line.split(" ")[1]) - if "P90:" in line: - data["P90"] = Decimal(line.split(" ")[1]) - if "P99:" in line: - data["P99"] = Decimal(line.split(" ")[1]) - if "totalTime" in data and "requests" in data: - data["throughput"] = data["requests"] / data["totalTime"] - - -def data_container(): +def data_from_client(data: dict): + if os.path.exists("benchmark_result.json"): + with open("benchmark_result.json", "r") as f: + data.update(json.load(f)) + elif os.path.exists("benchmark.log"): + with open("benchmark.log", "r") as f: + for line in f.readlines(): + line = line.strip() + if "Total time:" in line: + data["totalTime"] = Decimal(line.split(" ")[2]) + if "error rate:" in line: + data["errorRate"] = Decimal(line.split(" ")[-1]) + if "Concurrent clients:" in line: + data["concurrency"] = int(line.split(" ")[2]) + if "Total requests:" in line: + data["requests"] = int(line.split(" ")[2]) + if "TPS:" in line: + data["tps"] = Decimal(line.split(" ")[1].split("/")[0]) + if "Average Latency:" in line: + data["avgLatency"] = Decimal(line.split(" ")[2]) + if "P50:" in line: + data["P50"] = Decimal(line.split(" ")[1]) + if "P90:" in line: + data["P90"] = Decimal(line.split(" ")[1]) + if "P99:" in line: + data["P99"] = Decimal(line.split(" ")[1]) + if "totalTime" in data and "requests" in data: + data["throughput"] = data["requests"] / data["totalTime"] + else: + print("There is no benchmark logs found!") + + +def data_container(data: dict): if "container" in data: container = data["container"] if container.startswith("deepjavalibrary/djl-serving:"): @@ -158,7 +175,7 @@ def data_container(): data["tgiVersion"] = version -def data_from_model_files(): +def data_from_model_files(data: dict): if args.model: propsPath = os.path.join(args.model, "serving.properties") if os.path.isfile(propsPath): @@ -211,14 +228,15 @@ def data_from_model_files(): data["modelId"] = envs["MODEL_ID"] -def data_from_template(): +def data_from_template(data: dict): if args.template: with open(args.template, "r") as f: template = json.load(f) job_template = template[args.job] data["job"] = args.job + data['benchmark_vars'] = args.benchmark_vars data["awscurl"] = bytes.fromhex( - job_template['awscurl']).decode("utf-8") + job_template['awscurl'][args.benchmark_vars]).decode("utf-8") if "container" not in data and "container" in job_template: data["container"] = job_template["container"] if "info" in job_template: @@ -231,19 +249,19 @@ def data_from_template(): if __name__ == "__main__": - data_from_template() - data_basic() - data_container() - data_from_client() - data_from_model_files() + data_from_template(data) + data_basic(data) + data_container(data) + data_from_client(data) + data_from_model_files(data) if "errorRate" not in data or data["errorRate"] == 100: print("Not recording failed benchmark") print(data) else: if args.record == "table": - record_table() + record_table(data) elif args.record == "cloudwatch": - record_cloudwatch() + record_cloudwatch(data) else: print(data)