deepjavalibrary · lanking520 · May 16, 2024 · May 15, 2024
@@ -15,9 +15,14 @@ on:
           - g5.2xlarge
           - g5.12xlarge
           - g5.48xlarge
+          - g6.2xlarge
+          - g6.12xlarge
+          - g6.48xlarge
           - g4dn.12xlarge
           - g4dn.2xlarge
           - p4d.24xlarge
+          - p4de.24xlarge
+          - p5.24xlarge
           - inf2.8xlarge
           - inf2.24xlarge
           - trn1.2xlarge
@@ -34,6 +39,11 @@ on:
           - none
           - table
           - cloudwatch
+      repo:
+        description: '[Do not change] The repo for runner registration'
+        required: false
+        type: string
+        default: 'djl-serving'
   workflow_call:
     inputs:
       running_template:
@@ -54,6 +64,11 @@ on:
         required: false
         type: string
         default: 'none'
+      repo:
+        description: 'The repo for runner registration'
+        required: false
+        type: string
+        default: 'djl-serving'
 
 permissions:
   id-token: write
@@ -68,10 +83,10 @@ jobs:
         run: |
           cd /home/ubuntu/djl_benchmark_script/scripts
           token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
-          https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
+          https://api.github.com/repos/deepjavalibrary/${{ inputs.repo }}/actions/runners/registration-token \
           --fail \
           | jq '.token' | tr -d '"' )
-          ./start_instance.sh action_ib_${{ inputs.instance }} $token djl-serving
+          ./start_instance.sh action_ib_${{ inputs.instance }} $token ${{ inputs.repo }}
     outputs:
       gpu_instance_id: ${{ steps.create_instance.outputs.action_ib_instance_id }}
 
@@ -127,26 +142,20 @@ jobs:
         run: |
           wget https://publish.djl.ai/awscurl/awscurl
           chmod +x awscurl
-      - name: Run benchmark job
-        working-directory: tests/integration
-        run: |
-          echo "${{ needs.environment-setup.outputs.template }}" >> template.json
-          python3 instant_benchmark.py --template template.json \
-          --job ${{ matrix.job }} --instance ${{ inputs.instance }}
-
-          bash instant_benchmark.sh
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
           role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
           aws-region: us-east-1
-      - name: Record benchmark job
-        if: ${{ inputs.record == 'table' || inputs.record == 'cloudwatch' }}
+      - name: Run benchmark job
         working-directory: tests/integration
         run: |
-          python3 record_benchmark.py --template template.json \
+          echo "${{ needs.environment-setup.outputs.template }}" >> template.json
+          python3 instant_benchmark.py --template template.json \
           --job ${{ matrix.job }} --instance ${{ inputs.instance }} \
-          --model models/test --record ${{ inputs.record }}
+          --record ${{ inputs.record }}
+
+          bash instant_benchmark.sh
       - name: Get serving logs
         if: always()
         working-directory: tests/integration

@@ -13,6 +13,7 @@ option.tensor_parallel_degree=max
 TOKENIZER=TheBloke/Llama-2-7B-fp16 ./awscurl -c 32 -N 10 \
 -X POST http://127.0.0.1:8080/invocations   \
 --connect-timeout 60   -H "Content-type: application/json"   \
+--json-path benchmark_result.json \
 -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}'   \
 -t -o /tmp/output.txt
 [test_name]
@@ -22,13 +23,13 @@ ENGINE={vllm,lmi-dist}
 [container]
 deepjavalibrary/djl-serving:lmi-nightly
 [serving_properties]
-engine=Python
 option.rolling_batch=$ENGINE
 option.model_id=s3://djl-llm/llama-3-8b-hf/
 option.tensor_parallel_degree=max
 [aws_curl]
 TOKENIZER=TheBloke/Llama-2-13B-fp16 ./awscurl -c 32 -N 10 \
 -X POST http://127.0.0.1:8080/invocations   \
+--json-path benchmark_result.json \
 --connect-timeout 60   -H "Content-type: application/json"   \
 -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}'   \
 -t -o /tmp/output.txt
@@ -2,17 +2,19 @@
 mistral
 [vars]
 ENGINE={vllm,lmi-dist}
+[benchmark_vars]
+CONCURRENCY={1,2,4,8}
 [container]
 deepjavalibrary/djl-serving:lmi-nightly
 [serving_properties]
-engine=Python
 option.rolling_batch=$ENGINE
 option.model_id=NousResearch/Hermes-2-Pro-Mistral-7B
 option.tensor_parallel_degree=max
 option.max_model_len=8192
 [aws_curl]
-TOKENIZER=TheBloke/Mistral-7B-Instruct-v0.2-AWQ ./awscurl -c 32 -N 10 \
+TOKENIZER=TheBloke/Mistral-7B-Instruct-v0.2-AWQ ./awscurl -c 32 -N $CONCURRENCY \
 -X POST http://127.0.0.1:8080/invocations   \
 --connect-timeout 60   -H "Content-type: application/json"   \
+--json-path benchmark_result.json \
 -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}'   \
 -t -o /tmp/output.txt
@@ -5,13 +5,13 @@ ENGINE={vllm,lmi-dist}
 [container]
 deepjavalibrary/djl-serving:lmi-nightly
 [serving_properties]
-engine=Python
 option.rolling_batch=$ENGINE
 option.model_id=s3://djl-llm/mixtral-8x7b
 option.tensor_parallel_degree=max
 [aws_curl]
 TOKENIZER=NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO ./awscurl -c 32 -N 10 \
 -X POST http://127.0.0.1:8080/invocations   \
+--json-path benchmark_result.json \
 --connect-timeout 60   -H "Content-type: application/json"   \
 -d '{"inputs":"The new movie that got Oscar this year","parameters":{"max_new_tokens":256, "do_sample":true}}'   \
 -t -o /tmp/output.txt
@@ -29,6 +29,10 @@
                     required=False,
                     type=str,
                     help="The current instance name")
+parser.add_argument("--record",
+                    required=False,
+                    type=str,
+                    help="Place to record metrics")
 
 parser.add_argument("--job", required=False, type=str, help="The job string")
 args = parser.parse_args()
@@ -110,6 +114,7 @@ def parse_raw_template(url, override_container):
     commandline = []
     requirements = []
     vars = []
+    benchmark_vars = []
     info = None
     while iterator < len(lines):
         if '[test_name]' == lines[iterator]:
@@ -148,6 +153,12 @@ def parse_raw_template(url, override_container):
                     lines[iterator]):
                 vars.append(lines[iterator])
                 iterator += 1
+        elif '[benchmark_vars]' == lines[iterator]:
+            iterator += 1
+            while iterator < len(lines) and not is_square_bracket(
+                    lines[iterator]):
+                benchmark_vars.append(lines[iterator])
+                iterator += 1
         elif '[info]' == lines[iterator]:
             info = []
             iterator += 1
@@ -174,13 +185,20 @@ def parse_raw_template(url, override_container):
             if info is not None:
                 cur_result['info'] = info
             mul_results = multiply_template_with_vars(name, cur_result, vars)
+            # each of the replicated deployment options
             for r in mul_results.values():
-                r['awscurl'] = r['awscurl'].encode().hex()
+                replicated_awscurl = multiply_template_with_vars(
+                    '', {'awscurl': cur_result['awscurl']}, benchmark_vars)
+                for option in replicated_awscurl.keys():
+                    replicated_awscurl[option] = replicated_awscurl[option][
+                        'awscurl'].encode().hex()
+                r['awscurl'] = replicated_awscurl
             final_result.update(mul_results)
             name = ''
             container = None
             properties = []
             env = []
+            benchmark_vars = []
             commandline = []
             requirements = []
             vars = []
@@ -219,23 +237,35 @@ def machine_translation(machine_name: str):
         return "lmi"
 
 
-def build_running_script(template, job, instance):
+def build_running_script(template, job, instance, record):
     with open(template) as f:
         template = json.load(f)
     job_template = template[job]
-    job_template['awscurl'] = bytes.fromhex(
-        job_template['awscurl']).decode("utf-8")
+    for key in job_template['awscurl'].keys():
+        job_template['awscurl'][key] = bytes.fromhex(
+            job_template['awscurl'][key]).decode("utf-8")
     write_model_artifacts(job_template['properties'],
                           job_template['requirements'], job_template['env'])
-
     container = job_template['container']
 
+    benchmark_command = ['set -x']
+    record_benchmark = ('python3 record_benchmark.py --template template.json '
+                        f'--job {job} --instance {instance} '
+                        f'--model models/test --record {record}')
+
+    for key, value in job_template['awscurl'].items():
+        benchmark_command.append("rm -rf benchmark_result.json benchmark.log")
+        benchmark_command.append(value)
+        benchmark_command.append(record_benchmark +
+                                 f' --benchmark-vars "{key}"')
+
     bash_command = [
-        'set -euo pipefail', 'echo "Start Launching container..."',
+        'set -euo pipefail',
+        'echo "Start Launching container..."',
         f"docker pull {container}",
         f"./launch_container.sh {container} $PWD/models {machine_translation(instance)}",
-        job_template['awscurl'] + " | tee benchmark.log"
     ]
+    bash_command.extend(benchmark_command)
     with open("instant_benchmark.sh", "w") as f:
         f.write('\n'.join(bash_command))
 
@@ -249,7 +279,8 @@ def build_running_script(template, job, instance):
         command = f"echo \"template={json.dumps(json.dumps(json.dumps(result)))}\" >> $GITHUB_OUTPUT"
         sp.call(command, shell=True)
     elif args.template and args.job and args.instance:
-        build_running_script(args.template, args.job, args.instance)
+        build_running_script(args.template, args.job, args.instance,
+                             args.record)
     else:
         parser.print_help()
         raise ValueError("args not supported")