diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 52015d5f..e07a20cf 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -188,14 +188,26 @@ jobs: # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python # and we know that it will be written into a directory created by `mktemp -d`. # Given this information, we can use the following command to find the file: - log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") - mv "${log_file}" training-log.jsonl + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") + phase_num=1; + for log_file in $log_files; do + mv "${log_file}" phase-${phase_num}-training-log.jsonl + ((phase_num++)) + done + + - name: Upload training logs Phase 1 + uses: actions/upload-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: ./instructlab/phase-1-training-log.jsonl + retention-days: 1 + overwrite: true - - name: Upload training logs + - name: Upload training logs Phase 2 uses: actions/upload-artifact@v4 with: - name: training-log.jsonl - path: ./instructlab/training-log.jsonl + name: phase-2-training-log.jsonl + path: ./instructlab/phase-2-training-log.jsonl retention-days: 1 overwrite: true @@ -269,25 +281,41 @@ jobs: label: ${{ needs.start-large-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} - - name: Download loss data - id: download-logs + - name: Download loss data Phase 1 + id: phase-1-download-logs uses: actions/download-artifact@v4 with: - name: training-log.jsonl + name: phase-1-training-log.jsonl path: downloaded-data + - name: Download loss data Phase 2 + id: phase-2-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-2-training-log.jsonl + path: downloaded-data + + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + fetch-depth: 0 + - name: Install dependencies + working-directory: ./training run: | + python -m pip install --upgrade pip pip install -r requirements-dev.txt - - - name: Try to upload to s3 - id: upload-s3 + + - name: Try to upload Phase 1 to s3 + id: phase-1-upload-s3 continue-on-error: true run: | - output_file='./test.md' - python scripts/create-loss-graph.py \ - --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ - --output-file "${output_file}" \ + output_file_phase_1='./phase-1-test.md' + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ + --output-file "${output_file_phase_1}" \ --aws-region "${{ vars.AWS_REGION }}" \ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ --base-branch "${{ github.event.pull_request.base.ref }}" \ @@ -295,10 +323,41 @@ jobs: --head-sha "${{ github.event.pull_request.head.sha }}" \ --origin-repository "${{ github.repository }}" - - name: Check S3 upload status - if: steps.upload-s3.outcome == 'failure' + - name: Try to upload Phase 2 to s3 + id: phase-2-upload-s3 + continue-on-error: true + run: | + output_file_phase_2='./phase-2-test.md' + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ + --output-file "${output_file_phase_2}" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${{ github.event.pull_request.base.ref }}" \ + --pr-number "${{ github.event.pull_request.number }}" \ + --head-sha "${{ github.event.pull_request.head.sha }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check Phase 1 S3 upload status + if: steps.phase-1-upload-s3.outcome == 'success' run: | - echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Uploaded Phase 1 loss graph to S3." + cat "${output_file_phase_1}" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status + if: steps.phase-2-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 2 loss graph to S3." + cat "${output_file_phase_2}" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 1 S3 upload status + if: steps.phase-1-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status + if: steps.phase-2-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" - - cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" \ No newline at end of file