Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] Update ROCm and MIGraphX CI pipeline to ROCm5.3 #13257

Merged
merged 3 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ trigger: none
name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)'
jobs:
- job: AMDMIGraphX_CI
workspace:
clean: all
pool: 'AMD-GPU'
timeoutInMinutes: 180

Expand All @@ -13,7 +15,7 @@ jobs:
- name: render
value: 109
- name: RocmVersion
value: 5.2.3
value: 5.3

steps:
- checkout: self
Expand All @@ -24,12 +26,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/pai/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/pai
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
script: rm -rf $(Build.BinariesDirectory)/*
displayName: 'Clean Build.BinariesDirectory'
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
Expand All @@ -47,14 +44,14 @@ jobs:
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /onnxruntime_src \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
python tools/ci_build/build.py \
--config RelWithDebInfo \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
--mpi_home /opt/ompi \
--use_migraphx \
--rocm_version=5.2.3 \
--rocm_version=$(RocmVersion) \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--update \
Expand Down Expand Up @@ -83,7 +80,7 @@ jobs:
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--workdir /build/RelWithDebInfo \
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion) \
/onnxruntime_src/tools/ci_build/github/pai/migraphx_test_launcher.sh
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Run onnxruntime unit tests'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ trigger: none
name: 'orttraining_ci_$(Date:yyyyMMdd)_$(Rev:r)'
jobs:
- job: AMD_CI
workspace:
clean: all
pool: 'AMD-GPU'
timeoutInMinutes: 150

Expand All @@ -14,10 +16,12 @@ jobs:
value: 109
- name: onnxruntimeBuildSucceeded
value: false
- name: RocmVersion
value: 5.3

# generated from tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
container:
image: onnxruntimecibuildenvironment.azurecr.io/rocm-ci-pipeline-env:rocm5.2.3
image: onnxruntimecibuildenvironment.azurecr.io/rocm-ci-pipeline-env:rocm$(RocmVersion)
endpoint: onnxruntimecibuildenvironmentforamd
options: --privileged -e HIP_VISIBLE_DEVICES --security-opt seccomp=unconfined --shm-size=1024m --device=/dev/kfd --device=/dev/dri --group-add $(video) --group-add $(render)

Expand Down Expand Up @@ -54,7 +58,7 @@ jobs:
--cmake_extra_defines \
CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \
--use_rocm \
--rocm_version=5.2.3 \
--rocm_version=$(RocmVersion) \
--rocm_home ${ROCM_HOME} \
--nccl_home ${ROCM_HOME}\
--update \
Expand Down Expand Up @@ -90,7 +94,7 @@ jobs:
script: |-
cd ./build/RelWithDebInfo
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=14
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
--model_name_or_path bert-large-uncased \
Expand All @@ -108,7 +112,7 @@ jobs:
--skip_memory_metrics
python ../../orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm5.2.3.json
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.bert-large-rocm$(RocmVersion).json
displayName: 'Run Python Hugging-Face BERT-L test'
retryCountOnTaskFailure: 1
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
Expand All @@ -118,7 +122,7 @@ jobs:
script: |-
cd ./build/RelWithDebInfo
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=14
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_clm.py \
--model_name_or_path gpt2 \
Expand All @@ -137,7 +141,7 @@ jobs:
--skip_memory_metrics
python ../../orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.gpt2-rocm5.2.3.json
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.gpt2-rocm$(RocmVersion).json
displayName: 'Run Python Hugging-Face GPT2 test'
retryCountOnTaskFailure: 1
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
Expand Down Expand Up @@ -202,7 +206,7 @@ jobs:
script: |-
cd ./build/RelWithDebInfo
export PYTHONPATH=$PWD
export ORTMODULE_ONNX_OPSET_VERSION=14
export ORTMODULE_ONNX_OPSET_VERSION=15
python \
/stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \
--model_name_or_path distilbert-base-uncased \
Expand All @@ -220,7 +224,7 @@ jobs:
--skip_memory_metrics
python ../../orttraining/tools/ci_test/compare_huggingface.py \
ci-pipeline-actual.json \
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.distilbert-base-rocm5.2.3.json
../../orttraining/tools/ci_test/results/ci-mi100.huggingface.distilbert-base-rocm$(RocmVersion).json
displayName: 'Run Python Hugging-Face DistilBERT test'
retryCountOnTaskFailure: 1
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true')) # ensure all tests are run when the build successed
Expand Down Expand Up @@ -326,18 +330,19 @@ jobs:
sudo apt-get update
sudo apt install -y cifs-utils
displayName: 'Install filesystems util'
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist"
displayName: 'Mount MNIST'
condition: succeededOrFailed()
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data"
displayName: 'Mount bert-data'
condition: succeededOrFailed()
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))

- bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache"
displayName: 'Mount hf-models-cache'
condition: succeededOrFailed()
condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))


# Entry point for all ORTModule tests
Expand All @@ -362,4 +367,4 @@ jobs:
# --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw \
# --transformers_cache /hf_models_cache/huggingface/transformers
# displayName: 'Run orttraining_ortmodule_tests.py'
# condition: succeededOrFailed()
# condition: and(succeededOrFailed(), eq(variables.onnxruntimeBuildSucceeded, 'true'))
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rocm/pytorch:rocm5.2.3_ubuntu20.04_py3.7_pytorch_1.12.1
FROM rocm/pytorch:rocm5.3_ubuntu20.04_py3.7_pytorch_1.12.1

ENV DEBIAN_FRONTEND noninteractive
ENV MIGRAPHX_DISABLE_FAST_GELU=1
Expand Down
2 changes: 1 addition & 1 deletion tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM rocm/pytorch:rocm5.2.3_ubuntu20.04_py3.7_pytorch_1.12.1
FROM rocm/pytorch:rocm5.3_ubuntu20.04_py3.7_pytorch_1.12.1

WORKDIR /stage

Expand Down