From 71830815957d4b6a07ecb0f20980ea2875d1ae73 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Thu, 5 Oct 2023 20:51:15 +0900 Subject: [PATCH] Build MXJob examples in CI (#1927) Signed-off-by: Yuki Iwai --- .github/workflows/publish-example-images.yaml | 9 +-- .../dist_gpu_v1.yaml} | 0 examples/mxnet/mxjob_dist_v1.yaml | 6 +- examples/mxnet/train/Dockerfile | 9 +-- examples/mxnet/train/mx_job_dist_gpu_v1.yaml | 36 ----------- examples/mxnet/tune/Dockerfile | 62 ++----------------- examples/mxnet/tune/mx_job_tune_gpu_v1.yaml | 22 ++++--- 7 files changed, 32 insertions(+), 112 deletions(-) rename examples/mxnet/{train/byteps_dist_gpu_v1.yaml => byteps/dist_gpu_v1.yaml} (100%) delete mode 100755 examples/mxnet/train/mx_job_dist_gpu_v1.yaml diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml index 3248ec19b9..616c2f1072 100644 --- a/.github/workflows/publish-example-images.yaml +++ b/.github/workflows/publish-example-images.yaml @@ -47,12 +47,13 @@ jobs: - component-name: xgboost-dist-rabit-test dockerfile: examples/xgboost/smoke-dist/Dockerfile context: examples/xgboost/smoke-dist + - component-name: mxnet-gpu + dockerfile: examples/mxnet/train/Dockerfile + - component-name: mxnet-auto-tuning + dockerfile: examples/mxnet/tune/Dockerfile + context: examples/mxnet/tune # TODO (tenzen-y): Fix the below broken Dockerfiles -# - component-name: mxnet-gpu -# dockerfile: examples/mxnet/train/Dockerfile -# - component-name: mxnet-auto-tuning -# dockerfile: examples/mxnet/tune/Dockerfile # - component-name: pytorch-dist-mnist-mpi # dockerfile: examples/pytorch/mnist/Dockerfile-mpi # - component-name: pytorch-dist-mnist diff --git a/examples/mxnet/train/byteps_dist_gpu_v1.yaml b/examples/mxnet/byteps/dist_gpu_v1.yaml similarity index 100% rename from examples/mxnet/train/byteps_dist_gpu_v1.yaml rename to examples/mxnet/byteps/dist_gpu_v1.yaml diff --git a/examples/mxnet/mxjob_dist_v1.yaml b/examples/mxnet/mxjob_dist_v1.yaml index f54db4d6d5..3e117af007 100644 --- a/examples/mxnet/mxjob_dist_v1.yaml +++ b/examples/mxnet/mxjob_dist_v1.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: mxnet - image: mxjob/mxnet:gpu + image: kubeflow/mxnet-gpu:latest ports: - containerPort: 9991 name: mxjob-port @@ -23,7 +23,7 @@ spec: spec: containers: - name: mxnet - image: mxjob/mxnet:gpu + image: kubeflow/mxnet-gpu:latest ports: - containerPort: 9991 name: mxjob-port @@ -34,7 +34,7 @@ spec: spec: containers: - name: mxnet - image: mxjob/mxnet:gpu + image: kubeflow/mxnet-gpu:latest command: ["python"] args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"] resources: diff --git a/examples/mxnet/train/Dockerfile b/examples/mxnet/train/Dockerfile index 2c1c7d9cc0..dc6af9a96b 100755 --- a/examples/mxnet/train/Dockerfile +++ b/examples/mxnet/train/Dockerfile @@ -1,7 +1,8 @@ -FROM mxnet/python:gpu +FROM mxnet/python:1.9.1_gpu_cu112_py3 -RUN apt-get update && \ +RUN apt-get update -y && \ apt-get install -y git && \ - git clone https://github.com/apache/incubator-mxnet.git -b v1.6.x + git clone https://github.com/apache/mxnet.git -b v1.9.x && \ + rm -rf /var/lib/apt/lists/* -ENTRYPOINT ["python", "/incubator-mxnet/example/image-classification/train_mnist.py"] +ENTRYPOINT ["python3", "/mxnet/mxnet/example/image-classification/train_mnist.py"] diff --git a/examples/mxnet/train/mx_job_dist_gpu_v1.yaml b/examples/mxnet/train/mx_job_dist_gpu_v1.yaml deleted file mode 100755 index a9c6c0894d..0000000000 --- a/examples/mxnet/train/mx_job_dist_gpu_v1.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: "kubeflow.org/v1" -kind: "MXJob" -metadata: - name: "mxnet-job" -spec: - jobMode: MXTrain - mxReplicaSpecs: - Scheduler: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: mxjob/mxnet:gpu - Server: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: mxjob/mxnet:gpu - Worker: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: mxjob/mxnet:gpu - command: ["python"] - args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"] - resources: - limits: - nvidia.com/gpu: 1 diff --git a/examples/mxnet/tune/Dockerfile b/examples/mxnet/tune/Dockerfile index 7e7dc53372..9ae7c2e98a 100644 --- a/examples/mxnet/tune/Dockerfile +++ b/examples/mxnet/tune/Dockerfile @@ -1,61 +1,7 @@ -FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 - -# Download usefull tools and mxnet, tvm -WORKDIR /home/scripts -RUN apt-get update && apt-get install -y git vim cmake wget sed && \ - git clone --recursive https://github.com/dmlc/tvm && \ - git clone --recursive https://github.com/apache/incubator-mxnet mxnet - -# Download necessary dependence -RUN apt-get update && \ - apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev && \ - apt-get install -y python3-pip && \ - apt-get install -y build-essential - -# mxnet dependence -RUN apt-get install -y libopenblas-dev liblapack-dev && \ - apt-get install -y libopencv-dev - -# tvm dependence -RUN pip3 install --user numpy decorator && \ - pip3 install --user tornado psutil xgboost - -# get llvm 4.0.0 for tvm -RUN wget http://releases.llvm.org/4.0.0/clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ - tar -xf clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ - mv clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04 llvm && \ - rm clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz - -# Compile mxnet -RUN cd mxnet && \ - make clean && \ - make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas USE_DIST_KVSTORE=1 USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 - -# Install mxnet -RUN cd mxnet/python && \ - pip3 install -e . - -# Compile tvm -RUN cd tvm && \ - mkdir build && \ - cp cmake/config.cmake build && \ - cd build && \ - sed -i 's/set(USE_CUDA OFF)/set(USE_CUDA ON)/g' config.cmake && \ - sed -i 's/set(USE_CUDNN OFF)/set(USE_CUDNN ON)/g' config.cmake && \ - sed -i 's/set(USE_CUBLAS OFF)/set(USE_CUBLAS ON)/g' config.cmake && \ - sed -i 's/set(USE_LLVM OFF)/set(USE_LLVM ..\/..\/llvm\/bin\/llvm-config)/g' config.cmake && \ - cmake .. && \ - make -j $(nproc) - -# Install tvm -RUN cd tvm && \ - cd python; python3 setup.py install --user; cd .. && \ - cd topi/python; python3 setup.py install --user; cd ../.. && \ - cd nnvm/python; python3 setup.py install --user; cd ../.. +FROM nvcr.io/nvidia/mxnet:23.09-py3 # COPY custom code to container -COPY start-job.py . -COPY auto-tuning.py . +COPY start-job.py / +COPY auto-tuning.py / -# Change working path -WORKDIR /home/log +ENTRYPOINT ["python3", "/start-job.py"] diff --git a/examples/mxnet/tune/mx_job_tune_gpu_v1.yaml b/examples/mxnet/tune/mx_job_tune_gpu_v1.yaml index eac02ad3da..749c4317c3 100644 --- a/examples/mxnet/tune/mx_job_tune_gpu_v1.yaml +++ b/examples/mxnet/tune/mx_job_tune_gpu_v1.yaml @@ -12,23 +12,28 @@ spec: spec: containers: - name: mxnet - image: mxjob/auto-tuning:gpu + image: kubeflow/mxnet-auto-tuning:latest command: ["python3"] - args: ["/home/scripts/start-job.py"] + args: ["/start-job.py"] + ports: + - containerPort: 9991 + name: mxjob-port TunerServer: - label: 2080ti replicas: 1 restartPolicy: Never template: spec: containers: - name: mxnet - image: mxjob/auto-tuning:gpu + image: kubeflow/mxnet-auto-tuning:latest command: ["python3"] - args: ["/home/scripts/start-job.py"] + args: ["/start-job.py"] resources: limits: nvidia.com/gpu: 1 + ports: + - containerPort: 9991 + name: mxjob-port Tuner: replicas: 1 restartPolicy: Never @@ -36,6 +41,9 @@ spec: spec: containers: - name: mxnet - image: mxjob/auto-tuning:gpu + image: kubeflow/mxnet-auto-tuning:latest command: ["python3"] - args: ["/home/scripts/start-job.py"] + args: ["/start-job.py"] + ports: + - containerPort: 9991 + name: mxjob-port