Skip to content

Commit

Permalink
Build MXJob examples in CI (#1927)
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y authored Oct 5, 2023
1 parent 52ccd67 commit 7183081
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 112 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/publish-example-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,13 @@ jobs:
- component-name: xgboost-dist-rabit-test
dockerfile: examples/xgboost/smoke-dist/Dockerfile
context: examples/xgboost/smoke-dist
- component-name: mxnet-gpu
dockerfile: examples/mxnet/train/Dockerfile
- component-name: mxnet-auto-tuning
dockerfile: examples/mxnet/tune/Dockerfile
context: examples/mxnet/tune

# TODO (tenzen-y): Fix the below broken Dockerfiles
# - component-name: mxnet-gpu
# dockerfile: examples/mxnet/train/Dockerfile
# - component-name: mxnet-auto-tuning
# dockerfile: examples/mxnet/tune/Dockerfile
# - component-name: pytorch-dist-mnist-mpi
# dockerfile: examples/pytorch/mnist/Dockerfile-mpi
# - component-name: pytorch-dist-mnist
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions examples/mxnet/mxjob_dist_v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
image: kubeflow/mxnet-gpu:latest
ports:
- containerPort: 9991
name: mxjob-port
Expand All @@ -23,7 +23,7 @@ spec:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
image: kubeflow/mxnet-gpu:latest
ports:
- containerPort: 9991
name: mxjob-port
Expand All @@ -34,7 +34,7 @@ spec:
spec:
containers:
- name: mxnet
image: mxjob/mxnet:gpu
image: kubeflow/mxnet-gpu:latest
command: ["python"]
args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"]
resources:
Expand Down
9 changes: 5 additions & 4 deletions examples/mxnet/train/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
FROM mxnet/python:gpu
FROM mxnet/python:1.9.1_gpu_cu112_py3

RUN apt-get update && \
RUN apt-get update -y && \
apt-get install -y git && \
git clone https://github.com/apache/incubator-mxnet.git -b v1.6.x
git clone https://github.com/apache/mxnet.git -b v1.9.x && \
rm -rf /var/lib/apt/lists/*

ENTRYPOINT ["python", "/incubator-mxnet/example/image-classification/train_mnist.py"]
ENTRYPOINT ["python3", "/mxnet/mxnet/example/image-classification/train_mnist.py"]
36 changes: 0 additions & 36 deletions examples/mxnet/train/mx_job_dist_gpu_v1.yaml

This file was deleted.

62 changes: 4 additions & 58 deletions examples/mxnet/tune/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,61 +1,7 @@
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04

# Download usefull tools and mxnet, tvm
WORKDIR /home/scripts
RUN apt-get update && apt-get install -y git vim cmake wget sed && \
git clone --recursive https://github.com/dmlc/tvm && \
git clone --recursive https://github.com/apache/incubator-mxnet mxnet

# Download necessary dependence
RUN apt-get update && \
apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev && \
apt-get install -y python3-pip && \
apt-get install -y build-essential

# mxnet dependence
RUN apt-get install -y libopenblas-dev liblapack-dev && \
apt-get install -y libopencv-dev

# tvm dependence
RUN pip3 install --user numpy decorator && \
pip3 install --user tornado psutil xgboost

# get llvm 4.0.0 for tvm
RUN wget http://releases.llvm.org/4.0.0/clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \
tar -xf clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \
mv clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04 llvm && \
rm clang+llvm-4.0.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz

# Compile mxnet
RUN cd mxnet && \
make clean && \
make -j $(nproc) USE_OPENCV=1 USE_BLAS=openblas USE_DIST_KVSTORE=1 USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1

# Install mxnet
RUN cd mxnet/python && \
pip3 install -e .

# Compile tvm
RUN cd tvm && \
mkdir build && \
cp cmake/config.cmake build && \
cd build && \
sed -i 's/set(USE_CUDA OFF)/set(USE_CUDA ON)/g' config.cmake && \
sed -i 's/set(USE_CUDNN OFF)/set(USE_CUDNN ON)/g' config.cmake && \
sed -i 's/set(USE_CUBLAS OFF)/set(USE_CUBLAS ON)/g' config.cmake && \
sed -i 's/set(USE_LLVM OFF)/set(USE_LLVM ..\/..\/llvm\/bin\/llvm-config)/g' config.cmake && \
cmake .. && \
make -j $(nproc)

# Install tvm
RUN cd tvm && \
cd python; python3 setup.py install --user; cd .. && \
cd topi/python; python3 setup.py install --user; cd ../.. && \
cd nnvm/python; python3 setup.py install --user; cd ../..
FROM nvcr.io/nvidia/mxnet:23.09-py3

# COPY custom code to container
COPY start-job.py .
COPY auto-tuning.py .
COPY start-job.py /
COPY auto-tuning.py /

# Change working path
WORKDIR /home/log
ENTRYPOINT ["python3", "/start-job.py"]
22 changes: 15 additions & 7 deletions examples/mxnet/tune/mx_job_tune_gpu_v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,38 @@ spec:
spec:
containers:
- name: mxnet
image: mxjob/auto-tuning:gpu
image: kubeflow/mxnet-auto-tuning:latest
command: ["python3"]
args: ["/home/scripts/start-job.py"]
args: ["/start-job.py"]
ports:
- containerPort: 9991
name: mxjob-port
TunerServer:
label: 2080ti
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/auto-tuning:gpu
image: kubeflow/mxnet-auto-tuning:latest
command: ["python3"]
args: ["/home/scripts/start-job.py"]
args: ["/start-job.py"]
resources:
limits:
nvidia.com/gpu: 1
ports:
- containerPort: 9991
name: mxjob-port
Tuner:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: mxnet
image: mxjob/auto-tuning:gpu
image: kubeflow/mxnet-auto-tuning:latest
command: ["python3"]
args: ["/home/scripts/start-job.py"]
args: ["/start-job.py"]
ports:
- containerPort: 9991
name: mxjob-port

0 comments on commit 7183081

Please sign in to comment.