Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

[NCCL] - Various improvements #13

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 70 additions & 47 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,47 @@ concurrency:
group: build-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build-dependencies:
name: Build the FlexFlow dependencies
download-nccl:
name: Download and Package NCCL
strategy:
matrix:
os: [ubuntu-18.04, ubuntu-20.04]
fail-fast: false
runs-on: ${{ matrix.os }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3

- name: Free additional space on runner
run: .github/workflows/helpers/free_space_on_runner.sh

- name: Download NCCL
run: .github/workflows/helpers/download_nccl.sh

- name: Prepare library files
working-directory: nccl_downloads
run: |
for folder in *; do
if [ -d "$folder" ]; then
cd $folder
export NCCL_TARBALL="nccl_${{ matrix.os }}_${folder}.tar.gz"
echo "Creating archive $NCCL_TARBALL"
tar -zcvf $NCCL_TARBALL nccl
echo "Checking the size of the NCCL tarball..."
du -h $NCCL_TARBALL
mv $NCCL_TARBALL ../
cd ..
fi
done

- name: Archive compiled NCCL libraries
uses: actions/upload-artifact@v3
with:
name: nccl_${{ matrix.os }}
path: nccl_downloads/*.tar.gz

build-legion:
name: Build Legion
strategy:
matrix:
os: [ubuntu-18.04, ubuntu-20.04]
Expand All @@ -23,17 +62,15 @@ jobs:
"10.2.89",
"11.0.3",
"11.1.1",
"11.2.2",
"11.3.1",
"11.4.3",
"11.5.2",
"11.6.2",
"11.2.0",
"11.3.0",
"11.4.0",
"11.5.0",
"11.6.0",
"11.7.0",
]
gpu_backend: [cuda, hip_rocm]
# uncomment the line below (and related ones) to build nccl, legion in parallel. Because
# git only supports up to 20 jobs in parallel, building in parallel is currently not needed.
#dependency: ["nccl", "legion"]
python_version: ["3.7", "3.8", "3.9", "3.10"]
exclude:
- os: ubuntu-20.04
cuda_version: "10.1.243"
Expand All @@ -48,15 +85,15 @@ jobs:
gpu_backend: "hip_rocm"
- cuda_version: "11.0.3"
gpu_backend: "hip_rocm"
- cuda_version: "11.2.2"
- cuda_version: "11.2.0"
gpu_backend: "hip_rocm"
- cuda_version: "11.3.1"
- cuda_version: "11.3.0"
gpu_backend: "hip_rocm"
- cuda_version: "11.4.3"
- cuda_version: "11.4.0"
gpu_backend: "hip_rocm"
- cuda_version: "11.5.2"
- cuda_version: "11.5.0"
gpu_backend: "hip_rocm"
- cuda_version: "11.6.2"
- cuda_version: "11.6.0"
gpu_backend: "hip_rocm"
- cuda_version: "11.7.0"
gpu_backend: "hip_rocm"
Expand All @@ -82,11 +119,11 @@ jobs:
env:
CUDA_VERSION: ${{ matrix.cuda_version }}
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
PY_VERSION: ${{ matrix.python_version }}
run: .github/workflows/helpers/install_dependencies.sh

- name: Build NCCL/Legion
- name: Build Legion
env:
#DEPENDENCY: ${{ matrix.dependency }}
CUDA_VERSION: ${{ matrix.cuda_version }}
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
run: |
Expand All @@ -95,9 +132,7 @@ jobs:
export CUDA_DIR=/usr/local/cuda

export FF_BUILD_LEGION=ON
if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
export FF_USE_NCCL=ON
fi
export FF_USE_NCCL=OFF

cores_available=$(nproc --all)
n_build_cores=$(( cores_available -1 ))
Expand All @@ -112,55 +147,43 @@ jobs:
- name: Prepare library files
env:
FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
CUDA_VERSION: ${{ matrix.cuda_version }}
run: |
# Remove unnecessary files
echo "Removing unnecessary files..."
rm -rf build/deps/nccl/obj build/deps/nccl/src build/deps/nccl/tmp
rm -f build/export/legion/lib/libflexflow.so
export CUDA_VERSION_MAJOR="${CUDA_VERSION:0:4}"
echo "CUDA_VERSION_MAJOR=${CUDA_VERSION:0:4}" >> $GITHUB_ENV

if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
export NCCL_TARBALL="nccl_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz"
export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz"

# Only build NCCL tarball for CUDA backends
echo "Creating archive $NCCL_TARBALL"
tar -zcvf $NCCL_TARBALL build/deps/nccl/
echo "Checking the size of the NCCL tarball..."
du -h $NCCL_TARBALL
export LEGION_TARBALL="legion_${{ matrix.os }}_cuda-${CUDA_VERSION_MAJOR}_python${{ matrix.python_version }}.tar.gz"
else
export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.gpu_backend }}.tar.gz"
export LEGION_TARBALL="legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }}.tar.gz"
fi

echo "Creating archive $LEGION_TARBALL"
tar -zcvf $LEGION_TARBALL build/export/legion/
cd build/export
tar -zcvf $LEGION_TARBALL legion
echo "Checking the size of the Legion tarball..."
du -h $LEGION_TARBALL
mv $LEGION_TARBALL ../../
cd ../../

- name: Archive compiled Legion library (CUDA)
uses: actions/upload-artifact@v3
if: ${{ matrix.gpu_backend == 'cuda' }}
with:
name: legion_${{ matrix.os }}_${{ matrix.cuda_version }}
path: legion_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz
name: legion_${{ matrix.os }}_cuda-${{ env.CUDA_VERSION_MAJOR }}_python${{ matrix.python_version }}
path: legion_${{ matrix.os }}_cuda-${{ env.CUDA_VERSION_MAJOR }}_python${{ matrix.python_version }}.tar.gz

- name: Archive compiled Legion library (HIP)
uses: actions/upload-artifact@v3
if: ${{ matrix.gpu_backend != 'cuda' }}
with:
name: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}
path: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}.tar.gz

- name: Archive compiled NCCL library (CUDA)
uses: actions/upload-artifact@v3
if: ${{ matrix.gpu_backend == 'cuda' }}
with:
name: nccl_${{ matrix.os }}_${{ matrix.cuda_version }}
path: nccl_${{ matrix.os }}_${{ matrix.cuda_version }}.tar.gz
name: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }}
path: legion_${{ matrix.os }}_${{ matrix.gpu_backend }}_python${{ matrix.python_version }}.tar.gz

notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
needs: build-dependencies
needs: [download-nccl, build-legion]
if: ${{ failure() && github.event_name == 'schedule' }}
steps:
- name: Send Slack message
Expand All @@ -172,7 +195,7 @@ jobs:
create-release:
name: Create new release
runs-on: ubuntu-20.04
needs: build-dependencies
needs: [download-nccl, build-legion]
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
Expand Down
67 changes: 67 additions & 0 deletions .github/workflows/helpers/download_nccl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash
set -euo pipefail
set -x

mkdir -p nccl_downloads
cd nccl_downloads

ubuntu_version=$(lsb_release -rs)
ubuntu_version=${ubuntu_version//./}
wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update -y
rm -f cuda-keyring_1.0-1_all.deb

if [[ "$ubuntu_version" == "2004" ]]; then
sudo apt download libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0
sudo apt download libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1
sudo apt download libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2
sudo apt download libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3
sudo apt download libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4
sudo apt download libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5
sudo apt download libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6
sudo apt download libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7
elif [[ "$ubuntu_version" == "1804" ]]; then
# Additional key required to download the CUDA 10.1 version
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
wget https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
sudo dpkg -i nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
sudo apt-get update -y
rm -f nvidia-machine-learning-repo-ubuntu1804_1.0.0-1_amd64.deb
sudo apt download libnccl2=2.8.3-1+cuda10.1 libnccl-dev=2.8.3-1+cuda10.1
sudo apt download libnccl2=2.15.5-1+cuda10.2 libnccl-dev=2.15.5-1+cuda10.2
sudo apt download libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0
sudo apt download libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1
sudo apt download libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2
sudo apt download libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3
sudo apt download libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4
sudo apt download libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5
sudo apt download libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6
sudo apt download libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7
fi

for debfile in *.deb; do
temp_str=${debfile#*+}
temp_str=${temp_str%_*}
cuda_version=${temp_str:4}
mkdir -p "cuda-$cuda_version/nccl"
dpkg-deb -xv "$debfile" "./cuda-$cuda_version/nccl"
cd "cuda-$cuda_version/nccl"
[ -d ./usr/include ] && mv ./usr/include ./
mkdir -p lib
files_to_move=(./usr/lib/x86_64-linux-gnu/*.a)
[ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.a ./lib/
files_to_move=(./usr/lib/x86_64-linux-gnu/*.so)
[ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.so ./lib/
files_to_move=(./usr/lib/x86_64-linux-gnu/*.so.*)
[ -f "${files_to_move[0]}" ] && mv ./usr/lib/x86_64-linux-gnu/*.so.* ./lib/
symlinks_to_move="$(find ./usr/lib/x86_64-linux-gnu/ -type l )"
for s in $symlinks_to_move; do
fname="$(basename "$s")"
ln -s "$(readlink "$s" )" "./lib/$fname"
done
rm -rf usr
cd ../../
done

rm -rf ./*.deb
6 changes: 3 additions & 3 deletions .github/workflows/helpers/install_cudnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ wget -c -q $CUDNN_LINK
if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" ]]; then
tar -xf $CUDNN_TARBALL_NAME -C ./
CUDNN_EXTRACTED_TARBALL_NAME="${CUDNN_TARBALL_NAME::-7}"
sudo cp -r $CUDNN_EXTRACTED_TARBALL_NAME/include/* /usr/local/include
sudo cp -r $CUDNN_EXTRACTED_TARBALL_NAME/lib/* /usr/local/lib
rm -rf $CUDNN_EXTRACTED_TARBALL_NAME
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* "/usr/local/include"
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* "/usr/local/lib"
rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
else
sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
fi
Expand Down
32 changes: 27 additions & 5 deletions .github/workflows/helpers/install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,34 @@ sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binu
CUDA_VERSION=${CUDA_VERSION:-11.1.1}
./install_cudnn.sh "${CUDA_VERSION}"

# Install Miniconda
#Install Miniconda
echo "Installing Miniconda..."
wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ./Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
rm ./Miniconda3-latest-Linux-x86_64.sh && \
PY_VERSION=${PY_VERSION:-latest}
MINICONDA_BASE_URL="https://repo.continuum.io/miniconda/"
if [[ "$PY_VERSION" == "latest" ]]; then
echo "Installing latest Python version"
MINICONDA_INSTALLER="Miniconda3-latest-Linux-x86_64.sh"
elif [[ "$PY_VERSION" == "3.10" ]]; then
echo "Installing Python version ${PY_VERSION}"
MINICONDA_INSTALLER="Miniconda3-py310_22.11.1-1-Linux-x86_64.sh"
elif [[ "$PY_VERSION" == "3.9" ]]; then
echo "Installing Python version ${PY_VERSION}"
MINICONDA_INSTALLER="Miniconda3-py39_22.11.1-1-Linux-x86_64.sh"
elif [[ "$PY_VERSION" == "3.8" ]]; then
echo "Installing Python version ${PY_VERSION}"
MINICONDA_INSTALLER="Miniconda3-py38_22.11.1-1-Linux-x86_64.sh"
elif [[ "$PY_VERSION" == "3.7" ]]; then
echo "Installing Python version ${PY_VERSION}"
MINICONDA_INSTALLER="Miniconda3-py37_22.11.1-1-Linux-x86_64.sh"
else
echo "Request Python version (${PY_VERSION}) not supported"
exit 1
fi
MINICONDA_URL="${MINICONDA_BASE_URL}${MINICONDA_INSTALLER}"
wget -c -q $MINICONDA_URL && \
chmod +x $MINICONDA_INSTALLER && \
bash $MINICONDA_INSTALLER -b -p /opt/conda && \
rm $MINICONDA_INSTALLER && \
/opt/conda/bin/conda upgrade --all && \
/opt/conda/bin/conda install conda-build conda-verify && \
/opt/conda/bin/conda clean -ya
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/shell-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: Shell Check
on: [push, pull_request, workflow_dispatch]
jobs:
shellcheck:
name: Shellcheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Run ShellCheck
uses: ludeeus/action-shellcheck@master
2 changes: 1 addition & 1 deletion deps/legion
Submodule legion updated from 15b23c to 7f8df4