Skip to content

Commit

Permalink
Update ROCm CI to use HIP LANGUAGE (#13214)
Browse files Browse the repository at this point in the history
Update for ROCm CI before reland tunable GEMM #12853. This PR also update
composable kernel to use CMakes's HIP language support so that we can
mix C/C++ compiler with HIP compiler instead of locking to hip-clang
  • Loading branch information
cloudhan authored Oct 5, 2022
1 parent 4fc8f71 commit 72076b1
Show file tree
Hide file tree
Showing 13 changed files with 130 additions and 66 deletions.
27 changes: 21 additions & 6 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1722,7 +1722,7 @@ if (onnxruntime_USE_OPENVINO)
add_definitions(-DOPENVINO_CONFIG_GPU_FP16=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
endif()

if (onnxruntime_USE_OPENVINO_CPU_FP32_NP)
add_definitions(-DOPENVINO_CONFIG_CPU_FP32=1)
add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
Expand Down Expand Up @@ -1761,7 +1761,7 @@ if (onnxruntime_USE_OPENVINO)
if($ENV{FIL_ENABLED})
add_definitions(-DOPENVINO_FIL_ENABLED=1)
endif()

endif()

if (onnxruntime_USE_VITISAI)
Expand Down Expand Up @@ -1861,12 +1861,21 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "ROCM does not support build with CUDA!")
endif()

if (NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH /opt/rocm)
else()
set(ROCM_PATH $ENV{ROCM_PATH})
set(ROCM_PATH ${onnxruntime_ROCM_HOME})

if (NOT CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/llvm/bin/clang++")
endif()

if (NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030")
endif()

# NOTE: HIP language is added in 3.21 and there are bugs before 3.23.1
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
find_package(HIP)
enable_language(HIP)

# replicate strategy used by pytorch to get ROCM_VERSION
# https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173
file(READ "${ROCM_PATH}/.info/version-dev" ROCM_VERSION_DEV_RAW)
Expand All @@ -1884,6 +1893,12 @@ if (onnxruntime_USE_ROCM)
message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
message("ROCM_VERSION_DEV_INT: ${ROCM_VERSION_DEV_INT}")
message("\n***** HIP LANGUAGE CONFIG INFO ****\n")
message("CMAKE_HIP_COMPILER: ${CMAKE_HIP_COMPILER}")
message("CMAKE_HIP_ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
message("CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE)
message("CMAKE_HIP_FLAGS_${BUILD_TYPE}: ${CMAKE_HIP_FLAGS_${BUILD_TYPE}}")
add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
endif()

Expand Down
2 changes: 1 addition & 1 deletion cmake/external/composable_kernel.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
set(composable_kernel_URL https://github.com/ROCmSoftwarePlatform/composable_kernel.git)
set(composable_kernel_TAG e1a3fff67510be2af023b31587e411230b994631) # 2022-08-25 07:43:43 +0800

set(BUILD_DEV OFF)
set(PATCH ${PROJECT_SOURCE_DIR}/patches/composable_kernel/Fix_Clang_Build.patch)

include(FetchContent)
Expand All @@ -14,6 +13,7 @@ FetchContent_Declare(composable_kernel
FetchContent_GetProperties(composable_kernel)
if(NOT composable_kernel_POPULATED)
FetchContent_Populate(composable_kernel)
set(BUILD_DEV OFF CACHE BOOL "Disable -Weverything, otherwise, error: 'constexpr' specifier is incompatible with C++98 [-Werror,-Wc++98-compat]" FORCE)
add_subdirectory(${composable_kernel_SOURCE_DIR} ${composable_kernel_BINARY_DIR} EXCLUDE_FROM_ALL)

add_library(onnxruntime_composable_kernel_includes INTERFACE)
Expand Down
12 changes: 5 additions & 7 deletions cmake/onnxruntime_kernel_explorer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,19 @@ endif()
if(NOT HIP_FOUND)
message(FATAL_ERROR "hip is required but is not found")
endif()
enable_language(HIP)

include(composable_kernel)

set(KERNEL_EXPLORER_ROOT ${ONNXRUNTIME_ROOT}/python/tools/kernel_explorer)
set(BERT_DIR ${ONNXRUNTIME_ROOT}/contrib_ops/rocm/bert)

file(GLOB kernel_explorer_srcs CONFIGURE_DEPENDS "${KERNEL_EXPLORER_ROOT}/*.cc")
# NOTE: This should not be necessary, but hip* symbols are hiding by some ifdef in LANGUAGE CXX mode, weird...
set_source_files_properties(${kernel_explorer_srcs} PROPERTIES LANGUAGE HIP)

file(GLOB kernel_explorer_kernel_srcs CONFIGURE_DEPENDS "${KERNEL_EXPLORER_ROOT}/kernels/*.cc")
set_source_files_properties(${kernel_explorer_kernel_srcs} PROPERTIES LANGUAGE HIP)

onnxruntime_add_shared_library_module(kernel_explorer
${kernel_explorer_srcs}
Expand All @@ -41,13 +46,6 @@ target_compile_definitions(kernel_explorer
PUBLIC ROCM_USE_FLOAT16
PRIVATE $<TARGET_PROPERTY:onnxruntime_pybind11_state,COMPILE_DEFINITIONS>)

# handle kernel_explorer sources as hip language
target_compile_options(kernel_explorer PRIVATE "-xhip")
# TODO: use predefined AMDGPU_TARGETS
target_compile_options(kernel_explorer PRIVATE "--offload-arch=gfx908" "--offload-arch=gfx90a")
# https://github.com/ROCm-Developer-Tools/HIP/blob/4514f350849b1090954295f8f87a5f8d78bd781b/hip-lang-config.cmake.in
target_link_libraries(kernel_explorer PRIVATE ${CLANGRT_BUILTINS})

add_dependencies(kernel_explorer onnxruntime_pybind11_state)

enable_testing()
Expand Down
10 changes: 5 additions & 5 deletions cmake/onnxruntime_providers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1368,15 +1368,15 @@ if (onnxruntime_USE_ROCM)
endif(CMAKE_BUILD_TYPE MATCHES Debug)

list(APPEND HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
list(APPEND HIP_CLANG_FLAGS ${CMAKE_HIP_FLAGS})

# Generate GPU code during compilation
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)

# Generate GPU code for GFX9 Generation
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906 --amdgpu-target=gfx908)
if (ROCM_VERSION_DEV_INT GREATER_EQUAL 50000)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx90a --amdgpu-target=gfx1030)
endif()
# Generate GPU code
foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
list(APPEND HIP_CLANG_FLAGS --offload-arch=${HIP_ARCH})
endforeach()

#onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
hip_add_library(onnxruntime_providers_rocm MODULE ${onnxruntime_providers_rocm_src})
Expand Down
76 changes: 58 additions & 18 deletions cmake/patches/composable_kernel/Fix_Clang_Build.patch
Original file line number Diff line number Diff line change
@@ -1,26 +1,45 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e1174ec..f8795475 100644
index 3e1174ec..65648cb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,17 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.14)

# Check support for CUDA/HIP in Cmake
-project(composable_kernel)
+project(composable_kernel LANGUAGES CXX HIP)

list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

@@ -36,27 +36,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")

+add_compile_options(
+ -xhip
+ --offload-arch=gfx908
+ --offload-arch=gfx90a
+ -O3 # otherwise, "Illegal instruction detected" for gfx908
+ "SHELL:-mllvm -amdgpu-early-inline-all=true"
+ "SHELL:-mllvm -amdgpu-function-calls=false" # otherwise, "local memory (65920) exceeds limit (65536) in function"
+ -fhip-new-launch-api
+)
+
+
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
@@ -245,9 +256,6 @@ rocm_package_setup_component(tests
-## OpenMP
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
- # workaround issue hipcc in rocm3.5 cannot find openmp
- set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
- set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
- set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
- set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
- set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
-else()
- find_package(OpenMP REQUIRED)
-endif()
-
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
-
-link_libraries(${OpenMP_gomp_LIBRARY})
-link_libraries(${OpenMP_pthread_LIBRARY})
-
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
@@ -245,9 +224,6 @@ rocm_package_setup_component(tests
)

add_subdirectory(library)
Expand All @@ -30,7 +49,7 @@ index 3e1174ec..f8795475 100644

#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)
@@ -273,11 +281,3 @@ rocm_install(FILES
@@ -273,11 +249,3 @@ rocm_install(FILES

set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
Expand All @@ -56,3 +75,24 @@ index fcaec592..8ea06421 100644
#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1

// hack: have underlying assumption that need to be satsified, otherwise it's a bug
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 6f3f900b..594d983d 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -1,5 +1,6 @@
function(add_instance_library INSTANCE_NAME)
message("adding instance ${INSTANCE_NAME}")
+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP)
add_library(${INSTANCE_NAME} OBJECT ${ARGN})
target_compile_features(${INSTANCE_NAME} PUBLIC)
set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
index 5dc20332..78eedca5 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(device_grouped_conv3d_fwd_instance
+add_instance_library(device_grouped_conv3d_fwd_instance
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
6 changes: 1 addition & 5 deletions onnxruntime/python/tools/kernel_explorer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ Kernel Explorer hooks up GPU kernel code with a Python frontend to help develop,

## Build

Install `ninja` from system package manager or pip to speedup the building. If you don't have `ninja` installed, you can also set it to use the default cmake generator by removing the `--cmake_generator` option.

```bash
#!/bin/bash

Expand All @@ -19,10 +17,8 @@ rocm_home="/opt/rocm"
./build.sh --update \
--build_dir ${build_dir} \
--config ${config} \
--cmake_generator Ninja \
--cmake_extra_defines \
CMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
CMAKE_EXPORT_COMPILE_COMMANDS=ON \
onnxruntime_BUILD_KERNEL_EXPLORER=ON \
onnxruntime_DISABLE_CONTRIB_OPS=ON \
Expand Down
7 changes: 4 additions & 3 deletions orttraining/orttraining/test/graph/bert_toy_fetches.h

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ name: 'linux_ci_$(Date:yyyyMMdd)_$(Rev:r)'
jobs:
- job: AMDMIGraphX_CI
pool: 'AMD-GPU'
timeoutInMinutes: 60
timeoutInMinutes: 180

# gid of video and render group on gcr-openpai-35 and -36
variables:
Expand All @@ -26,6 +26,11 @@ jobs:
Context: tools/ci_build/github/pai
Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
inputs:
script: rm -rf $(Build.BinariesDirectory)/*
displayName: 'Clean Build.BinariesDirectory'

- task: CmdLine@2
inputs:
script: |
Expand All @@ -45,6 +50,9 @@ jobs:
onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion) \
python tools/ci_build/build.py \
--config RelWithDebInfo \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_DEV_MODE=OFF \
--mpi_home /opt/ompi \
--use_migraphx \
--rocm_version=5.2.3 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,23 @@ jobs:
script: |-
export ROCM_HOME=/opt/rocm
python tools/ci_build/build.py \
--config RelWithDebInfo \
--enable_training \
--enable_training_torch_interop \
--mpi_home /opt/ompi \
--use_rocm \
--rocm_version=5.2.3 \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--update \
--build_dir ./build \
--build \
--parallel 8 \
--build_wheel \
--skip_tests
--config RelWithDebInfo \
--enable_training \
--enable_training_torch_interop \
--mpi_home /opt/ompi \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=${ROCM_HOME}/llvm/bin/clang++ \
onnxruntime_DEV_MODE=OFF \
--use_rocm \
--rocm_version=5.2.3 \
--rocm_home ${ROCM_HOME} \
--nccl_home ${ROCM_HOME}\
--update \
--build_dir ./build \
--build \
--parallel 8 \
--build_wheel \
--skip_tests
displayName: 'Build onnxruntime'

- bash: |-
Expand Down
5 changes: 4 additions & 1 deletion tools/ci_build/github/azure-pipelines/templates/rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ jobs:
--build_wheel \
--skip_tests \
--enable_training \
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF \
--cmake_extra_defines \
CMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
onnxruntime_DEV_MODE=OFF \
onnxruntime_BUILD_UNIT_TESTS=OFF \
--enable_training_torch_interop
workingDirectory: $(Build.SourcesDirectory)
displayName: 'Build onnxruntime (in container)'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ LABEL maintainer="The ManyLinux project"
RUN yum install -y hipify-clang

# CMake
ENV CMAKE_VERSION=3.22.1
ENV CMAKE_VERSION=3.24.2
RUN cd /usr/local && \
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}
Expand Down
4 changes: 2 additions & 2 deletions tools/ci_build/github/pai/migraphx-ci-pipeline-env.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ RUN cd /opt/mpi_install/ucx/build &&\
make install

# CMake
ENV CMAKE_VERSION=3.18.2
ENV CMAKE_VERSION=3.24.2
RUN cd /usr/local && \
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}

RUN apt-get update &&\
apt-get install -y half libnuma-dev
Expand Down
4 changes: 2 additions & 2 deletions tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ RUN cd /opt/mpi_install/ucx/build &&\
make install

# CMake
ENV CMAKE_VERSION=3.18.2
ENV CMAKE_VERSION=3.24.2
RUN cd /usr/local && \
wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}

# rocm-ci branch contains instrumentation needed for loss curves and perf
RUN git clone https://github.com/microsoft/huggingface-transformers.git &&\
Expand Down

0 comments on commit 72076b1

Please sign in to comment.