Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mixed precision ELL #717

Merged
merged 14 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {shared: "ON", build_type: "Debug", name: "omp/debug/shared"}
- {shared: "OFF", build_type: "Release", name: "omp/release/static"}
- {shared: "ON", build_type: "Debug", name: "omp/debug/shared", "mixed": "OFF"}
- {shared: "OFF", build_type: "Release", name: "omp/release/static", "mixed": "ON"}
name: ${{ matrix.config.name }}
runs-on: [macos-latest]

Expand All @@ -40,7 +40,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }}
cmake .. -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }}
make -j8
ctest -j10 --output-on-failure

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/windows-msvc-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {version: "10.2.89.20191206", name: "cuda102/release/shared"}
- {version: "latest", name: "cuda-latest/release/shared"}
- {version: "10.2.89.20191206", name: "cuda102/release/shared", "mixed": "ON"}
- {version: "latest", name: "cuda-latest/release/shared", "mixed": "OFF"}
name: msvc/${{ matrix.config.name }} (only compile)
runs-on: [windows-latest]

Expand Down Expand Up @@ -46,5 +46,5 @@ jobs:
$env:PATH="$env:PATH;$pwd\build\windows_shared_library"
mkdir build
cd build
cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_CUDA_ARCHITECTURES=60 ..
cmake -DCMAKE_CXX_FLAGS=/bigobj -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_CUDA_ARCHITECTURES=60 ..
cmake --build . -j4 --config Release
6 changes: 3 additions & 3 deletions .github/workflows/windows-msvc-ref.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
fail-fast: false
matrix:
config:
- {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
- {shared: "OFF", build_type: "Release", name: "reference/release/static"}
- {shared: "ON", build_type: "Debug", name: "reference/debug/shared", "mixed": "ON"}
- {shared: "OFF", build_type: "Release", name: "reference/release/static", "mixed": "OFF"}
# Debug static needs too much storage
# - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
name: msvc/${{ matrix.config.name }}
Expand All @@ -35,7 +35,7 @@ jobs:
$env:PATH="$env:PATH;$pwd\build\windows_shared_library"
mkdir build
cd build
cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
cmake -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} ..
cmake --build . -j4 --config ${{ matrix.config.build_type }}
ctest . -C ${{ matrix.config.build_type }} --output-on-failure

Expand Down
41 changes: 41 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ include:
BUILD_HWLOC: "ON"
FAST_TESTS: "OFF"
DPCPP_SINGLE_MODE: "OFF"
MIXED_PRECISION: "ON"
RUN_EXAMPLES: "OFF"
CONFIG_LOG: "ON"
CXX_FLAGS: ""
Expand Down Expand Up @@ -77,6 +78,7 @@ include:
-DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
-DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
-DGINKGO_FAST_TESTS=${FAST_TESTS}
-DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
-DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
-DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
-DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
Expand Down Expand Up @@ -111,6 +113,7 @@ include:
-DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
-DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
-DGINKGO_FAST_TESTS=${FAST_TESTS}
-DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
-DGINKGO_CONFIG_LOG_DETAILED=${CONFIG_LOG}
-DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
-DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
Expand Down Expand Up @@ -681,6 +684,44 @@ build/nocuda/intel/omp/release/static:
BUILD_TYPE: "Release"
BUILD_SHARED_LIBS: "OFF"

build/nocuda-nomixed/gcc/omp/release/shared:
<<: *default_build_with_test
extends:
- .quick_test_condition
- .use_gko-nocuda-gnu9-llvm8
variables:
<<: *default_variables
BUILD_OMP: "ON"
BUILD_TYPE: "Release"
MIXED_PRECISION: "OFF"

build/nocuda-nomixed/clang/omp/debug/static:
<<: *default_build_with_test
extends:
- .full_test_condition
- .use_gko-nocuda-gnu9-llvm8
variables:
<<: *default_variables
C_COMPILER: "clang"
CXX_COMPILER: "clang++"
BUILD_OMP: "ON"
BUILD_TYPE: "Debug"
BUILD_SHARED_LIBS: "OFF"
MIXED_PRECISION: "OFF"

build/nocuda-nomixed/intel/omp/release/static:
<<: *default_build_with_test
extends:
- .full_test_condition
- .use_gko-nocuda-gnu9-llvm8-intel
variables:
<<: *default_variables
C_COMPILER: "icc"
CXX_COMPILER: "icpc"
BUILD_OMP: "ON"
BUILD_TYPE: "Release"
BUILD_SHARED_LIBS: "OFF"
MIXED_PRECISION: "OFF"

build/dpcpp/cpu/release/static:
<<: *default_build_with_test
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
option(GINKGO_BUILD_DOC "Generate documentation" OFF)
option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
option(GINKGO_SKIP_DEPENDENCY_UPDATE
"Do not update dependencies each time the project is rebuilt" ON)
option(GINKGO_EXPORT_BUILD_DIR
Expand Down
4 changes: 4 additions & 0 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Ginkgo adds the following additional switches to control what is being built:
* `-DGINKGO_DEVEL_TOOLS={ON, OFF}` sets up the build system for development
(requires clang-format, will also download git-cmake-format),
default is `OFF`.
* `-DGINKGO_MIXED_PRECISION={ON, OFF}` compiles true mixed-precision kernels
instead of converting data on the fly, default is `OFF`.
Enabling this flag increases the library size, but improves performance of
mixed-precision kernels.
* `-DGINKGO_BUILD_TESTS={ON, OFF}` builds Ginkgo's tests
(will download googletest), default is `ON`.
* `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
Expand Down
29 changes: 25 additions & 4 deletions benchmark/utils/formats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ namespace formats {


std::string available_format =
"coo, csr, ell, sellp, hybrid, hybrid0, hybrid25, hybrid33, hybrid40, "
"coo, csr, ell, ell-mixed, sellp, hybrid, hybrid0, hybrid25, hybrid33, "
"hybrid40, "
"hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
"hybridminstorage"
#ifdef HAS_CUDA
Expand Down Expand Up @@ -90,6 +91,8 @@ std::string format_description =
"csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
"ell: Ellpack format according to Bell and Garland: Efficient Sparse "
"Matrix-Vector Multiplication on CUDA.\n"
"ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: "
"Efficient Sparse Matrix-Vector Multiplication on CUDA.\n"
"sellp: Sliced Ellpack uses a default block size of 32.\n"
"hybrid: Hybrid uses ell and coo to represent the matrix.\n"
"hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses "
Expand Down Expand Up @@ -204,6 +207,23 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
{"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
{"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
{"ell-mixed",
[](std::shared_ptr<const gko::Executor> exec,
const gko::matrix_data<etype> &data) {
gko::matrix_data<gko::next_precision<etype>> conv_data;
conv_data.size = data.size;
conv_data.nonzeros.resize(data.nonzeros.size());
auto it = conv_data.nonzeros.begin();
for (auto &el : data.nonzeros) {
it->row = el.row;
it->column = el.column;
it->value = el.value;
++it;
}
auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
mat->read(conv_data);
return mat;
}},
#ifdef HAS_CUDA
#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
{"cusp_csr", read_matrix_from_data<cusp_csr>},
Expand All @@ -212,8 +232,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
{"cusp_coo", read_matrix_from_data<cusp_coo>},
{"cusp_ell", read_matrix_from_data<cusp_ell>},
#else // CUDA_VERSION >= 11000
// cusp_csr, cusp_coo use the generic ones from CUDA 11
#else // CUDA_VERSION >= 11000
// cusp_csr, cusp_coo use the generic ones from CUDA 11
{"cusp_csr", read_matrix_from_data<cusp_gcsr>},
{"cusp_coo", read_matrix_from_data<cusp_gcoo>},
#endif
Expand Down Expand Up @@ -260,7 +280,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
{"hybridminstorage",
READ_MATRIX(hybrid,
std::make_shared<hybrid::minimal_storage_limit>())},
{"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}};
{"sellp", read_matrix_from_data<gko::matrix::Sellp<etype>>}
};
// clang-format on


Expand Down
3 changes: 3 additions & 0 deletions cmake/get_info.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ foreach(log_type ${log_types})
ginkgo_print_module_footer(${${log_type}} " Enabled modules:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_OMP;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
ginkgo_print_module_footer(${${log_type}} " Enabled features:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_MIXED_PRECISION")
ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
Expand Down
63 changes: 32 additions & 31 deletions common/matrix/ell_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,30 @@ namespace kernel {
namespace {


template <int num_thread_per_worker, bool atomic, typename ValueType,
typename IndexType, typename Closure>
template <int num_thread_per_worker, bool atomic, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType,
typename Closure>
__device__ void spmv_kernel(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
acc::range<a_accessor> val, const IndexType *__restrict__ col,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

after using accessor, it can not have const *__restrict__ anymore, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I think the accessor itself uses __restrict__, so we shouldn't need to worry about it.

const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride, Closure op)
acc::range<b_accessor> b, OutputValueType *__restrict__ c,
const size_type c_stride, Closure op)
{
const auto tidx = thread::get_thread_id_flat();
const auto column_id = blockIdx.y;
const decltype(tidx) column_id = blockIdx.y;
if (num_thread_per_worker == 1) {
// Specialize the num_thread_per_worker = 1. It doesn't need the shared
// memory, __syncthreads, and atomic_add
if (tidx < num_rows) {
ValueType temp = zero<ValueType>();
auto temp = zero<OutputValueType>();
for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
const auto ind = tidx + idx * stride;
const auto col_idx = col[ind];
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
temp += val(ind) * b(col_idx, column_id);
}
}
const auto c_ind = tidx * c_stride + column_id;
Expand All @@ -68,14 +69,14 @@ __device__ void spmv_kernel(
const auto x = tidx % num_rows;
const auto worker_id = tidx / num_rows;
const auto step_size = num_worker_per_row * num_thread_per_worker;
__shared__ UninitializedArray<ValueType, default_block_size /
num_thread_per_worker>
__shared__ UninitializedArray<
OutputValueType, default_block_size / num_thread_per_worker>
storage;
if (idx_in_worker == 0) {
storage[threadIdx.x] = 0;
}
__syncthreads();
ValueType temp = zero<ValueType>();
auto temp = zero<OutputValueType>();
for (size_type idx =
worker_id * num_thread_per_worker + idx_in_worker;
idx < num_stored_elements_per_row; idx += step_size) {
Expand All @@ -84,7 +85,7 @@ __device__ void spmv_kernel(
if (col_idx < idx) {
break;
} else {
temp += val[ind] * b[col_idx * b_stride + column_id];
temp += val(ind) * b(col_idx, column_id);
}
}
atomic_add(&storage[threadIdx.x], temp);
Expand All @@ -102,51 +103,51 @@ __device__ void spmv_kernel(
}


template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ val, const IndexType *__restrict__ col,
acc::range<a_accessor> val, const IndexType *__restrict__ col,
const size_type stride, const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
ValueType *__restrict__ c, const size_type c_stride)
acc::range<b_accessor> b, OutputValueType *__restrict__ c,
const size_type c_stride)
{
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[](const ValueType &x, const ValueType &y) { return x; });
num_stored_elements_per_row, b, c, c_stride,
[](const OutputValueType &x, const OutputValueType &y) { return x; });
}


template <int num_thread_per_worker, bool atomic = false, typename ValueType,
typename IndexType>
template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
typename a_accessor, typename OutputValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void spmv(
const size_type num_rows, const int num_worker_per_row,
const ValueType *__restrict__ alpha, const ValueType *__restrict__ val,
acc::range<a_accessor> alpha, acc::range<a_accessor> val,
const IndexType *__restrict__ col, const size_type stride,
const size_type num_stored_elements_per_row,
const ValueType *__restrict__ b, const size_type b_stride,
const ValueType *__restrict__ beta, ValueType *__restrict__ c,
const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
const OutputValueType *__restrict__ beta, OutputValueType *__restrict__ c,
const size_type c_stride)
{
const ValueType alpha_val = alpha[0];
const ValueType beta_val = beta[0];
const OutputValueType alpha_val = alpha(0);
const OutputValueType beta_val = beta[0];
// Because the atomic operation changes the values of c during computation,
// it can not do the right alpha * a * b + beta * c operation.
// Thus, the cuda kernel only computes alpha * a * b when it uses atomic
// operation.
if (atomic) {
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val](const ValueType &x, const ValueType &y) {
num_stored_elements_per_row, b, c, c_stride,
[&alpha_val](const OutputValueType &x, const OutputValueType &y) {
return alpha_val * x;
});
} else {
spmv_kernel<num_thread_per_worker, atomic>(
num_rows, num_worker_per_row, val, col, stride,
num_stored_elements_per_row, b, b_stride, c, c_stride,
[&alpha_val, &beta_val](const ValueType &x, const ValueType &y) {
num_stored_elements_per_row, b, c, c_stride,
[&alpha_val, &beta_val](const OutputValueType &x,
const OutputValueType &y) {
return alpha_val * x + beta_val * y;
});
}
Expand Down
Loading