Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Half base type #1706

Merged
merged 13 commits into from
Dec 3, 2024
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: 'v14.0.0' # The default in Ubuntu 22.04, which is used in our CI
hooks:
Expand Down
15 changes: 14 additions & 1 deletion accessor/cuda_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@
#include "utils.hpp"


struct __half;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's a way around the issue, but we should be aware that __half is a reserved identifier according to the C++ standard, so we should technically not be defining anything with that name. Also there might be a tiny potential of name mangling issues if this is ever handled as a class instead of a struct with MSVC.



namespace gko {


class half;


namespace acc {
namespace detail {

Expand All @@ -27,6 +35,11 @@ struct cuda_type {
using type = T;
};

template <>
struct cuda_type<gko::half> {
using type = __half;
};

// Unpack cv and reference / pointer qualifiers
template <typename T>
struct cuda_type<const T> {
Expand Down Expand Up @@ -57,7 +70,7 @@ struct cuda_type<T&&> {
// Transform std::complex to thrust::complex
template <typename T>
struct cuda_type<std::complex<T>> {
using type = thrust::complex<T>;
using type = thrust::complex<typename cuda_type<T>::type>;
};


Expand Down
14 changes: 13 additions & 1 deletion accessor/hip_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@
#include "utils.hpp"


struct __half;


namespace gko {


class half;


namespace acc {
namespace detail {

Expand Down Expand Up @@ -53,11 +61,15 @@ struct hip_type<T&&> {
using type = typename hip_type<T>::type&&;
};

template <>
struct hip_type<gko::half> {
using type = __half;
};

// Transform std::complex to thrust::complex
template <typename T>
struct hip_type<std::complex<T>> {
using type = thrust::complex<T>;
using type = thrust::complex<typename hip_type<T>::type>;
};


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ __launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply(
ValueType, block_precisions[block_id],
multiply_vec<max_block_size>(
subwarp, block_size, v,
reinterpret_cast<const resolved_precision*>(
reinterpret_cast<const device_type<resolved_precision>*>(
blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id) +
subwarp.thread_rank(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
}
}

return succeeded && block_cond >= 1.0 &&
return succeeded && block_cond >= remove_complex<ValueType>{1.0} &&
block_cond * static_cast<remove_complex<ValueType>>(
float_traits<remove_complex<ValueType>>::eps) <
remove_complex<ValueType>{1e-3};
Expand Down Expand Up @@ -160,7 +160,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
accuracy, block_cond,
[&subwarp, &block_size, &row, &block_data, &storage_scheme,
&block_id] {
using target = reduce_precision<ValueType>;
using target = device_type<reduce_precision<ValueType>>;
return validate_precision_reduction_feasibility<
max_block_size, target>(
subwarp, block_size, row,
Expand All @@ -170,8 +170,8 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
},
[&subwarp, &block_size, &row, &block_data, &storage_scheme,
&block_id] {
using target =
reduce_precision<reduce_precision<ValueType>>;
using target = device_type<
reduce_precision<reduce_precision<ValueType>>>;
return validate_precision_reduction_feasibility<
max_block_size, target>(
subwarp, block_size, row,
Expand All @@ -195,7 +195,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
ValueType, prec,
copy_matrix<max_block_size, and_transpose>(
subwarp, block_size, row, 1, perm, trans_perm,
reinterpret_cast<resolved_precision*>(
reinterpret_cast<device_type<resolved_precision>*>(
block_data + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id),
storage_scheme.get_stride()));
Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/preconditioner/jacobi_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,11 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi(
GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
ValueType, block_precisions[block_id],
auto local_block =
reinterpret_cast<const resolved_precision*>(
reinterpret_cast<const device_type<resolved_precision>*>(
blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id);
auto local_out_block =
reinterpret_cast<resolved_precision*>(
reinterpret_cast<device_type<resolved_precision>*>(
out_blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id);
for (int i = rank; i < block_size * block_size; i += subwarp_size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size)
ValueType, block_precisions[block_id],
multiply_vec<max_block_size>(
subwarp, block_size, v,
reinterpret_cast<const resolved_precision*>(
reinterpret_cast<const device_type<resolved_precision>*>(
blocks + storage_scheme.get_group_offset(block_id)) +
storage_scheme.get_block_offset(block_id) +
subwarp.thread_rank(),
Expand Down
Loading
Loading