ginkgo-project · yhmtsai · Dec 3, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,8 @@
 repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -  id: end-of-file-fixer
 - repo: https://github.com/pre-commit/mirrors-clang-format
   rev: 'v14.0.0'  # The default in Ubuntu 22.04, which is used in our CI
   hooks:

diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -27,6 +35,11 @@ struct cuda_type {
     using type = T;
 };
 
+template <>
+struct cuda_type<gko::half> {
+    using type = __half;
+};
+
 // Unpack cv and reference / pointer qualifiers
 template <typename T>
 struct cuda_type<const T> {
@@ -57,7 +70,7 @@ struct cuda_type<T&&> {
 // Transform std::complex to thrust::complex
 template <typename T>
 struct cuda_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename cuda_type<T>::type>;
 };
 
 

diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp
@@ -17,7 +17,15 @@
 #include "utils.hpp"
 
 
+struct __half;
+
+
 namespace gko {
+
+
+class half;
+
+
 namespace acc {
 namespace detail {
 
@@ -53,11 +61,15 @@ struct hip_type<T&&> {
     using type = typename hip_type<T>::type&&;
 };
 
+template <>
+struct hip_type<gko::half> {
+    using type = __half;
+};
 
 // Transform std::complex to thrust::complex
 template <typename T>
 struct hip_type<std::complex<T>> {
-    using type = thrust::complex<T>;
+    using type = thrust::complex<typename hip_type<T>::type>;
 };
 
 

diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp
@@ -90,7 +90,7 @@ __launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply(
         ValueType, block_precisions[block_id],
         multiply_vec<max_block_size>(
             subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
                 storage_scheme.get_block_offset(block_id) +
                 subwarp.thread_rank(),

diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp
@@ -68,7 +68,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
         }
     }
 
-    return succeeded && block_cond >= 1.0 &&
+    return succeeded && block_cond >= remove_complex<ValueType>{1.0} &&
            block_cond * static_cast<remove_complex<ValueType>>(
                             float_traits<remove_complex<ValueType>>::eps) <
                remove_complex<ValueType>{1e-3};
@@ -160,7 +160,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
                 accuracy, block_cond,
                 [&subwarp, &block_size, &row, &block_data, &storage_scheme,
                  &block_id] {
-                    using target = reduce_precision<ValueType>;
+                    using target = device_type<reduce_precision<ValueType>>;
                     return validate_precision_reduction_feasibility<
                         max_block_size, target>(
                         subwarp, block_size, row,
@@ -170,8 +170,8 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
                 },
                 [&subwarp, &block_size, &row, &block_data, &storage_scheme,
                  &block_id] {
-                    using target =
-                        reduce_precision<reduce_precision<ValueType>>;
+                    using target = device_type<
+                        reduce_precision<reduce_precision<ValueType>>>;
                     return validate_precision_reduction_feasibility<
                         max_block_size, target>(
                         subwarp, block_size, row,
@@ -195,7 +195,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate(
             ValueType, prec,
             copy_matrix<max_block_size, and_transpose>(
                 subwarp, block_size, row, 1, perm, trans_perm,
-                reinterpret_cast<resolved_precision*>(
+                reinterpret_cast<device_type<resolved_precision>*>(
                     block_data + storage_scheme.get_group_offset(block_id)) +
                     storage_scheme.get_block_offset(block_id),
                 storage_scheme.get_stride()));

diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -206,11 +206,11 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi(
     GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
         ValueType, block_precisions[block_id],
         auto local_block =
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
             storage_scheme.get_block_offset(block_id);
         auto local_out_block =
-            reinterpret_cast<resolved_precision*>(
+            reinterpret_cast<device_type<resolved_precision>*>(
                 out_blocks + storage_scheme.get_group_offset(block_id)) +
             storage_scheme.get_block_offset(block_id);
         for (int i = rank; i < block_size * block_size; i += subwarp_size) {

diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp
@@ -84,7 +84,7 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size)
         ValueType, block_precisions[block_id],
         multiply_vec<max_block_size>(
             subwarp, block_size, v,
-            reinterpret_cast<const resolved_precision*>(
+            reinterpret_cast<const device_type<resolved_precision>*>(
                 blocks + storage_scheme.get_group_offset(block_id)) +
                 storage_scheme.get_block_offset(block_id) +
                 subwarp.thread_rank(),