Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA/HIP RCM kernels #1503

Merged
merged 29 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f1fa710
remove get_node_degrees kernel
upsj Dec 2, 2023
356e97c
unify RCM source CUDA/HIP
upsj Dec 3, 2023
d7bd22f
use CUDA/HIP RCM reordering
upsj Dec 3, 2023
3345649
fix unsigned abs warnings
upsj Dec 5, 2023
1b151f5
add RCM GPU kernel
upsj Dec 9, 2023
8e4ee0f
tests
upsj Dec 11, 2023
1e150cf
add load_*_local loads for workgroup-coherent global memory atomics
upsj Dec 11, 2023
100d220
fix connected component search
upsj Dec 11, 2023
d541d19
fix level initialization
upsj Dec 11, 2023
eeb18e9
fix membar for local atomics
upsj Dec 14, 2023
69b298d
fix min degree level initialization
upsj Dec 14, 2023
4479266
test multiple connected components
upsj Dec 14, 2023
02b71cd
fix ubfs
upsj Dec 15, 2023
b258af7
fix test
upsj Dec 15, 2023
42833cf
work around rocThrust bug
upsj Dec 15, 2023
3e9420d
fix handling of multiple connected components in OpenMP
upsj Dec 18, 2023
0de9baa
compute and test inverse permutation
upsj Dec 20, 2023
8d7c35e
review updates
upsj Dec 20, 2023
c9b2f10
remove old reordering test
upsj Dec 20, 2023
ed75c86
reenable dpcpp test
upsj Jan 9, 2024
faf720e
formatting
upsj Jan 9, 2024
9086dab
clearer connected component detection
upsj Jan 9, 2024
211aecf
fix minimum degree choice in OMP starting node
upsj Jan 9, 2024
78a1f5c
rename kernel get_permutation -> compute_permutation
upsj Jan 9, 2024
147c280
review updates
upsj Jan 10, 2024
8a9a7df
use stable sort in RCM
upsj Jan 17, 2024
3f62741
add isolated vertices to test
upsj Jan 17, 2024
7e0f19a
add failing test case
upsj Jan 18, 2024
9f65902
fix test
upsj Jan 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336 changes: 336 additions & 0 deletions common/cuda_hip/components/memory.nvidia.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ __device__ __forceinline__ void membar_acq_rel_shared()
}


__device__ __forceinline__ void membar_acq_rel_local()
{
#if __CUDA_ARCH__ < 700
asm volatile("membar.cta;" ::: "memory");
#else
asm volatile("fence.acq_rel.cta;" ::: "memory");
#endif
}


__device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
{
int32 result;
Expand Down Expand Up @@ -343,6 +353,258 @@ __device__ __forceinline__ void store_release_shared(double* ptr, double result)
}


__device__ __forceinline__ int32 load_relaxed_local(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#endif

return result;
}


__device__ __forceinline__ void store_relaxed_local(int32* ptr, int32 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}


__device__ __forceinline__ int64 load_relaxed_local(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#endif

return result;
}


__device__ __forceinline__ void store_relaxed_local(int64* ptr, int64 result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}


__device__ __forceinline__ float load_relaxed_local(const float* ptr)
{
float result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.f32 %0, [%1];"
: "=f"(result)
: "l"(const_cast<float*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.f32 %0, [%1];"
: "=f"(result)
: "l"(const_cast<float*>(ptr))
: "memory");
#endif

return result;
}


__device__ __forceinline__ void store_relaxed_local(float* ptr, float result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#endif
}


__device__ __forceinline__ double load_relaxed_local(const double* ptr)
{
double result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.f64 %0, [%1];"
: "=d"(result)
: "l"(const_cast<double*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.f64 %0, [%1];"
: "=d"(result)
: "l"(const_cast<double*>(ptr))
: "memory");
#endif

return result;
}


__device__ __forceinline__ void store_relaxed_local(double* ptr, double result)
{
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#else
asm volatile("st.relaxed.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#endif
}


__device__ __forceinline__ int32 load_acquire_local(const int32* ptr)
{
int32 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.cta.s32 %0, [%1];"
: "=r"(result)
: "l"(const_cast<int32*>(ptr))
: "memory");
#endif
membar_acq_rel_local();
return result;
}


__device__ __forceinline__ void store_release_local(int32* ptr, int32 result)
{
membar_acq_rel_local();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#else
asm volatile("st.release.cta.s32 [%0], %1;" ::"l"(ptr), "r"(result)
: "memory");
#endif
}


__device__ __forceinline__ int64 load_acquire_local(const int64* ptr)
{
int64 result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.cta.s64 %0, [%1];"
: "=l"(result)
: "l"(const_cast<int64*>(ptr))
: "memory");
#endif
membar_acq_rel_local();
return result;
}


__device__ __forceinline__ void store_release_local(int64* ptr, int64 result)
{
membar_acq_rel_local();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#else
asm volatile("st.release.cta.s64 [%0], %1;" ::"l"(ptr), "l"(result)
: "memory");
#endif
}


__device__ __forceinline__ float load_acquire_local(const float* ptr)
{
float result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.f32 %0, [%1];"
: "=f"(result)
: "l"(const_cast<float*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.cta.f32 %0, [%1];"
: "=f"(result)
: "l"(const_cast<float*>(ptr))
: "memory");
#endif
membar_acq_rel_local();
return result;
}


__device__ __forceinline__ void store_release_local(float* ptr, float result)
{
membar_acq_rel_local();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#else
asm volatile("st.release.cta.f32 [%0], %1;" ::"l"(ptr), "f"(result)
: "memory");
#endif
}


__device__ __forceinline__ double load_acquire_local(const double* ptr)
{
double result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.f64 %0, [%1];"
: "=d"(result)
: "l"(const_cast<double*>(ptr))
: "memory");
#else
asm volatile("ld.acquire.cta.f64 %0, [%1];"
: "=d"(result)
: "l"(const_cast<double*>(ptr))
: "memory");
#endif
membar_acq_rel_local();
return result;
}


__device__ __forceinline__ void store_release_local(double* ptr, double result)
{
membar_acq_rel_local();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#else
asm volatile("st.release.cta.f64 [%0], %1;" ::"l"(ptr), "d"(result)
: "memory");
#endif
}


__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
{
int32 result;
Expand Down Expand Up @@ -677,6 +939,80 @@ __device__ __forceinline__ void store_relaxed_shared(
}


__device__ __forceinline__ thrust::complex<float> load_relaxed_local(
const thrust::complex<float>* ptr)
{
float real_result;
float imag_result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
: "=f"(real_result), "=f"(imag_result)
: "l"(const_cast<thrust::complex<float>*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.v2.f32 {%0, %1}, [%2];"
: "=f"(real_result), "=f"(imag_result)
: "l"(const_cast<thrust::complex<float>*>(ptr))
: "memory");
#endif
return thrust::complex<float>{real_result, imag_result};
}


__device__ __forceinline__ void store_relaxed_local(
thrust::complex<float>* ptr, thrust::complex<float> result)
{
auto real_result = result.real();
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.cta.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
"f"(real_result), "f"(imag_result)
: "memory");
#endif
}


__device__ __forceinline__ thrust::complex<double> load_relaxed_local(
const thrust::complex<double>* ptr)
{
double real_result;
double imag_result;
#if __CUDA_ARCH__ < 700
asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
: "=d"(real_result), "=d"(imag_result)
: "l"(const_cast<thrust::complex<double>*>(ptr))
: "memory");
#else
asm volatile("ld.relaxed.cta.v2.f64 {%0, %1}, [%2];"
: "=d"(real_result), "=d"(imag_result)
: "l"(const_cast<thrust::complex<double>*>(ptr))
: "memory");
#endif
return thrust::complex<double>{real_result, imag_result};
}


__device__ __forceinline__ void store_relaxed_local(
thrust::complex<double>* ptr, thrust::complex<double> result)
{
auto real_result = result.real();
auto imag_result = result.imag();
#if __CUDA_ARCH__ < 700
asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
"d"(real_result), "d"(imag_result)
: "memory");
#else
asm volatile("st.relaxed.cta.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
"d"(real_result), "d"(imag_result)
: "memory");
#endif
}


__device__ __forceinline__ thrust::complex<float> load_relaxed(
const thrust::complex<float>* ptr)
{
Expand Down
Loading