diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh index 390d5b1..2cdd71b 100644 --- a/include/FastFFT.cuh +++ b/include/FastFFT.cuh @@ -138,7 +138,6 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ const ComplexData_t* __restrict__ input_values, ComplexData_t* __restrict__ output_values, Offsets mem_offsets, - int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv, PreOpType pre_op_functor, @@ -226,39 +225,9 @@ __global__ void clip_into_real_kernel(PositionSpaceType* real_values_gpu, int3 wanted_coordinate_of_box_center, OutputBaseType wanted_padding_value); -template -__device__ __forceinline__ T load_with_hint(const T* ptr, const int idx) { - static_assert(hint_type == 0 || hint_type == 1 || hint_type == 2 || hint_type == 3, "invalid mem op hint type"); - // 0 is default for store or load - // Cache at all levels, likely to be accessed again. https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators - // Cache write-back all coherent levels. - if constexpr ( hint_type == 0 ) - return __ldca(&ptr[idx]); - else if constexpr ( hint_type == 1 ) - return __ldcg(&ptr[idx]); - else if constexpr ( hint_type == 2 ) - return __ldcs(&ptr[idx]); - else - return ptr[idx]; // when we are doing a store, we still want to convert type but there is no need for a load -}; - -template -__device__ __forceinline__ void store_with_hint(T* ptr, const int idx, T val_to_store) { - static_assert(hint_type == 0 || hint_type == 1 || hint_type == 2, "invalid mem op hint type"); - // 0 is default for store or load - // Cache at all levels, likely to be accessed again. https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cache-operators - // Cache write-back all coherent levels. - if constexpr ( hint_type == 0 ) - return __stwb(&ptr[idx]); - else if constexpr ( hint_type == 1 ) - return __stcg(&ptr[idx]); - else - return __stcs(&ptr[idx]); -} - // TODO: This would be much cleaner if we could first go from complex_compute_t -> float 2 then do conversions // I think since this would be a compile time decision, it would be fine, but it would be good to confirm. -template +template inline __device__ SetTo_t convert_if_needed(const GetFrom_t* __restrict__ ptr, const unsigned int idx) { using complex_compute_t = typename FFT::value_type; using scalar_compute_t = typename complex_compute_t::value_type; @@ -277,41 +246,41 @@ inline __device__ SetTo_t convert_if_needed(const GetFrom_t* __restrict__ ptr, c if constexpr ( std::is_same_v || std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(load_with_hint(reinterpret_cast(ptr), idx)); + return std::move(reinterpret_cast(ptr)[idx]); } else if constexpr ( std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(__float2half_rn(load_with_hint(reinterpret_cast(ptr), idx))); + return std::move(__float2half_rn(reinterpret_cast(ptr)[idx])); } else if constexpr ( std::is_same_v ) { // Note: we will eventually need a similar hase for __nv_bfloat16 // I think I may need to strip the const news for this to work if constexpr ( std::is_same_v ) { - return std::move(__floats2half2_rn(load_with_hint(ptr, idx).real( ), 0.f)); + return std::move(__floats2half2_rn((ptr[idx]).real( ), 0.f)); } else { - return std::move(__floats2half2_rn(load_with_hint(static_cast(ptr), idx), 0.f)); + return std::move(__floats2half2_rn(static_cast(ptr)[idx], 0.f)); } } else if constexpr ( std::is_same_v, complex_compute_t> && std::is_same_v, complex_compute_t> ) { // return std::move(static_cast(ptr)[idx]); - float2 t = load_with_hint(reinterpret_cast(ptr), idx); // FIXME + float2 t = (reinterpret_cast(ptr)[idx]); // FIXME return std::move(complex_compute_t{t.x, t.y}); } else if constexpr ( std::is_same_v, complex_compute_t> && std::is_same_v, float2> ) { // return std::move(static_cast(ptr)[idx]); - // return std::move(SetTo_t{load_with_hint(reinterpret_cast(ptr), idx).real( ), load_with_hint(reinterpret_cast(ptr), idx).imag( )}); - return std::move(SetTo_t{load_with_hint(reinterpret_cast(ptr), idx).x, load_with_hint(reinterpret_cast(ptr), idx).y}); + // return std::move(SetTo_t{(reinterpret_cast(ptr)[idx]).real( ), (reinterpret_cast(ptr)[idx]).imag( )}); + return std::move(SetTo_t{(reinterpret_cast(ptr)[idx]).x, (reinterpret_cast(ptr)[idx]).y}); } else if constexpr ( std::is_same_v, float2> && std::is_same_v, complex_compute_t> ) { // return std::move(static_cast(ptr)[idx]); - return std::move(SetTo_t{load_with_hint(ptr, idx).x, load_with_hint(ptr, idx).y}); + return std::move(SetTo_t{(ptr[idx]).x, (ptr[idx]).y}); } else if constexpr ( std::is_same_v, float2> && std::is_same_v, float2> ) { // return std::move(static_cast(ptr)[idx]); - return std::move(load_with_hint(ptr, idx)); + return std::move((ptr[idx])); } else { static_no_match( ); @@ -321,21 +290,21 @@ inline __device__ SetTo_t convert_if_needed(const GetFrom_t* __restrict__ ptr, c if constexpr ( std::is_same_v || std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(load_with_hint(static_cast(ptr), idx)); + return std::move((static_cast(ptr)[idx])); } else if constexpr ( std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(__float2half_rn(load_with_hint(static_cast(ptr), idx))); + return std::move(__float2half_rn((static_cast(ptr)[idx]))); } else if constexpr ( std::is_same_v ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? - return std::move(__floats2half2_rn(load_with_hint(static_cast(ptr), idx), 0.f)); + return std::move(__floats2half2_rn((static_cast(ptr)[idx]), 0.f)); } else if constexpr ( std::is_same_v, complex_compute_t> || std::is_same_v, float2> ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? - return std::move(SetTo_t{load_with_hint(static_cast(ptr), idx), 0.f}); + return std::move(SetTo_t{(static_cast(ptr)[idx]), 0.f}); } else { static_no_match( ); @@ -343,21 +312,21 @@ inline __device__ SetTo_t convert_if_needed(const GetFrom_t* __restrict__ ptr, c } else if constexpr ( std::is_same_v, __half> ) { if constexpr ( std::is_same_v || std::is_same_v ) { - return std::move(__half2float(load_with_hint(ptr, idx))); + return std::move(__half2float((ptr[idx]))); } else if constexpr ( std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(load_with_hint(static_cast(ptr), idx)); + return std::move((static_cast(ptr)[idx])); } else if constexpr ( std::is_same_v ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? // FIXME: For some reason CUDART_ZERO_FP16 is not defined even with cuda_fp16.h included - return std::move(__halves2half2(load_with_hint(static_cast(ptr), idx), __ushort_as_half((unsigned short)0x0000U))); + return std::move(__halves2half2((static_cast(ptr)[idx]), __ushort_as_half((unsigned short)0x0000U))); } else if constexpr ( std::is_same_v, complex_compute_t> || std::is_same_v, float2> ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? - return std::move(SetTo_t{__half2float(load_with_hint(static_cast(ptr), idx)), 0.f}); + return std::move(SetTo_t{__half2float((static_cast(ptr)[idx])), 0.f}); } else { static_no_match( ); @@ -367,16 +336,16 @@ inline __device__ SetTo_t convert_if_needed(const GetFrom_t* __restrict__ ptr, c if constexpr ( std::is_same_v || std::is_same_v || std::is_same_v ) { // In this case we assume we have a real valued result, packed into the first half of the complex array // TODO: think about cases where we may hit this block unintentionally and how to catch this - return std::move(load_with_hint(reinterpret_cast(ptr), idx)); + return std::move((reinterpret_cast(ptr)[idx])); } else if constexpr ( std::is_same_v ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? // FIXME: For some reason CUDART_ZERO_FP16 is not defined even with cuda_fp16.h included - return std::move(load_with_hint(static_cast(ptr), idx)); + return std::move((static_cast(ptr)[idx])); } else if constexpr ( std::is_same_v, complex_compute_t> || std::is_same_v, float2> ) { // Here we assume we are reading a real value and placeing it in a complex array. Could this go sideways? - return std::move(SetTo_t{__low2float(load_with_hint(static_cast(ptr), idx)), __high2float(load_with_hint(static_cast(ptr), idx))}); + return std::move(SetTo_t{__low2float((static_cast(ptr)[idx])), __high2float((static_cast(ptr)[idx]))}); } else { static_no_match( ); @@ -864,9 +833,9 @@ struct io { // TODO: set user lambda to default = false, then get rid of other load_shared template - static inline __device__ void load_shared(const ExternalImage_t* __restrict__ image_to_search, - complex_compute_t* __restrict__ thread_data, - FunctionType intra_op_functor = nullptr) { + static inline __device__ void load_external_data(const ExternalImage_t* __restrict__ image_to_search, + complex_compute_t* __restrict__ thread_data, + FunctionType intra_op_functor = nullptr) { unsigned int index = threadIdx.x; if constexpr ( IS_IKF_t( ) ) { @@ -1085,7 +1054,7 @@ struct io { for ( unsigned int i = 0; i < FFT::input_ept; i++ ) { if ( x_prime < SignalLength ) - smem_buffer[smem_idx] = convert_if_needed(input, x_prime * pixel_pitch + fft_idx + blockIdx.y * n_coalesced_ffts); + smem_buffer[smem_idx] = convert_if_needed(input, x_prime * pixel_pitch + fft_idx + blockIdx.y * n_coalesced_ffts); __syncthreads( ); if ( index < SignalLength ) @@ -1123,7 +1092,7 @@ struct io { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { - thread_data[i] = convert_if_needed(input, index); + thread_data[i] = convert_if_needed(input, index); index += FFT::stride; } } @@ -1141,7 +1110,7 @@ struct io { float2 temp; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { if ( index < last_index_to_load ) { - temp = pre_op_functor(convert_if_needed(input, index)); + temp = pre_op_functor(convert_if_needed(input, index)); thread_data[i] = convert_if_needed(&temp, 0); } else { diff --git a/include/FastFFT.h b/include/FastFFT.h index c57e43b..6eda68d 100644 --- a/include/FastFFT.h +++ b/include/FastFFT.h @@ -18,7 +18,7 @@ // #define FastFFT_build_sizes 32, 64, 128, 256, 512, 1024, 2048, 4096 // #define FastFFT_build_sizes 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 16, 8192, 16 -#define FastFFT_build_sizes 64, 128 +#define FastFFT_build_sizes 512, 4096 namespace FastFFT { diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu index e323864..05968c6 100644 --- a/src/fastfft/FastFFT.cu +++ b/src/fastfft/FastFFT.cu @@ -27,6 +27,16 @@ // rather than saying extern __shared__ __align__(16) value_type shared_mem[]; we define the following instead. #define FastFFT_SMEM extern __shared__ __align__(16) +// clang-format off +#define FastFFT_PRINT_DUMP(LP) \ + std::cerr << "\nFastFFT Debug Dump\n"; \ + PrintState(); \ + PrintLaunchParameters(LP); \ + std::cerr << "Smem :" << shared_memory << std::endl; \ + std::cerr << "Ept: " << FFT::elements_per_thread << std::endl; \ + std::cerr << "Shared memory size: " << FFT::shared_memory_size << std::endl; + +// clang-format on namespace FastFFT { // If we are using the suggested ept we need to scale some down that otherwise requiest too many registers as cufftdx is unaware of the higher order transforms. @@ -68,7 +78,7 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImage_t* __restrict__ image_to_search, const ComplexData_t* __restrict__ input_values, ComplexData_t* __restrict__ output_values, - Offsets mem_offsets, int apparent_Q, + Offsets mem_offsets, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv, PreOpType pre_op_functor, @@ -79,33 +89,34 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ using complex_compute_t = typename FFT::value_type; using scalar_compute_t = typename complex_compute_t::value_type; - // __shared__ complex_compute_t shared_mem[invFFT::shared_memory_size/sizeof(complex_compute_t)]; // Storage for the input data that is re-used each blcok - FastFFT_SMEM complex_compute_t shared_mem[]; // Storage for the input data that is re-used each blcok + FastFFT_SMEM complex_compute_t shared_mem[]; complex_compute_t thread_data[FFT::storage_size]; - // For simplicity, we explicitly zeropad the input data to the size of the FFT. - // It may be worth trying to use threadIdx.y as in the DECREASE methods. - // Until then, this - - io::load(&input_values[Return1DFFTAddress(size_of::value / apparent_Q)], thread_data, size_of::value / apparent_Q, pre_op_functor); + // All threads must participate in the FFT due to shared memory fences, so there + // is no way to merge kernels w/ different sized FFTs, so we are forced to explicitly + // zero-pad the input on the forward. + io::load(&input_values[Return1DFFTAddress(mem_offsets.physical_x_input)], + thread_data, + mem_offsets.physical_x_input, + pre_op_functor); // In the first FFT the modifying twiddle factor is 1 so the data are reeal FFT( ).execute(thread_data, shared_mem, workspace_fwd); #if FFT_DEBUG_STAGE > 3 // * apparent_Q - io::load_shared(&image_to_search[Return1DFFTAddress(size_of::value)], - thread_data, - intra_op_functor); + io::load_external_data(&image_to_search[Return1DFFTAddress(size_of::value)], + thread_data, + intra_op_functor); #endif #if FFT_DEBUG_STAGE > 4 invFFT( ).execute(thread_data, shared_mem, workspace_inv); - io::store(thread_data, &output_values[Return1DFFTAddress(size_of::value)], post_op_functor); + io::store(thread_data, &output_values[Return1DFFTAddress(size_of::value)], post_op_functor); #else // Do not do the post op lambda if the invFFT is not used. - io::store(thread_data, &output_values[Return1DFFTAddress(size_of::value)]); + io::store(thread_data, &output_values[Return1DFFTAddress(size_of::value)]); #endif } @@ -1607,19 +1618,18 @@ __launch_bounds__(MAX_TPB) __global__ shared_mem, gridDim.y * n_ffts); - + FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * threadIdx.y], workspace); // To cut down on total smem needed split these - // FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * threadIdx.y], workspace); - // FIXME: the reduction by 2 is also used at the kernel call site and shoulid be set somehwere once. - const unsigned int eve_odd = threadIdx.y % 2; - if ( eve_odd == 0 ) { - FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * (threadIdx.y / 2)], workspace); - } - __syncthreads( ); - if ( eve_odd == 1 ) { - FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * (threadIdx.y / 2)], workspace); - } - __syncthreads( ); + // FIXME: I don't think there is any way to get this to work, b/c the FFT().exectute method inserts barrier.sync ptx, which afaik behave the same as __syncthreads + // const unsigned int eve_odd = threadIdx.y % 2; + // if ( eve_odd == 0 ) { + // FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * (threadIdx.y / 2)], workspace); + // } + // __syncthreads( ); + // if ( eve_odd == 1 ) { + // FFT( ).execute(thread_data, &shared_mem[FFT::shared_memory_size / sizeof(complex_compute_t) * (threadIdx.y / 2)], workspace); + // } + // __syncthreads( ); #else static_assert(n_ffts == 1, "C2R_BUFFER_LINES must be enabled, should not get hereonly for n_ffts == 1"); @@ -2056,11 +2066,6 @@ void FourierTransformer 0 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_INCREASE_XY, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2305,9 +2310,6 @@ void FourierTransformer(error_code); int shared_memory = FFT::shared_memory_size + (unsigned int)sizeof(complex_compute_t) * (LP.mem_offsets.shared_input + LP.mem_offsets.shared_output); - std::cerr << "\n\nShared Mem from c2c fwd increase " << shared_memory << std::endl; // revert - PrintLaunchParameters(LP); - PrintState( ); #if FFT_DEBUG_STAGE > 2 CheckSharedMemory(shared_memory, device_properties); cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_INCREASE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2366,9 +2368,6 @@ void FourierTransformer(error_code); int shared_memory = FFT::shared_memory_size; - std::cerr << "\n\nShared Mem from c2c_inv_none " << shared_memory << std::endl; // revert - PrintLaunchParameters(LP); - PrintState( ); CheckSharedMemory(shared_memory, device_properties); #if FFT_DEBUG_STAGE > 4 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2595,8 +2594,11 @@ void FourierTransformer 6 @@ -2803,6 +2799,7 @@ void FourierTransformer( ) ) { // FIXME #if FFT_DEBUG_STAGE > 2 + // Right now, because of the n_threads == size_of requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size. // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion. int apparent_Q = size_of::value / fwd_dims_in.y; @@ -2840,7 +2838,6 @@ void FourierTransformer