diff --git a/.vscode_shared/BenHimes/settings.json b/.vscode_shared/BenHimes/settings.json index 5eb033f..f774bbd 100644 --- a/.vscode_shared/BenHimes/settings.json +++ b/.vscode_shared/BenHimes/settings.json @@ -77,7 +77,9 @@ "unordered_set": "cpp", "__config": "cpp", "target": "cpp", - "ios": "cpp" + "ios": "cpp", + "__pragma_push": "cpp", + "__locale": "cpp" }, "C_Cpp.clang_format_path": "/usr/bin/clang-format-14", "editor.formatOnSave": true, diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh index 17d094d..3c6e323 100644 --- a/include/FastFFT.cuh +++ b/include/FastFFT.cuh @@ -23,15 +23,7 @@ -template -constexpr inline bool IS_IKF_t( ) { - if constexpr ( std::is_final_v ) { - return true; - } - else { - return false; - } -}; + @@ -392,24 +384,24 @@ template __launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel_C2C_WithPadding_SwapRealSpaceQuadrants(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace); -template +template __launch_bounds__(invFFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, + void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv); -template +template __launch_bounds__(FFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, + void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); -template +template __launch_bounds__(FFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, + void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv); -template -__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, +template +__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv); template @@ -464,8 +456,8 @@ __global__ void thread_fft_kernel_R2C_decomposed_transposed(const ScalarType* __ template __global__ void thread_fft_kernel_C2C_decomposed(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q); -template -__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q); +template +__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q); ///////////// // C2R @@ -493,6 +485,39 @@ __global__ void clip_into_real_kernel(InputType* real_values_gpu, int3 wanted_coordinate_of_box_center, OutputBaseType wanted_padding_value); +// TODO: this may be expanded, for now it is to be used in the case where we have +// packed the real values of a c2r into the first half of the complex array. +// The output type pointer needs to be cast to the correct type AND posibly converted +template +inline __device__ WantedType convert_if_needed(const GivenType* __restrict__ ptr, const int idx) { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + WantedType ret; + if constexpr ( std::is_same_v || std::is_same_v ) { + if constexpr ( std::is_same_v ) { + if constexpr ( std::is_same_v ) { + ret = __float2half_rn(reinterpret_cast(ptr)[idx]); + return std::move(ret); + } + if constexpr ( std::is_same_v ) { + ret = reinterpret_cast(ptr)[idx]; + return std::move(ret); + } + else { + static_assert_type_name(ret); + } + } + else { + // Currently only setup for the above + static_assert_type_name(ptr); + } + } + else { + // Currently only setup for the above + static_assert_type_name(ptr); + } +} + ////////////////////////////////////////////// // IO functions adapted from the cufftdx examples /////////////////////////////// @@ -502,7 +527,9 @@ struct io { using complex_type = typename FFT::value_type; using scalar_type = typename complex_type::value_type; - static inline __device__ unsigned int stride_size( ) { + static inline __device__ unsigned int + stride_size( ) { + return cufftdx::size_of::value / FFT::elements_per_thread; } @@ -517,9 +544,9 @@ struct io { } } - static inline __device__ void store_r2c(const complex_type* thread_data, - complex_type* output, - int offset) { + static inline __device__ void store_r2c(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int offset) { const unsigned int stride = stride_size( ); unsigned int index = offset + threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -539,14 +566,14 @@ struct io { // Since we can make repeated use of the same shared memory for each sub-fft // we use this method to load into shared mem instead of directly to registers // TODO set this up for async mem load - static inline __device__ void load_shared(const complex_type* input, - complex_type* shared_input, - complex_type* thread_data, - float* twiddle_factor_args, - float twiddle_in, - int* input_map, - int* output_map, - int Q) { + static inline __device__ void load_shared(const complex_type* __restrict__ input, + complex_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + float* __restrict__ twiddle_factor_args, + float twiddle_in, + int* input_map, + int* __restrict__ output_map, + int Q) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -562,11 +589,11 @@ struct io { // Since we can make repeated use of the same shared memory for each sub-fft // we use this method to load into shared mem instead of directly to registers // TODO set this up for async mem load - static inline __device__ void load_shared(const complex_type* input, - complex_type* shared_input, - complex_type* thread_data, - float* twiddle_factor_args, - float twiddle_in) { + static inline __device__ void load_shared(const complex_type* __restrict__ input, + complex_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + float* __restrict__ twiddle_factor_args, + float twiddle_in) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -577,15 +604,15 @@ struct io { } } - static inline __device__ void load_shared(const complex_type* input, - complex_type* shared_input, - complex_type* thread_data, - float* twiddle_factor_args, - float twiddle_in, - int* input_map, - int* output_map, - int Q, - int number_of_elements) { + static inline __device__ void load_shared(const complex_type* __restrict__ input, + complex_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + float* __restrict__ twiddle_factor_args, + float twiddle_in, + int* __restrict__ input_map, + int* __restrict__ output_map, + int Q, + int number_of_elements) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -606,14 +633,14 @@ struct io { // Since we can make repeated use of the same shared memory for each sub-fft // we use this method to load into shared mem instead of directly to registers // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute - static inline __device__ void load_r2c_shared(const scalar_type* input, - scalar_type* shared_input, - complex_type* thread_data, - float* twiddle_factor_args, - float twiddle_in, - int* input_map, - int* output_map, - int Q) { + static inline __device__ void load_r2c_shared(const scalar_type* __restrict__ input, + scalar_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + float* __restrict__ twiddle_factor_args, + float twiddle_in, + int* __restrict__ input_map, + int* __restrict__ output_map, + int Q) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -631,11 +658,11 @@ struct io { // Since we can make repeated use of the same shared memory for each sub-fft // we use this method to load into shared mem instead of directly to registers // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute - static inline __device__ void load_r2c_shared(const scalar_type* input, - scalar_type* shared_input, - complex_type* thread_data, - float* twiddle_factor_args, - float twiddle_in) { + static inline __device__ void load_r2c_shared(const scalar_type* __restrict__ input, + scalar_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + float* __restrict__ twiddle_factor_args, + float twiddle_in) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -647,8 +674,8 @@ struct io { } } - static inline __device__ void load_r2c_shared_and_pad(const scalar_type* input, - complex_type* shared_mem) { + static inline __device__ void load_r2c_shared_and_pad(const scalar_type* __restrict__ input, + complex_type* __restrict__ shared_mem) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + (threadIdx.z * size_of::value); for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -658,9 +685,9 @@ struct io { __syncthreads( ); } - static inline __device__ void copy_from_shared(const complex_type* shared_mem, - complex_type* thread_data, - const unsigned int Q) { + static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ thread_data, + const unsigned int Q) { const unsigned int stride = stride_size( ) * Q; // I think the Q is needed, but double check me TODO unsigned int index = (threadIdx.x * Q) + threadIdx.z; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -673,8 +700,8 @@ struct io { // Note that unlike most functions in this file, this one does not have a // const decorator on the thread mem, as we want to modify it with the twiddle factors // before reducing the full shared mem space. - static inline __device__ void reduce_block_fft(complex_type* thread_data, - complex_type* shared_mem, + static inline __device__ void reduce_block_fft(complex_type* __restrict__ thread_data, + complex_type* __restrict__ shared_mem, const float twiddle_in, const unsigned int Q) { const unsigned int stride = stride_size( ); @@ -706,10 +733,10 @@ struct io { } } - static inline __device__ void store_r2c_reduced(const complex_type* thread_data, - complex_type* output, - const unsigned int pixel_pitch, - const unsigned int memory_limit) { + static inline __device__ void store_r2c_reduced(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + const unsigned int pixel_pitch, + const unsigned int memory_limit) { if ( threadIdx.z == 0 ) { // Finally we write out the first size_of::values to global const unsigned int stride = stride_size( ); @@ -726,25 +753,25 @@ struct io { // when using load_shared || load_r2c_shared, we need then copy from shared mem into the registers. // notice we still need the packed complex values for the xform. - static inline __device__ void copy_from_shared(const scalar_type* shared_input, - complex_type* thread_data, - int* input_map) { + static inline __device__ void copy_from_shared(const scalar_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data, + int* input_map) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { thread_data[i].x = shared_input[input_map[i]]; thread_data[i].y = 0.0f; } } - static inline __device__ void copy_from_shared(const complex_type* shared_input_complex, - complex_type* thread_data, - int* input_map) { + static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_input_complex, + complex_type* __restrict__ thread_data, + int* __restrict__ input_map) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { thread_data[i] = shared_input_complex[input_map[i]]; } } - static inline __device__ void copy_from_shared(const scalar_type* shared_input, - complex_type* thread_data) { + static inline __device__ void copy_from_shared(const scalar_type* __restrict__ shared_input, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -754,8 +781,8 @@ struct io { } } - static inline __device__ void copy_from_shared(const complex_type* shared_input_complex, - complex_type* thread_data) { + static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_input_complex, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -764,13 +791,13 @@ struct io { } } - template - static inline __device__ void load_shared_and_conj_multiply(const ExternalImagePtr_t* image_to_search, - complex_type* thread_data) { + template + static inline __device__ void load_shared_and_conj_multiply(ExternalImage_t* __restrict__ image_to_search, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; complex_type c; - if constexpr ( std::is_same_v ) { + if constexpr ( std::is_same_v ) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { c.x = (thread_data[i].x * __low2float(image_to_search[index]) + thread_data[i].y * __high2float(image_to_search[index].y)); c.y = (thread_data[i].y * __low2float(image_to_search[index]) - thread_data[i].x * __high2float(image_to_search[index].y)); @@ -778,7 +805,7 @@ struct io { index += stride; } } - else if constexpr ( std::is_same_v ) { + else if constexpr ( std::is_same_v ) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { c.x = (thread_data[i].x * image_to_search[index].x + thread_data[i].y * image_to_search[index].y); c.y = (thread_data[i].y * image_to_search[index].x - thread_data[i].x * image_to_search[index].y); @@ -792,14 +819,14 @@ struct io { } // TODO: set user lambda to default = false, then get rid of other load_shared - template - static inline __device__ void load_shared(const ExternalImagePtr_t image_to_search, - complex_type* thread_data, - FunctionType intra_op_functor = nullptr) { + template + static inline __device__ void load_shared(const ExternalImage_t* __restrict__ image_to_search, + complex_type* __restrict__ thread_data, + FunctionType intra_op_functor = nullptr) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; if constexpr ( IS_IKF_t( ) ) { - if constexpr ( std::is_same_v ) { + if constexpr ( std::is_same_v ) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { intra_op_functor(thread_data[i].x, thread_data[i].y, __low2float(image_to_search[index]), __high2float(image_to_search[index])); //ComplexConjMulAndScale(thread_data[i], image_to_search[index], 1.0f); index += stride; @@ -823,8 +850,8 @@ struct io { // Now we need send to shared mem and transpose on the way // TODO: fix bank conflicts later. - static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* shared_mem, - complex_type* thread_data) { + static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* __restrict__ shared_mem, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -842,8 +869,8 @@ struct io { // Now we need send to shared mem and transpose on the way // TODO: fix bank conflicts later. - static inline __device__ void transpose_in_shared_XZ(complex_type* shared_mem, - complex_type* thread_data) { + static inline __device__ void transpose_in_shared_XZ(complex_type* __restrict__ shared_mem, + complex_type* __restrict__ thread_data) { const unsigned int stride = io::stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -855,8 +882,8 @@ struct io { __syncthreads( ); } - static inline __device__ void store_r2c_transposed_xz(const complex_type* thread_data, - complex_type* output) { + static inline __device__ void store_r2c_transposed_xz(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -873,8 +900,8 @@ struct io { } // Store a transposed tile, made up of contiguous (full) FFTS - static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem, - complex_type* output) { + static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ output) { const unsigned int stride = stride_size( ); constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; unsigned int index = threadIdx.x + threadIdx.z * output_values_to_store; @@ -892,10 +919,10 @@ struct io { // Store a transposed tile, made up of non-contiguous (strided partial) FFTS // - static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem, - complex_type* output, - const unsigned int Q, - const unsigned int sub_fft) { + static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ output, + const unsigned int Q, + const unsigned int sub_fft) { const unsigned int stride = stride_size( ); constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; unsigned int index = threadIdx.x + threadIdx.z * output_values_to_store; @@ -911,8 +938,8 @@ struct io { __syncthreads( ); } - static inline __device__ void store_transposed_xz_strided_Z(const complex_type* shared_mem, - complex_type* output) { + static inline __device__ void store_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ output) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + threadIdx.z * cufftdx::size_of::value; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -922,9 +949,9 @@ struct io { __syncthreads( ); } - static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data, - complex_type* output, - int pixel_pitch) { + static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int pixel_pitch) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -940,10 +967,10 @@ struct io { } } - static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data, - complex_type* output, - int* output_MAP, - int pixel_pitch) { + static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* output_MAP, + int pixel_pitch) { const unsigned int stride = stride_size( ); for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index @@ -958,11 +985,11 @@ struct io { } } - static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data, - complex_type* output, - int* output_MAP, - int pixel_pitch, - int memory_limit) { + static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* __restrict__ output_MAP, + int pixel_pitch, + int memory_limit) { const unsigned int stride = stride_size( ); for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) { // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index @@ -982,8 +1009,8 @@ struct io { // } } - static inline __device__ void load_c2r(const complex_type* input, - complex_type* thread_data) { + static inline __device__ void load_c2r(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -999,9 +1026,9 @@ struct io { } } - static inline __device__ void load_c2r_transposed(const complex_type* input, - complex_type* thread_data, - unsigned int pixel_pitch) { + static inline __device__ void load_c2r_transposed(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + unsigned int pixel_pitch) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -1017,9 +1044,9 @@ struct io { } } - static inline __device__ void load_c2r_shared_and_pad(const complex_type* input, - complex_type* shared_mem, - const unsigned int pixel_pitch) { + static inline __device__ void load_c2r_shared_and_pad(const complex_type* __restrict__ input, + complex_type* __restrict__ shared_mem, + const unsigned int pixel_pitch) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + (threadIdx.z * size_of::value); for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) { @@ -1037,8 +1064,8 @@ struct io { } // this may benefit from asynchronous execution - static inline __device__ void load(const complex_type* input, - complex_type* thread_data) { + static inline __device__ void load(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1050,9 +1077,9 @@ struct io { } // this may benefit from asynchronous execution - static inline __device__ void load(const complex_type* input, - complex_type* thread_data, - int last_index_to_load) { + static inline __device__ void load(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + int last_index_to_load) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1066,10 +1093,10 @@ struct io { // TODO: set pre_op_functor to default=false and get rid of other load template - static inline __device__ void load(const complex_type* input, - complex_type* thread_data, - int last_index_to_load, - FunctionType pre_op_functor = nullptr) { + static inline __device__ void load(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + int last_index_to_load, + FunctionType pre_op_functor = nullptr) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; if constexpr ( IS_IKF_t( ) ) { @@ -1092,9 +1119,9 @@ struct io { } } - static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data, - complex_type* output, - int first_negative_index) { + static inline __device__ void store_and_swap_quadrants(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int first_negative_index) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; complex_type phase_shift; @@ -1112,10 +1139,10 @@ struct io { } } - static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data, - complex_type* output, - int* source_idx, - int first_negative_index) { + static inline __device__ void store_and_swap_quadrants(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* __restrict__ source_idx, + int first_negative_index) { const unsigned int stride = stride_size( ); complex_type phase_shift; int logical_y; @@ -1132,9 +1159,9 @@ struct io { } template - static inline __device__ void store(const complex_type* thread_data, - complex_type* output, - FunctionType post_op_functor = nullptr) { + static inline __device__ void store(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + FunctionType post_op_functor = nullptr) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; if constexpr ( IS_IKF_t( ) ) { @@ -1151,10 +1178,10 @@ struct io { } } - static inline __device__ void store(const complex_type* thread_data, - complex_type* output, - const unsigned int Q, - const unsigned int sub_fft) { + static inline __device__ void store(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + const unsigned int Q, + const unsigned int sub_fft) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1163,8 +1190,8 @@ struct io { } } - static inline __device__ void store_Z(const complex_type* shared_mem, - complex_type* output) { + static inline __device__ void store_Z(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ output) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + threadIdx.z * size_of::value; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1174,10 +1201,10 @@ struct io { } } - static inline __device__ void store_Z(const complex_type* shared_mem, - complex_type* output, - const unsigned int Q, - const unsigned int sub_fft) { + static inline __device__ void store_Z(const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ output, + const unsigned int Q, + const unsigned int sub_fft) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + threadIdx.z * size_of::value; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1187,9 +1214,9 @@ struct io { __syncthreads( ); } - static inline __device__ void store(const complex_type* thread_data, - complex_type* output, - unsigned int memory_limit) { + static inline __device__ void store(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + unsigned int memory_limit) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1199,9 +1226,9 @@ struct io { } } - static inline __device__ void store(const complex_type* thread_data, - complex_type* output, - int* source_idx) { + static inline __device__ void store(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* __restrict__ source_idx) { const unsigned int stride = stride_size( ); for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { // If no kernel based changes are made to source_idx, this will be the same as the original index value @@ -1209,9 +1236,9 @@ struct io { } } - static inline __device__ void store_subset(const complex_type* thread_data, - complex_type* output, - int* source_idx) { + static inline __device__ void store_subset(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* __restrict__ source_idx) { const unsigned int stride = stride_size( ); for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { // If no kernel based changes are made to source_idx, this will be the same as the original index value @@ -1220,9 +1247,9 @@ struct io { } } - static inline __device__ void store_coalesced(const complex_type* shared_output, - complex_type* global_output, - int offset) { + static inline __device__ void store_coalesced(const complex_type* __restrict__ shared_output, + complex_type* __restrict__ global_output, + int offset) { const unsigned int stride = stride_size( ); unsigned int index = offset + threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1231,8 +1258,8 @@ struct io { } } - static inline __device__ void load_c2c_shared_and_pad(const complex_type* input, - complex_type* shared_mem) { + static inline __device__ void load_c2c_shared_and_pad(const complex_type* __restrict__ input, + complex_type* __restrict__ shared_mem) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + (threadIdx.z * size_of::value); for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { @@ -1242,8 +1269,8 @@ struct io { __syncthreads( ); } - static inline __device__ void store_c2c_reduced(const complex_type* thread_data, - complex_type* output) { + static inline __device__ void store_c2c_reduced(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output) { if ( threadIdx.z == 0 ) { // Finally we write out the first size_of::values to global const unsigned int stride = stride_size( ); @@ -1258,27 +1285,28 @@ struct io { } } - static inline __device__ void store_c2r_reduced(const complex_type* thread_data, - scalar_type* output) { + static inline __device__ void store_c2r_reduced(const complex_type* __restrict__ thread_data, + scalar_type* __restrict__ output) { if ( threadIdx.z == 0 ) { // Finally we write out the first size_of::values to global const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x + (threadIdx.z * size_of::value); + for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { if ( index < size_of::value ) { // transposed index. - output[index] = reinterpret_cast(thread_data)[i]; + output[index] = convert_if_needed(thread_data, i); } index += stride; } } } - static inline __device__ void store_transposed(const complex_type* thread_data, - complex_type* output, - int* output_map, - int* rotated_offset, - int memory_limit) { + static inline __device__ void store_transposed(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ output, + int* __restrict__ output_map, + int* __restrict__ rotated_offset, + int memory_limit) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { // If no kernel based changes are made to source_idx, this will be the same as the original index value if ( output_map[i] < memory_limit ) @@ -1286,25 +1314,27 @@ struct io { } } - static inline __device__ void store_c2r(const complex_type* thread_data, - scalar_type* output) { + static inline __device__ void store_c2r(const complex_type* __restrict__ thread_data, + scalar_type* __restrict__ output) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; + for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { - output[index] = reinterpret_cast(thread_data)[i]; + output[index] = convert_if_needed(thread_data, i); index += stride; } } - static inline __device__ void store_c2r(const complex_type* thread_data, - scalar_type* output, - unsigned int memory_limit) { + static inline __device__ void store_c2r(const complex_type* __restrict__ thread_data, + scalar_type* __restrict__ output, + unsigned int memory_limit) { const unsigned int stride = stride_size( ); unsigned int index = threadIdx.x; + for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { // TODO: does reinterpret_cast(thread_data)[i] make more sense than just thread_data[i].x?? if ( index < memory_limit ) - output[index] = reinterpret_cast(thread_data)[i]; + output[index] = convert_if_needed(thread_data, i); index += stride; } } @@ -1315,9 +1345,9 @@ struct io_thread { using complex_type = typename FFT::value_type; using scalar_type = typename complex_type::value_type; - static inline __device__ void load_r2c(const scalar_type* input, - complex_type* thread_data, - const int stride) { + static inline __device__ void load_r2c(const scalar_type* __restrict__ input, + complex_type* __restrict__ thread_data, + const int stride) { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < size_of::value; i++ ) { thread_data[i].x = input[index]; @@ -1326,10 +1356,10 @@ struct io_thread { } } - static inline __device__ void store_r2c(const complex_type* shared_output, - complex_type* output, - const int stride, - const int memory_limit) { + static inline __device__ void store_r2c(const complex_type* __restrict__ shared_output, + complex_type* __restrict__ output, + const int stride, + const int memory_limit) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < size_of::value / 2; i++ ) { @@ -1341,11 +1371,11 @@ struct io_thread { } } - static inline __device__ void store_r2c_transposed_xy(const complex_type* shared_output, - complex_type* output, - int stride, - int pixel_pitch, - int memory_limit) { + static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ shared_output, + complex_type* __restrict__ output, + int stride, + int pixel_pitch, + int memory_limit) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < size_of::value / 2; i++ ) { @@ -1357,11 +1387,11 @@ struct io_thread { } } - static inline __device__ void remap_decomposed_segments(const complex_type* thread_data, - complex_type* shared_output, - float twiddle_in, - int Q, - int memory_limit) { + static inline __device__ void remap_decomposed_segments(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ shared_output, + float twiddle_in, + int Q, + int memory_limit) { // Unroll the first loop and initialize the shared mem. complex_type twiddle; int index = threadIdx.x * size_of::value; @@ -1389,9 +1419,9 @@ struct io_thread { __syncthreads( ); } - static inline __device__ void load_c2c(const complex_type* input, - complex_type* thread_data, - const int stride) { + static inline __device__ void load_c2c(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + const int stride) { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < size_of::value; i++ ) { thread_data[i] = input[index]; @@ -1399,9 +1429,9 @@ struct io_thread { } } - static inline __device__ void store_c2c(const complex_type* shared_output, - complex_type* output, - const int stride) { + static inline __device__ void store_c2c(const complex_type* __restrict__ shared_output, + complex_type* __restrict__ output, + const int stride) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < size_of::value; i++ ) { @@ -1410,10 +1440,10 @@ struct io_thread { } } - static inline __device__ void remap_decomposed_segments(const complex_type* thread_data, - complex_type* shared_output, - float twiddle_in, - int Q) { + static inline __device__ void remap_decomposed_segments(const complex_type* __restrict__ thread_data, + complex_type* __restrict__ shared_output, + float twiddle_in, + int Q) { // Unroll the first loop and initialize the shared mem. complex_type twiddle; int index = threadIdx.x * size_of::value; @@ -1438,10 +1468,10 @@ struct io_thread { __syncthreads( ); } - static inline __device__ void load_c2r(const complex_type* input, - complex_type* thread_data, - const int stride, - const int memory_limit) { + static inline __device__ void load_c2r(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + const int stride, + const int memory_limit) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; unsigned int offset = 2 * memory_limit - 2; @@ -1460,11 +1490,11 @@ struct io_thread { } // FIXME as above - static inline __device__ void load_c2r_transposed(const complex_type* input, - complex_type* thread_data, - int stride, - int pixel_pitch, - int memory_limit) { + static inline __device__ void load_c2r_transposed(const complex_type* __restrict__ input, + complex_type* __restrict__ thread_data, + int stride, + int pixel_pitch, + int memory_limit) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; // unsigned int offset = 2*memory_limit - 2; @@ -1483,10 +1513,10 @@ struct io_thread { } } - static inline __device__ void remap_decomposed_segments_c2r(const complex_type* thread_data, - scalar_type* shared_output, - scalar_type twiddle_in, - int Q) { + static inline __device__ void remap_decomposed_segments_c2r(const complex_type* __restrict__ thread_data, + scalar_type* __restrict__ shared_output, + scalar_type twiddle_in, + int Q) { // Unroll the first loop and initialize the shared mem. complex_type twiddle; int index = threadIdx.x * size_of::value; @@ -1510,25 +1540,26 @@ struct io_thread { __syncthreads( ); } - static inline __device__ void store_c2r(const scalar_type* shared_output, - scalar_type* output, - const int stride) { + static inline __device__ void store_c2r(const scalar_type* __restrict__ shared_output, + scalar_type* __restrict__ output, + const int stride) { // Each thread reads in the input data at stride = mem_offsets.Q unsigned int index = threadIdx.x; + for ( unsigned int i = 0; i < size_of::value; i++ ) { - output[index] = shared_output[index]; + output[index] = convert_if_needed(shared_output, index); index += stride; } } - template - static inline __device__ void load_shared_and_conj_multiply(const ExternalImagePtr_t image_to_search, - const complex_type* shared_mem, - complex_type* thread_data, - const int stride) { + template + static inline __device__ void load_shared_and_conj_multiply(const ExternalImage_t* __restrict__ image_to_search, + const complex_type* __restrict__ shared_mem, + complex_type* __restrict__ thread_data, + const int stride) { unsigned int index = threadIdx.x; complex_type c; - if constexpr ( std::is_same_v ) { + if constexpr ( std::is_same_v ) { for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { c.x = (shared_mem[index].x * __low2float(image_to_search[index]) + shared_mem[index].y * __high2float(image_to_search[index])); diff --git a/include/FastFFT.h b/include/FastFFT.h index ffd574f..25aaf72 100644 --- a/include/FastFFT.h +++ b/include/FastFFT.h @@ -30,6 +30,16 @@ inline constexpr _Tp pi_v = 3.141592653589793238462643383279502884L; #include "../src/fastfft/types.cuh" +template +constexpr inline bool IS_IKF_t( ) { + if constexpr ( std::is_final_v ) { + return true; + } + else { + return false; + } +}; + template __device__ __host__ inline void static_assert_type_name(T v) { @@ -45,6 +55,7 @@ __device__ __host__ inline void static_assert_type_name(T v) { static_assert(! std::is_same_v, "double*"); static_assert(! std::is_same_v, "__half*"); static_assert(! std::is_same_v, "__half2*"); + static_assert(! std::is_same_v, "nullptr_t"); } else { static_assert(! std::is_same_v, "int"); @@ -297,11 +308,11 @@ class FourierTransformer { // For the time being, the caller is responsible for having the memory allocated for any of these input/output pointers. void SetInputPointer(InputType* input_pointer); - template - void SetOutputPointer(ExternalImagePtr_t* output_pointer); + template + void SetOutputPointer(ExternalImage_t* output_pointer); - template - void SetExternalImagePointer(ExternalImagePtr_t* output_pointer); + template + void SetExternalImagePointer(ExternalImage_t* output_pointer); // When passing in a pointer from python (cupy or pytorch) it is a long, and needs to be cast to input type. // For now, we are assuming memory ops are all handled in the python code. void SetInputPointerFromPython(long input_pointer); @@ -562,6 +573,8 @@ class FourierTransformer { */ + // FIXME: in the execution blocks, we should have some check that the correct direction is implemented. + // Or better yet, have this templated and enum KernelType { r2c_decomposed, // 1D fwd r2c_decomposed_transposed, // 2d fwd 1st stage r2c_none_XY, // 1d fwd // 2d fwd 1st stage @@ -579,7 +592,8 @@ class FourierTransformer { c2c_inv_none_XYZ, c2c_inv_decrease, c2c_inv_increase, - c2c_decomposed, + c2c_fwd_decomposed, + c2c_inv_decomposed, c2r_decomposed, c2r_decomposed_transposed, c2r_none, @@ -601,7 +615,8 @@ class FourierTransformer { "r2c_decrease_XY", "r2c_increase_XY", "r2c_increase_XZ", "c2c_fwd_none", "c2c_fwd_none_XYZ", "c2c_fwd_increase", "c2c_fwd_increase", "c2c_fwd_increase_XYZ", "c2c_inv_none", "c2c_inv_none_XZ", "c2c_inv_none_XYZ", "c2c_inv_increase", "c2c_inv_increase", - "c2c_decomposed", + "c2c_fwd_decomposed", + "c2c_inv_decomposed", "c2r_decomposed", "c2r_decomposed_transposed", "c2r_none", "c2r_none_XY", "c2r_decrease_XY", "c2r_increase", @@ -614,7 +629,7 @@ class FourierTransformer { inline bool IsThreadType(KernelType kernel_type) { if ( kernel_type == r2c_decomposed || kernel_type == r2c_decomposed_transposed || - kernel_type == c2c_decomposed || + kernel_type == c2c_fwd_decomposed || c2c_inv_decomposed || kernel_type == c2r_decomposed || kernel_type == c2r_decomposed_transposed || kernel_type == xcorr_decomposed ) { return true; } @@ -717,7 +732,7 @@ class FourierTransformer { void GetTransformSize(KernelType kernel_type); void GetTransformSize_thread(KernelType kernel_type, int thread_fft_size); - LaunchParams SetLaunchParameters(const int& ept, KernelType kernel_type, bool do_forward_transform = true); + LaunchParams SetLaunchParameters(const int& ept, KernelType kernel_type); inline int ReturnPaddedMemorySize(short4& wanted_dims) { // Assumes a) SetInputDimensionsAndType has been called and is_fftw_padded is set before this call. (Currently RuntimeAssert to die if false) FIXME @@ -756,12 +771,13 @@ class FourierTransformer { // 1. // First call passed from a public transform function, selects block or thread and the transform precision. template // bool is just used as a dummy type - void SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform = true, PreOpType pre_op_functor = nullptr, IntraOpType intra_op_functor = nullptr, PostOpType post_op_functor = nullptr); + typename std::enable_if_t( ))> + SetPrecisionAndExectutionMethod(KernelType kernel_type, PreOpType pre_op_functor = nullptr, IntraOpType intra_op_functor = nullptr, PostOpType post_op_functor = nullptr); // 2. // TODO: remove this now that the functors are working // Check to see if any intra kernel functions are wanted, and if so set the appropriate device pointers. template - void SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); + void SetIntraKernelFunctions(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); // 3. // Second call, sets size of the transform kernel, selects the appropriate GPU arch @@ -771,15 +787,15 @@ class FourierTransformer { // This allows us to iterate through a set of constexpr sizes passed as a template parameter pack. The value is in providing a means to have different size packs // for different fft configurations, eg. 2d vs 3d template - void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); + void SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); template - void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); + void SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); // 3. // Third call, sets the input and output dimensions and type template - void SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); + void SetAndLaunchKernel(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor); void PrintLaunchParameters(LaunchParams LP) { std::cerr << "Launch parameters: " << std::endl; @@ -806,7 +822,8 @@ class FourierTransformer { // If the user doesn't specify input/output pointers, assume the are copied into the FastFFT bufferspace. // TODO: this will only work for 2d as the output in 3d should be in d_ptr.position_space_buffer template - void Generic_Fwd_Image_Inv(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr, PostOpType post_op = nullptr); + typename std::enable_if_t( )> + Generic_Fwd_Image_Inv(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr, PostOpType post_op = nullptr); template void Generic_Fwd(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr); diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu index 2efc577..c4c6fcd 100644 --- a/src/fastfft/FastFFT.cu +++ b/src/fastfft/FastFFT.cu @@ -9,9 +9,9 @@ namespace FastFFT { -template +template __launch_bounds__(FFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, + void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { @@ -254,8 +254,8 @@ void FourierTransformer::SetIn } template -template -void FourierTransformer::SetOutputPointer(ExternalImagePtr_t* output_pointer) { +template +void FourierTransformer::SetOutputPointer(ExternalImage_t* output_pointer) { MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters not set"); if ( output_pointer == nullptr ) { @@ -263,11 +263,11 @@ void FourierTransformer::SetOu d_ptr.external_output_complex = nullptr; } else { - if constexpr ( std::is_same_v ) { + if constexpr ( std::is_same_v ) { d_ptr.external_output = output_pointer; d_ptr.external_output_complex = nullptr; } - else if constexpr ( std::is_same_v ) { + else if constexpr ( std::is_same_v ) { d_ptr.external_output_complex = output_pointer; d_ptr.external_output = nullptr; } @@ -295,8 +295,8 @@ void FourierTransformer::SetOu } template -template -void FourierTransformer::SetExternalImagePointer(ExternalImagePtr_t* ptr) { +template +void FourierTransformer::SetExternalImagePointer(ExternalImage_t* ptr) { MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters not set"); d_ptr.image_to_search = ptr; @@ -482,35 +482,34 @@ void FourierTransformer::Gener SetDimensions(DimensionCheckType::FwdTransform); // All placeholders - constexpr bool use_thread_method = false; - const bool do_forward_transform = true; + constexpr bool use_thread_method = false; // const bool swap_real_space_quadrants = false; // const bool transpose_output = true; - // SetPrecisionAndExectutionMethod(((r2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor); //FFT_R2C_decomposed(transpose_output); + SetPrecisionAndExectutionMethod(r2c_decomposed, pre_op_functor, intra_op_functor); //FFT_R2C_decomposed(transpose_output); else - SetPrecisionAndExectutionMethod(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_decomposed, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; } else { if ( is_real_valued_input ) { switch ( fwd_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_none_XY, pre_op_functor, intra_op_functor); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(r2c_decrease_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_decrease_XY, pre_op_functor, intra_op_functor); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_increase_XY, pre_op_functor, intra_op_functor); break; } default: { @@ -522,15 +521,15 @@ void FourierTransformer::Gener switch ( fwd_size_change_type ) { case SizeChangeType::no_change: { MyFFTDebugAssertTrue(false, "Complex input images are not yet supported"); // FIXME: - SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_none, pre_op_functor, intra_op_functor); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_decrease, pre_op_functor, intra_op_functor); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_increase, pre_op_functor, intra_op_functor); break; } default: { @@ -547,27 +546,27 @@ void FourierTransformer::Gener // FIXME there is some redundancy in specifying _decomposed and use_thread_method // Note: the only time the non-transposed method should be used is for 1d data. if ( use_thread_method ) { - SetPrecisionAndExectutionMethod(r2c_decomposed_transposed, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_decomposed_transposed, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_decomposed, pre_op_functor, intra_op_functor); } else { - SetPrecisionAndExectutionMethod(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_none_XY, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_none, pre_op_functor, intra_op_functor); } break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_increase_XY, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_increase, pre_op_functor, intra_op_functor); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(r2c_decrease_XY, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_decrease_XY, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_decrease, pre_op_functor, intra_op_functor); break; } } @@ -575,17 +574,17 @@ void FourierTransformer::Gener else if constexpr ( Rank == 3 ) { switch ( fwd_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(r2c_none_XZ, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_none_XZ, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_none_XYZ, do_forward_transform, pre_op_functor, intra_op_functor); - SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_none_XYZ, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_none, pre_op_functor, intra_op_functor); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XZ, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(r2c_increase_XZ, pre_op_functor, intra_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_increase_XYZ, do_forward_transform, pre_op_functor, intra_op_functor); - SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_increase_XYZ, pre_op_functor, intra_op_functor); + SetPrecisionAndExectutionMethod(c2c_fwd_increase, pre_op_functor, intra_op_functor); // SetPrecisionAndExectutionMethod(::Gener SetDimensions(DimensionCheckType::InvTransform); // All placeholders - constexpr bool use_thread_method = false; - const bool do_forward_transform = false; + constexpr bool use_thread_method = false; // const bool swap_real_space_quadrants = false; // const bool transpose_output = true; @@ -619,24 +617,24 @@ void FourierTransformer::Gener // Note: the only time the non-transposed method should be used is for 1d data. if constexpr ( use_thread_method ) { if ( is_real_valued_input ) - SetPrecisionAndExectutionMethod(c2r_decomposed, do_forward_transform); //FFT_R2C_decomposed(transpose_output); + SetPrecisionAndExectutionMethod(c2r_decomposed, intra_op, post_op); //FFT_R2C_decomposed(transpose_output); else - SetPrecisionAndExectutionMethod(c2c_decomposed, do_forward_transform); + SetPrecisionAndExectutionMethod(c2c_inv_decomposed, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; } else { if ( is_real_valued_input ) { switch ( inv_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(c2r_none_XY); + SetPrecisionAndExectutionMethod(c2r_none_XY, intra_op, post_op); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(c2r_decrease_XY); + SetPrecisionAndExectutionMethod(c2r_decrease_XY, intra_op, post_op); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(c2r_increase); + SetPrecisionAndExectutionMethod(c2r_increase, intra_op, post_op); break; } default: { @@ -647,15 +645,15 @@ void FourierTransformer::Gener else { switch ( inv_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(c2c_inv_none); + SetPrecisionAndExectutionMethod(c2c_inv_none, intra_op, post_op); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(c2c_inv_decrease); + SetPrecisionAndExectutionMethod(c2c_inv_decrease, intra_op, post_op); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(c2c_inv_increase); + SetPrecisionAndExectutionMethod(c2c_inv_increase, intra_op, post_op); break; } default: { @@ -673,27 +671,27 @@ void FourierTransformer::Gener // FIXME there is some redundancy in specifying _decomposed and use_thread_method // Note: the only time the non-transposed method should be used is for 1d data. if ( use_thread_method ) { - SetPrecisionAndExectutionMethod(c2c_decomposed, do_forward_transform); + SetPrecisionAndExectutionMethod(c2c_inv_decomposed, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2r_decomposed_transposed, do_forward_transform); + SetPrecisionAndExectutionMethod(c2r_decomposed_transposed, intra_op, post_op); } else { - SetPrecisionAndExectutionMethod(c2c_inv_none); + SetPrecisionAndExectutionMethod(c2c_inv_none, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2r_none_XY); + SetPrecisionAndExectutionMethod(c2r_none_XY, intra_op, post_op); } break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(c2c_inv_increase); + SetPrecisionAndExectutionMethod(c2c_inv_increase, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2r_increase); + SetPrecisionAndExectutionMethod(c2r_increase, intra_op, post_op); break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(c2c_inv_decrease); + SetPrecisionAndExectutionMethod(c2c_inv_decrease, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2r_decrease_XY); + SetPrecisionAndExectutionMethod(c2r_decrease_XY, intra_op, post_op); break; } default: { @@ -706,14 +704,14 @@ void FourierTransformer::Gener case 3: { switch ( inv_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(c2c_inv_none_XZ); + SetPrecisionAndExectutionMethod(c2c_inv_none_XZ, intra_op, post_op); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_inv_none_XYZ); - SetPrecisionAndExectutionMethod(c2r_none); + SetPrecisionAndExectutionMethod(c2c_inv_none_XYZ, intra_op, post_op); + SetPrecisionAndExectutionMethod(c2r_none, intra_op, post_op); break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XY); + SetPrecisionAndExectutionMethod(r2c_increase_XY, intra_op, post_op); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. // SetPrecisionAndExectutionMethod(c2c_fwd_increase_XYZ); break; @@ -734,7 +732,8 @@ void FourierTransformer::Gener template template -void FourierTransformer::Generic_Fwd_Image_Inv(PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +typename std::enable_if_t( )> +FourierTransformer::Generic_Fwd_Image_Inv(PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { // Set the member pointer to the passed pointer SetDimensions(DimensionCheckType::FwdTransform); @@ -747,7 +746,7 @@ void FourierTransformer::Gener case 2: { switch ( fwd_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(r2c_none_XY, true); + SetPrecisionAndExectutionMethod(r2c_none_XY, pre_op_functor, intra_op_functor, post_op_functor); switch ( inv_size_change_type ) { case SizeChangeType::no_change: { MyFFTRunTimeAssertTrue(false, "2D FFT generic lambda no change/nochange not yet supported"); @@ -758,8 +757,8 @@ void FourierTransformer::Gener break; } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(xcorr_fwd_none_inv_decrease, true); - SetPrecisionAndExectutionMethod(c2r_decrease_XY, false); + SetPrecisionAndExectutionMethod(xcorr_fwd_none_inv_decrease, pre_op_functor, intra_op_functor, post_op_functor); + SetPrecisionAndExectutionMethod(c2r_decrease_XY, pre_op_functor, intra_op_functor, post_op_functor); break; } default: { @@ -770,11 +769,11 @@ void FourierTransformer::Gener break; } // case fwd no change case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XY, true); + SetPrecisionAndExectutionMethod(r2c_increase_XY, pre_op_functor, intra_op_functor, post_op_functor); switch ( inv_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(generic_fwd_increase_op_inv_none, true, pre_op_functor, intra_op_functor, post_op_functor); - SetPrecisionAndExectutionMethod(c2r_none_XY, false); + SetPrecisionAndExectutionMethod(generic_fwd_increase_op_inv_none, pre_op_functor, intra_op_functor, post_op_functor); + SetPrecisionAndExectutionMethod(c2r_none_XY, pre_op_functor, intra_op_functor, post_op_functor); transform_stage_completed = TransformStageCompleted::inv; break; } @@ -801,11 +800,11 @@ void FourierTransformer::Gener } case SizeChangeType::decrease: { - SetPrecisionAndExectutionMethod(r2c_decrease_XY, true); + SetPrecisionAndExectutionMethod(r2c_decrease_XY, pre_op_functor, intra_op_functor, post_op_functor); switch ( inv_size_change_type ) { case SizeChangeType::no_change: { - SetPrecisionAndExectutionMethod(xcorr_fwd_increase_inv_none, true); - SetPrecisionAndExectutionMethod(c2r_none_XY, false); // TODO the output could be smaller + SetPrecisionAndExectutionMethod(xcorr_fwd_increase_inv_none, pre_op_functor, intra_op_functor, post_op_functor); + SetPrecisionAndExectutionMethod(c2r_none_XY, pre_op_functor, intra_op_functor, post_op_functor); transform_stage_completed = TransformStageCompleted::inv; break; } @@ -852,17 +851,17 @@ void FourierTransformer::Gener break; } case SizeChangeType::increase: { - SetPrecisionAndExectutionMethod(r2c_increase_XZ); + SetPrecisionAndExectutionMethod(r2c_increase_XZ, pre_op_functor, intra_op_functor, post_op_functor); transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_fwd_increase_XYZ); + SetPrecisionAndExectutionMethod(c2c_fwd_increase_XYZ, pre_op_functor, intra_op_functor, post_op_functor); switch ( inv_size_change_type ) { case SizeChangeType::no_change: { // TODO: will need a kernel for generic_fwd_increase_op_inv_none_XZ - SetPrecisionAndExectutionMethod(generic_fwd_increase_op_inv_none); + SetPrecisionAndExectutionMethod(generic_fwd_increase_op_inv_none, pre_op_functor, intra_op_functor, post_op_functor); // SetPrecisionAndExectutionMethod(c2c_inv_none_XZ); transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft. - SetPrecisionAndExectutionMethod(c2c_inv_none_XYZ); - SetPrecisionAndExectutionMethod(c2r_none); + SetPrecisionAndExectutionMethod(c2c_inv_none_XYZ, pre_op_functor, intra_op_functor, post_op_functor); + SetPrecisionAndExectutionMethod(c2r_none, pre_op_functor, intra_op_functor, post_op_functor); break; } case SizeChangeType::increase: { @@ -1326,8 +1325,8 @@ __global__ void block_fft_kernel_R2C_DECREASE_XY(const ScalarType* __restrict__ // decomposed with conj multiplication -template -__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q) { +template +__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q) { using complex_type = ComplexType; // The data store is non-coalesced, so don't aggregate the data in shared mem. @@ -1359,9 +1358,9 @@ __global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_ // C2C with conj multiplication -template +template __launch_bounds__(invFFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, + void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) { // // Initialize the shared memory, assuming everyting matches the input data X size in @@ -1393,9 +1392,9 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__ io::store(thread_data, &output_values[Return1DFFTAddress(size_of::value)]); } -template +template __launch_bounds__(invFFT::max_threads_per_block) __global__ - void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) { + void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) { // // Initialize the shared memory, assuming everyting matches the input data X size in using complex_type = ComplexType; @@ -1434,8 +1433,8 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__ } // -template -__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, +template +__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) { using complex_type = ComplexType; @@ -1973,44 +1972,48 @@ __global__ void clip_into_real_kernel(InputType* real_values_gpu, template template -void FourierTransformer::SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +typename std::enable_if_t( ))> +FourierTransformer::SetPrecisionAndExectutionMethod(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { // For kernels with fwd and inv transforms, we want to not set the direction yet. static const bool is_half = std::is_same_v; static const bool is_float = std::is_same_v; static_assert(is_half || is_float, "FourierTransformer::SetPrecisionAndExectutionMethod: Unsupported ComputeBaseType"); + if constexpr ( FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT ) { + static_assert(IS_IKF_t( ), "FourierTransformer::SetPrecisionAndExectutionMethod: Unsupported IntraOpType"); + } if constexpr ( use_thread_method ) { using FFT = decltype(Thread( ) + Size<32>( ) + Precision( )); - SetIntraKernelFunctions(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetIntraKernelFunctions(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } else { using FFT = decltype(Block( ) + Precision( ) + FFTsPerBlock<1>( )); - SetIntraKernelFunctions(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetIntraKernelFunctions(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } } template template -void FourierTransformer::SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +void FourierTransformer::SetIntraKernelFunctions(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { if constexpr ( ! detail::has_any_block_operator::value ) { - // SelectSizeAndType(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + // SelectSizeAndType(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } else { if constexpr ( Rank == 3 ) { - SelectSizeAndType(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SelectSizeAndType(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } else { // TODO: 8192 will fail for sm75 if wanted need some extra logic ... , 8192, 16 - SelectSizeAndType(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SelectSizeAndType(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } } } template template -void FourierTransformer::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +void FourierTransformer::SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { // This provides both a termination point for the recursive version needed for the block transform case as well as the actual function for thread transform with fixed size 32 GetTransformSize(kernel_type); if constexpr ( ! detail::has_any_block_operator::value ) { @@ -2018,22 +2021,22 @@ void FourierTransformer::Selec switch ( device_properties.device_arch ) { case 700: { using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } case 750: { using FFT = decltype(FFT_base( ) + SM<750>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } case 800: { using FFT = decltype(FFT_base( ) + SM<800>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } case 860: { using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } default: { @@ -2046,7 +2049,7 @@ void FourierTransformer::Selec template template -void FourierTransformer::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +void FourierTransformer::SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { // Use recursion to step through the allowed sizes. GetTransformSize(kernel_type); @@ -2055,24 +2058,24 @@ void FourierTransformer::Selec switch ( device_properties.device_arch ) { case 700: { using FFT = decltype(FFT_base( ) + Size( ) + SM<700>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } case 750: { if constexpr ( SizeValue <= 4096 ) { using FFT = decltype(FFT_base( ) + Size( ) + SM<750>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } break; } case 800: { using FFT = decltype(FFT_base( ) + Size( ) + SM<800>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } case 860: { using FFT = decltype(FFT_base( ) + Size( ) + SM<700>( ) + ElementsPerThread<8>( )); - SetAndLaunchKernel(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SetAndLaunchKernel(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); break; } default: { @@ -2082,12 +2085,12 @@ void FourierTransformer::Selec } } - SelectSizeAndType(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor); + SelectSizeAndType(kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } template template -void FourierTransformer::SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { +void FourierTransformer::SetAndLaunchKernel(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) { using complex_type = typename FFT_base_arch::value_type; using scalar_type = typename complex_type::value_type; @@ -2130,7 +2133,7 @@ void FourierTransformer::SetAn } // FIXME: this only works if the only intraop functor is conj mul (which it currently is.) - if constexpr ( IS_IKF_t ) { + if constexpr ( IS_IKF_t( ) ) { if constexpr ( FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT ) { MyFFTDebugAssertTrue(d_ptr.image_to_search != nullptr, "SetExternalImagePointer has not been called."); } @@ -2138,7 +2141,7 @@ void FourierTransformer::SetAn MyFFTDebugAssertTrue(d_ptr.image_to_search == nullptr, "SetExternalImagePointer has been called twice."); } } - using ExternalImagePtr_t = decltype(d_ptr.image_to_search); + using ExternalImage_t = std::remove_pointer_t; if ( d_ptr.external_output ) { MyFFTDebugAssertTrue(d_ptr.external_output_complex == nullptr, "SetExternalImagePointer has been called twice."); @@ -2276,20 +2279,20 @@ void FourierTransformer::SetAn if ( swap_real_space_quadrants ) { MyFFTRunTimeAssertTrue(false, "decomposed xcorr with swap real space quadrants is not implemented."); - // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); // precheck; - // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<< > > - // ( (ExternalImagePtr_t*) image_to_search, (complex_type*) d_ptr.momentum_space_buffer, (complex_type*) d_ptr.momentum_space, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv); + // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<< > > + // ( (ExternalImage_t*) image_to_search, (complex_type*) d_ptr.momentum_space_buffer, (complex_type*) d_ptr.momentum_space, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv); // postcheck; } else { - cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); #if FFT_DEBUG_STAGE > 2 // the image_to_search pointer is set during call to CrossCorrelate, precheck; - thread_fft_kernel_C2C_decomposed_ConjMul<<>>((ExternalImagePtr_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); + thread_fft_kernel_C2C_decomposed_ConjMul<<>>((ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); postcheck; is_in_second_buffer_partition = ! is_in_second_buffer_partition; #endif @@ -2297,36 +2300,40 @@ void FourierTransformer::SetAn break; } - case c2c_decomposed: { + case c2c_fwd_decomposed: { // TODO: FFT_ALGO_t using FFT_nodir = decltype(FFT_base_arch( ) + Type( )); - LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_decomposed, do_forward_transform); + LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_fwd_decomposed); - if ( do_forward_transform ) { - using FFT = decltype(FFT_nodir( ) + Direction( )); - int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type); - CheckSharedMemory(shared_memory, device_properties); - cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + using FFT = decltype(FFT_nodir( ) + Direction( )); + int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type); + CheckSharedMemory(shared_memory, device_properties); + cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); #if FFT_DEBUG_STAGE > 2 - precheck; - thread_fft_kernel_C2C_decomposed<<>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); - postcheck; - is_in_second_buffer_partition = ! is_in_second_buffer_partition; + precheck; + thread_fft_kernel_C2C_decomposed<<>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); + postcheck; + is_in_second_buffer_partition = ! is_in_second_buffer_partition; #endif - } - else { - using FFT = decltype(FFT_nodir( ) + Direction( )); - int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type); - CheckSharedMemory(shared_memory, device_properties); - cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + break; + } + case c2c_inv_decomposed: { + + using FFT_nodir = decltype(FFT_base_arch( ) + Type( )); + LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_inv_decomposed); + + using FFT = decltype(FFT_nodir( ) + Direction( )); + int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type); + CheckSharedMemory(shared_memory, device_properties); + cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); #if FFT_DEBUG_STAGE > 4 - precheck; - thread_fft_kernel_C2C_decomposed<<>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); - postcheck; - is_in_second_buffer_partition = ! is_in_second_buffer_partition; + precheck; + thread_fft_kernel_C2C_decomposed<<>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q); + postcheck; + is_in_second_buffer_partition = ! is_in_second_buffer_partition; #endif - } + break; } } // switch on (thread) kernel_type @@ -2874,15 +2881,15 @@ void FourierTransformer::SetAn } else { - cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); precheck; // Right now, because of the n_threads == size_of requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size. // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion. int apparent_Q = size_of::value / fwd_dims_in.y; - block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<<>>( - (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv); + block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<<>>( + (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv); postcheck; } is_in_second_buffer_partition = ! is_in_second_buffer_partition; @@ -2926,13 +2933,13 @@ void FourierTransformer::SetAn // postcheck; } else { - cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); // Right now, because of the n_threads == size_of requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size. // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion. int apparent_Q = size_of::value / inv_dims_out.y; precheck; - block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<<>>( - (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv); + block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<<>>( + (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv); postcheck; } is_in_second_buffer_partition = ! is_in_second_buffer_partition; @@ -2971,10 +2978,10 @@ void FourierTransformer::SetAn // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion. int apparent_Q = size_of::value / fwd_dims_in.y; - cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); + cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); precheck; - block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<<>>( - (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_functor, intra_op_functor, post_op_functor); + block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<<>>( + (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_functor, intra_op_functor, post_op_functor); postcheck; // FIXME: this is set in the public method calls for other functions. Since it will be changed to 0-7 to match FFT_DEBUG_STAGE, fix it then. @@ -3099,7 +3106,14 @@ void FourierTransformer::GetTr case r2c_decomposed_transposed: transform_size.N = fwd_dims_in.x; break; - case c2c_decomposed: + case c2c_fwd_decomposed: + // FIXME fwd vs inv + if ( fwd_dims_in.y == 1 ) + transform_size.N = fwd_dims_in.x; + else + transform_size.N = fwd_dims_in.y; + break; + case c2c_inv_decomposed: // FIXME fwd vs inv if ( fwd_dims_in.y == 1 ) transform_size.N = fwd_dims_in.x; @@ -3133,7 +3147,7 @@ void FourierTransformer::GetTr } // end GetTransformSize_thread function template -LaunchParams FourierTransformer::SetLaunchParameters(const int& ept, KernelType kernel_type, bool do_forward_transform) { +LaunchParams FourierTransformer::SetLaunchParameters(const int& ept, KernelType kernel_type) { /* Assuming: 1) r2c/c2r imply forward/inverse transform. diff --git a/src/tests/constant_image_test.cu b/src/tests/constant_image_test.cu index a3ca709..4650215 100644 --- a/src/tests/constant_image_test.cu +++ b/src/tests/constant_image_test.cu @@ -167,11 +167,11 @@ bool const_image_test(std::vector& size) { FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f); if constexpr ( use_fp16_io_buffers ) { - FT_fp16.SetInputPointer(output_buffer_fp16); + FT_fp16.SetInputPointer(output_buffer_fp16); // FIXME: this has no effect for this kernel which is using intra_complex_ptr __half* dummy_ptr = nullptr; // FIXME: - FT_fp16.SetOutputPointer(dummy_ptr); + FT_fp16.SetOutputPointer(output_buffer_fp16); FT_fp16.InvFFT( ); - FT_fp16.CopyDeviceToHostAndSynchronize(reinterpret_cast<__half*>(host_output.real_values), true); + cudaErr(cudaMemcpyAsync(host_output.real_values, output_buffer_fp16, sizeof(__half) * host_output.real_memory_allocated, cudaMemcpyDeviceToHost, cudaStreamPerThread)); host_output.data_is_fp16 = true; // we need to over-ride this as we already convertted but are overwriting. host_output.ConvertFP16ToFP32( ); } @@ -193,7 +193,7 @@ bool const_image_test(std::vector& size) { all_passed = false; FastFFT_roundTrip_passed[n] = false; } - std::cerr << "sum" << sum << "full sum " << full_sum << std::endl; + std::cerr << "sum" << sum << " full sum " << full_sum << std::endl; MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x)); } // loop over sizes