diff --git a/docs/_docs/references/development_tools.md b/docs/_docs/references/development_tools.md index 3139cd2..d714fe6 100644 --- a/docs/_docs/references/development_tools.md +++ b/docs/_docs/references/development_tools.md @@ -105,7 +105,7 @@ FastFFT::PrintState() std::cout << "is_in_buffer_memory " << is_in_buffer_memory << std::endl; std::cout << "is_fftw_padded_input " << is_fftw_padded_input << std::endl; std::cout << "is_fftw_padded_output " << is_fftw_padded_output << std::endl; - std::cout << "is_real_valued_input " << IsAllowedRealType << std::endl; + std::cout << "is_real_valued_input " << IsAllowedRealType << std::endl; std::cout << "is_set_input_params " << is_set_input_params << std::endl; std::cout << "is_set_output_params " << is_set_output_params << std::endl; std::cout << std::endl; @@ -145,7 +145,7 @@ FastFFT::PrintLaunchParameters() PrintVectorType(LP.threadsPerBlock); std::cout << " Grid dimensions: "; PrintVectorType(LP.gridDims); - std::cout << " Q: " << LP.Q << std::endl; + std::cout << " Q: " << LP.transform_size.Q << std::endl; std::cout << " shared input: " << LP.mem_offsets.shared_input << std::endl; std::cout << " shared output (memlimit in r2c): " << LP.mem_offsets.shared_output << std::endl; std::cout << " physical_x_input: " << LP.mem_offsets.physical_x_input << std::endl; diff --git a/docs/_docs/references/usage.md b/docs/_docs/references/usage.md index de1596f..6ed8f04 100644 --- a/docs/_docs/references/usage.md +++ b/docs/_docs/references/usage.md @@ -20,7 +20,7 @@ In cufft, the first step to library access is to create a "handle" to a plan, *i cufftSetStream(cuda_plan_inverse, cudaStreamPerThread); // The parallel in Fast FFT would be to create an empty FourierTransformer object, e.g. - // The template arguments are: ComputeBaseType, InputType, OtherImageType, Rank + // The template arguments are: ComputeBaseType, PositionSpaceType, OtherImageType, Rank FastFFT::FourierTransformer FT; ``` diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh index 4804d93..99eb26e 100644 --- a/include/FastFFT.cuh +++ b/include/FastFFT.cuh @@ -5,8 +5,8 @@ #define __INCLUDE_FAST_FFT_CUH__ // #define USE_FOLDED_R2C_C2R -#define USE_FOLDED_C2R -#define C2R_BUFFER_LINES +// #define USE_FOLDED_C2R +// #define C2R_BUFFER_LINES // cudaErr(cudaFuncSetSharedMemConfig((const void*)block_fft_kernel_C2R_DECREASE_XY, cudaSharedMemBankSizeEightByte)); @@ -77,7 +77,8 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ OutputData_t* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, - int Q, + const int Q, + const int SignalLength, typename FFT::workspace_type workspace); // XZ_STRIDE ffts/block via threadIdx.x, notice launch bounds. Creates partial coalescing. @@ -110,6 +111,7 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ Offsets mem_offsets, float twiddle_in, int Q, + const unsigned int SignalLength, typename FFT::workspace_type workspace); // __launch_bounds__(FFT::max_threads_per_block) we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly @@ -210,19 +212,19 @@ __global__ void block_fft_kernel_C2R_DECREASE_XY(const InputData_t* __restrict__ int Q, typename FFT::workspace_type workspace); -template -__global__ void clip_into_top_left_kernel(InputType* input_values, - OutputBaseType* output_values, - const short4 dims); +template +__global__ void clip_into_top_left_kernel(PositionSpaceType* input_values, + OutputBaseType* output_values, + const short4 dims); // Modified from GpuImage::ClipIntoRealKernel -template -__global__ void clip_into_real_kernel(InputType* real_values_gpu, - OutputBaseType* other_image_real_values_gpu, - short4 dims, - short4 other_dims, - int3 wanted_coordinate_of_box_center, - OutputBaseType wanted_padding_value); +template +__global__ void clip_into_real_kernel(PositionSpaceType* real_values_gpu, + OutputBaseType* other_image_real_values_gpu, + short4 dims, + short4 other_dims, + int3 wanted_coordinate_of_box_center, + OutputBaseType wanted_padding_value); template __device__ __forceinline__ T load_with_hint(const T* ptr, const int idx) { @@ -618,13 +620,17 @@ struct io { complex_compute_t* __restrict__ shared_input, complex_compute_t* __restrict__ thread_data, float* __restrict__ twiddle_factor_args, - float twiddle_in) { + float twiddle_in, + const unsigned int SignalLength) { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { twiddle_factor_args[i] = twiddle_in * index; - thread_data[i] = convert_if_needed(input, index); - shared_input[index] = thread_data[i]; + if ( index < SignalLength ) + thread_data[i] = convert_if_needed(input, index); + else + thread_data[i] = complex_compute_t{0, 0}; + shared_input[index] = thread_data[i]; index += FFT::stride; } } @@ -666,16 +672,23 @@ struct io { float twiddle_in, int* __restrict__ input_map, int* __restrict__ output_map, - int Q) { + int Q, + int SignalLength) { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { - // if (blockIdx.y == 0) ("blck %i index %i \n", Q*index, index); + input_map[i] = index; output_map[i] = Q * index; twiddle_factor_args[i] = twiddle_in * index; - thread_data[i] = convert_if_needed(input, index); - shared_input[index] = thread_data[i].x; + if ( index < SignalLength ) { + thread_data[i] = convert_if_needed(input, index); + } + else { + thread_data[i] = complex_compute_t{0, 0}; + } + shared_input[index] = thread_data[i].x; + index += FFT::stride; } } @@ -688,13 +701,17 @@ struct io { scalar_compute_t* __restrict__ shared_input, complex_compute_t* __restrict__ thread_data, float* __restrict__ twiddle_factor_args, - float twiddle_in) { + float twiddle_in, + const int SignalLength) { unsigned int index = threadIdx.x; for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) { twiddle_factor_args[i] = twiddle_in * index; - thread_data[i] = convert_if_needed(input, index); - shared_input[index] = thread_data[i].x; + if ( index < SignalLength ) + thread_data[i] = convert_if_needed(input, index); + else + thread_data[i] = complex_compute_t{0, 0}; + shared_input[index] = thread_data[i].x; index += FFT::stride; } } diff --git a/include/FastFFT.h b/include/FastFFT.h index 73b6740..c57e43b 100644 --- a/include/FastFFT.h +++ b/include/FastFFT.h @@ -18,7 +18,7 @@ // #define FastFFT_build_sizes 32, 64, 128, 256, 512, 1024, 2048, 4096 // #define FastFFT_build_sizes 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 16, 8192, 16 -#define FastFFT_build_sizes 512, 4096 +#define FastFFT_build_sizes 64, 128 namespace FastFFT { @@ -88,11 +88,11 @@ struct DevicePointers { * * * @tparam ComputeBaseType - float. Support for ieee half precision is not yet implemented. - * @tparam InputType - __half or float for real valued input, __half2 or float2 for complex valued input images. + * @tparam PositionSpaceType - __half or float for real valued input, __half2 or float2 for complex valued input images. * @tparam OtherImageType - __half or float. Actual type depends on position/momentum space representation. * @tparam Rank - only 2,3 supported. Support for 3d is partial */ -template +template class FourierTransformer { public: @@ -140,14 +140,14 @@ class FourierTransformer { return d_ptr.buffer_1; } - void CopyHostToDeviceAndSynchronize(InputType* input_pointer, int n_elements_to_copy = 0); - void CopyHostToDevice(InputType* input_pointer, int n_elements_to_copy = 0); + void CopyHostToDeviceAndSynchronize(PositionSpaceType* input_pointer, int n_elements_to_copy = 0); + void CopyHostToDevice(PositionSpaceType* input_pointer, int n_elements_to_copy = 0); - void CopyDeviceToHostAndSynchronize(InputType* input_pointer, int n_elements_to_copy = 0) { CopyAndSynchronize(true, input_pointer, n_elements_to_copy); }; + void CopyDeviceToHostAndSynchronize(PositionSpaceType* input_pointer, int n_elements_to_copy = 0) { CopyAndSynchronize(true, input_pointer, n_elements_to_copy); }; - void CopyDeviceToDeviceAndSynchronize(InputType* input_pointer, int n_elements_to_copy = 0) { CopyAndSynchronize(false, input_pointer, n_elements_to_copy); }; + void CopyDeviceToDeviceAndSynchronize(PositionSpaceType* input_pointer, int n_elements_to_copy = 0) { CopyAndSynchronize(false, input_pointer, n_elements_to_copy); }; - void CopyAndSynchronize(bool to_host, InputType* input_pointer, int n_elements_to_copy = 0); + void CopyAndSynchronize(bool to_host, PositionSpaceType* input_pointer, int n_elements_to_copy = 0); // Using the enum directly from python is not something I've figured out yet. Just make simple methods. // FIXME: these are not currently used, and perhaps are not needed. @@ -173,38 +173,39 @@ class FourierTransformer { } // FFT calls + // NOTE: if writing or adding to these, transform stage, current buffer and external in/output pointers must be set. - // TODO: when picking up tomorrow, remove default values for input pointers and move EnableIf to the declarations for the generic functions and instantiate these from fastFFT.cus + // TODO: make sure these are all that are accessed from the user end. Re-set private and start encapsulating more thoroughly. // Following this // Alias for FwdFFT, is there any overhead? template - void FwdFFT(InputType* input_ptr, - InputType* output_ptr = nullptr, - PreOpType pre_op = nullptr, - IntraOpType intra_op = nullptr); + void FwdFFT(PositionSpaceType* input_ptr, + PositionSpaceType* output_ptr = nullptr, + PreOpType pre_op = nullptr, + IntraOpType intra_op = nullptr); template - void InvFFT(InputType* input_ptr, - InputType* output_ptr = nullptr, - IntraOpType intra_op = nullptr, - PostOpType post_op = nullptr); + void InvFFT(PositionSpaceType* input_ptr, + PositionSpaceType* output_ptr = nullptr, + IntraOpType intra_op = nullptr, + PostOpType post_op = nullptr); template - void FwdImageInvFFT(InputType* input_ptr, - OtherImageType* image_to_search, - InputType* output_ptr = nullptr, - PreOpType pre_op = nullptr, - IntraOpType intra_op = nullptr, - PostOpType post_op = nullptr); + void FwdImageInvFFT(PositionSpaceType* input_ptr, + OtherImageType* image_to_search, + PositionSpaceType* output_ptr = nullptr, + PreOpType pre_op = nullptr, + IntraOpType intra_op = nullptr, + PostOpType post_op = nullptr); - void ClipIntoTopLeft(InputType* input_ptr); - void ClipIntoReal(InputType*, int wanted_coordinate_of_box_center_x, int wanted_coordinate_of_box_center_y, int wanted_coordinate_of_box_center_z); + void ClipIntoTopLeft(PositionSpaceType* input_ptr); + void ClipIntoReal(PositionSpaceType*, int wanted_coordinate_of_box_center_x, int wanted_coordinate_of_box_center_y, int wanted_coordinate_of_box_center_z); - // For all real valued inputs, assumed for any InputType that is not float2 or __half2 + // For all real valued inputs, assumed for any PositionSpaceType that is not float2 or __half2 int inline ReturnInputMemorySize( ) { return input_memory_wanted_; @@ -234,6 +235,7 @@ class FourierTransformer { return inv_dims_out; } + // FIXME: these are convenience functinos for testing and should be moved accordingly template void SetToConstant(T* input_pointer, int N_values, const T& wanted_value) { if ( is_on_host ) { @@ -275,6 +277,7 @@ class FourierTransformer { buffer_location current_buffer; + // FIXME: move to checks and debug void PrintState( ) { std::cerr << "================================================================" << std::endl; std::cerr << "Device Properties: " << std::endl; @@ -293,7 +296,7 @@ class FourierTransformer { std::cerr << "in buffer " << buffer_name[current_buffer] << std::endl; std::cerr << "is_fftw_padded_input " << is_fftw_padded_input << std::endl; std::cerr << "is_fftw_padded_output " << is_fftw_padded_output << std::endl; - std::cerr << "is_real_valued_input " << IsAllowedRealType << std::endl; + std::cerr << "is_real_valued_input " << IsAllowedRealType << std::endl; std::cerr << "is_set_input_params " << is_set_input_params << std::endl; std::cerr << "is_set_output_params " << is_set_output_params << std::endl; std::cerr << std::endl; @@ -344,7 +347,6 @@ class FourierTransformer { bool is_set_input_params; // Yes, yes, "are" set. bool is_set_output_params; - int transform_dimension; // 1,2,3d. FFT_Size transform_size; std::vector SizeChangeName{"increase", "decrease", "no_change"}; @@ -379,7 +381,7 @@ class FourierTransformer { int wanted_memory_n_elements = 0; constexpr int scale_compute_base_type_to_full_type = 2; // The odd sized block is probably not needed. - if constexpr ( IsAllowedRealType ) { + if constexpr ( IsAllowedRealType ) { if ( wanted_dims.x % 2 == 0 ) { padding_jump_val_ = 2; wanted_memory_n_elements = wanted_dims.x / 2 + 1; @@ -392,12 +394,12 @@ class FourierTransformer { wanted_memory_n_elements *= wanted_dims.y * wanted_dims.z; // other dimensions wanted_dims.w = (wanted_dims.x + padding_jump_val_) / 2; // number of complex elements in the X dimesnions after FFT. } - else if constexpr ( IsAllowedComplexType ) { + else if constexpr ( IsAllowedComplexType ) { wanted_memory_n_elements = wanted_dims.x * wanted_dims.y * wanted_dims.z; wanted_dims.w = wanted_dims.x; // pitch is constant } else { - constexpr InputType a; + constexpr PositionSpaceType a; static_assert_type_name(a); } @@ -426,7 +428,7 @@ class FourierTransformer { PrintVectorType(LP.threadsPerBlock); std::cerr << " Grid dimensions: "; PrintVectorType(LP.gridDims); - std::cerr << " Q: " << LP.Q << std::endl; + std::cerr << " Q: " << LP.transform_size.Q << std::endl; std::cerr << " shared input: " << LP.mem_offsets.shared_input << std::endl; std::cerr << " shared output (memlimit in r2c): " << LP.mem_offsets.shared_output << std::endl; std::cerr << " physical_x_input: " << LP.mem_offsets.physical_x_input << std::endl; @@ -476,10 +478,10 @@ class FourierTransformer { std::vector sizes_in_this_build = {FastFFT_build_sizes}; enum KernelType { r2c_none_XY, // 1d fwd // 2d fwd 1st stage - r2c_none_XZ, // 3d fwd 1st stage + r2c_none_XZ, // 3d fwd 1st stage FIXME: partial coalescing set in LaunchParams r2c_decrease_XY, r2c_increase_XY, - r2c_increase_XZ, + r2c_increase_XZ, // FIXME: partial coalescing set in LaunchParams c2c_fwd_none, // 1d complex valued input, or final stage of Fwd 2d or 3d c2c_fwd_none_XYZ, c2c_fwd_decrease, @@ -497,7 +499,7 @@ class FourierTransformer { xcorr_fwd_increase_inv_none, // (e.g. template matching) xcorr_fwd_decrease_inv_none, // (e.g. Fourier cropping) xcorr_fwd_none_inv_decrease, // (e.g. movie/particle translational search) - xcorr_fwd_decrease_inv_decrease, // (e.g. bandlimit, xcorr, translational search) + xcorr_fwd_decrease_inv_decrease, // (e.g. bandlimit, xcorr, translational search) not implemented yet generic_fwd_increase_op_inv_none, COUNT }; @@ -585,16 +587,22 @@ class FourierTransformer { return false; } - inline bool IsTransormAlongZ(KernelType kernel_type) { - if ( KernelName.at(kernel_type).find("_XYZ") != KernelName.at(kernel_type).npos ) + inline bool Is_XY_Transposed(KernelType kernel_type) { + if ( KernelName.at(kernel_type).find("_XY") != KernelName.at(kernel_type).npos ) return true; else return false; } - inline bool IsRank3(KernelType kernel_type) { - if ( KernelName.at(kernel_type).find("_XZ") != KernelName.at(kernel_type).npos || - KernelName.at(kernel_type).find("_XYZ") != KernelName.at(kernel_type).npos ) + inline bool Is_XZ_Transposed(KernelType kernel_type) { + if ( KernelName.at(kernel_type).find("_XZ") != KernelName.at(kernel_type).npos ) + return true; + else + return false; + } + + inline bool IsTransormAlongZ(KernelType kernel_type) { + if ( KernelName.at(kernel_type).find("_XYZ") != KernelName.at(kernel_type).npos ) return true; else return false; @@ -604,17 +612,19 @@ class FourierTransformer { GetTransformSize(KernelType kernel_type); void GetTransformSize_thread(KernelType kernel_type, int thread_fft_size); - LaunchParams SetLaunchParameters(KernelType kernel_type, const int ept); + LaunchParams SetLaunchParameters(KernelType kernel_type, + const unsigned int ept, + const unsigned int stride_y, + const unsigned int stride_z); // 1. // First call passed from a public transform function, selects block or thread and the transform precision. template - EnableIf> - SetPrecisionAndExectutionMethod(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor = nullptr, - IntraOpType intra_op_functor = nullptr, - PostOpType post_op_functor = nullptr); + EnableIf> SetPrecisionAndExectutionMethod(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor = nullptr, + IntraOpType intra_op_functor = nullptr, + PostOpType post_op_functor = nullptr); // 2. // TODO: remove this now that the functors are working // Check to see if any intra kernel functions are wanted, and if so set the appropriate device pointers. @@ -656,7 +666,7 @@ class FourierTransformer { template - EnableIf && IsAllowedInputType> + EnableIf && IsAllowedPositionSpaceType> Generic_Fwd_Image_Inv(OtherImageType* image_to_search, PreOpType pre_op, IntraOpType intra_op, @@ -664,13 +674,13 @@ class FourierTransformer { template - EnableIf> + EnableIf> Generic_Fwd(PreOpType pre_op, IntraOpType intra_op); template - EnableIf> + EnableIf> Generic_Inv(IntraOpType intra_op, PostOpType post_op); @@ -700,14 +710,13 @@ class FourierTransformer { // The size we would need to use with a general purpose library, eg. FFTW // Note: this is not limited by the power of 2 restriction as we can compose this with sub ffts that are power of 2 transform_size.N = full_size_transform; - // The input/output size we care about. non-zero comes from zero padding, but probably doesn't make sense + // The input/output SignalLength we care about. non-zero comes from zero padding, but probably doesn't make sense // for a size reduction algo e.g. TODO: rename transform_size.L = number_non_zero_inputs_or_outputs; // Get the closest >= power of 2 - transform_size.P = 1; - while ( transform_size.P < number_non_zero_inputs_or_outputs ) - transform_size.P = transform_size.P << 1; + transform_size.P = GetNextPowerOfTwo(number_non_zero_inputs_or_outputs); + // FIXME: debug assert, but we need to define max size somewhere else MyFFTDebugAssertFalse(transform_size.P > transform_size.N, "transform_size.P > tranform_size.N"); @@ -715,15 +724,13 @@ class FourierTransformer { MyFFTDebugAssertTrue(transform_size.N % transform_size.P == 0, "transform_size.N % tranform_size.P != 0"); transform_size.Q = transform_size.N / transform_size.P; - // FIXME: there is a bug in cuda that crashes for thread block size > 64 in the Z dimension. - // Note: for size increase or rount trip transforms, we can use on chip explicit padding, so this bug - // does not apply. + // FIXME: thread block size > 64 in the Z dimension is not allowed in CUDA // if ( IsDecreaseSizeType(kernel_type) ) // MyFFTRunTimeAssertFalse(transform_size.Q > 64, "transform_size.Q > 64, see Nvidia bug report 4417253"); } - // Input is real or complex inferred from InputType - DevicePointers d_ptr; + // Input is real or complex inferred from PositionSpaceType + DevicePointers d_ptr; // Check to make sure we haven't fouled up the explicit instantiation of DevicePointers }; // class Fourier Transformer diff --git a/include/detail/checks_and_debug.h b/include/detail/checks_and_debug.h index 8ba32f3..646992f 100644 --- a/include/detail/checks_and_debug.h +++ b/include/detail/checks_and_debug.h @@ -6,12 +6,15 @@ // During testing we may be intenting to capture std::err to see if some particular assert is being triggered // if we are, we do not want to abort. -// #ifdef FastFFT_captureStdErr -#define FastFFT_useAbort -// #else -// #define FastFFT_useAbort std::abort( ); -// #endif +// TODO = make configurable so we can run tests/test_run_time_asserts.cu +// #define FastFFT_captureStdErr + +#ifdef FastFFT_captureStdErr +#define FastFFT_useAbort +#else +#define FastFFT_useAbort std::abort( ); +#endif namespace FastFFT { // hacky and non-conclusive way to trouble shoot mismatched types in function calls diff --git a/include/detail/concepts.h b/include/detail/concepts.h index 02885fa..ae32a8b 100644 --- a/include/detail/concepts.h +++ b/include/detail/concepts.h @@ -66,7 +66,7 @@ template constexpr bool IsAllowedComplexType = (... && (std::is_same_v || std::is_same_v)); template -constexpr bool IsAllowedInputType = (... && (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v)); +constexpr bool IsAllowedPositionSpaceType = (... && (std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v)); template constexpr bool CheckPointerTypesForMatch = (std::is_same_v && std::is_same_v); diff --git a/include/detail/fft_properties.h b/include/detail/fft_properties.h index 279b9e1..cd770b7 100644 --- a/include/detail/fft_properties.h +++ b/include/detail/fft_properties.h @@ -3,31 +3,30 @@ namespace FastFFT { -typedef struct __align__(8) _FFT_Size { +typedef struct __align__(16) _FFT_Size { // Following Sorensen & Burrus 1993 for clarity - short N; // N : 1d FFT size - short L; // L : number of non-zero output/input points - short P; // P >= L && N % P == 0 : The size of the sub-FFT used to compute the full transform. Currently also must be a power of 2. - short Q; // Q = N/P : The number of sub-FFTs used to compute the full transform + unsigned int N; // N : 1d FFT size + unsigned int L; // L : number of non-zero output/input points + unsigned int P; // P >= L && N % P == 0 : The size of the sub-FFT used to compute the full transform. Currently also must be a power of 2. + unsigned int Q; // Q = N/P : The number of sub-FFTs used to compute the full transform } FFT_Size; -typedef struct __align__(8) _Offsets { - unsigned short shared_input; - unsigned short shared_output; - unsigned short physical_x_input; - unsigned short physical_x_output; +typedef struct __align__(16) _Offsets { + unsigned int shared_input; + unsigned int shared_output; + unsigned int physical_x_input; + unsigned int physical_x_output; } Offsets; typedef struct __align__(64) _LaunchParams { - int Q; - float twiddle_in; - dim3 gridDims; - dim3 threadsPerBlock; - Offsets mem_offsets; + FFT_Size transform_size; + Offsets mem_offsets; + dim3 gridDims; + dim3 threadsPerBlock; } LaunchParams; diff --git a/include/detail/functors.h b/include/detail/functors.h index a319b27..636e196 100644 --- a/include/detail/functors.h +++ b/include/detail/functors.h @@ -59,7 +59,7 @@ struct my_functor -struct my_functor>> final { +struct my_functor>> final { const T scale_factor; diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu index d98811b..0fc90cc 100644 --- a/src/fastfft/FastFFT.cu +++ b/src/fastfft/FastFFT.cu @@ -109,19 +109,17 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ #endif } -template -FourierTransformer::FourierTransformer( ) { +template +FourierTransformer::FourierTransformer( ) { SetDefaults( ); GetCudaDeviceProps(device_properties); // FIXME: assert on OtherImageType being a complex type static_assert(std::is_same_v, "Compute base type must be float"); static_assert(Rank == 2 || Rank == 3, "Only 2D and 3D FFTs are supported"); - static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, - "Input base type must be either __half or float"); // exit(0); // This assumption precludes the use of a packed _half2 that is really RRII layout for two arrays of __half. - static_assert(IsAllowedRealType || IsAllowedComplexType, "Input type must be either float or __half"); + static_assert(IsAllowedRealType || IsAllowedComplexType, "Input type must be either float or __half, support for complex input types is not yet implemented."); // Make sure an explicit specializtion for the device pointers is available static_assert(! std::is_same_v, "Device pointer type not specialized"); @@ -130,14 +128,14 @@ FourierTransformer::FourierTra #endif } -template -FourierTransformer::~FourierTransformer( ) { +template +FourierTransformer::~FourierTransformer( ) { Deallocate( ); SetDefaults( ); } -template -void FourierTransformer::SetDefaults( ) { +template +void FourierTransformer::SetDefaults( ) { // booleans to track state, could be bit fields but that seem opaque to me. current_buffer = fastfft_external_input; @@ -159,8 +157,8 @@ void FourierTransformer::SetDe compute_memory_wanted_ = 0; } -template -void FourierTransformer::Deallocate( ) { +template +void FourierTransformer::Deallocate( ) { if ( is_pointer_in_device_memory(d_ptr.buffer_1) ) { precheck; @@ -179,7 +177,7 @@ void FourierTransformer::Deall * Data may be copied to this buffer and used directly * * @tparam ComputeBaseType - * @tparam InputType + * @tparam PositionSpaceType * @tparam OtherImageType * @tparam Rank * @param input_logical_x_dimension @@ -190,19 +188,21 @@ void FourierTransformer::Deall * @param output_logical_z_dimension * @param is_padded_input */ -template -void FourierTransformer::SetForwardFFTPlan(size_t input_logical_x_dimension, - size_t input_logical_y_dimension, - size_t input_logical_z_dimension, - size_t output_logical_x_dimension, - size_t output_logical_y_dimension, - size_t output_logical_z_dimension, - bool is_padded_input) { + +template +void FourierTransformer::SetForwardFFTPlan(size_t input_logical_x_dimension, + size_t input_logical_y_dimension, + size_t input_logical_z_dimension, + size_t output_logical_x_dimension, + size_t output_logical_y_dimension, + size_t output_logical_z_dimension, + bool is_padded_input) { fwd_dims_in = make_short4(input_logical_x_dimension, input_logical_y_dimension, input_logical_z_dimension, 0); fwd_dims_out = make_short4(output_logical_x_dimension, output_logical_y_dimension, output_logical_z_dimension, 0); ValidateDimensions(fwd_dims_in, fwd_dims_out, true); + is_fftw_padded_input = is_padded_input; MyFFTRunTimeAssertTrue(is_fftw_padded_input, "Support for input arrays that are not FFTW padded needs to be implemented."); // FIXME // ReturnPaddedMemorySize also sets FFTW padding etc. @@ -227,7 +227,7 @@ void FourierTransformer::SetFo * Data may be copied to this buffer and used directly * * @tparam ComputeBaseType - * @tparam InputType + * @tparam PositionSpaceType * @tparam OtherImageType * @tparam Rank * @param input_logical_x_dimension @@ -238,14 +238,14 @@ void FourierTransformer::SetFo * @param output_logical_z_dimension * @param is_padded_output */ -template -void FourierTransformer::SetInverseFFTPlan(size_t input_logical_x_dimension, - size_t input_logical_y_dimension, - size_t input_logical_z_dimension, - size_t output_logical_x_dimension, - size_t output_logical_y_dimension, - size_t output_logical_z_dimension, - bool is_padded_output) { +template +void FourierTransformer::SetInverseFFTPlan(size_t input_logical_x_dimension, + size_t input_logical_y_dimension, + size_t input_logical_z_dimension, + size_t output_logical_x_dimension, + size_t output_logical_y_dimension, + size_t output_logical_z_dimension, + bool is_padded_output) { MyFFTDebugAssertTrue(is_fftw_padded_input == is_padded_output, "If the input data are FFTW padded, so must the output."); @@ -269,12 +269,12 @@ void FourierTransformer::SetIn * @brief Private method to allocate memory for the internal FastFFT buffer. * * @tparam ComputeBaseType - * @tparam InputType + * @tparam PositionSpaceType * @tparam OtherImageType * @tparam Rank */ -template -void FourierTransformer::AllocateBufferMemory( ) { +template +void FourierTransformer::AllocateBufferMemory( ) { MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters must be set before allocating buffer memory"); MyFFTDebugAssertTrue(compute_memory_wanted_ > 0, "Compute memory already allocated"); @@ -292,8 +292,8 @@ void FourierTransformer::Alloc d_ptr.buffer_2 = &d_ptr.buffer_1[compute_memory_wanted_ / buffer_address_scalar]; } -template -void FourierTransformer::ZeroBufferMemory( ) { +template +void FourierTransformer::ZeroBufferMemory( ) { MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters must be set before allocating buffer memory"); MyFFTDebugAssertTrue(n_bytes_allocated > 0, "No memory has been allocated"); // Allocate enough for the out of place buffer as well. @@ -303,8 +303,8 @@ void FourierTransformer::ZeroB postcheck; } -template -void FourierTransformer::SetInputPointerFromPython(long input_pointer) { +template +void FourierTransformer::SetInputPointerFromPython(long input_pointer) { MyFFTRunTimeAssertFalse(true, "This needs to be re-implemented."); // MyFFTDebugAssertTrue(is_set_input_params, "Input parameters not set"); @@ -312,38 +312,38 @@ void FourierTransformer::SetIn // // The assumption for now is that access from python wrappers have taken care of device/host xfer // // and the passed pointer is in device memory. // // TODO: I should probably have a state variable to track is_python_call - // d_ptr.position_space = reinterpret_cast(input_pointer); + // d_ptr.position_space = reinterpret_cast(input_pointer); // // These are normally set on CopyHostToDevice // SetDevicePointers( ); } // FIXME: see header file for comments -template -void FourierTransformer::CopyHostToDeviceAndSynchronize(InputType* input_pointer, int n_elements_to_copy) { +template +void FourierTransformer::CopyHostToDeviceAndSynchronize(PositionSpaceType* input_pointer, int n_elements_to_copy) { CopyHostToDevice(input_pointer, n_elements_to_copy); cudaErr(cudaStreamSynchronize(cudaStreamPerThread)); } // FIXME: see header file for comments -template -void FourierTransformer::CopyHostToDevice(InputType* input_pointer, int n_elements_to_copy) { +template +void FourierTransformer::CopyHostToDevice(PositionSpaceType* input_pointer, int n_elements_to_copy) { MyFFTDebugAssertFalse(input_data_is_on_device, "External input pointer is on device, cannot copy from host"); MyFFTRunTimeAssertTrue(false, "This method is being removed."); SetDimensions(DimensionCheckType::CopyFromHost); precheck; - cudaErr(cudaMemcpyAsync(d_ptr.buffer_1, input_pointer, memory_size_to_copy_ * sizeof(InputType), cudaMemcpyHostToDevice, cudaStreamPerThread)); + cudaErr(cudaMemcpyAsync(d_ptr.buffer_1, input_pointer, memory_size_to_copy_ * sizeof(PositionSpaceType), cudaMemcpyHostToDevice, cudaStreamPerThread)); postcheck; } -template +template template -void FourierTransformer::FwdFFT(InputType* input_ptr, - InputType* output_ptr, - PreOpType pre_op, - IntraOpType intra_op) { +void FourierTransformer::FwdFFT(PositionSpaceType* input_ptr, + PositionSpaceType* output_ptr, + PreOpType pre_op, + IntraOpType intra_op) { transform_stage_completed = 0; current_buffer = fastfft_external_input; @@ -353,13 +353,13 @@ void FourierTransformer::FwdFF Generic_Fwd(pre_op, intra_op); } -template +template template -void FourierTransformer::InvFFT(InputType* input_ptr, - InputType* output_ptr, - IntraOpType intra_op, - PostOpType post_op) { +void FourierTransformer::InvFFT(PositionSpaceType* input_ptr, + PositionSpaceType* output_ptr, + IntraOpType intra_op, + PostOpType post_op) { transform_stage_completed = 4; current_buffer = fastfft_external_input; // Keep track of the device side pointer used when called @@ -369,17 +369,17 @@ void FourierTransformer::InvFF Generic_Inv(intra_op, post_op); } -template +template template -void FourierTransformer::FwdImageInvFFT(InputType* input_ptr, - OtherImageType* image_to_search, - InputType* output_ptr, - PreOpType pre_op, - IntraOpType intra_op, - PostOpType post_op) { +void FourierTransformer::FwdImageInvFFT(PositionSpaceType* input_ptr, + OtherImageType* image_to_search, + PositionSpaceType* output_ptr, + PreOpType pre_op, + IntraOpType intra_op, + PostOpType post_op) { transform_stage_completed = 0; current_buffer = fastfft_external_input; // Keep track of the device side pointer used when called @@ -389,12 +389,12 @@ void FourierTransformer::FwdIm Generic_Fwd_Image_Inv(image_to_search, pre_op, intra_op, post_op); } -template +template template -EnableIf> -FourierTransformer::Generic_Fwd(PreOpType pre_op_functor, - IntraOpType intra_op_functor) { +EnableIf> +FourierTransformer::Generic_Fwd(PreOpType pre_op_functor, + IntraOpType intra_op_functor) { SetDimensions(DimensionCheckType::FwdTransform); @@ -405,7 +405,7 @@ FourierTransformer::Generic_Fw // SetPrecisionAndExectutionMethod( ) { + if constexpr ( IsAllowedRealType ) { switch ( fwd_size_change_type ) { case SizeChangeType::no_change: { SetPrecisionAndExectutionMethod(nullptr, r2c_none_XY, pre_op_functor, intra_op_functor); @@ -508,21 +508,21 @@ FourierTransformer::Generic_Fw } } -template +template template -EnableIf> -FourierTransformer::Generic_Inv(IntraOpType intra_op, - PostOpType post_op) { +EnableIf> +FourierTransformer::Generic_Inv(IntraOpType intra_op, + PostOpType post_op) { SetDimensions(DimensionCheckType::InvTransform); MyFFTRunTimeAssertFalse(fwd_implicit_dimension_change, "Implicit dimension change not yet supported for InvFFT"); MyFFTRunTimeAssertFalse(inv_implicit_dimension_change, "Implicit dimension change not yet supported for InvFFT"); - switch ( transform_dimension ) { + switch ( Rank ) { case 1: { - if constexpr ( IsAllowedRealType ) { + if constexpr ( IsAllowedRealType ) { switch ( inv_size_change_type ) { case SizeChangeType::no_change: { SetPrecisionAndExectutionMethod(nullptr, c2r_none_XY, intra_op, post_op); @@ -629,20 +629,20 @@ FourierTransformer::Generic_In } } -template +template template -EnableIf && IsAllowedInputType> -FourierTransformer::Generic_Fwd_Image_Inv(OtherImageType* image_to_search_ptr, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +EnableIf && IsAllowedPositionSpaceType> +FourierTransformer::Generic_Fwd_Image_Inv(OtherImageType* image_to_search_ptr, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { // Set the member pointer to the passed pointer SetDimensions(DimensionCheckType::FwdTransform); - switch ( transform_dimension ) { + switch ( Rank ) { case 1: { MyFFTRunTimeAssertTrue(false, "1D FFT Cross correlation not yet supported"); break; @@ -825,8 +825,8 @@ FourierTransformer::Generic_Fw //////////////////////////////////////////////////// /// END PUBLIC METHODS //////////////////////////////////////////////////// -template -void FourierTransformer::ValidateDimensions(short4& dims_in, short4& dims_out, const bool is_fwd_not_inv) { +template +void FourierTransformer::ValidateDimensions(short4& dims_in, short4& dims_out, const bool is_fwd_not_inv) { /* Basic conditions on allowed (supported sizes) - Transforms must be power of 2 @@ -836,21 +836,25 @@ void FourierTransformer::Valid - Currently only allowing square or cubic images */ - const bool params_set = is_fwd_not_inv ? is_set_input_params : is_set_output_params; - MyFFTRunTimeAssertTrue(params_set, "Parameters not set"); - // Validate the forward transform - if ( fwd_dims_out.x > fwd_dims_in.x || fwd_dims_out.y > fwd_dims_in.y || fwd_dims_out.z > fwd_dims_in.z ) { + if ( dims_out.x > dims_in.x || dims_out.y > dims_in.y || dims_out.z > dims_in.z ) { // For now we must pad in all dimensions, this is not needed and should be lifted. FIXME MyFFTRunTimeAssertTrue(dims_out.x >= dims_in.x, "If padding, all dimensions must be >=, x out < x in"); MyFFTRunTimeAssertTrue(dims_out.y >= dims_in.y, "If padding, all dimensions must be >=, y out < y in"); MyFFTRunTimeAssertTrue(dims_out.z >= dims_in.z, "If padding, all dimensions must be >=, z out < z in"); MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_out.x) && IsAPowerOfTwo(dims_out.y) && IsAPowerOfTwo(dims_out.z), "Output dimensions must be a power of 2"); - fwd_size_change_type = SizeChangeType::increase; - fwd_implicit_dimension_change = ! (IsAPowerOfTwo(dims_out.x) && IsAPowerOfTwo(dims_out.y) && IsAPowerOfTwo(dims_out.z)); + if ( is_fwd_not_inv ) { + fwd_size_change_type = SizeChangeType::increase; + fwd_implicit_dimension_change = ! (IsAPowerOfTwo(dims_out.x) && IsAPowerOfTwo(dims_out.y) && IsAPowerOfTwo(dims_out.z)); + } + else { + inv_size_change_type = SizeChangeType::increase; + MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_in.x) && IsAPowerOfTwo(dims_in.y) && IsAPowerOfTwo(dims_in.z), "Input dimensions must be a power of 2"); + } } - else if ( fwd_dims_out.x < fwd_dims_in.x || fwd_dims_out.y < fwd_dims_in.y || fwd_dims_out.z < fwd_dims_in.z ) { + + else if ( dims_out.x < dims_in.x || dims_out.y < dims_in.y || dims_out.z < dims_in.z ) { // For now we must pad in all dimensions, this is not needed and should be lifted. FIXME MyFFTRunTimeAssertTrue(dims_out.x <= dims_in.x, "If padding, all dimensions must be <=, x out > x in"); MyFFTRunTimeAssertTrue(dims_out.y <= dims_in.y, "If padding, all dimensions must be <=, y out > y in"); @@ -859,13 +863,19 @@ void FourierTransformer::Valid MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_in.x) && IsAPowerOfTwo(dims_in.y) && IsAPowerOfTwo(dims_in.z), "Input dimensions must be a power of 2"); MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_out.x) && IsAPowerOfTwo(dims_out.y) && IsAPowerOfTwo(dims_out.z), "Output dimensions must be a power of 2"); - fwd_size_change_type = SizeChangeType::decrease; + if ( is_fwd_not_inv ) + fwd_size_change_type = SizeChangeType::decrease; + else + inv_size_change_type = SizeChangeType::decrease; } - else if ( fwd_dims_out.x == fwd_dims_in.x && fwd_dims_out.y == fwd_dims_in.y && fwd_dims_out.z == fwd_dims_in.z ) { + else if ( dims_out.x == dims_in.x && dims_out.y == dims_in.y && dims_out.z == dims_in.z ) { MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_in.x) && IsAPowerOfTwo(dims_in.y) && IsAPowerOfTwo(dims_in.z), "Input dimensions must be a power of 2"); MyFFTRunTimeAssertTrue(IsAPowerOfTwo(dims_out.x) && IsAPowerOfTwo(dims_out.y) && IsAPowerOfTwo(dims_out.z), "Output dimensions must be a power of 2"); - fwd_size_change_type = SizeChangeType::no_change; + if ( is_fwd_not_inv ) + fwd_size_change_type = SizeChangeType::no_change; + else + inv_size_change_type = SizeChangeType::no_change; } else { // TODO: if this is relaxed, the dimensionality check below will be invalid. @@ -898,33 +908,21 @@ void FourierTransformer::Valid } } - // We can only run this onces both input and output are set - if ( is_set_input_params && is_set_output_params ) { + // We can only run this onces both input and output are set, right now we are setting after... FIXME + if ( is_set_input_params || is_set_output_params ) { MyFFTRunTimeAssertTrue(fwd_dims_out.x == inv_dims_in.x && fwd_dims_out.y == inv_dims_in.y && fwd_dims_out.z == inv_dims_in.z, "Error in validating the dimension: Currently all fwd out should match inv in."); - // check for dimensionality - // Note: this is predicated on the else clause ensuring all dimensions behave the same way w.r.t. size change. - if ( fwd_dims_in.z == 1 || fwd_dims_out.z == 1 || inv_dims_in.z == 1 || inv_dims_out.z == 1 ) { - MyFFTRunTimeAssertTrue(fwd_dims_in.z == 1 && fwd_dims_out.z == 1 && inv_dims_in.z == 1 && inv_dims_out.z == 1, "Fwd/Inv dimensionality may not change from 1d,2d,3d (z dimension)"); - if ( fwd_dims_in.y == 1 || fwd_dims_out.y == 1 || inv_dims_in.y == 1 || inv_dims_out.y == 1 ) { - - transform_dimension = 1; - } - else { - // We need to enforce square dimensions currently - MyFFTRunTimeAssertTrue(fwd_dims_in.y == fwd_dims_in.x && fwd_dims_out.y == fwd_dims_out.x && inv_dims_in.y == inv_dims_in.x && inv_dims_out.y == inv_dims_out.x, "Only square images are supported currently (y dimension)"); - transform_dimension = 2; - } + if ( Rank == 2 ) { + MyFFTRunTimeAssertTrue(fwd_dims_in.y == fwd_dims_in.x && fwd_dims_out.y == fwd_dims_out.x && inv_dims_in.y == inv_dims_in.x && inv_dims_out.y == inv_dims_out.x, "Only square images are supported currently (y dimension)"); } - else { + else if ( Rank == 3 ) { // We need to enforce cubic dimensions currently MyFFTRunTimeAssertTrue(fwd_dims_in.y == fwd_dims_in.x && fwd_dims_out.y == fwd_dims_out.x && fwd_dims_in.z == fwd_dims_in.x && fwd_dims_out.z == fwd_dims_out.x && inv_dims_in.y == inv_dims_in.x && inv_dims_out.y == inv_dims_out.x && inv_dims_in.z == inv_dims_in.x && inv_dims_out.z == inv_dims_out.x, "Only cubic images are supported currently (z dimension)"); - transform_dimension = 3; constexpr unsigned int max_3d_size = 512; MyFFTRunTimeAssertFalse(fwd_dims_in.z > max_3d_size || fwd_dims_out.z > max_3d_size || @@ -943,8 +941,8 @@ void FourierTransformer::Valid } } -template -void FourierTransformer::SetDimensions(DimensionCheckType::Enum check_op_type) { +template +void FourierTransformer::SetDimensions(DimensionCheckType::Enum check_op_type) { switch ( check_op_type ) { case DimensionCheckType::CopyFromHost: { @@ -974,10 +972,10 @@ void FourierTransformer::SetDi /// Transform kernels //////////////////////////////////////////////////// -template -void FourierTransformer::CopyAndSynchronize(bool to_host, - InputType* input_pointer, - int n_elements_to_copy) { +template +void FourierTransformer::CopyAndSynchronize(bool to_host, + PositionSpaceType* input_pointer, + int n_elements_to_copy) { SetDimensions(DimensionCheckType::CopyToHost); int n_to_actually_copy = (n_elements_to_copy > 0) ? n_elements_to_copy : memory_size_to_copy_; @@ -990,26 +988,26 @@ void FourierTransformer::CopyA switch ( current_buffer ) { case fastfft_external_input: { MyFFTDebugAssertTrue(is_pointer_in_device_memory(d_ptr.external_input), "Error in CopyDeviceToHostAndSynchronize: input_pointer must be in device memory"); - cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.external_input, n_to_actually_copy * sizeof(InputType), copy_kind, cudaStreamPerThread)); + cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.external_input, n_to_actually_copy * sizeof(PositionSpaceType), copy_kind, cudaStreamPerThread)); break; } case fastfft_external_output: { MyFFTDebugAssertTrue(is_pointer_in_device_memory(d_ptr.external_input), "Error in CopyDeviceToHostAndSynchronize: input_pointer must be in device memory"); - cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.external_output, n_to_actually_copy * sizeof(InputType), copy_kind, cudaStreamPerThread)); + cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.external_output, n_to_actually_copy * sizeof(PositionSpaceType), copy_kind, cudaStreamPerThread)); break; } // If we are in the internal buffers, our data is ComputeBaseType case fastfft_internal_buffer_1: { MyFFTDebugAssertTrue(is_pointer_in_device_memory(d_ptr.buffer_1), "Error in CopyDeviceToHostAndSynchronize: input_pointer must be in device memory"); - if ( sizeof(ComputeBaseType) != sizeof(InputType) ) - std::cerr << "\n\tWarning: CopyDeviceToHostAndSynchronize: sizeof(ComputeBaseType) != sizeof(InputType) - this may be a problem\n\n"; + if ( sizeof(ComputeBaseType) != sizeof(PositionSpaceType) ) + std::cerr << "\n\tWarning: CopyDeviceToHostAndSynchronize: sizeof(ComputeBaseType) != sizeof(PositionSpaceType) - this may be a problem\n\n"; cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.buffer_1, n_to_actually_copy * sizeof(ComputeBaseType), copy_kind, cudaStreamPerThread)); break; } case fastfft_internal_buffer_2: { MyFFTDebugAssertTrue(is_pointer_in_device_memory(d_ptr.buffer_2), "Error in CopyDeviceToHostAndSynchronize: input_pointer must be in device memory"); - if ( sizeof(ComputeBaseType) != sizeof(InputType) ) - std::cerr << "\n\tWarning: CopyDeviceToHostAndSynchronize: sizeof(ComputeBaseType) != sizeof(InputType) - this may be a problem\n\n"; + if ( sizeof(ComputeBaseType) != sizeof(PositionSpaceType) ) + std::cerr << "\n\tWarning: CopyDeviceToHostAndSynchronize: sizeof(ComputeBaseType) != sizeof(PositionSpaceType) - this may be a problem\n\n"; cudaErr(cudaMemcpyAsync(input_pointer, d_ptr.buffer_2, n_to_actually_copy * sizeof(ComputeBaseType), copy_kind, cudaStreamPerThread)); break; } @@ -1085,7 +1083,8 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ OutputData_t* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, - int Q, + const int Q, + const int SignalLength, typename FFT::workspace_type workspace) { // Initialize the shared memory, assuming everyting matches the input data X size in using complex_compute_t = typename FFT::value_type; @@ -1114,7 +1113,8 @@ __launch_bounds__(FFT::max_threads_per_block) __global__ twiddle_in, input_MAP, output_MAP, - Q); + Q, + SignalLength); // We unroll the first and last loops. // In the first FFT the modifying twiddle factor is 1 so the data are real @@ -1434,6 +1434,7 @@ __global__ void block_fft_kernel_C2C_INCREASE(const ComplexData_t* __restrict__ Offsets mem_offsets, float twiddle_in, int Q, + const unsigned int SignalLength, typename FFT::workspace_type workspace) { // Initialize the shared memory, assuming everyting matches the input data X size in @@ -1441,8 +1442,10 @@ __global__ void block_fft_kernel_C2C_INCREASE(const ComplexData_t* __restrict__ using scalar_compute_t = typename complex_compute_t::value_type; FastFFT_SMEM complex_compute_t shared_input_complex[]; // Storage for the input data that is re-used each blcok - complex_compute_t* shared_output = (complex_compute_t*)&shared_input_complex[mem_offsets.shared_input]; // storage for the coalesced output data. This may grow too large, - complex_compute_t* shared_mem = (complex_compute_t*)&shared_output[mem_offsets.shared_output]; + // storage for the coalesced output data. This may grow too large, + // FIXME: could be signal length, but that that also might interfere with coalescing + complex_compute_t* shared_output = (complex_compute_t*)&shared_input_complex[mem_offsets.shared_input]; + complex_compute_t* shared_mem = (complex_compute_t*)&shared_output[mem_offsets.shared_output]; // Memory used by FFT complex_compute_t thread_data[FFT::storage_size]; @@ -1454,7 +1457,8 @@ __global__ void block_fft_kernel_C2C_INCREASE(const ComplexData_t* __restrict__ shared_input_complex, thread_data, twiddle_factor_args, - twiddle_in); + twiddle_in, + SignalLength); // FIXME: currently zero padding to size_of::value on load, but could use less shared at SignalLenght, if so, would need to modify copy_from_shared. FFT( ).execute(thread_data, shared_mem, workspace); io::store(thread_data, shared_output, Q, 0); @@ -1660,10 +1664,10 @@ __global__ void block_fft_kernel_C2R_DECREASE_XY(const InputData_t* __restrict__ } // FIXME assumed FWD -template -__global__ void clip_into_top_left_kernel(InputType* input_values, - InputType* output_values, - const short4 dims) { +template +__global__ void clip_into_top_left_kernel(PositionSpaceType* input_values, + PositionSpaceType* output_values, + const short4 dims) { int x = blockIdx.x * blockDim.x + threadIdx.x; if ( x > dims.w ) @@ -1686,8 +1690,8 @@ __global__ void clip_into_top_left_kernel(InputType* input_values, } } -template -void FourierTransformer::ClipIntoTopLeft(InputType* input_ptr) { +template +void FourierTransformer::ClipIntoTopLeft(PositionSpaceType* input_ptr) { // TODO add some checks and logic. // Assuming we are calling this from R2C_Transposed and that the launch bounds are not set. @@ -1697,19 +1701,19 @@ void FourierTransformer::ClipI const short4 area_to_clip_from = make_short4(fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.w * 2, fwd_dims_out.w * 2); precheck; - clip_into_top_left_kernel<<>>(input_ptr, (InputType*)d_ptr.buffer_1, area_to_clip_from); + clip_into_top_left_kernel<<>>(input_ptr, (PositionSpaceType*)d_ptr.buffer_1, area_to_clip_from); postcheck; current_buffer = fastfft_internal_buffer_1; } // Modified from GpuImage::ClipIntoRealKernel -template -__global__ void clip_into_real_kernel(InputType* real_values_gpu, - InputType* other_image_real_values_gpu, - short4 dims, - short4 other_dims, - int3 wanted_coordinate_of_box_center, - InputType wanted_padding_value) { +template +__global__ void clip_into_real_kernel(PositionSpaceType* real_values_gpu, + PositionSpaceType* other_image_real_values_gpu, + short4 dims, + short4 other_dims, + int3 wanted_coordinate_of_box_center, + PositionSpaceType wanted_padding_value) { int3 other_coord = make_int3(blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * gridDim.y + threadIdx.y, blockIdx.z); @@ -1742,11 +1746,11 @@ __global__ void clip_into_real_kernel(InputType* real_values_gpu, } // end of bounds check } -template -void FourierTransformer::ClipIntoReal(InputType* input_ptr, - int wanted_coordinate_of_box_center_x, - int wanted_coordinate_of_box_center_y, - int wanted_coordinate_of_box_center_z) { +template +void FourierTransformer::ClipIntoReal(PositionSpaceType* input_ptr, + int wanted_coordinate_of_box_center_x, + int wanted_coordinate_of_box_center_y, + int wanted_coordinate_of_box_center_z) { // TODO add some checks and logic. // Assuming we are calling this from R2C_Transposed and that the launch bounds are not set. @@ -1762,19 +1766,19 @@ void FourierTransformer::ClipI float wanted_padding_value = 0.f; precheck; - clip_into_real_kernel<<>>(input_ptr, (InputType*)d_ptr.buffer_1, fwd_dims_in, fwd_dims_out, wanted_center, wanted_padding_value); + clip_into_real_kernel<<>>(input_ptr, (PositionSpaceType*)d_ptr.buffer_1, fwd_dims_in, fwd_dims_out, wanted_center, wanted_padding_value); postcheck; current_buffer = fastfft_internal_buffer_1; } -template +template template EnableIf> -FourierTransformer::SetPrecisionAndExectutionMethod(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +FourierTransformer::SetPrecisionAndExectutionMethod(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { // For kernels with fwd and inv transforms, we want to not set the direction yet. static const bool is_half = std::is_same_v; // FIXME: This should be done in the constructor @@ -1788,35 +1792,35 @@ FourierTransformer::SetPrecisi SetIntraKernelFunctions(other_image_ptr, kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } -template +template template -void FourierTransformer::SetIntraKernelFunctions(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +void FourierTransformer::SetIntraKernelFunctions(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { SelectSizeAndType(other_image_ptr, kernel_type, pre_op_functor, intra_op_functor, post_op_functor); } -template +template template -void FourierTransformer::SelectSizeAndType(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +void FourierTransformer::SelectSizeAndType(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { (SelectSizeAndTypeWithFold(other_image_ptr, kernel_type, pre_op_functor, intra_op_functor, post_op_functor), ...); } -template +template template -void FourierTransformer::SelectSizeAndTypeWithFold(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +void FourierTransformer::SelectSizeAndTypeWithFold(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { // Use recursion to step through the allowed sizes. GetTransformSize(kernel_type); @@ -1871,18 +1875,18 @@ void FourierTransformer::Selec } } -template +template template -void FourierTransformer::SetAndLaunchKernel(OtherImageType* other_image_ptr, - KernelType kernel_type, - PreOpType pre_op_functor, - IntraOpType intra_op_functor, - PostOpType post_op_functor) { +void FourierTransformer::SetAndLaunchKernel(OtherImageType* other_image_ptr, + KernelType kernel_type, + PreOpType pre_op_functor, + IntraOpType intra_op_functor, + PostOpType post_op_functor) { // Used to determine shared memory requirements using complex_compute_t = typename FFT_base_arch::value_type; using scalar_compute_t = typename complex_compute_t::value_type; - // Determined by InputType as complex version, i.e., half half2 or float float2 + // Determined by PositionSpaceType as complex version, i.e., half half2 or float float2 using data_buffer_t = std::remove_pointer_t; // Allowed half, float (real type image) half2 float2 (complex type image) so with typical case // as real valued image, data_io_t != data_buffer_t @@ -1913,9 +1917,9 @@ void FourierTransformer::SetAn #else using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); #endif - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(r2c_none_XY, FFT::elements_per_thread); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(r2c_none_XY, FFT::elements_per_thread, 1, 1); int shared_memory = FFT::shared_memory_size; CheckSharedMemory(shared_memory, device_properties); @@ -1961,9 +1965,9 @@ void FourierTransformer::SetAn #else using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); #endif - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(r2c_none_XZ, FFT::elements_per_thread); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(r2c_none_XZ, FFT::elements_per_thread, 1, XZ_STRIDE); int shared_memory = std::max(LP.threadsPerBlock.y * FFT::shared_memory_size, LP.threadsPerBlock.y * LP.mem_offsets.physical_x_output * (unsigned int)sizeof(complex_compute_t)); CheckSharedMemory(shared_memory, device_properties); @@ -1992,9 +1996,9 @@ void FourierTransformer::SetAn using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); #endif - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(r2c_decrease_XY, FFT::elements_per_thread); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(r2c_decrease_XY, FFT::elements_per_thread, 1, 1); // the shared mem is mixed between storage, shuffling and FFT. For this kernel we need to add padding to avoid bank conlicts (N/32) int shared_memory = std::max(FFT::shared_memory_size * LP.threadsPerBlock.y, (LP.mem_offsets.shared_input + LP.mem_offsets.shared_input / 32) * (unsigned int)sizeof(complex_compute_t)); @@ -2010,8 +2014,8 @@ void FourierTransformer::SetAn d_ptr.external_input, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = aliased_output_buffer; @@ -2023,8 +2027,8 @@ void FourierTransformer::SetAn d_ptr.external_input, d_ptr.buffer_1, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = fastfft_internal_buffer_1; @@ -2041,12 +2045,17 @@ void FourierTransformer::SetAn #else using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); #endif - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(r2c_increase_XY, FFT::elements_per_thread); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(r2c_increase_XY, FFT::elements_per_thread, 1, 1); int shared_memory = LP.mem_offsets.shared_input * sizeof(scalar_compute_t) + FFT::shared_memory_size; + std::cerr << "Shared Mem from r2c_increase_XY " << shared_memory << std::endl; // revert + + PrintLaunchParameters(LP); + PrintState( ); + CheckSharedMemory(shared_memory, device_properties); #if FFT_DEBUG_STAGE > 0 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_INCREASE_XY, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2057,8 +2066,9 @@ void FourierTransformer::SetAn d_ptr.external_input, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, + transform_size.L, workspace); postcheck; current_buffer = aliased_output_buffer; @@ -2070,8 +2080,9 @@ void FourierTransformer::SetAn d_ptr.external_input, d_ptr.buffer_1, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, // TODO: transform_size.Q + transform_size.L, workspace); postcheck; current_buffer = fastfft_internal_buffer_1; @@ -2090,13 +2101,13 @@ void FourierTransformer::SetAn #else using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); #endif - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); // FIXME: I don't think this is right when XZ_STRIDE is used - LaunchParams LP = SetLaunchParameters(r2c_increase_XZ, FFT::elements_per_thread); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); // FIXME: I don't think this is right when XZ_STRIDE is used + const LaunchParams LP = SetLaunchParameters(r2c_increase_XZ, FFT::elements_per_thread, 1, XZ_STRIDE); // We need shared memory to hold the input array(s) that is const through the kernel. // We alternate using additional shared memory for the computation and the transposition of the data. - int shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, LP.mem_offsets.physical_x_output / LP.Q * (unsigned int)sizeof(complex_compute_t)); + int shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, LP.mem_offsets.physical_x_output / LP.transform_size.Q * (unsigned int)sizeof(complex_compute_t)); shared_memory += XZ_STRIDE * LP.mem_offsets.shared_input * (unsigned int)sizeof(scalar_compute_t); CheckSharedMemory(shared_memory, device_properties); @@ -2107,8 +2118,8 @@ void FourierTransformer::SetAn d_ptr.external_input, d_ptr.buffer_1, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = fastfft_internal_buffer_1; @@ -2122,7 +2133,7 @@ void FourierTransformer::SetAn if constexpr ( FFT_ALGO_t == Generic_Fwd_FFT ) { using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - LaunchParams LP = SetLaunchParameters(c2c_fwd_none, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_fwd_none, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; DebugUnused auto workspace = make_workspace(error_code); @@ -2174,7 +2185,7 @@ void FourierTransformer::SetAn using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - LaunchParams LP = SetLaunchParameters(c2c_fwd_none_XYZ, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_fwd_none_XYZ, FFT::elements_per_thread, 1, XZ_STRIDE); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); @@ -2200,10 +2211,10 @@ void FourierTransformer::SetAn case c2c_fwd_decrease: { if constexpr ( FFT_ALGO_t == Generic_Fwd_FFT ) { - using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(c2c_fwd_decrease, FFT::elements_per_thread); + using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(c2c_fwd_decrease, FFT::elements_per_thread, 1, 1); #if FFT_DEBUG_STAGE > 2 // the shared mem is mixed between storage, shuffling and FFT. For this kernel we need to add padding to avoid bank conlicts (N/32) @@ -2220,8 +2231,8 @@ void FourierTransformer::SetAn d_ptr.external_input, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; } @@ -2232,8 +2243,8 @@ void FourierTransformer::SetAn d_ptr.buffer_1, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; } @@ -2250,14 +2261,14 @@ void FourierTransformer::SetAn MyFFTDebugAssertTrue(current_buffer == fastfft_internal_buffer_1, "current_buffer != fastfft_internal_buffer_1"); using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - LaunchParams LP = SetLaunchParameters(c2c_fwd_increase_XYZ, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_fwd_increase_XYZ, FFT::elements_per_thread, 1, XZ_STRIDE); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); // We need shared memory to hold the input array(s) that is const through the kernel. // We alternate using additional shared memory for the computation and the transposition of the data. - int shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, XZ_STRIDE * LP.mem_offsets.physical_x_output / LP.Q * (unsigned int)sizeof(complex_compute_t)); + int shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, XZ_STRIDE * LP.mem_offsets.physical_x_output / LP.transform_size.Q * (unsigned int)sizeof(complex_compute_t)); shared_memory += XZ_STRIDE * LP.mem_offsets.shared_input * (unsigned int)sizeof(complex_compute_t); #if FFT_DEBUG_STAGE > 1 @@ -2269,8 +2280,8 @@ void FourierTransformer::SetAn d_ptr.buffer_1, d_ptr.buffer_2, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = fastfft_internal_buffer_2; @@ -2284,11 +2295,14 @@ void FourierTransformer::SetAn if constexpr ( FFT_ALGO_t == Generic_Fwd_FFT ) { using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - LaunchParams LP = SetLaunchParameters(c2c_fwd_increase, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_fwd_increase, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); int shared_memory = FFT::shared_memory_size + (unsigned int)sizeof(complex_compute_t) * (LP.mem_offsets.shared_input + LP.mem_offsets.shared_output); + std::cerr << "\n\nShared Mem from c2c fwd increase " << shared_memory << std::endl; // revert + PrintLaunchParameters(LP); + PrintState( ); #if FFT_DEBUG_STAGE > 2 CheckSharedMemory(shared_memory, device_properties); cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_INCREASE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2299,8 +2313,9 @@ void FourierTransformer::SetAn block_fft_kernel_C2C_INCREASE<<>>( d_ptr.external_input, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, + LP.transform_size.L, workspace); postcheck; } @@ -2311,8 +2326,9 @@ void FourierTransformer::SetAn d_ptr.buffer_1, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, + LP.transform_size.L, workspace); postcheck; } @@ -2323,8 +2339,9 @@ void FourierTransformer::SetAn d_ptr.buffer_2, reinterpret_cast(external_output_ptr), LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, + LP.transform_size.L, workspace); postcheck; } @@ -2338,13 +2355,15 @@ void FourierTransformer::SetAn if constexpr ( FFT_ALGO_t == Generic_Inv_FFT ) { using FFT = decltype(FFT_base_arch( ) + Type( ) + Direction( )); - LaunchParams LP = SetLaunchParameters(c2c_inv_none, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_inv_none, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); int shared_memory = FFT::shared_memory_size; - + std::cerr << "\n\nShared Mem from c2c_inv_none " << shared_memory << std::endl; // revert + PrintLaunchParameters(LP); + PrintState( ); CheckSharedMemory(shared_memory, device_properties); #if FFT_DEBUG_STAGE > 4 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory)); @@ -2383,7 +2402,7 @@ void FourierTransformer::SetAn MyFFTDebugAssertTrue(current_buffer == fastfft_external_input, "current_buffer != fastfft_external_input"); using FFT = decltype(FFT_base_arch( ) + Type( ) + Direction( )); - LaunchParams LP = SetLaunchParameters(c2c_inv_none_XZ, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_inv_none_XZ, FFT::elements_per_thread, 1, XZ_STRIDE); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); @@ -2414,7 +2433,7 @@ void FourierTransformer::SetAn MyFFTDebugAssertTrue(current_buffer == fastfft_internal_buffer_1, "current_buffer != fastfft_internal_buffer_1"); using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - LaunchParams LP = SetLaunchParameters(c2c_inv_none_XYZ, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2c_inv_none_XYZ, FFT::elements_per_thread, 1, XZ_STRIDE); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); @@ -2439,10 +2458,10 @@ void FourierTransformer::SetAn case c2c_inv_decrease: { if constexpr ( FFT_ALGO_t == Generic_Inv_FFT ) { - using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); - LaunchParams LP = SetLaunchParameters(c2c_inv_decrease, FFT::elements_per_thread); + using FFT = decltype(FFT_base_arch( ) + Direction( ) + Type( )); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + const LaunchParams LP = SetLaunchParameters(c2c_inv_decrease, FFT::elements_per_thread, 1, 1); #if FFT_DEBUG_STAGE > 4 // the shared mem is mixed between storage, shuffling and FFT. For this kernel we need to add padding to avoid bank conlicts (N/32) @@ -2458,8 +2477,8 @@ void FourierTransformer::SetAn block_fft_kernel_C2C_DECREASE<<>>( reinterpret_cast(d_ptr.external_input), external_output_ptr, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = aliased_output_buffer; @@ -2471,8 +2490,8 @@ void FourierTransformer::SetAn reinterpret_cast(d_ptr.external_input), d_ptr.buffer_1, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; current_buffer = fastfft_internal_buffer_1; @@ -2505,7 +2524,7 @@ void FourierTransformer::SetAn using FFT = check_ept_t( ) + Type( ))>; #endif - LaunchParams LP = SetLaunchParameters(c2r_none, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2r_none, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl; cudaErr(error_code); @@ -2557,9 +2576,8 @@ void FourierTransformer::SetAn using FFT = check_ept_t( ) + Type( ))>; #endif - LaunchParams LP = SetLaunchParameters(c2r_none_XY, FFT::elements_per_thread); - cudaError_t error_code = cudaSuccess; - auto workspace = make_workspace(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl; cudaErr(error_code); + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl; cudaErr(error_code); int shared_memory = FFT::shared_memory_size; #ifdef C2R_BUFFER_LINES @@ -2568,14 +2586,9 @@ void FourierTransformer::SetAn constexpr unsigned int n_buffer_lines = size_of::value < 32u ? 1 : size_of::value < 128u ? std::max(min_buffer_lines, 2u) : size_of::value < 512 ? std::max(min_buffer_lines, 4u) : 8; //std::max(min_buffer_lines, 4u); - LP.gridDims.y /= n_buffer_lines; // For the shared impl, we need more threads in y each will workin on the subfft constexpr unsigned int max_threads_per_block = FFT::max_threads_per_block * n_buffer_lines; static_assert(max_threads_per_block < 1025, "C2R_BUFFER_LINES resulting in too many threads per block."); - if ( n_buffer_lines > 1 ) { - LP.threadsPerBlock.y = n_buffer_lines; - // TODO: other asserts, but basically, for the buffering we want to have at least 32 threads per block, and we can't modify x - } // Add enough shared mem to swap on each read // revert shared_memory = std::max(size_t(FFT::shared_memory_size * n_buffer_lines / 2), size_t(FFT::stride * n_buffer_lines * sizeof(complex_compute_t))); @@ -2585,6 +2598,15 @@ void FourierTransformer::SetAn constexpr unsigned int n_buffer_lines = 1; constexpr unsigned int max_threads_per_block = FFT::max_threads_per_block; #endif + + const LaunchParams LP = SetLaunchParameters(c2r_none_XY, FFT::elements_per_thread, n_buffer_lines, 1); + + std::cerr << "Shared Mem from c2r_none XY " << shared_memory << std::endl; // revert + + PrintLaunchParameters(LP); + PrintState( ); + std::cerr << "n_buffer_lines " << n_buffer_lines << std::endl; + CheckSharedMemory(shared_memory, device_properties); // PrintLaunchParameters(LP); @@ -2652,7 +2674,7 @@ void FourierTransformer::SetAn using FFT = check_ept_t( ) + Type( ))>; #endif - LaunchParams LP = SetLaunchParameters(c2r_decrease_XY, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(c2r_decrease_XY, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace = make_workspace(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl; cudaErr(error_code); @@ -2684,8 +2706,8 @@ void FourierTransformer::SetAn d_ptr.external_input, external_output_ptr, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; } @@ -2701,8 +2723,8 @@ void FourierTransformer::SetAn d_ptr.buffer_1, external_output_ptr, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; } @@ -2714,8 +2736,8 @@ void FourierTransformer::SetAn d_ptr.buffer_2, external_output_ptr, LP.mem_offsets, - _i2pi_div_N(LP.Q), - LP.Q, + _i2pi_div_N(LP.transform_size.Q), + LP.transform_size.Q, workspace); postcheck; } @@ -2744,7 +2766,7 @@ void FourierTransformer::SetAn using FFT = decltype(FFT_base_arch( ) + Type( ) + Direction( )); using invFFT = decltype(FFT_base_arch( ) + Type( ) + Direction( )); - LaunchParams LP = SetLaunchParameters(xcorr_fwd_none_inv_decrease, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(xcorr_fwd_none_inv_decrease, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace_fwd = make_workspace(error_code); // presumably larger of the two @@ -2770,7 +2792,7 @@ void FourierTransformer::SetAn d_ptr.buffer_1, d_ptr.buffer_2, LP.mem_offsets, - _i2pi_div_N(LP.Q), + _i2pi_div_N(LP.transform_size.Q), apparent_Q, workspace_fwd, workspace_inv); @@ -2791,7 +2813,7 @@ void FourierTransformer::SetAn using FFT = decltype(mod_base( ) + Type( ) + Direction( )); using invFFT = decltype(mod_base( ) + Type( ) + Direction( )); - LaunchParams LP = SetLaunchParameters(generic_fwd_increase_op_inv_none, FFT::elements_per_thread); + const LaunchParams LP = SetLaunchParameters(generic_fwd_increase_op_inv_none, FFT::elements_per_thread, 1, 1); cudaError_t error_code = cudaSuccess; auto workspace_fwd = make_workspace(error_code); // presumably larger of the two @@ -2871,8 +2893,8 @@ void FourierTransformer::SetAn ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Some helper functions that are annoyingly long to have in the header. ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -void FourierTransformer::GetTransformSize(KernelType kernel_type) { +template +void FourierTransformer::GetTransformSize(KernelType kernel_type) { // Set member variable transform_size.N (.P .L .Q) if ( IsR2CType(kernel_type) ) { @@ -2890,7 +2912,7 @@ void FourierTransformer::GetTr else { // C2C type if ( IsForwardType(kernel_type) ) { - switch ( transform_dimension ) { + switch ( Rank ) { case 1: { AssertDivisibleAndFactorOf2(kernel_type, std::max(fwd_dims_in.x, fwd_dims_out.x), std::min(fwd_dims_in.x, fwd_dims_out.x)); break; @@ -2917,12 +2939,13 @@ void FourierTransformer::GetTr } default: { + std::cerr << "Transform dimension: " << Rank << std::endl; MyFFTDebugAssertTrue(false, "ERROR: Invalid transform dimension for c2c fwd type.\n"); } } } else { - switch ( transform_dimension ) { + switch ( Rank ) { case 1: { AssertDivisibleAndFactorOf2(kernel_type, std::max(inv_dims_in.x, inv_dims_out.x), std::min(inv_dims_in.x, inv_dims_out.x)); break; @@ -2957,8 +2980,11 @@ void FourierTransformer::GetTr } // end GetTransformSize function -template -LaunchParams FourierTransformer::SetLaunchParameters(KernelType kernel_type, const int ept) { +template +LaunchParams FourierTransformer::SetLaunchParameters(KernelType kernel_type, + const unsigned int ept, + const unsigned int stride_y, + const unsigned int stride_z) { /* Assuming: 1) r2c/c2r imply forward/inverse transform. @@ -2975,14 +3001,19 @@ LaunchParams FourierTransformer::max( ); + L.mem_offsets.shared_input = unset_value; + L.mem_offsets.shared_output = unset_value; + L.mem_offsets.physical_x_input = unset_value; + L.mem_offsets.physical_x_output = unset_value; + L.gridDims = dim3(0, 0, 0); + L.threadsPerBlock = dim3(0, 0, 0); - // Set the twiddle factor, only differ in sign between fwd/inv transforms. - // For mixed kernels (eg. xcorr_* the size type is defined by where the size change happens. - // FIXME fwd_increase (oversampling) xcorr -> inv decrease (peak search) is a likely algorithm, that will not fit with this logic. SizeChangeType::Enum size_change_type; if ( IsForwardType(kernel_type) ) { size_change_type = fwd_size_change_type; @@ -2991,15 +3022,9 @@ LaunchParams FourierTransformer || std::is_same_v, "PositionSpaceType must be real valued"); + switch ( transform_stage_completed ) { + case 0: { + // We currently are only supporting real valued input/output so this should not happen. + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed == 0 should not happen."); + break; + } case 1: { - L.gridDims = dim3(1, 1, 1); + // if 2d XY are transposed, if 3d XZ are transposed. + if constexpr ( Rank == 2 ) { + L.gridDims = dim3(1, fwd_dims_out.w, 1); + L.mem_offsets.physical_x_input = fwd_dims_in.y; + L.mem_offsets.physical_x_output = fwd_dims_out.y; + } + else { + MyFFTDebugAssertTrue(IsTransormAlongZ(kernel_type), "ERROR: transform_stage_completed == 1 should only happen for 3d transforms."); + L.gridDims = dim3(1, fwd_dims_in.y, fwd_dims_out.w); + L.mem_offsets.physical_x_input = fwd_dims_in.z; + L.mem_offsets.physical_x_output = fwd_dims_in.y; + } + break; } case 2: { - if ( IsForwardType(kernel_type) ) { - L.gridDims = dim3(1, fwd_dims_out.w, fwd_dims_out.z); + // This should only ever be set for a 3d transform + MyFFTDebugAssertTrue(Rank == 3, "ERROR: transform_stage_completed == 2 should only happen for 3d transforms."); + MyFFTDebugAssertTrue(! Is_XY_Transposed(kernel_type) && ! IsTransormAlongZ(kernel_type), "ERROR: transform_stage_completed == 2 should only happen for 3d transforms."); + L.gridDims = dim3(1, fwd_dims_out.w, fwd_dims_out.z); + L.mem_offsets.physical_x_input = fwd_dims_in.y; + L.mem_offsets.physical_x_output = fwd_dims_out.y; + break; + } + case 3: { + // There are no cases currently where this should happen here (but is used in copy ops) + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed == 3 should not happen."); + break; + } + case 4: { + // This should be set only when calling InvFFT and in all cases we should have the y dim input + L.mem_offsets.physical_x_input = inv_dims_in.y; + // For 2d we handle the transpose on C2R, for 3d we should be transposing physical XZ, logical y z + if constexpr ( Rank == 2 ) { + L.mem_offsets.physical_x_output = inv_dims_out.y; + L.gridDims = dim3(1, inv_dims_in.w, 1); } else { - L.gridDims = dim3(1, inv_dims_out.w, inv_dims_out.z); + MyFFTDebugAssertTrue(Is_XZ_Transposed(kernel_type), "ERROR: transform_stage_completed == 4 should only happen for 3d transforms."); + L.mem_offsets.physical_x_output = inv_dims_in.z; // note dims in as we haven't done anything to the logical z dimension yet + L.gridDims = dim3(1, inv_dims_in.w, inv_dims_in.z); } break; } - case 3: { - if ( IsTransormAlongZ(kernel_type) ) { - // When transforming along the (logical) Z-dimension, The Z grid dimensions for a 3d kernel are used to indicate the transposed x coordinate. (physical Z) - if ( IsForwardType(kernel_type) ) { - // Always 1 | index into physical y, yet to be expanded | logical x (physical Z) is already expanded (dims_out) - L.gridDims = dim3(1, fwd_dims_in.y, fwd_dims_out.w); - } - else { - // FIXME only tested for NONE size change type, dims_in might be correct, not stopping to think about it now. - L.gridDims = dim3(1, inv_dims_out.y, inv_dims_out.w); - } + case 5: { + // + if constexpr ( Rank == 2 ) { + // Currently with real valued input/output this should not happen. + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed == 5 should not happen for Rank 2."); } else { - if ( IsForwardType(kernel_type) ) { - L.gridDims = dim3(1, fwd_dims_out.w, fwd_dims_out.z); - } - else { - L.gridDims = dim3(1, inv_dims_out.w, inv_dims_out.z); - } + MyFFTDebugAssertTrue(IsTransormAlongZ(kernel_type), "ERROR: transform_stage_completed == 5 should only happen for 3d transforms."); + L.gridDims = dim3(1, inv_dims_in.w, inv_dims_out.y); + L.mem_offsets.physical_x_input = inv_dims_in.z; + L.mem_offsets.physical_x_output = inv_dims_in.w; // we write out the logical x to the physical N/2+1 } break; - } // 3 dimensional case + } + case 6: { + + // Currently with real valued input/output this should not happen. + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed == 6 should not happen for Rank 2 or 3 c2c with real valued input/output."); + break; + } + case 7: { + // Currently with real valued input/output this should not happen. + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed == 7 should not happen for Rank 2 or 3 c2c with real valued input/output."); + break; + } default: { - MyFFTDebugAssertTrue(false, "Unknown transform_dimension ( " + std::to_string(transform_dimension) + " )"); + MyFFTDebugAssertTrue(false, "ERROR: transform_stage_completed is not a valid value."); } } + } - // Over ride for partial output coalescing - if ( kernel_type == c2c_inv_none_XZ ) { - L.threadsPerBlock.y = XZ_STRIDE; - L.gridDims.z /= XZ_STRIDE; + // Now handle setting block dim + if ( IsRoundTripType(kernel_type) ) { + L.threadsPerBlock = dim3(transform_size.N / ept, 1, 1); + if ( IsDecreaseSizeType(kernel_type) ) { + // FIXME: I'm not sure what was supposed to be happending here and it needs to be reviewd. + L.gridDims = dim3(1, fwd_dims_out.w, 1); + L.mem_offsets.physical_x_input = inv_dims_in.y; + L.mem_offsets.physical_x_output = inv_dims_out.y; + } + } + else { + if ( size_change_type == SizeChangeType::decrease ) { + L.threadsPerBlock = dim3(transform_size.P / ept, transform_size.Q, 1); } - if ( kernel_type == c2c_fwd_none_XYZ || kernel_type == c2c_inv_none_XYZ || kernel_type == c2c_fwd_increase_XYZ ) { - L.threadsPerBlock.y = XZ_STRIDE; - L.gridDims.y /= XZ_STRIDE; + else { + // In the current xcorr methods that have INCREASE, explicit zero padding is used, so this will be overridden (overrode?) with transform_size.N + L.threadsPerBlock = dim3(transform_size.P / ept, 1, 1); } } + + if ( stride_y > 1 ) { + L.threadsPerBlock.y = stride_y; + L.gridDims.y /= stride_y; + } + + if ( stride_z > 1 ) { + L.threadsPerBlock.y = stride_z; + L.gridDims.z /= stride_z; + } + // FIXME // Some shared memory over-rides if ( kernel_type == c2c_inv_decrease || kernel_type == c2c_inv_increase ) { L.mem_offsets.shared_output = inv_dims_out.y; } - // FIXME - // Some xcorr overrides TODO try the DECREASE approcae - if ( kernel_type == generic_fwd_increase_op_inv_none ) { - // FIXME not correct for 3D - L.threadsPerBlock = dim3(transform_size.N / ept, 1, 1); - } - - if ( kernel_type == xcorr_fwd_none_inv_decrease ) { - // FIXME not correct for 3D + // Maybe these could be debug asserts, but I think the cost is minimal. TOOD: see if there is any measurable performance hit. + MyFFTRunTimeAssertFalse(L.mem_offsets.physical_x_input == unset_value, "physical_x_input not set"); + MyFFTRunTimeAssertFalse(L.mem_offsets.physical_x_output == unset_value, "physical_x_output not set"); + MyFFTRunTimeAssertFalse(L.mem_offsets.shared_input == unset_value, "shared_input not set"); + MyFFTRunTimeAssertFalse(L.mem_offsets.shared_output == unset_value, "shared_output not set"); + MyFFTRunTimeAssertFalse(L.threadsPerBlock.x == 0, "threadsPerBlock.x not set"); + MyFFTRunTimeAssertFalse(L.threadsPerBlock.y == 0, "threadsPerBlock.y not set"); + MyFFTRunTimeAssertFalse(L.threadsPerBlock.z == 0, "threadsPerBlock.z not set"); + MyFFTRunTimeAssertFalse(L.gridDims.x == 0, "gridDims.x not set"); + MyFFTRunTimeAssertFalse(L.gridDims.y == 0, "gridDims.y not set"); + MyFFTRunTimeAssertFalse(L.gridDims.z == 0, "gridDims.z not set"); - L.threadsPerBlock = dim3(transform_size.N / ept, 1, 1); - // FIXME - L.gridDims = dim3(1, fwd_dims_out.w, 1); - L.mem_offsets.physical_x_input = inv_dims_in.y; - L.mem_offsets.physical_x_output = inv_dims_out.y; - } return L; } diff --git a/src/python/FastFFT_binding/FastFFT_python_binding.cu b/src/python/FastFFT_binding/FastFFT_python_binding.cu index 3575092..bb8694c 100644 --- a/src/python/FastFFT_binding/FastFFT_python_binding.cu +++ b/src/python/FastFFT_binding/FastFFT_python_binding.cu @@ -18,10 +18,10 @@ namespace py = pybind11; -template +template void declare_array(py::module& m, const std::string& typestr) { - using FT_t = FastFFT::FourierTransformer; + using FT_t = FastFFT::FourierTransformer; std::string pyclass_name = std::string("FourierTransformer") + typestr; py::class_(m, pyclass_name.c_str( )) diff --git a/src/tests/debug_with_index_values.cu b/src/tests/debug_with_index_values.cu index 11ddabe..58270e2 100644 --- a/src/tests/debug_with_index_values.cu +++ b/src/tests/debug_with_index_values.cu @@ -116,10 +116,10 @@ void compare_libraries(std::vector size, FastFFT::SizeChangeType::Enum size FT.SetToConstant(FT_output.real_values, FT_output.real_memory_allocated, 0.0f); // Place these values at the origin of the image and after convolution, should be at 0,0,0. - float divide_by = FFT_DEBUG_STAGE == 0 ? 1.f : FFT_DEBUG_STAGE < 4 ? sqrtf(16.f) - : FFT_DEBUG_STAGE == 4 ? sqrtf(16.f * 16.f) - : FFT_DEBUG_STAGE < 7 ? sqrtf(16.f * 16.f * 16.f) - : 32. * 32.; + float divide_by = FFT_DEBUG_STAGE == 0 ? 1.f : FFT_DEBUG_STAGE < 4 ? sqrtf(powf(size[oSize], 1)) + : FFT_DEBUG_STAGE == 4 ? sqrtf(powf(size[oSize], 2)) + : FFT_DEBUG_STAGE < 7 ? sqrtf(powf(size[oSize], 3)) + : sqrtf(powf(size[oSize], 4)); float counter = 0.f; int pixel_counter = 0; @@ -129,7 +129,7 @@ void compare_libraries(std::vector size, FastFFT::SizeChangeType::Enum size counter += 1.f; pixel_counter++; } - pixel_counter += FT_input.padding_jump_value; + pixel_counter += (FT_input.padding_jump_value + input_size.x - 16); } // Transform the target on the host prior to transfer. diff --git a/src/tests/test_runtime_asserts.cu b/src/tests/test_runtime_asserts.cu index 1e78acb..70ac1c4 100644 --- a/src/tests/test_runtime_asserts.cu +++ b/src/tests/test_runtime_asserts.cu @@ -1,8 +1,11 @@ -#define FastFFT_captureStdErr + #include #include "../../include/FastFFT.h" +// FastFFT_captureStdErr must be defined so that the asserts do not call std::abort +// It is not clear yet how useful or not this is to check that runtime asserts are working, vs. hope and pray + struct FastFFTCaptureStdErr { // This can be an ofstream as well or any other ostream std::stringstream buffer; @@ -43,6 +46,11 @@ struct FastFFTCaptureStdErr { int main(int argc, char** argv) { +#ifndef FastFFT_captureStdErr + std::cout << "Error: FastFFT_captureStdErr must be defined when running this test." << std::endl; + return 1; +#endif + const int input_size = 64; FastFFT::FourierTransformer FT;