Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
bHimes committed Nov 23, 2023
1 parent 332f77b commit e8962da
Show file tree
Hide file tree
Showing 15 changed files with 670 additions and 461 deletions.
7 changes: 6 additions & 1 deletion .vscode_shared/BenHimes/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@
"target": "cpp",
"ios": "cpp",
"__pragma_push": "cpp",
"__locale": "cpp"
"__locale": "cpp",
"__bit_reference": "cpp",
"__functional_base": "cpp",
"__node_handle": "cpp",
"__memory": "cpp",
"locale": "cpp"
},
"C_Cpp.clang_format_path": "/usr/bin/clang-format-14",
"editor.formatOnSave": true,
Expand Down
309 changes: 6 additions & 303 deletions include/FastFFT.cuh
Original file line number Diff line number Diff line change
@@ -1,319 +1,22 @@
// Utilites for FastFFT.cu that we don't need the host to know about (FastFFT.h)
#include "FastFFT.h"

#ifndef Fast_FFT_cuh_
#define Fast_FFT_cuh_
#ifndef __INCLUDE_FAST_FFT_CUH__
#define __INCLUDE_FAST_FFT_CUH__

#include <cuda_fp16.h>
#include "cufftdx/include/cufftdx.hpp"
#include "detail/detail.cuh"

// clang-format off

// “This software contains source code provided by NVIDIA Corporation.” Much of it is modfied as noted at relevant function definitions.

// When defined Turns on synchronization based checking for all FFT kernels as well as cudaErr macros
// Defined in the Makefile when DEBUG_STAGE is not equal 8 (the default, not partial transforms.)
// #define HEAVYERRORCHECKING_FFT

// Various levels of debuging conditions and prints
// #define FFT_DEBUG_LEVEL 0
// “This software contains source code provided by NVIDIA Corporation.”
// This is located in include/cufftdx*
// Please review the license in the cufftdx directory.

// #define forceforce( type ) __nv_is_extended_device_lambda_closure_type( type )
//FIXME: change to constexpr func







#if FFT_DEBUG_LEVEL < 1

#define MyFFTDebugPrintWithDetails(...)
#define MyFFTDebugAssertTrue(cond, msg, ...)
#define MyFFTDebugAssertFalse(cond, msg, ...)
#define MyFFTDebugAssertTestTrue(cond, msg, ...)
#define MyFFTDebugAssertTestFalse(cond, msg, ...)

#else
// Minimally define asserts that check state variables and setup.
#define MyFFTDebugAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; std::abort(); } }
#define MyFFTDebugAssertFalse(cond, msg, ...) { if ( (cond) == true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; std::abort(); } }

#endif

#if FFT_DEBUG_LEVEL > 1
// Turn on checkpoints in the testing functions.
#define MyFFTDebugAssertTestTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << " Test " << msg << " FAILED!" << std::endl << " at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort(); } else { std::cerr << " Test " << msg << " passed!" << std::endl; }}
#define MyFFTDebugAssertTestFalse(cond, msg, ...) { if ( (cond) == true ) { std::cerr << " Test " << msg << " FAILED!" << std::endl << " at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; std::abort(); } else { std::cerr << " Test " << msg << " passed!" << std::endl; } }

#endif

#if FFT_DEBUG_LEVEL == 2
#define MyFFTDebugPrintWithDetails(...)
#endif

#if FFT_DEBUG_LEVEL == 3
// More verbose debug info
#define MyFFTDebugPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
#define MyFFTDebugPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
#endif

#if FFT_DEBUG_LEVEL == 4
// More verbose debug info + state info
#define MyFFTDebugPrint(...) { FastFFT::FourierTransformer::PrintState( ); std::cerr << __VA_ARGS__ << std::endl; }
#define MyFFTDebugPrintWithDetails(...) { FastFFT::FourierTransformer::PrintState( ); std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }

#endif

// Always in use
#define MyFFTPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
#define MyFFTPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
#define MyFFTRunTimeAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort(); } }
#define MyFFTRunTimeAssertFalse(cond, msg, ...) { if ( (cond) == true ) {std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort(); } }



// I use the same things in cisTEM, so check for them. FIXME, get rid of defines and also find a better sharing mechanism.
#ifndef cudaErr
// Note we are using std::cerr b/c the wxWidgets apps running in cisTEM are capturing std::cout
// If I leave cudaErr blank when HEAVYERRORCHECKING_FFT is not defined, I get some reports/warnings about unused or unreferenced variables. I suspect the performance hit is very small so just leave this on.
// The real cost is in the synchronization of in pre/postcheck.
#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if ( status != cudaSuccess ) { std::cerr << cudaGetErrorString(status) << " :-> "; MyFFTPrintWithDetails(""); } };
#endif

#ifndef postcheck
#ifndef precheck
#ifndef HEAVYERRORCHECKING_FFT
#define postcheck
#define precheck
#else
#define postcheck { cudaErr(cudaPeekAtLastError( )); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error) }
#define precheck { cudaErr(cudaGetLastError( )) }
#endif
#endif
#endif


inline void checkCudaErr(cudaError_t err) {
if ( err != cudaSuccess ) {
std::cerr << cudaGetErrorString(err) << " :-> " << std::endl;
MyFFTPrintWithDetails(" ");
}
};

#define USEFASTSINCOS
// The __sincosf doesn't appear to be the problem with accuracy, likely just the extra additions, but it probably also is less flexible with other types. I don't see a half precision equivalent.
#ifdef USEFASTSINCOS
__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
__sincosf(arg, s, c);
}
#else
__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
sincos(arg, s, c);
}
#endif

// clang-format on

namespace FastFFT {

template <typename T>
inline bool pointer_is_in_memory_and_registered(T ptr) {
// FIXME: I don't think this is thread safe, add a mutex as in cistem::GpuImage
cudaPointerAttributes attr;
cudaErr(cudaPointerGetAttributes(&attr, ptr));

if ( attr.type == 1 && attr.devicePointer == attr.hostPointer ) {
return true;
}
else {
return false;
}
}

template <typename T>
inline bool pointer_is_in_device_memory(T ptr) {
// FIXME: I don't think this is thread safe, add a mutex as in cistem::GpuImage
cudaPointerAttributes attr;
cudaErr(cudaPointerGetAttributes(&attr, ptr));

if ( attr.type == 2 || attr.type == 3 ) {
return true;
}
else {
return false;
}
}

__device__ __forceinline__ int
d_ReturnReal1DAddressFromPhysicalCoord(int3 coords, short4 img_dims) {
return ((((int)coords.z * (int)img_dims.y + coords.y) * (int)img_dims.w * 2) + (int)coords.x);
}

static constexpr const int XZ_STRIDE = 16;

static constexpr const int bank_size = 32;
static constexpr const int bank_padded = bank_size + 1;
static constexpr const unsigned int ubank_size = 32;
static constexpr const unsigned int ubank_padded = ubank_size + 1;

__device__ __forceinline__ int GetSharedMemPaddedIndex(const int index) {
return (index % bank_size) + ((index / bank_size) * bank_padded);
}

__device__ __forceinline__ int GetSharedMemPaddedIndex(const unsigned int index) {
return (index % ubank_size) + ((index / ubank_size) * ubank_padded);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress(const unsigned int pixel_pitch) {
return pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y);
}

// Return the address of the 1D transform index 0. Right now testing for a stride of 2, but this could be modifiable if it works.
static __device__ __forceinline__ unsigned int Return1DFFTAddress_strided_Z(const unsigned int pixel_pitch) {
// In the current condition, threadIdx.z is either 0 || 1, and gridDim.z = size_z / 2
// index into a 2D tile in the XZ plane, for output in the ZX transposed plane (for coalsced write.)
return pixel_pitch * (blockIdx.y + (XZ_STRIDE * blockIdx.z + threadIdx.z) * gridDim.y);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int ReturnZplane(const unsigned int NX, const unsigned int NY) {
return (blockIdx.z * NX * NY);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress_Z(const unsigned int NY) {
return blockIdx.y + (blockIdx.z * NY);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTColumn_XYZ_transpose(const unsigned int NX) {
// NX should be size_of<FFT>::value for this method. Should this be templated?
// presumably the XZ axis is alread transposed on the forward, used to index into this state. Indexs in (ZY)' plane for input, to be transposed and permuted to output.'
return NX * (XZ_STRIDE * (blockIdx.y + gridDim.y * blockIdx.z) + threadIdx.z);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose(const unsigned int X) {
return blockIdx.z + gridDim.z * (blockIdx.y + X * gridDim.y);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX) {
// return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
// (IDX % XZ_STRIDE) -> transposed x coordinate in tile
// ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
// (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
// above * blockIdx.y -> offset in physical Y (transposed Y)
// (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + (IDX / XZ_STRIDE) * gridDim.y);
}

static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
// return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
// (IDX % XZ_STRIDE) -> transposed x coordinate in tile
// ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
// (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
// above * blockIdx.y -> offset in physical Y (transposed Y)
// (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.y);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX) {
// return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + (IDX / XZ_STRIDE) * gridDim.z);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
// return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.z);
}

// Return the address of the 1D transform index 0
static __device__ __forceinline__ unsigned int Return1DFFTColumn_XZ_to_XY( ) {
// return blockIdx.y + gridDim.y * ( blockIdx.z + gridDim.z * X);
return blockIdx.y + gridDim.y * blockIdx.z;
}

static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX_to_XY( ) {
return blockIdx.z + gridDim.z * blockIdx.y;
}

static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX( ) {
return Return1DFFTColumn_XZ_to_XY( );
}

// Complex a * conj b multiplication
template <typename ComplexType, typename ScalarType>
static __device__ __host__ inline auto ComplexConjMulAndScale(const ComplexType a, const ComplexType b, ScalarType s) -> decltype(b) {
ComplexType c;
c.x = s * (a.x * b.x + a.y * b.y);
c.y = s * (a.y * b.x - a.x * b.y);
return c;
}

// GetCudaDeviceArch from https://github.com/mnicely/cufft_examples/blob/master/Common/cuda_helper.h
void GetCudaDeviceProps(DeviceProps& dp);

void CheckSharedMemory(int& memory_requested, DeviceProps& dp);
void CheckSharedMemory(unsigned int& memory_requested, DeviceProps& dp);

using namespace cufftdx;

// TODO this probably needs to depend on the size of the xform, at least small vs large.
constexpr const int elements_per_thread_16 = 4;
constexpr const int elements_per_thread_32 = 8;
constexpr const int elements_per_thread_64 = 8;
constexpr const int elements_per_thread_128 = 8;
constexpr const int elements_per_thread_256 = 8;
constexpr const int elements_per_thread_512 = 8;
constexpr const int elements_per_thread_1024 = 8;
constexpr const int elements_per_thread_2048 = 8;
constexpr const int elements_per_thread_4096 = 8;
constexpr const int elements_per_thread_8192 = 16;

namespace KernelFunction {

// Define an enum for different functors
// Intra Kernel Function Type
enum IKF_t { NOOP,
CONJ_MUL };

// Maybe a better way to check , but using keyword final to statically check for non NONE types
template <class T, int N_ARGS, IKF_t U>
class my_functor {};

template <class T>
class my_functor<T, 0, IKF_t::NOOP> {
public:
__device__ __forceinline__
T
operator( )( ) {
printf("really specific NOOP\n");
return 0;
}
};

template <class T>
class my_functor<T, 2, IKF_t::CONJ_MUL> final {
public:
__device__ __forceinline__
T
operator( )(float& template_fft_x, float& template_fft_y, const float& target_fft_x, const float& target_fft_y) {
// Is there a better way than declaring this variable each time?
// This is target * conj(template)
float tmp = (template_fft_x * target_fft_x + template_fft_y * target_fft_y);
template_fft_y = (template_fft_x * target_fft_y - template_fft_y * target_fft_x);
template_fft_x = tmp;
}
};

} // namespace KernelFunction

// constexpr const std::map<unsigned int, unsigned int> elements_per_thread = {
// {16, 4}, {"GPU", 15}, {"RAM", 20},
// };
Expand Down
Loading

0 comments on commit e8962da

Please sign in to comment.