wip

StochasticAnalytics · Nov 23, 2023 · e8962da · e8962da
1 parent 332f77b
commit e8962da
Show file tree

Hide file tree

Showing 15 changed files with 670 additions and 461 deletions.
diff --git a/.vscode_shared/BenHimes/settings.json b/.vscode_shared/BenHimes/settings.json
@@ -79,7 +79,12 @@
         "target": "cpp",
         "ios": "cpp",
         "__pragma_push": "cpp",
-        "__locale": "cpp"
+        "__locale": "cpp",
+        "__bit_reference": "cpp",
+        "__functional_base": "cpp",
+        "__node_handle": "cpp",
+        "__memory": "cpp",
+        "locale": "cpp"
     },
     "C_Cpp.clang_format_path": "/usr/bin/clang-format-14",
     "editor.formatOnSave": true,

diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh
@@ -1,319 +1,22 @@
 // Utilites for FastFFT.cu that we don't need the host to know about (FastFFT.h)
 #include "FastFFT.h"
 
-#ifndef Fast_FFT_cuh_
-#define Fast_FFT_cuh_
+#ifndef __INCLUDE_FAST_FFT_CUH__
+#define __INCLUDE_FAST_FFT_CUH__
 
-#include <cuda_fp16.h>
-#include "cufftdx/include/cufftdx.hpp"
+#include "detail/detail.cuh"
 
-// clang-format off
-
-// “This software contains source code provided by NVIDIA Corporation.” Much of it is modfied as noted at relevant function definitions.
-
-// When defined Turns on synchronization based checking for all FFT kernels as well as cudaErr macros
-// Defined in the Makefile when DEBUG_STAGE is not equal 8 (the default, not partial transforms.)
-// #define HEAVYERRORCHECKING_FFT
-
-// Various levels of debuging conditions and prints
-// #define FFT_DEBUG_LEVEL 0
+// “This software contains source code provided by NVIDIA Corporation.”
+// This is located in include/cufftdx*
+// Please review the license in the cufftdx directory.
 
 // #define forceforce( type )  __nv_is_extended_device_lambda_closure_type( type )
 //FIXME: change to constexpr func
 
-
-
-
-
-
-
-#if FFT_DEBUG_LEVEL < 1
-
-#define MyFFTDebugPrintWithDetails(...)
-#define MyFFTDebugAssertTrue(cond, msg, ...)
-#define MyFFTDebugAssertFalse(cond, msg, ...)
-#define MyFFTDebugAssertTestTrue(cond, msg, ...)
-#define MyFFTDebugAssertTestFalse(cond, msg, ...)
-
-#else
-// Minimally define asserts that check state variables and setup.
-#define MyFFTDebugAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; std::abort(); } }
-#define MyFFTDebugAssertFalse(cond, msg, ...) { if ( (cond) == true ) { std::cerr << msg << std::endl  << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;  std::abort(); } }                                                                                                    
-
-#endif
-
-#if FFT_DEBUG_LEVEL > 1
-// Turn on checkpoints in the testing functions.
-#define MyFFTDebugAssertTestTrue(cond, msg, ...)  { if ( (cond) != true ) { std::cerr << "    Test " << msg << " FAILED!" << std::endl  << "  at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort(); } else { std::cerr << "    Test " << msg << " passed!" << std::endl; }}
-#define MyFFTDebugAssertTestFalse(cond, msg, ...)  { if ( (cond) == true ) {  std::cerr << "    Test " << msg << " FAILED!" << std::endl   << " at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;   std::abort();  } else {  std::cerr << "    Test " << msg << " passed!" << std::endl;  } }
-
-#endif
-
-#if FFT_DEBUG_LEVEL == 2
-#define MyFFTDebugPrintWithDetails(...)
-#endif
-
-#if FFT_DEBUG_LEVEL == 3
-// More verbose debug info
-#define MyFFTDebugPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTDebugPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-#endif
-
-#if FFT_DEBUG_LEVEL == 4
-// More verbose debug info + state info
-#define MyFFTDebugPrint(...) { FastFFT::FourierTransformer::PrintState( );  std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTDebugPrintWithDetails(...)  { FastFFT::FourierTransformer::PrintState( ); std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-
-#endif
-
-// Always in use
-#define MyFFTPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-#define MyFFTRunTimeAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort(); }  }
-#define MyFFTRunTimeAssertFalse(cond, msg, ...) { if ( (cond) == true ) {std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;std::abort();  } }                                                                                                               
-
-
-
-// I use the same things in cisTEM, so check for them. FIXME, get rid of defines and also find a better sharing mechanism.
-#ifndef cudaErr
-// Note we are using std::cerr b/c the wxWidgets apps running in cisTEM are capturing std::cout
-// If I leave cudaErr blank when HEAVYERRORCHECKING_FFT is not defined, I get some reports/warnings about unused or unreferenced variables. I suspect the performance hit is very small so just leave this on.
-// The real cost is in the synchronization of in pre/postcheck.
-#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if ( status != cudaSuccess ) { std::cerr << cudaGetErrorString(status) << " :-> "; MyFFTPrintWithDetails(""); } };
-#endif
-
-#ifndef postcheck
-    #ifndef precheck
-        #ifndef HEAVYERRORCHECKING_FFT
-            #define postcheck
-            #define precheck
-        #else
-            #define postcheck  { cudaErr(cudaPeekAtLastError( )); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error) }
-            #define precheck { cudaErr(cudaGetLastError( )) }
-        #endif
-    #endif
-#endif
-
-
-inline void checkCudaErr(cudaError_t err) {
-    if ( err != cudaSuccess ) {
-        std::cerr << cudaGetErrorString(err) << " :-> " << std::endl;
-        MyFFTPrintWithDetails(" ");
-    }
-};
-
-#define USEFASTSINCOS
-// The __sincosf doesn't appear to be the problem with accuracy, likely just the extra additions, but it probably also is less flexible with other types. I don't see a half precision equivalent.
-#ifdef USEFASTSINCOS
-__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
-    __sincosf(arg, s, c);
-}
-#else
-__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
-    sincos(arg, s, c);
-}
-#endif
-
-// clang-format on
-
 namespace FastFFT {
 
-template <typename T>
-inline bool pointer_is_in_memory_and_registered(T ptr) {
-    // FIXME: I don't think this is thread safe, add a mutex as in cistem::GpuImage
-    cudaPointerAttributes attr;
-    cudaErr(cudaPointerGetAttributes(&attr, ptr));
-
-    if ( attr.type == 1 && attr.devicePointer == attr.hostPointer ) {
-        return true;
-    }
-    else {
-        return false;
-    }
-}
-
-template <typename T>
-inline bool pointer_is_in_device_memory(T ptr) {
-    // FIXME: I don't think this is thread safe, add a mutex as in cistem::GpuImage
-    cudaPointerAttributes attr;
-    cudaErr(cudaPointerGetAttributes(&attr, ptr));
-
-    if ( attr.type == 2 || attr.type == 3 ) {
-        return true;
-    }
-    else {
-        return false;
-    }
-}
-
-__device__ __forceinline__ int
-d_ReturnReal1DAddressFromPhysicalCoord(int3 coords, short4 img_dims) {
-    return ((((int)coords.z * (int)img_dims.y + coords.y) * (int)img_dims.w * 2) + (int)coords.x);
-}
-
-static constexpr const int XZ_STRIDE = 16;
-
-static constexpr const int          bank_size    = 32;
-static constexpr const int          bank_padded  = bank_size + 1;
-static constexpr const unsigned int ubank_size   = 32;
-static constexpr const unsigned int ubank_padded = ubank_size + 1;
-
-__device__ __forceinline__ int GetSharedMemPaddedIndex(const int index) {
-    return (index % bank_size) + ((index / bank_size) * bank_padded);
-}
-
-__device__ __forceinline__ int GetSharedMemPaddedIndex(const unsigned int index) {
-    return (index % ubank_size) + ((index / ubank_size) * ubank_padded);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress(const unsigned int pixel_pitch) {
-    return pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0. Right now testing for a stride of 2, but this could be modifiable if it works.
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_strided_Z(const unsigned int pixel_pitch) {
-    // In the current condition, threadIdx.z is either 0 || 1, and gridDim.z = size_z / 2
-    // index into a 2D tile in the XZ plane, for output in the ZX transposed plane (for coalsced write.)
-    return pixel_pitch * (blockIdx.y + (XZ_STRIDE * blockIdx.z + threadIdx.z) * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int ReturnZplane(const unsigned int NX, const unsigned int NY) {
-    return (blockIdx.z * NX * NY);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_Z(const unsigned int NY) {
-    return blockIdx.y + (blockIdx.z * NY);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTColumn_XYZ_transpose(const unsigned int NX) {
-    // NX should be size_of<FFT>::value for this method. Should this be templated?
-    // presumably the XZ axis is alread transposed on the forward, used to index into this state. Indexs in (ZY)' plane for input, to be transposed and permuted to output.'
-    return NX * (XZ_STRIDE * (blockIdx.y + gridDim.y * blockIdx.z) + threadIdx.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose(const unsigned int X) {
-    return blockIdx.z + gridDim.z * (blockIdx.y + X * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
-    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
-    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
-    // above * blockIdx.y -> offset in physical Y (transposed Y)
-    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
-    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + (IDX / XZ_STRIDE) * gridDim.y);
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
-    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
-    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
-    // above * blockIdx.y -> offset in physical Y (transposed Y)
-    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
-    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + (IDX / XZ_STRIDE) * gridDim.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTColumn_XZ_to_XY( ) {
-    // return blockIdx.y + gridDim.y * ( blockIdx.z + gridDim.z * X);
-    return blockIdx.y + gridDim.y * blockIdx.z;
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX_to_XY( ) {
-    return blockIdx.z + gridDim.z * blockIdx.y;
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX( ) {
-    return Return1DFFTColumn_XZ_to_XY( );
-}
-
-// Complex a * conj b multiplication
-template <typename ComplexType, typename ScalarType>
-static __device__ __host__ inline auto ComplexConjMulAndScale(const ComplexType a, const ComplexType b, ScalarType s) -> decltype(b) {
-    ComplexType c;
-    c.x = s * (a.x * b.x + a.y * b.y);
-    c.y = s * (a.y * b.x - a.x * b.y);
-    return c;
-}
-
-// GetCudaDeviceArch from https://github.com/mnicely/cufft_examples/blob/master/Common/cuda_helper.h
-void GetCudaDeviceProps(DeviceProps& dp);
-
-void CheckSharedMemory(int& memory_requested, DeviceProps& dp);
-void CheckSharedMemory(unsigned int& memory_requested, DeviceProps& dp);
-
 using namespace cufftdx;
 
-// TODO this probably needs to depend on the size of the xform, at least small vs large.
-constexpr const int elements_per_thread_16   = 4;
-constexpr const int elements_per_thread_32   = 8;
-constexpr const int elements_per_thread_64   = 8;
-constexpr const int elements_per_thread_128  = 8;
-constexpr const int elements_per_thread_256  = 8;
-constexpr const int elements_per_thread_512  = 8;
-constexpr const int elements_per_thread_1024 = 8;
-constexpr const int elements_per_thread_2048 = 8;
-constexpr const int elements_per_thread_4096 = 8;
-constexpr const int elements_per_thread_8192 = 16;
-
-namespace KernelFunction {
-
-// Define an enum for different functors
-// Intra Kernel Function Type
-enum IKF_t { NOOP,
-             CONJ_MUL };
-
-// Maybe a better way to check , but using keyword final to statically check for non NONE types
-template <class T, int N_ARGS, IKF_t U>
-class my_functor {};
-
-template <class T>
-class my_functor<T, 0, IKF_t::NOOP> {
-  public:
-    __device__ __forceinline__
-            T
-            operator( )( ) {
-        printf("really specific NOOP\n");
-        return 0;
-    }
-};
-
-template <class T>
-class my_functor<T, 2, IKF_t::CONJ_MUL> final {
-  public:
-    __device__ __forceinline__
-            T
-            operator( )(float& template_fft_x, float& template_fft_y, const float& target_fft_x, const float& target_fft_y) {
-        // Is there a better way than declaring this variable each time?
-        // This is target * conj(template)
-        float tmp      = (template_fft_x * target_fft_x + template_fft_y * target_fft_y);
-        template_fft_y = (template_fft_x * target_fft_y - template_fft_y * target_fft_x);
-        template_fft_x = tmp;
-    }
-};
-
-} // namespace KernelFunction
-
 // constexpr const std::map<unsigned int, unsigned int> elements_per_thread = {
 //     {16, 4}, {"GPU", 15}, {"RAM", 20},
 // };