diff --git a/.vscode_shared/BenHimes/settings.json b/.vscode_shared/BenHimes/settings.json
index 5eb033f..f774bbd 100644
--- a/.vscode_shared/BenHimes/settings.json
+++ b/.vscode_shared/BenHimes/settings.json
@@ -77,7 +77,9 @@
         "unordered_set": "cpp",
         "__config": "cpp",
         "target": "cpp",
-        "ios": "cpp"
+        "ios": "cpp",
+        "__pragma_push": "cpp",
+        "__locale": "cpp"
     },
     "C_Cpp.clang_format_path": "/usr/bin/clang-format-14",
     "editor.formatOnSave": true,
diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh
index 17d094d..3c6e323 100644
--- a/include/FastFFT.cuh
+++ b/include/FastFFT.cuh
@@ -23,15 +23,7 @@
 
 
 
-template <typename K>
-constexpr inline bool IS_IKF_t( ) {
-    if constexpr ( std::is_final_v<K> ) {
-        return true;
-    }
-    else {
-        return false;
-    }
-};
+
 
 
 
@@ -392,24 +384,24 @@ template <class FFT, class ComplexType = typename FFT::value_type>
 __launch_bounds__(FFT::max_threads_per_block) __global__
         void block_fft_kernel_C2C_WithPadding_SwapRealSpaceQuadrants(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
 __launch_bounds__(invFFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                                 Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type, class PreOpType, class IntraOpType, class PostOpType>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type, class PreOpType, class IntraOpType, class PostOpType>
 __launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                            Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv,
                                                            PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
 __launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                                                        Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                                    Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
 
 template <class FFT, class ComplexType = typename FFT::value_type>
@@ -464,8 +456,8 @@ __global__ void thread_fft_kernel_R2C_decomposed_transposed(const ScalarType* __
 template <class FFT, class ComplexType = typename FFT::value_type>
 __global__ void thread_fft_kernel_C2C_decomposed(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
 
 /////////////
 // C2R
@@ -493,6 +485,39 @@ __global__ void clip_into_real_kernel(InputType*      real_values_gpu,
                                       int3            wanted_coordinate_of_box_center,
                                       OutputBaseType  wanted_padding_value);
 
+// TODO: this may be expanded, for now it is to be used in the case where we have
+// packed the real values of a c2r into the first half of the complex array.
+// The output type pointer needs to be cast to the correct type AND posibly converted
+template <class FFT, typename WantedType, typename GivenType>
+inline __device__ WantedType convert_if_needed(const GivenType* __restrict__ ptr, const int idx) {
+    using complex_type = typename FFT::value_type;
+    using scalar_type  = typename complex_type::value_type;
+    WantedType ret;
+    if constexpr ( std::is_same_v<GivenType, complex_type> || std::is_same_v<GivenType, const complex_type> ) {
+        if constexpr ( std::is_same_v<complex_type, float2> ) {
+            if constexpr ( std::is_same_v<WantedType, __half> ) {
+                ret = __float2half_rn(reinterpret_cast<const float*>(ptr)[idx]);
+                return std::move(ret);
+            }
+            if constexpr ( std::is_same_v<WantedType, float> ) {
+                ret = reinterpret_cast<const float*>(ptr)[idx];
+                return std::move(ret);
+            }
+            else {
+                static_assert_type_name(ret);
+            }
+        }
+        else {
+            // Currently only setup for the above
+            static_assert_type_name(ptr);
+        }
+    }
+    else {
+        // Currently only setup for the above
+        static_assert_type_name(ptr);
+    }
+}
+
 //////////////////////////////////////////////
 // IO functions adapted from the cufftdx examples
 ///////////////////////////////
@@ -502,7 +527,9 @@ struct io {
     using complex_type = typename FFT::value_type;
     using scalar_type  = typename complex_type::value_type;
 
-    static inline __device__ unsigned int stride_size( ) {
+    static inline __device__ unsigned int
+    stride_size( ) {
+
         return cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
     }
 
@@ -517,9 +544,9 @@ struct io {
         }
     }
 
-    static inline __device__ void store_r2c(const complex_type* thread_data,
-                                            complex_type*       output,
-                                            int                 offset) {
+    static inline __device__ void store_r2c(const complex_type* __restrict__ thread_data,
+                                            complex_type* __restrict__ output,
+                                            int offset) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = offset + threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -539,14 +566,14 @@ struct io {
     // Since we can make repeated use of the same shared memory for each sub-fft
     // we use this method to load into shared mem instead of directly to registers
     // TODO set this up for async mem load
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in,
-                                              int*                input_map,
-                                              int*                output_map,
-                                              int                 Q) {
+    static inline __device__ void load_shared(const complex_type* __restrict__ input,
+                                              complex_type* __restrict__ shared_input,
+                                              complex_type* __restrict__ thread_data,
+                                              float* __restrict__ twiddle_factor_args,
+                                              float twiddle_in,
+                                              int*  input_map,
+                                              int* __restrict__ output_map,
+                                              int Q) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -562,11 +589,11 @@ struct io {
     // Since we can make repeated use of the same shared memory for each sub-fft
     // we use this method to load into shared mem instead of directly to registers
     // TODO set this up for async mem load
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in) {
+    static inline __device__ void load_shared(const complex_type* __restrict__ input,
+                                              complex_type* __restrict__ shared_input,
+                                              complex_type* __restrict__ thread_data,
+                                              float* __restrict__ twiddle_factor_args,
+                                              float twiddle_in) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -577,15 +604,15 @@ struct io {
         }
     }
 
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in,
-                                              int*                input_map,
-                                              int*                output_map,
-                                              int                 Q,
-                                              int                 number_of_elements) {
+    static inline __device__ void load_shared(const complex_type* __restrict__ input,
+                                              complex_type* __restrict__ shared_input,
+                                              complex_type* __restrict__ thread_data,
+                                              float* __restrict__ twiddle_factor_args,
+                                              float twiddle_in,
+                                              int* __restrict__ input_map,
+                                              int* __restrict__ output_map,
+                                              int Q,
+                                              int number_of_elements) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -606,14 +633,14 @@ struct io {
     // Since we can make repeated use of the same shared memory for each sub-fft
     // we use this method to load into shared mem instead of directly to registers
     // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
-    static inline __device__ void load_r2c_shared(const scalar_type* input,
-                                                  scalar_type*       shared_input,
-                                                  complex_type*      thread_data,
-                                                  float*             twiddle_factor_args,
-                                                  float              twiddle_in,
-                                                  int*               input_map,
-                                                  int*               output_map,
-                                                  int                Q) {
+    static inline __device__ void load_r2c_shared(const scalar_type* __restrict__ input,
+                                                  scalar_type* __restrict__ shared_input,
+                                                  complex_type* __restrict__ thread_data,
+                                                  float* __restrict__ twiddle_factor_args,
+                                                  float twiddle_in,
+                                                  int* __restrict__ input_map,
+                                                  int* __restrict__ output_map,
+                                                  int Q) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -631,11 +658,11 @@ struct io {
     // Since we can make repeated use of the same shared memory for each sub-fft
     // we use this method to load into shared mem instead of directly to registers
     // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
-    static inline __device__ void load_r2c_shared(const scalar_type* input,
-                                                  scalar_type*       shared_input,
-                                                  complex_type*      thread_data,
-                                                  float*             twiddle_factor_args,
-                                                  float              twiddle_in) {
+    static inline __device__ void load_r2c_shared(const scalar_type* __restrict__ input,
+                                                  scalar_type* __restrict__ shared_input,
+                                                  complex_type* __restrict__ thread_data,
+                                                  float* __restrict__ twiddle_factor_args,
+                                                  float twiddle_in) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -647,8 +674,8 @@ struct io {
         }
     }
 
-    static inline __device__ void load_r2c_shared_and_pad(const scalar_type* input,
-                                                          complex_type*      shared_mem) {
+    static inline __device__ void load_r2c_shared_and_pad(const scalar_type* __restrict__ input,
+                                                          complex_type* __restrict__ shared_mem) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -658,9 +685,9 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void copy_from_shared(const complex_type* shared_mem,
-                                                   complex_type*       thread_data,
-                                                   const unsigned int  Q) {
+    static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_mem,
+                                                   complex_type* __restrict__ thread_data,
+                                                   const unsigned int Q) {
         const unsigned int stride = stride_size( ) * Q; // I think the Q is needed, but double check me TODO
         unsigned int       index  = (threadIdx.x * Q) + threadIdx.z;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -673,8 +700,8 @@ struct io {
     // Note that unlike most functions in this file, this one does not have a
     // const decorator on the thread mem, as we want to modify it with the twiddle factors
     // before reducing the full shared mem space.
-    static inline __device__ void reduce_block_fft(complex_type*      thread_data,
-                                                   complex_type*      shared_mem,
+    static inline __device__ void reduce_block_fft(complex_type* __restrict__ thread_data,
+                                                   complex_type* __restrict__ shared_mem,
                                                    const float        twiddle_in,
                                                    const unsigned int Q) {
         const unsigned int stride = stride_size( );
@@ -706,10 +733,10 @@ struct io {
         }
     }
 
-    static inline __device__ void store_r2c_reduced(const complex_type* thread_data,
-                                                    complex_type*       output,
-                                                    const unsigned int  pixel_pitch,
-                                                    const unsigned int  memory_limit) {
+    static inline __device__ void store_r2c_reduced(const complex_type* __restrict__ thread_data,
+                                                    complex_type* __restrict__ output,
+                                                    const unsigned int pixel_pitch,
+                                                    const unsigned int memory_limit) {
         if ( threadIdx.z == 0 ) {
             // Finally we write out the first size_of<FFT>::values to global
             const unsigned int stride = stride_size( );
@@ -726,25 +753,25 @@ struct io {
 
     // when using load_shared || load_r2c_shared, we need then copy from shared mem into the registers.
     // notice we still need the packed complex values for the xform.
-    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
-                                                   complex_type*      thread_data,
-                                                   int*               input_map) {
+    static inline __device__ void copy_from_shared(const scalar_type* __restrict__ shared_input,
+                                                   complex_type* __restrict__ thread_data,
+                                                   int* input_map) {
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             thread_data[i].x = shared_input[input_map[i]];
             thread_data[i].y = 0.0f;
         }
     }
 
-    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
-                                                   complex_type*       thread_data,
-                                                   int*                input_map) {
+    static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_input_complex,
+                                                   complex_type* __restrict__ thread_data,
+                                                   int* __restrict__ input_map) {
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             thread_data[i] = shared_input_complex[input_map[i]];
         }
     }
 
-    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
-                                                   complex_type*      thread_data) {
+    static inline __device__ void copy_from_shared(const scalar_type* __restrict__ shared_input,
+                                                   complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -754,8 +781,8 @@ struct io {
         }
     }
 
-    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
-                                                   complex_type*       thread_data) {
+    static inline __device__ void copy_from_shared(const complex_type* __restrict__ shared_input_complex,
+                                                   complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -764,13 +791,13 @@ struct io {
         }
     }
 
-    template <class ExternalImagePtr_t>
-    static inline __device__ void load_shared_and_conj_multiply(const ExternalImagePtr_t* image_to_search,
-                                                                complex_type*             thread_data) {
+    template <class ExternalImage_t>
+    static inline __device__ void load_shared_and_conj_multiply(ExternalImage_t* __restrict__ image_to_search,
+                                                                complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         complex_type       c;
-        if constexpr ( std::is_same_v<ExternalImagePtr_t, __half2*> ) {
+        if constexpr ( std::is_same_v<ExternalImage_t, __half2> ) {
             for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
                 c.x            = (thread_data[i].x * __low2float(image_to_search[index]) + thread_data[i].y * __high2float(image_to_search[index].y));
                 c.y            = (thread_data[i].y * __low2float(image_to_search[index]) - thread_data[i].x * __high2float(image_to_search[index].y));
@@ -778,7 +805,7 @@ struct io {
                 index += stride;
             }
         }
-        else if constexpr ( std::is_same_v<ExternalImagePtr_t, float2*> ) {
+        else if constexpr ( std::is_same_v<ExternalImage_t, float2> ) {
             for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
                 c.x            = (thread_data[i].x * image_to_search[index].x + thread_data[i].y * image_to_search[index].y);
                 c.y            = (thread_data[i].y * image_to_search[index].x - thread_data[i].x * image_to_search[index].y);
@@ -792,14 +819,14 @@ struct io {
     }
 
     // TODO: set user lambda to default = false, then get rid of other load_shared
-    template <class FunctionType, class ExternalImagePtr_t>
-    static inline __device__ void load_shared(const ExternalImagePtr_t image_to_search,
-                                              complex_type*            thread_data,
-                                              FunctionType             intra_op_functor = nullptr) {
+    template <class FunctionType, class ExternalImage_t>
+    static inline __device__ void load_shared(const ExternalImage_t* __restrict__ image_to_search,
+                                              complex_type* __restrict__ thread_data,
+                                              FunctionType intra_op_functor = nullptr) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         if constexpr ( IS_IKF_t<FunctionType>( ) ) {
-            if constexpr ( std::is_same_v<ExternalImagePtr_t, __half2*> ) {
+            if constexpr ( std::is_same_v<ExternalImage_t, __half2*> ) {
                 for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
                     intra_op_functor(thread_data[i].x, thread_data[i].y, __low2float(image_to_search[index]), __high2float(image_to_search[index])); //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
                     index += stride;
@@ -823,8 +850,8 @@ struct io {
 
     // Now we need send to shared mem and transpose on the way
     // TODO: fix bank conflicts later.
-    static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* shared_mem,
-                                                             complex_type* thread_data) {
+    static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* __restrict__ shared_mem,
+                                                             complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -842,8 +869,8 @@ struct io {
 
     // Now we need send to shared mem and transpose on the way
     // TODO: fix bank conflicts later.
-    static inline __device__ void transpose_in_shared_XZ(complex_type* shared_mem,
-                                                         complex_type* thread_data) {
+    static inline __device__ void transpose_in_shared_XZ(complex_type* __restrict__ shared_mem,
+                                                         complex_type* __restrict__ thread_data) {
         const unsigned int stride = io<FFT>::stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -855,8 +882,8 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void store_r2c_transposed_xz(const complex_type* thread_data,
-                                                          complex_type*       output) {
+    static inline __device__ void store_r2c_transposed_xz(const complex_type* __restrict__ thread_data,
+                                                          complex_type* __restrict__ output) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -873,8 +900,8 @@ struct io {
     }
 
     // Store a transposed tile, made up of contiguous (full) FFTS
-    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                    complex_type*       output) {
+    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem,
+                                                                    complex_type* __restrict__ output) {
         const unsigned int     stride                 = stride_size( );
         constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
         unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
@@ -892,10 +919,10 @@ struct io {
 
     // Store a transposed tile, made up of non-contiguous (strided partial) FFTS
     //
-    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                    complex_type*       output,
-                                                                    const unsigned int  Q,
-                                                                    const unsigned int  sub_fft) {
+    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem,
+                                                                    complex_type* __restrict__ output,
+                                                                    const unsigned int Q,
+                                                                    const unsigned int sub_fft) {
         const unsigned int     stride                 = stride_size( );
         constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
         unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
@@ -911,8 +938,8 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void store_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                complex_type*       output) {
+    static inline __device__ void store_transposed_xz_strided_Z(const complex_type* __restrict__ shared_mem,
+                                                                complex_type* __restrict__ output) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + threadIdx.z * cufftdx::size_of<FFT>::value;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -922,9 +949,9 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int                 pixel_pitch) {
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data,
+                                                          complex_type* __restrict__ output,
+                                                          int pixel_pitch) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -940,10 +967,10 @@ struct io {
         }
     }
 
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int*                output_MAP,
-                                                          int                 pixel_pitch) {
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data,
+                                                          complex_type* __restrict__ output,
+                                                          int* output_MAP,
+                                                          int  pixel_pitch) {
         const unsigned int stride = stride_size( );
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
             // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
@@ -958,11 +985,11 @@ struct io {
         }
     }
 
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int*                output_MAP,
-                                                          int                 pixel_pitch,
-                                                          int                 memory_limit) {
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ thread_data,
+                                                          complex_type* __restrict__ output,
+                                                          int* __restrict__ output_MAP,
+                                                          int pixel_pitch,
+                                                          int memory_limit) {
         const unsigned int stride = stride_size( );
         for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) {
             // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
@@ -982,8 +1009,8 @@ struct io {
         // }
     }
 
-    static inline __device__ void load_c2r(const complex_type* input,
-                                           complex_type*       thread_data) {
+    static inline __device__ void load_c2r(const complex_type* __restrict__ input,
+                                           complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -999,9 +1026,9 @@ struct io {
         }
     }
 
-    static inline __device__ void load_c2r_transposed(const complex_type* input,
-                                                      complex_type*       thread_data,
-                                                      unsigned int        pixel_pitch) {
+    static inline __device__ void load_c2r_transposed(const complex_type* __restrict__ input,
+                                                      complex_type* __restrict__ thread_data,
+                                                      unsigned int pixel_pitch) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -1017,9 +1044,9 @@ struct io {
         }
     }
 
-    static inline __device__ void load_c2r_shared_and_pad(const complex_type* input,
-                                                          complex_type*       shared_mem,
-                                                          const unsigned int  pixel_pitch) {
+    static inline __device__ void load_c2r_shared_and_pad(const complex_type* __restrict__ input,
+                                                          complex_type* __restrict__ shared_mem,
+                                                          const unsigned int pixel_pitch) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
         for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
@@ -1037,8 +1064,8 @@ struct io {
     }
 
     // this may benefit from asynchronous execution
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data) {
+    static inline __device__ void load(const complex_type* __restrict__ input,
+                                       complex_type* __restrict__ thread_data) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1050,9 +1077,9 @@ struct io {
     }
 
     // this may benefit from asynchronous execution
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data,
-                                       int                 last_index_to_load) {
+    static inline __device__ void load(const complex_type* __restrict__ input,
+                                       complex_type* __restrict__ thread_data,
+                                       int last_index_to_load) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1066,10 +1093,10 @@ struct io {
 
     //  TODO: set pre_op_functor to default=false and get rid of other load
     template <class FunctionType>
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data,
-                                       int                 last_index_to_load,
-                                       FunctionType        pre_op_functor = nullptr) {
+    static inline __device__ void load(const complex_type* __restrict__ input,
+                                       complex_type* __restrict__ thread_data,
+                                       int          last_index_to_load,
+                                       FunctionType pre_op_functor = nullptr) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         if constexpr ( IS_IKF_t<FunctionType>( ) ) {
@@ -1092,9 +1119,9 @@ struct io {
         }
     }
 
-    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
-                                                           complex_type*       output,
-                                                           int                 first_negative_index) {
+    static inline __device__ void store_and_swap_quadrants(const complex_type* __restrict__ thread_data,
+                                                           complex_type* __restrict__ output,
+                                                           int first_negative_index) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         complex_type       phase_shift;
@@ -1112,10 +1139,10 @@ struct io {
         }
     }
 
-    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
-                                                           complex_type*       output,
-                                                           int*                source_idx,
-                                                           int                 first_negative_index) {
+    static inline __device__ void store_and_swap_quadrants(const complex_type* __restrict__ thread_data,
+                                                           complex_type* __restrict__ output,
+                                                           int* __restrict__ source_idx,
+                                                           int first_negative_index) {
         const unsigned int stride = stride_size( );
         complex_type       phase_shift;
         int                logical_y;
@@ -1132,9 +1159,9 @@ struct io {
     }
 
     template <class FunctionType = std::nullptr_t>
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        FunctionType        post_op_functor = nullptr) {
+    static inline __device__ void store(const complex_type* __restrict__ thread_data,
+                                        complex_type* __restrict__ output,
+                                        FunctionType post_op_functor = nullptr) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         if constexpr ( IS_IKF_t<FunctionType>( ) ) {
@@ -1151,10 +1178,10 @@ struct io {
         }
     }
 
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        const unsigned int  Q,
-                                        const unsigned int  sub_fft) {
+    static inline __device__ void store(const complex_type* __restrict__ thread_data,
+                                        complex_type* __restrict__ output,
+                                        const unsigned int Q,
+                                        const unsigned int sub_fft) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1163,8 +1190,8 @@ struct io {
         }
     }
 
-    static inline __device__ void store_Z(const complex_type* shared_mem,
-                                          complex_type*       output) {
+    static inline __device__ void store_Z(const complex_type* __restrict__ shared_mem,
+                                          complex_type* __restrict__ output) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1174,10 +1201,10 @@ struct io {
         }
     }
 
-    static inline __device__ void store_Z(const complex_type* shared_mem,
-                                          complex_type*       output,
-                                          const unsigned int  Q,
-                                          const unsigned int  sub_fft) {
+    static inline __device__ void store_Z(const complex_type* __restrict__ shared_mem,
+                                          complex_type* __restrict__ output,
+                                          const unsigned int Q,
+                                          const unsigned int sub_fft) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1187,9 +1214,9 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        unsigned int        memory_limit) {
+    static inline __device__ void store(const complex_type* __restrict__ thread_data,
+                                        complex_type* __restrict__ output,
+                                        unsigned int memory_limit) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1199,9 +1226,9 @@ struct io {
         }
     }
 
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        int*                source_idx) {
+    static inline __device__ void store(const complex_type* __restrict__ thread_data,
+                                        complex_type* __restrict__ output,
+                                        int* __restrict__ source_idx) {
         const unsigned int stride = stride_size( );
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             // If no kernel based changes are made to source_idx, this will be the same as the original index value
@@ -1209,9 +1236,9 @@ struct io {
         }
     }
 
-    static inline __device__ void store_subset(const complex_type* thread_data,
-                                               complex_type*       output,
-                                               int*                source_idx) {
+    static inline __device__ void store_subset(const complex_type* __restrict__ thread_data,
+                                               complex_type* __restrict__ output,
+                                               int* __restrict__ source_idx) {
         const unsigned int stride = stride_size( );
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             // If no kernel based changes are made to source_idx, this will be the same as the original index value
@@ -1220,9 +1247,9 @@ struct io {
         }
     }
 
-    static inline __device__ void store_coalesced(const complex_type* shared_output,
-                                                  complex_type*       global_output,
-                                                  int                 offset) {
+    static inline __device__ void store_coalesced(const complex_type* __restrict__ shared_output,
+                                                  complex_type* __restrict__ global_output,
+                                                  int offset) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = offset + threadIdx.x;
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1231,8 +1258,8 @@ struct io {
         }
     }
 
-    static inline __device__ void load_c2c_shared_and_pad(const complex_type* input,
-                                                          complex_type*       shared_mem) {
+    static inline __device__ void load_c2c_shared_and_pad(const complex_type* __restrict__ input,
+                                                          complex_type* __restrict__ shared_mem) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
@@ -1242,8 +1269,8 @@ struct io {
         __syncthreads( );
     }
 
-    static inline __device__ void store_c2c_reduced(const complex_type* thread_data,
-                                                    complex_type*       output) {
+    static inline __device__ void store_c2c_reduced(const complex_type* __restrict__ thread_data,
+                                                    complex_type* __restrict__ output) {
         if ( threadIdx.z == 0 ) {
             // Finally we write out the first size_of<FFT>::values to global
             const unsigned int stride = stride_size( );
@@ -1258,27 +1285,28 @@ struct io {
         }
     }
 
-    static inline __device__ void store_c2r_reduced(const complex_type* thread_data,
-                                                    scalar_type*        output) {
+    static inline __device__ void store_c2r_reduced(const complex_type* __restrict__ thread_data,
+                                                    scalar_type* __restrict__ output) {
         if ( threadIdx.z == 0 ) {
             // Finally we write out the first size_of<FFT>::values to global
             const unsigned int stride = stride_size( );
             unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+
             for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
                 if ( index < size_of<FFT>::value ) {
                     // transposed index.
-                    output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+                    output[index] = convert_if_needed<FFT, FFT, scalar_type>(thread_data, i);
                 }
                 index += stride;
             }
         }
     }
 
-    static inline __device__ void store_transposed(const complex_type* thread_data,
-                                                   complex_type*       output,
-                                                   int*                output_map,
-                                                   int*                rotated_offset,
-                                                   int                 memory_limit) {
+    static inline __device__ void store_transposed(const complex_type* __restrict__ thread_data,
+                                                   complex_type* __restrict__ output,
+                                                   int* __restrict__ output_map,
+                                                   int* __restrict__ rotated_offset,
+                                                   int memory_limit) {
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             // If no kernel based changes are made to source_idx, this will be the same as the original index value
             if ( output_map[i] < memory_limit )
@@ -1286,25 +1314,27 @@ struct io {
         }
     }
 
-    static inline __device__ void store_c2r(const complex_type* thread_data,
-                                            scalar_type*        output) {
+    static inline __device__ void store_c2r(const complex_type* __restrict__ thread_data,
+                                            scalar_type* __restrict__ output) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
+
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+            output[index] = convert_if_needed<FFT, scalar_type>(thread_data, i);
             index += stride;
         }
     }
 
-    static inline __device__ void store_c2r(const complex_type* thread_data,
-                                            scalar_type*        output,
-                                            unsigned int        memory_limit) {
+    static inline __device__ void store_c2r(const complex_type* __restrict__ thread_data,
+                                            scalar_type* __restrict__ output,
+                                            unsigned int memory_limit) {
         const unsigned int stride = stride_size( );
         unsigned int       index  = threadIdx.x;
+
         for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
             // TODO: does reinterpret_cast<const scalar_type*>(thread_data)[i] make more sense than just thread_data[i].x??
             if ( index < memory_limit )
-                output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+                output[index] = convert_if_needed<FFT, scalar_type>(thread_data, i);
             index += stride;
         }
     }
@@ -1315,9 +1345,9 @@ struct io_thread {
     using complex_type = typename FFT::value_type;
     using scalar_type  = typename complex_type::value_type;
 
-    static inline __device__ void load_r2c(const scalar_type* input,
-                                           complex_type*      thread_data,
-                                           const int          stride) {
+    static inline __device__ void load_r2c(const scalar_type* __restrict__ input,
+                                           complex_type* __restrict__ thread_data,
+                                           const int stride) {
         unsigned int index = threadIdx.x;
         for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
             thread_data[i].x = input[index];
@@ -1326,10 +1356,10 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void store_r2c(const complex_type* shared_output,
-                                            complex_type*       output,
-                                            const int           stride,
-                                            const int           memory_limit) {
+    static inline __device__ void store_r2c(const complex_type* __restrict__ shared_output,
+                                            complex_type* __restrict__ output,
+                                            const int stride,
+                                            const int memory_limit) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index = threadIdx.x;
         for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
@@ -1341,11 +1371,11 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* shared_output,
-                                                          complex_type*       output,
-                                                          int                 stride,
-                                                          int                 pixel_pitch,
-                                                          int                 memory_limit) {
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* __restrict__ shared_output,
+                                                          complex_type* __restrict__ output,
+                                                          int stride,
+                                                          int pixel_pitch,
+                                                          int memory_limit) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index = threadIdx.x;
         for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
@@ -1357,11 +1387,11 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
-                                                            complex_type*       shared_output,
-                                                            float               twiddle_in,
-                                                            int                 Q,
-                                                            int                 memory_limit) {
+    static inline __device__ void remap_decomposed_segments(const complex_type* __restrict__ thread_data,
+                                                            complex_type* __restrict__ shared_output,
+                                                            float twiddle_in,
+                                                            int   Q,
+                                                            int   memory_limit) {
         // Unroll the first loop and initialize the shared mem.
         complex_type twiddle;
         int          index = threadIdx.x * size_of<FFT>::value;
@@ -1389,9 +1419,9 @@ struct io_thread {
         __syncthreads( );
     }
 
-    static inline __device__ void load_c2c(const complex_type* input,
-                                           complex_type*       thread_data,
-                                           const int           stride) {
+    static inline __device__ void load_c2c(const complex_type* __restrict__ input,
+                                           complex_type* __restrict__ thread_data,
+                                           const int stride) {
         unsigned int index = threadIdx.x;
         for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
             thread_data[i] = input[index];
@@ -1399,9 +1429,9 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void store_c2c(const complex_type* shared_output,
-                                            complex_type*       output,
-                                            const int           stride) {
+    static inline __device__ void store_c2c(const complex_type* __restrict__ shared_output,
+                                            complex_type* __restrict__ output,
+                                            const int stride) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index = threadIdx.x;
         for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
@@ -1410,10 +1440,10 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
-                                                            complex_type*       shared_output,
-                                                            float               twiddle_in,
-                                                            int                 Q) {
+    static inline __device__ void remap_decomposed_segments(const complex_type* __restrict__ thread_data,
+                                                            complex_type* __restrict__ shared_output,
+                                                            float twiddle_in,
+                                                            int   Q) {
         // Unroll the first loop and initialize the shared mem.
         complex_type twiddle;
         int          index = threadIdx.x * size_of<FFT>::value;
@@ -1438,10 +1468,10 @@ struct io_thread {
         __syncthreads( );
     }
 
-    static inline __device__ void load_c2r(const complex_type* input,
-                                           complex_type*       thread_data,
-                                           const int           stride,
-                                           const int           memory_limit) {
+    static inline __device__ void load_c2r(const complex_type* __restrict__ input,
+                                           complex_type* __restrict__ thread_data,
+                                           const int stride,
+                                           const int memory_limit) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index  = threadIdx.x;
         unsigned int offset = 2 * memory_limit - 2;
@@ -1460,11 +1490,11 @@ struct io_thread {
     }
 
     // FIXME as above
-    static inline __device__ void load_c2r_transposed(const complex_type* input,
-                                                      complex_type*       thread_data,
-                                                      int                 stride,
-                                                      int                 pixel_pitch,
-                                                      int                 memory_limit) {
+    static inline __device__ void load_c2r_transposed(const complex_type* __restrict__ input,
+                                                      complex_type* __restrict__ thread_data,
+                                                      int stride,
+                                                      int pixel_pitch,
+                                                      int memory_limit) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index = threadIdx.x;
         // unsigned int offset = 2*memory_limit - 2;
@@ -1483,10 +1513,10 @@ struct io_thread {
         }
     }
 
-    static inline __device__ void remap_decomposed_segments_c2r(const complex_type* thread_data,
-                                                                scalar_type*        shared_output,
-                                                                scalar_type         twiddle_in,
-                                                                int                 Q) {
+    static inline __device__ void remap_decomposed_segments_c2r(const complex_type* __restrict__ thread_data,
+                                                                scalar_type* __restrict__ shared_output,
+                                                                scalar_type twiddle_in,
+                                                                int         Q) {
         // Unroll the first loop and initialize the shared mem.
         complex_type twiddle;
         int          index = threadIdx.x * size_of<FFT>::value;
@@ -1510,25 +1540,26 @@ struct io_thread {
         __syncthreads( );
     }
 
-    static inline __device__ void store_c2r(const scalar_type* shared_output,
-                                            scalar_type*       output,
-                                            const int          stride) {
+    static inline __device__ void store_c2r(const scalar_type* __restrict__ shared_output,
+                                            scalar_type* __restrict__ output,
+                                            const int stride) {
         // Each thread reads in the input data at stride = mem_offsets.Q
         unsigned int index = threadIdx.x;
+
         for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            output[index] = shared_output[index];
+            output[index] = convert_if_needed(shared_output, index);
             index += stride;
         }
     }
 
-    template <typename ExternalImagePtr_t>
-    static inline __device__ void load_shared_and_conj_multiply(const ExternalImagePtr_t image_to_search,
-                                                                const complex_type*      shared_mem,
-                                                                complex_type*            thread_data,
-                                                                const int                stride) {
+    template <typename ExternalImage_t>
+    static inline __device__ void load_shared_and_conj_multiply(const ExternalImage_t* __restrict__ image_to_search,
+                                                                const complex_type* __restrict__ shared_mem,
+                                                                complex_type* __restrict__ thread_data,
+                                                                const int stride) {
         unsigned int index = threadIdx.x;
         complex_type c;
-        if constexpr ( std::is_same_v<ExternalImagePtr_t, __half2*> ) {
+        if constexpr ( std::is_same_v<ExternalImage_t, __half2*> ) {
 
             for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
                 c.x            = (shared_mem[index].x * __low2float(image_to_search[index]) + shared_mem[index].y * __high2float(image_to_search[index]));
diff --git a/include/FastFFT.h b/include/FastFFT.h
index ffd574f..25aaf72 100644
--- a/include/FastFFT.h
+++ b/include/FastFFT.h
@@ -30,6 +30,16 @@ inline constexpr _Tp pi_v = 3.141592653589793238462643383279502884L;
 
 #include "../src/fastfft/types.cuh"
 
+template <typename K>
+constexpr inline bool IS_IKF_t( ) {
+    if constexpr ( std::is_final_v<K> ) {
+        return true;
+    }
+    else {
+        return false;
+    }
+};
+
 template <typename T>
 __device__ __host__ inline void static_assert_type_name(T v) {
 
@@ -45,6 +55,7 @@ __device__ __host__ inline void static_assert_type_name(T v) {
         static_assert(! std::is_same_v<T, double*>, "double*");
         static_assert(! std::is_same_v<T, __half*>, "__half*");
         static_assert(! std::is_same_v<T, __half2*>, "__half2*");
+        static_assert(! std::is_same_v<T, nullptr_t>, "nullptr_t");
     }
     else {
         static_assert(! std::is_same_v<T, int>, "int");
@@ -297,11 +308,11 @@ class FourierTransformer {
     // For the time being, the caller is responsible for having the memory allocated for any of these input/output pointers.
     void SetInputPointer(InputType* input_pointer);
 
-    template <typename ExternalImagePtr_t>
-    void SetOutputPointer(ExternalImagePtr_t* output_pointer);
+    template <typename ExternalImage_t>
+    void SetOutputPointer(ExternalImage_t* output_pointer);
 
-    template <typename ExternalImagePtr_t>
-    void SetExternalImagePointer(ExternalImagePtr_t* output_pointer);
+    template <typename ExternalImage_t>
+    void SetExternalImagePointer(ExternalImage_t* output_pointer);
     // When passing in a pointer from python (cupy or pytorch) it is a long, and needs to be cast to input type.
     // For now, we are assuming memory ops are all handled in the python code.
     void SetInputPointerFromPython(long input_pointer);
@@ -562,6 +573,8 @@ class FourierTransformer {
 
  */
 
+    // FIXME: in the execution blocks, we should have some check that the correct direction is implemented.
+    // Or better yet, have this templated and
     enum KernelType { r2c_decomposed, // 1D fwd
                       r2c_decomposed_transposed, // 2d fwd 1st stage
                       r2c_none_XY, // 1d fwd  //  2d fwd 1st stage
@@ -579,7 +592,8 @@ class FourierTransformer {
                       c2c_inv_none_XYZ,
                       c2c_inv_decrease,
                       c2c_inv_increase,
-                      c2c_decomposed,
+                      c2c_fwd_decomposed,
+                      c2c_inv_decomposed,
                       c2r_decomposed,
                       c2r_decomposed_transposed,
                       c2r_none,
@@ -601,7 +615,8 @@ class FourierTransformer {
                        "r2c_decrease_XY", "r2c_increase_XY", "r2c_increase_XZ",
                        "c2c_fwd_none", "c2c_fwd_none_XYZ", "c2c_fwd_increase", "c2c_fwd_increase", "c2c_fwd_increase_XYZ",
                        "c2c_inv_none", "c2c_inv_none_XZ", "c2c_inv_none_XYZ", "c2c_inv_increase", "c2c_inv_increase",
-                       "c2c_decomposed",
+                       "c2c_fwd_decomposed",
+                       "c2c_inv_decomposed",
                        "c2r_decomposed",
                        "c2r_decomposed_transposed",
                        "c2r_none", "c2r_none_XY", "c2r_decrease_XY", "c2r_increase",
@@ -614,7 +629,7 @@ class FourierTransformer {
 
     inline bool IsThreadType(KernelType kernel_type) {
         if ( kernel_type == r2c_decomposed || kernel_type == r2c_decomposed_transposed ||
-             kernel_type == c2c_decomposed ||
+             kernel_type == c2c_fwd_decomposed || c2c_inv_decomposed ||
              kernel_type == c2r_decomposed || kernel_type == c2r_decomposed_transposed || kernel_type == xcorr_decomposed ) {
             return true;
         }
@@ -717,7 +732,7 @@ class FourierTransformer {
 
     void         GetTransformSize(KernelType kernel_type);
     void         GetTransformSize_thread(KernelType kernel_type, int thread_fft_size);
-    LaunchParams SetLaunchParameters(const int& ept, KernelType kernel_type, bool do_forward_transform = true);
+    LaunchParams SetLaunchParameters(const int& ept, KernelType kernel_type);
 
     inline int ReturnPaddedMemorySize(short4& wanted_dims) {
         // Assumes a) SetInputDimensionsAndType has been called and is_fftw_padded is set before this call. (Currently RuntimeAssert to die if false) FIXME
@@ -756,12 +771,13 @@ class FourierTransformer {
     // 1.
     // First call passed from a public transform function, selects block or thread and the transform precision.
     template <int FFT_ALGO_t, bool use_thread_method = false, class PreOpType = std::nullptr_t, class IntraOpType = std::nullptr_t, class PostOpType = std::nullptr_t> // bool is just used as a dummy type
-    void SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform = true, PreOpType pre_op_functor = nullptr, IntraOpType intra_op_functor = nullptr, PostOpType post_op_functor = nullptr);
+    typename std::enable_if_t<FFT_ALGO_t != Generic_Fwd_Image_Inv_FFT || (FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT && IS_IKF_t<IntraOpType>( ))>
+    SetPrecisionAndExectutionMethod(KernelType kernel_type, PreOpType pre_op_functor = nullptr, IntraOpType intra_op_functor = nullptr, PostOpType post_op_functor = nullptr);
 
     // 2. // TODO: remove this now that the functors are working
     // Check to see if any intra kernel functions are wanted, and if so set the appropriate device pointers.
     template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-    void SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
+    void SetIntraKernelFunctions(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     // 3.
     // Second call, sets size of the transform kernel, selects the appropriate GPU arch
@@ -771,15 +787,15 @@ class FourierTransformer {
     // This allows us to iterate through a set of constexpr sizes passed as a template parameter pack. The value is in providing a means to have different size packs
     // for different fft configurations, eg. 2d vs 3d
     template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
+    void SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType, unsigned int SizeValue, unsigned int Ept, unsigned int... OtherValues>
-    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
+    void SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     // 3.
     // Third call, sets the input and output dimensions and type
     template <int FFT_ALGO_t, class FFT_base_arch, class PreOpType, class IntraOpType, class PostOpType>
-    void SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
+    void SetAndLaunchKernel(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     void PrintLaunchParameters(LaunchParams LP) {
         std::cerr << "Launch parameters: " << std::endl;
@@ -806,7 +822,8 @@ class FourierTransformer {
     // If the user doesn't specify input/output pointers, assume the are copied into the FastFFT bufferspace.
     // TODO: this will only work for 2d as the output in 3d should be in d_ptr.position_space_buffer
     template <class PreOpType = std::nullptr_t, class IntraOpType = std::nullptr_t, class PostOpType = std::nullptr_t>
-    void Generic_Fwd_Image_Inv(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr, PostOpType post_op = nullptr);
+    typename std::enable_if_t<IS_IKF_t<IntraOpType>( )>
+    Generic_Fwd_Image_Inv(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr, PostOpType post_op = nullptr);
 
     template <class PreOpType = std::nullptr_t, class IntraOpType = std::nullptr_t>
     void Generic_Fwd(PreOpType pre_op = nullptr, IntraOpType intra_op = nullptr);
diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu
index 2efc577..c4c6fcd 100644
--- a/src/fastfft/FastFFT.cu
+++ b/src/fastfft/FastFFT.cu
@@ -9,9 +9,9 @@
 
 namespace FastFFT {
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType, class PreOpType, class IntraOpType, class PostOpType>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType, class PreOpType, class IntraOpType, class PostOpType>
 __launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                            Offsets mem_offsets, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv,
                                                            PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
@@ -254,8 +254,8 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetIn
 }
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
-template <typename ExternalImagePtr_t>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetOutputPointer(ExternalImagePtr_t* output_pointer) {
+template <typename ExternalImage_t>
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetOutputPointer(ExternalImage_t* output_pointer) {
     MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters not set");
 
     if ( output_pointer == nullptr ) {
@@ -263,11 +263,11 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetOu
         d_ptr.external_output_complex = nullptr;
     }
     else {
-        if constexpr ( std::is_same_v<ExternalImagePtr_t*, decltype(d_ptr.external_output)> ) {
+        if constexpr ( std::is_same_v<ExternalImage_t*, decltype(d_ptr.external_output)> ) {
             d_ptr.external_output         = output_pointer;
             d_ptr.external_output_complex = nullptr;
         }
-        else if constexpr ( std::is_same_v<ExternalImagePtr_t*, decltype(d_ptr.external_output_complex)> ) {
+        else if constexpr ( std::is_same_v<ExternalImage_t*, decltype(d_ptr.external_output_complex)> ) {
             d_ptr.external_output_complex = output_pointer;
             d_ptr.external_output         = nullptr;
         }
@@ -295,8 +295,8 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetOu
 }
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
-template <typename ExternalImagePtr_t>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetExternalImagePointer(ExternalImagePtr_t* ptr) {
+template <typename ExternalImage_t>
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetExternalImagePointer(ExternalImage_t* ptr) {
     MyFFTDebugAssertTrue(is_set_input_params && is_set_output_params, "Input and output parameters not set");
 
     d_ptr.image_to_search = ptr;
@@ -482,35 +482,34 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
     SetDimensions(DimensionCheckType::FwdTransform);
 
     // All placeholders
-    constexpr bool use_thread_method    = false;
-    const bool     do_forward_transform = true;
+    constexpr bool use_thread_method = false;
     // const bool     swap_real_space_quadrants = false;
     // const bool transpose_output = true;
 
-    // SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(<Generic_Inv_FFT, KernelType kernel_type, bool do_forward_transform, bool use_thread_method)
+    // SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(<Generic_Inv_FFT, KernelType kernel_type, bool  bool use_thread_method)
     if constexpr ( Rank == 1 ) {
         // FIXME there is some redundancy in specifying _decomposed and use_thread_method
         // Note: the only time the non-transposed method should be used is for 1d data.
         if constexpr ( use_thread_method ) {
             if ( is_real_valued_input )
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(r2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor); //FFT_R2C_decomposed(transpose_output);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(r2c_decomposed, pre_op_functor, intra_op_functor); //FFT_R2C_decomposed(transpose_output);
             else
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(c2c_fwd_decomposed, pre_op_functor, intra_op_functor);
             transform_stage_completed = TransformStageCompleted::fwd;
         }
         else {
             if ( is_real_valued_input ) {
                 switch ( fwd_size_change_type ) {
                     case SizeChangeType::no_change: {
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, pre_op_functor, intra_op_functor);
                         break;
                     }
                     case SizeChangeType::decrease: {
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, pre_op_functor, intra_op_functor);
                         break;
                     }
                     case SizeChangeType::increase: {
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, pre_op_functor, intra_op_functor);
                         break;
                     }
                     default: {
@@ -522,15 +521,15 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                 switch ( fwd_size_change_type ) {
                     case SizeChangeType::no_change: {
                         MyFFTDebugAssertTrue(false, "Complex input images are not yet supported"); // FIXME:
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, pre_op_functor, intra_op_functor);
                         break;
                     }
                     case SizeChangeType::decrease: {
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_decrease, pre_op_functor, intra_op_functor);
                         break;
                     }
                     case SizeChangeType::increase: {
-                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
+                        SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, pre_op_functor, intra_op_functor);
                         break;
                     }
                     default: {
@@ -547,27 +546,27 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                 // FIXME there is some redundancy in specifying _decomposed and use_thread_method
                 // Note: the only time the non-transposed method should be used is for 1d data.
                 if ( use_thread_method ) {
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(r2c_decomposed_transposed, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(r2c_decomposed_transposed, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT, true>(c2c_fwd_decomposed, pre_op_functor, intra_op_functor);
                 }
                 else {
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, pre_op_functor, intra_op_functor);
                 }
                 break;
             }
             case SizeChangeType::increase: {
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, pre_op_functor, intra_op_functor);
                 transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, pre_op_functor, intra_op_functor);
                 break;
             }
             case SizeChangeType::decrease: {
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, pre_op_functor, intra_op_functor);
                 transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_decrease, pre_op_functor, intra_op_functor);
                 break;
             }
         }
@@ -575,17 +574,17 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
     else if constexpr ( Rank == 3 ) {
         switch ( fwd_size_change_type ) {
             case SizeChangeType::no_change: {
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XZ, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XZ, pre_op_functor, intra_op_functor);
                 transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none_XYZ, do_forward_transform, pre_op_functor, intra_op_functor);
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none_XYZ, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_none, pre_op_functor, intra_op_functor);
                 break;
             }
             case SizeChangeType::increase: {
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XZ, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XZ, pre_op_functor, intra_op_functor);
                 transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase_XYZ, do_forward_transform, pre_op_functor, intra_op_functor);
-                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase_XYZ, pre_op_functor, intra_op_functor);
+                SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase, pre_op_functor, intra_op_functor);
                 // SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(<Generic_Inv_FFT, c2c_fwd_increase_XYZ);
                 break;
             }
@@ -608,8 +607,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
     SetDimensions(DimensionCheckType::InvTransform);
 
     // All placeholders
-    constexpr bool use_thread_method    = false;
-    const bool     do_forward_transform = false;
+    constexpr bool use_thread_method = false;
     // const bool     swap_real_space_quadrants = false;
     // const bool     transpose_output          = true;
 
@@ -619,24 +617,24 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
             // Note: the only time the non-transposed method should be used is for 1d data.
             if constexpr ( use_thread_method ) {
                 if ( is_real_valued_input )
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2r_decomposed, do_forward_transform); //FFT_R2C_decomposed(transpose_output);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2r_decomposed, intra_op, post_op); //FFT_R2C_decomposed(transpose_output);
                 else
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2c_decomposed, do_forward_transform);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2c_inv_decomposed, intra_op, post_op);
                 transform_stage_completed = TransformStageCompleted::inv;
             }
             else {
                 if ( is_real_valued_input ) {
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, intra_op, post_op);
                             break;
                         }
                         case SizeChangeType::decrease: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY, intra_op, post_op);
                             break;
                         }
                         case SizeChangeType::increase: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_increase);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_increase, intra_op, post_op);
                             break;
                         }
                         default: {
@@ -647,15 +645,15 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                 else {
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none, intra_op, post_op);
                             break;
                         }
                         case SizeChangeType::decrease: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_decrease);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_decrease, intra_op, post_op);
                             break;
                         }
                         case SizeChangeType::increase: {
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_increase);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_increase, intra_op, post_op);
                             break;
                         }
                         default: {
@@ -673,27 +671,27 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                     // FIXME there is some redundancy in specifying _decomposed and use_thread_method
                     // Note: the only time the non-transposed method should be used is for 1d data.
                     if ( use_thread_method ) {
-                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2c_decomposed, do_forward_transform);
+                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2c_inv_decomposed, intra_op, post_op);
                         transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2r_decomposed_transposed, do_forward_transform);
+                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT, true>(c2r_decomposed_transposed, intra_op, post_op);
                     }
                     else {
-                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none);
+                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none, intra_op, post_op);
                         transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY);
+                        SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, intra_op, post_op);
                     }
                     break;
                 }
                 case SizeChangeType::increase: {
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_increase);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_increase, intra_op, post_op);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_increase);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_increase, intra_op, post_op);
                     break;
                 }
                 case SizeChangeType::decrease: {
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_decrease);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_decrease, intra_op, post_op);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY, intra_op, post_op);
                     break;
                 }
                 default: {
@@ -706,14 +704,14 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
         case 3: {
             switch ( inv_size_change_type ) {
                 case SizeChangeType::no_change: {
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XZ);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XZ, intra_op, post_op);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XYZ);
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XYZ, intra_op, post_op);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none, intra_op, post_op);
                     break;
                 }
                 case SizeChangeType::increase: {
-                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(r2c_increase_XY);
+                    SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(r2c_increase_XY, intra_op, post_op);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
                     // SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_fwd_increase_XYZ);
                     break;
@@ -734,7 +732,8 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Generic_Fwd_Image_Inv(PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+typename std::enable_if_t<IS_IKF_t<IntraOpType>( )>
+FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Generic_Fwd_Image_Inv(PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     // Set the member pointer to the passed pointer
     SetDimensions(DimensionCheckType::FwdTransform);
@@ -747,7 +746,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
         case 2: {
             switch ( fwd_size_change_type ) {
                 case SizeChangeType::no_change: {
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, true);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_none_XY, pre_op_functor, intra_op_functor, post_op_functor);
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
                             MyFFTRunTimeAssertTrue(false, "2D FFT generic lambda no change/nochange not yet supported");
@@ -758,8 +757,8 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                             break;
                         }
                         case SizeChangeType::decrease: {
-                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(xcorr_fwd_none_inv_decrease, true);
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY, false);
+                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(xcorr_fwd_none_inv_decrease, pre_op_functor, intra_op_functor, post_op_functor);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_decrease_XY, pre_op_functor, intra_op_functor, post_op_functor);
                             break;
                         }
                         default: {
@@ -770,11 +769,11 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                     break;
                 } // case fwd no change
                 case SizeChangeType::increase: {
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, true);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XY, pre_op_functor, intra_op_functor, post_op_functor);
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT, false, PreOpType, IntraOpType, PostOpType>(generic_fwd_increase_op_inv_none, true, pre_op_functor, intra_op_functor, post_op_functor);
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, false);
+                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(generic_fwd_increase_op_inv_none, pre_op_functor, intra_op_functor, post_op_functor);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, pre_op_functor, intra_op_functor, post_op_functor);
                             transform_stage_completed = TransformStageCompleted::inv;
                             break;
                         }
@@ -801,11 +800,11 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                 }
                 case SizeChangeType::decrease: {
 
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, true);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_decrease_XY, pre_op_functor, intra_op_functor, post_op_functor);
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(xcorr_fwd_increase_inv_none, true);
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, false); // TODO the output could be smaller
+                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(xcorr_fwd_increase_inv_none, pre_op_functor, intra_op_functor, post_op_functor);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none_XY, pre_op_functor, intra_op_functor, post_op_functor);
                             transform_stage_completed = TransformStageCompleted::inv;
                             break;
                         }
@@ -852,17 +851,17 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Gener
                     break;
                 }
                 case SizeChangeType::increase: {
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XZ);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(r2c_increase_XZ, pre_op_functor, intra_op_functor, post_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase_XYZ);
+                    SetPrecisionAndExectutionMethod<Generic_Fwd_FFT>(c2c_fwd_increase_XYZ, pre_op_functor, intra_op_functor, post_op_functor);
                     switch ( inv_size_change_type ) {
                         case SizeChangeType::no_change: {
                             // TODO: will need a kernel for generic_fwd_increase_op_inv_none_XZ
-                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(generic_fwd_increase_op_inv_none);
+                            SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(generic_fwd_increase_op_inv_none, pre_op_functor, intra_op_functor, post_op_functor);
                             // SetPrecisionAndExectutionMethod<Generic_Fwd_Image_Inv_FFT>(c2c_inv_none_XZ);
                             transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XYZ);
-                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2c_inv_none_XYZ, pre_op_functor, intra_op_functor, post_op_functor);
+                            SetPrecisionAndExectutionMethod<Generic_Inv_FFT>(c2r_none, pre_op_functor, intra_op_functor, post_op_functor);
                             break;
                         }
                         case SizeChangeType::increase: {
@@ -1326,8 +1325,8 @@ __global__ void block_fft_kernel_R2C_DECREASE_XY(const ScalarType* __restrict__
 
 // decomposed with conj multiplication
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType>
-__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q) {
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType>
+__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q) {
 
     using complex_type = ComplexType;
     // The data store is non-coalesced, so don't aggregate the data in shared mem.
@@ -1359,9 +1358,9 @@ __global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ExternalImagePtr_
 
 // C2C with conj multiplication
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType>
 __launch_bounds__(invFFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                                 Offsets mem_offsets, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) {
 
     //	// Initialize the shared memory, assuming everyting matches the input data X size in
@@ -1393,9 +1392,9 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__
     io<invFFT>::store(thread_data, &output_values[Return1DFFTAddress(size_of<FFT>::value)]);
 }
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType>
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType>
 __launch_bounds__(invFFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImagePtr_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) {
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) {
 
     //	// Initialize the shared memory, assuming everyting matches the input data X size in
     using complex_type = ComplexType;
@@ -1434,8 +1433,8 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__
 
 } //
 
-template <class ExternalImagePtr_t, class FFT, class invFFT, class ComplexType>
-__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImagePtr_t __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+template <class ExternalImage_t, class FFT, class invFFT, class ComplexType>
+__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ExternalImage_t* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                                    Offsets mem_offsets, float twiddle_in, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv) {
 
     using complex_type = ComplexType;
@@ -1973,44 +1972,48 @@ __global__ void clip_into_real_kernel(InputType*      real_values_gpu,
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <int FFT_ALGO_t, bool use_thread_method, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+typename std::enable_if_t<FFT_ALGO_t != Generic_Fwd_Image_Inv_FFT || (FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT && IS_IKF_t<IntraOpType>( ))>
+FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetPrecisionAndExectutionMethod(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // For kernels with fwd and inv transforms, we want to not set the direction yet.
 
     static const bool is_half  = std::is_same_v<ComputeBaseType, __half>;
     static const bool is_float = std::is_same_v<ComputeBaseType, float>;
     static_assert(is_half || is_float, "FourierTransformer::SetPrecisionAndExectutionMethod: Unsupported ComputeBaseType");
+    if constexpr ( FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT ) {
+        static_assert(IS_IKF_t<IntraOpType>( ), "FourierTransformer::SetPrecisionAndExectutionMethod: Unsupported IntraOpType");
+    }
 
     if constexpr ( use_thread_method ) {
         using FFT = decltype(Thread( ) + Size<32>( ) + Precision<ComputeBaseType>( ));
-        SetIntraKernelFunctions<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+        SetIntraKernelFunctions<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
     }
     else {
         using FFT = decltype(Block( ) + Precision<ComputeBaseType>( ) + FFTsPerBlock<1>( ));
-        SetIntraKernelFunctions<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+        SetIntraKernelFunctions<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
     }
 }
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetIntraKernelFunctions(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     if constexpr ( ! detail::has_any_block_operator<FFT_base>::value ) {
-        // SelectSizeAndType<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+        // SelectSizeAndType<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type,  pre_op_functor, intra_op_functor, post_op_functor);
     }
     else {
         if constexpr ( Rank == 3 ) {
-            SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+            SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
         }
         else {
             // TODO: 8192 will fail for sm75 if wanted need some extra logic ... , 8192, 16
-            SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 8>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+            SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 8>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
         }
     }
 }
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // This provides both a termination point for the recursive version needed for the block transform case as well as the actual function for thread transform with fixed size 32
     GetTransformSize(kernel_type);
     if constexpr ( ! detail::has_any_block_operator<FFT_base>::value ) {
@@ -2018,22 +2021,22 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Selec
         switch ( device_properties.device_arch ) {
             case 700: {
                 using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 750: {
                 using FFT = decltype(FFT_base( ) + SM<750>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 800: {
                 using FFT = decltype(FFT_base( ) + SM<800>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 860: {
                 using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             default: {
@@ -2046,7 +2049,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Selec
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <int FFT_ALGO_t, class FFT_base, class PreOpType, class IntraOpType, class PostOpType, unsigned int SizeValue, unsigned int Ept, unsigned int... OtherValues>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SelectSizeAndType(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // Use recursion to step through the allowed sizes.
     GetTransformSize(kernel_type);
 
@@ -2055,24 +2058,24 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Selec
         switch ( device_properties.device_arch ) {
             case 700: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 750: {
                 if constexpr ( SizeValue <= 4096 ) {
                     using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<750>( ) + ElementsPerThread<8>( ));
-                    SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                    SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 }
                 break;
             }
             case 800: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<800>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 860: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+                SetAndLaunchKernel<FFT_ALGO_t, FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             default: {
@@ -2082,12 +2085,12 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::Selec
         }
     }
 
-    SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, OtherValues...>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
+    SelectSizeAndType<FFT_ALGO_t, FFT_base, PreOpType, IntraOpType, PostOpType, OtherValues...>(kernel_type, pre_op_functor, intra_op_functor, post_op_functor);
 }
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
 template <int FFT_ALGO_t, class FFT_base_arch, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
+void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAndLaunchKernel(KernelType kernel_type, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     using complex_type = typename FFT_base_arch::value_type;
     using scalar_type  = typename complex_type::value_type;
@@ -2130,7 +2133,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
     }
 
     // FIXME: this only works if the only intraop functor is conj mul (which it currently is.)
-    if constexpr ( IS_IKF_t<IntraOpType> ) {
+    if constexpr ( IS_IKF_t<IntraOpType>( ) ) {
         if constexpr ( FFT_ALGO_t == Generic_Fwd_Image_Inv_FFT ) {
             MyFFTDebugAssertTrue(d_ptr.image_to_search != nullptr, "SetExternalImagePointer has not been called.");
         }
@@ -2138,7 +2141,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
             MyFFTDebugAssertTrue(d_ptr.image_to_search == nullptr, "SetExternalImagePointer has been called twice.");
         }
     }
-    using ExternalImagePtr_t = decltype(d_ptr.image_to_search);
+    using ExternalImage_t = std::remove_pointer_t<decltype(d_ptr.image_to_search)>;
 
     if ( d_ptr.external_output ) {
         MyFFTDebugAssertTrue(d_ptr.external_output_complex == nullptr, "SetExternalImagePointer has been called twice.");
@@ -2276,20 +2279,20 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
 
                 if ( swap_real_space_quadrants ) {
                     MyFFTRunTimeAssertTrue(false, "decomposed xcorr with swap real space quadrants is not implemented.");
-                    // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<ExternalImagePtr_t, FFT,complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                    // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<ExternalImage_t, FFT,complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
                     // precheck;
-                    // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<ExternalImagePtr_t, FFT,invFFT, complex_type><< <LP.gridDims,  LP.threadsPerBlock, shared_memory, cudaStreamPerThread>> >
-                    // ( (ExternalImagePtr_t*) image_to_search, (complex_type*)  d_ptr.momentum_space_buffer,  (complex_type*) d_ptr.momentum_space, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv);
+                    // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<ExternalImage_t, FFT,invFFT, complex_type><< <LP.gridDims,  LP.threadsPerBlock, shared_memory, cudaStreamPerThread>> >
+                    // ( (ExternalImage_t*) image_to_search, (complex_type*)  d_ptr.momentum_space_buffer,  (complex_type*) d_ptr.momentum_space, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv);
                     // postcheck;
                 }
                 else {
-                    cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed_ConjMul<ExternalImagePtr_t, FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                    cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed_ConjMul<ExternalImage_t, FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
 #if FFT_DEBUG_STAGE > 2
                     // the image_to_search pointer is set during call to CrossCorrelate,
                     precheck;
-                    thread_fft_kernel_C2C_decomposed_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((ExternalImagePtr_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                    thread_fft_kernel_C2C_decomposed_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
                     postcheck;
                     is_in_second_buffer_partition = ! is_in_second_buffer_partition;
 #endif
@@ -2297,36 +2300,40 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
                 break;
             }
 
-            case c2c_decomposed: {
+            case c2c_fwd_decomposed: {
                 // TODO: FFT_ALGO_t
                 using FFT_nodir = decltype(FFT_base_arch( ) + Type<fft_type::c2c>( ));
-                LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_decomposed, do_forward_transform);
+                LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_fwd_decomposed);
 
-                if ( do_forward_transform ) {
-                    using FFT         = decltype(FFT_nodir( ) + Direction<fft_direction::forward>( ));
-                    int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
-                    CheckSharedMemory(shared_memory, device_properties);
-                    cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                using FFT         = decltype(FFT_nodir( ) + Direction<fft_direction::forward>( ));
+                int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
+                CheckSharedMemory(shared_memory, device_properties);
+                cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 #if FFT_DEBUG_STAGE > 2
-                    precheck;
-                    thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                    postcheck;
-                    is_in_second_buffer_partition = ! is_in_second_buffer_partition;
+                precheck;
+                thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
+                is_in_second_buffer_partition = ! is_in_second_buffer_partition;
 #endif
-                }
-                else {
 
-                    using FFT         = decltype(FFT_nodir( ) + Direction<fft_direction::inverse>( ));
-                    int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
-                    CheckSharedMemory(shared_memory, device_properties);
-                    cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                break;
+            }
+            case c2c_inv_decomposed: {
+
+                using FFT_nodir = decltype(FFT_base_arch( ) + Type<fft_type::c2c>( ));
+                LaunchParams LP = SetLaunchParameters(elements_per_thread_complex, c2c_inv_decomposed);
+
+                using FFT         = decltype(FFT_nodir( ) + Direction<fft_direction::inverse>( ));
+                int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
+                CheckSharedMemory(shared_memory, device_properties);
+                cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 #if FFT_DEBUG_STAGE > 4
-                    precheck;
-                    thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                    postcheck;
-                    is_in_second_buffer_partition = ! is_in_second_buffer_partition;
+                precheck;
+                thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
+                is_in_second_buffer_partition = ! is_in_second_buffer_partition;
 #endif
-                }
+
                 break;
             }
         } // switch on (thread) kernel_type
@@ -2874,15 +2881,15 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
                     }
                     else {
 
-                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<ExternalImagePtr_t, FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<ExternalImage_t, FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
                         precheck;
 
                         // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
                         // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
                         int apparent_Q = size_of<FFT>::value / fwd_dims_in.y;
 
-                        block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<ExternalImagePtr_t, FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
-                                (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv);
+                        block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<ExternalImage_t, FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
+                                (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv);
                         postcheck;
                     }
                     is_in_second_buffer_partition = ! is_in_second_buffer_partition;
@@ -2926,13 +2933,13 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
                         // postcheck;
                     }
                     else {
-                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<decltype(d_ptr.image_to_search), FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<ExternalImage_t, FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
                         // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
                         // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
                         int apparent_Q = size_of<FFT>::value / inv_dims_out.y;
                         precheck;
-                        block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<decltype(d_ptr.image_to_search), FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
-                                (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv);
+                        block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<ExternalImage_t, FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
+                                (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv);
                         postcheck;
                     }
                     is_in_second_buffer_partition = ! is_in_second_buffer_partition;
@@ -2971,10 +2978,10 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetAn
                         // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
                         int apparent_Q = size_of<FFT>::value / fwd_dims_in.y;
 
-                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<ExternalImagePtr_t, FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
+                        cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<ExternalImage_t, FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
                         precheck;
-                        block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<ExternalImagePtr_t, FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
-                                (ExternalImagePtr_t)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_functor, intra_op_functor, post_op_functor);
+                        block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<ExternalImage_t, FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(
+                                (ExternalImage_t*)d_ptr.image_to_search, intra_complex_input, intra_complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_functor, intra_op_functor, post_op_functor);
                         postcheck;
 
                         // FIXME: this is set in the public method calls for other functions. Since it will be changed to 0-7 to match FFT_DEBUG_STAGE, fix it then.
@@ -3099,7 +3106,14 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::GetTr
         case r2c_decomposed_transposed:
             transform_size.N = fwd_dims_in.x;
             break;
-        case c2c_decomposed:
+        case c2c_fwd_decomposed:
+            // FIXME fwd vs inv
+            if ( fwd_dims_in.y == 1 )
+                transform_size.N = fwd_dims_in.x;
+            else
+                transform_size.N = fwd_dims_in.y;
+            break;
+        case c2c_inv_decomposed:
             // FIXME fwd vs inv
             if ( fwd_dims_in.y == 1 )
                 transform_size.N = fwd_dims_in.x;
@@ -3133,7 +3147,7 @@ void FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::GetTr
 } // end GetTransformSize_thread function
 
 template <class ComputeBaseType, class InputType, class OutputBaseType, int Rank>
-LaunchParams FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetLaunchParameters(const int& ept, KernelType kernel_type, bool do_forward_transform) {
+LaunchParams FourierTransformer<ComputeBaseType, InputType, OutputBaseType, Rank>::SetLaunchParameters(const int& ept, KernelType kernel_type) {
     /*
     Assuming:
     1) r2c/c2r imply forward/inverse transform. 
diff --git a/src/tests/constant_image_test.cu b/src/tests/constant_image_test.cu
index a3ca709..4650215 100644
--- a/src/tests/constant_image_test.cu
+++ b/src/tests/constant_image_test.cu
@@ -167,11 +167,11 @@ bool const_image_test(std::vector<int>& size) {
         FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f);
 
         if constexpr ( use_fp16_io_buffers ) {
-            FT_fp16.SetInputPointer(output_buffer_fp16);
+            FT_fp16.SetInputPointer(output_buffer_fp16); // FIXME: this has no effect for this kernel which is using intra_complex_ptr
             __half* dummy_ptr = nullptr; // FIXME:
-            FT_fp16.SetOutputPointer(dummy_ptr);
+            FT_fp16.SetOutputPointer(output_buffer_fp16);
             FT_fp16.InvFFT( );
-            FT_fp16.CopyDeviceToHostAndSynchronize(reinterpret_cast<__half*>(host_output.real_values), true);
+            cudaErr(cudaMemcpyAsync(host_output.real_values, output_buffer_fp16, sizeof(__half) * host_output.real_memory_allocated, cudaMemcpyDeviceToHost, cudaStreamPerThread));
             host_output.data_is_fp16 = true; // we need to over-ride this as we already convertted but are overwriting.
             host_output.ConvertFP16ToFP32( );
         }
@@ -193,7 +193,7 @@ bool const_image_test(std::vector<int>& size) {
             all_passed                  = false;
             FastFFT_roundTrip_passed[n] = false;
         }
-        std::cerr << "sum" << sum << "full sum " << full_sum << std::endl;
+        std::cerr << "sum" << sum << " full sum " << full_sum << std::endl;
         MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x));
     } // loop over sizes