diff --git a/.gitignore b/.gitignore
index e5ca254..cb49002 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,4 +45,5 @@ build/*
 !build/build
 
 include/nvidia-mathdx-*
+include/cufftdx/
 
diff --git a/.vscode_shared/BenHimes/c_cpp_properties.json b/.vscode_shared/BenHimes/c_cpp_properties.json
index b5691db..e0cc884 100644
--- a/.vscode_shared/BenHimes/c_cpp_properties.json
+++ b/.vscode_shared/BenHimes/c_cpp_properties.json
@@ -4,7 +4,7 @@
             "name": "Linux",
             "includePath": [
                 "${workspaceFolder}/**",
-                "/groups/himesb/git/gemmi/include/**"
+                "/usr/local/cuda/include/**"
             ],
             "defines": [
                 "_FILE_OFFSET_BITS=64",
@@ -12,7 +12,6 @@
                 "__WXGTK__",
                 "DEBUG"
             ],
-            "compilerPath": "/groups/himesb/intel/compilers_and_libraries_2020.2.254/linux/bin/intel64/icpc",
             "cStandard": "c17",
             "cppStandard": "c++17",
             "browse": {
@@ -21,4 +20,4 @@
         }
     ],
     "version": 4
-}
+}
\ No newline at end of file
diff --git a/.vscode_shared/BenHimes/settings.json b/.vscode_shared/BenHimes/settings.json
index 98c1f59..5c7d1d8 100644
--- a/.vscode_shared/BenHimes/settings.json
+++ b/.vscode_shared/BenHimes/settings.json
@@ -1,26 +1,6 @@
 {
     "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
     "C_Cpp.errorSquiggles": "Enabled",
-    "cmake.configureOnOpen": false,
-    "cmake.sourceDirectory": "${workspaceFolder}",
-    "cmake.buildDirectory": "${workspaceFolder}/build/${buildKit}/${buildType}",
-    "cmake.skipConfigureIfCachePresent": false,
-    "cmake.configureSettings": {
-        "BUILD_EXPERIMENTAL_FEATURES":true,
-        "BUILD_GPU_FEATURES":true,
-        "BUILD_STATIC_BINARIES":true,
-        "BUILD_OpenMP":true,
-        "cisTEM_CUDA_TOOLKIT_PATH":"/groups/cryoadmin/software/CUDA-TOOLKIT/cuda_11.3.1/"
-        
-    },
-    "cmake.configureEnvironment": {
-        "WX_CONFIG":"/admin/software/wxWidgets3_static_GNU/bin/wx-config",
-        "CUDACXX":"/groups/cryoadmin/software/CUDA-TOOLKIT/cuda_11.3.1/bin/nvcc",
-        "CUDAARCHS":"70;75",
-        "CUDAFLAGS":" --default-stream per-thread -m64  --use_fast_math  -Xptxas --warn-on-local-memory-usage,--warn-on-spills, --generate-line-info -Xcompiler= -DGPU -DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_DLFCN_H=1"
-     
-
-    },
     "C_Cpp.dimInactiveRegions": true,
     "files.associations": {
         "string": "cpp",
@@ -74,11 +54,30 @@
         "typeinfo": "cpp",
         "bit": "cpp",
         "any": "cpp",
-        "map": "cpp"
+        "map": "cpp",
+        "bitset": "cpp",
+        "__hash_table": "cpp",
+        "__split_buffer": "cpp",
+        "queue": "cpp",
+        "stack": "cpp",
+        "codecvt": "cpp",
+        "condition_variable": "cpp",
+        "iomanip": "cpp",
+        "mutex": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "__nullptr": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "set": "cpp",
+        "numbers": "cpp",
+        "semaphore": "cpp",
+        "stop_token": "cpp",
+        "complex": "cpp",
+        "unordered_set": "cpp"
     },
     "C_Cpp.clang_format_path": "/usr/bin/clang-format-14",
     "editor.formatOnSave": true,
     "DockerRun.DisableDockerrc": true,
     "html.format.endWithNewline": true
-}
-
+}
\ No newline at end of file
diff --git a/.vscode_shared/BenHimes/tasks.json b/.vscode_shared/BenHimes/tasks.json
index 8a3f786..bc2e846 100644
--- a/.vscode_shared/BenHimes/tasks.json
+++ b/.vscode_shared/BenHimes/tasks.json
@@ -1,101 +1,101 @@
 {
-    // See https://go.microsoft.com/fwlink/?LinkId=733558
-    // for the documentation about the tasks.json format
-    "version": "2.0.0",
-    "options": {
-        "env": {
-            "cuda_dir": "/groups/cryoadmin/software/CUDA-TOOLKIT/cuda_11.3.1",
-            "wx_dir":"/groups/cryoadmin/software/WX/wx_static_3.05_",
-            "build_dir": "${workspaceFolder}/build",
-            "compile_cores": "48"
-        }
-    },
-    "tasks": [
-      {
-        "label": "Book_build",
-        "type": "shell",
-        "command": "/groups/himesb/.FastFFTDocs/bin/jupyter-book build --all ./docs && firefox ./docs/_build/html/index.html"
-      },
-      {
-        "label": "Book_publish",
-        "type": "shell",
-        "command": "cd docs && /groups/himesb/.FastFFTDocs/bin/ghp-import -n -p -f _build/html ; cd .."
-      },
-      {
-        "label": "echo",
-        "type": "shell",
-        "command": "echo --with-wx-config=${wx_dir}"
-      },
-      {
-        "label": "CONFIG intel,gpu,debug",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu-debug && cd ${build_dir}/Intel-gpu-debug && CC=icc CXX=icpc ../../configure  --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu,debug",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu-debug && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG intel,gpu,debug,noexp",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu-debug-noexp && cd ${build_dir}/Intel-gpu-debug-noexp && CC=icc CXX=icpc ../../configure  --enable-debugmode  --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu,debug,noexp",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu-debug-noexp && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG intel,gpu,debug,rotate",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu-debug-rotate && cd ${build_dir}/Intel-gpu-debug-rotate && CC=icc CXX=icpc ../../configure --enable-rotated-tm --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu,debug,rotate",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu-debug-rotate && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG intel,gpu",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu && cd ${build_dir}/Intel-gpu && CC=icc CXX=icpc ../../configure  --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG intel,gpu,samples,debug",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu-samples-debug && cd ${build_dir}/Intel-gpu-samples-debug && CC=icc CXX=icpc ../../configure --enable-samples --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu,samples,debug",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu-samples-debug && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG intel,gpu,device-lto",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/Intel-gpu-lto && cd ${build_dir}/Intel-gpu-lto && CC=icc CXX=icpc ../../configure  --with-oldest-gpu-arch=80 --with-target-gpu-arch=80 --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
-      },
-      {
-        "label": "BUILD intel,gpu,device-lto",
-        "type": "shell",
-        "command": "cd ${build_dir}/Intel-gpu-lto && make -j${compile_cores}"
-      },
-      {
-        "label": "CONFIG GNU ,gpu",
-        "type": "shell",
-        "command": "mkdir -p ${build_dir}/GNU-gpu && cd ${build_dir}/GNU-gpu && CC=gcc CXX=g++ ../../configure  --disable-mkl --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}GNU/bin/wx-config"
-      },
-      {
-        "label": "BUILD GNU,gpu",
-        "type": "shell",
-        "command": "cd  ${build_dir}/GNU-gpu && make -j${compile_cores}",
-        "problemMatcher": []
-      }
-    ]
-}
+  // See https://go.microsoft.com/fwlink/?LinkId=733558
+  // for the documentation about the tasks.json format
+  "version": "2.0.0",
+  "options": {
+    "env": {
+      "cuda_dir": "/groups/cryoadmin/software/CUDA-TOOLKIT/cuda_11.3.1",
+      "wx_dir": "/groups/cryoadmin/software/WX/wx_static_3.05_",
+      "build_dir": "${workspaceFolder}/build",
+      "compile_cores": "48"
+    }
+  },
+  "tasks": [
+    {
+      "label": "Book_build",
+      "type": "shell",
+      "command": "${HOME}/.FastFFTDocs/bin/jupyter-book build --all ./docs && firefox ./docs/_build/html/index.html"
+    },
+    {
+      "label": "Book_publish",
+      "type": "shell",
+      "command": "cd docs && ${HOME}/.FastFFTDocs/bin/ghp-import -n -p -f _build/html ; cd .."
+    },
+    {
+      "label": "echo",
+      "type": "shell",
+      "command": "echo --with-wx-config=${wx_dir}"
+    },
+    {
+      "label": "CONFIG intel,gpu,debug",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu-debug && cd ${build_dir}/Intel-gpu-debug && CC=icc CXX=icpc ../../configure  --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu,debug",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu-debug && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG intel,gpu,debug,noexp",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu-debug-noexp && cd ${build_dir}/Intel-gpu-debug-noexp && CC=icc CXX=icpc ../../configure  --enable-debugmode  --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu,debug,noexp",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu-debug-noexp && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG intel,gpu,debug,rotate",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu-debug-rotate && cd ${build_dir}/Intel-gpu-debug-rotate && CC=icc CXX=icpc ../../configure --enable-rotated-tm --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu,debug,rotate",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu-debug-rotate && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG intel,gpu",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu && cd ${build_dir}/Intel-gpu && CC=icc CXX=icpc ../../configure  --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG intel,gpu,samples,debug",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu-samples-debug && cd ${build_dir}/Intel-gpu-samples-debug && CC=icc CXX=icpc ../../configure --enable-samples --enable-debugmode --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu,samples,debug",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu-samples-debug && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG intel,gpu,device-lto",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/Intel-gpu-lto && cd ${build_dir}/Intel-gpu-lto && CC=icc CXX=icpc ../../configure  --with-oldest-gpu-arch=80 --with-target-gpu-arch=80 --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}INTEL/bin/wx-config"
+    },
+    {
+      "label": "BUILD intel,gpu,device-lto",
+      "type": "shell",
+      "command": "cd ${build_dir}/Intel-gpu-lto && make -j${compile_cores}"
+    },
+    {
+      "label": "CONFIG GNU ,gpu",
+      "type": "shell",
+      "command": "mkdir -p ${build_dir}/GNU-gpu && cd ${build_dir}/GNU-gpu && CC=gcc CXX=g++ ../../configure  --disable-mkl --enable-experimental --with-cuda=${cuda_dir} --enable-staticmode --enable-openmp  --with-wx-config=${wx_dir}GNU/bin/wx-config"
+    },
+    {
+      "label": "BUILD GNU,gpu",
+      "type": "shell",
+      "command": "cd  ${build_dir}/GNU-gpu && make -j${compile_cores}",
+      "problemMatcher": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/build/Makefile b/build/Makefile
index f99926a..6c46363 100644
--- a/build/Makefile
+++ b/build/Makefile
@@ -1,4 +1,4 @@
-# TODO: nvc++ ?
+
 NVCC=nvcc
 # TODO: test with gcc and clang
 NVCC_FLAGS=-ccbin=g++ -t 8 
@@ -14,26 +14,24 @@ NVCC_FLAGS+=-DCUFFTDX_DISABLE_RUNTIME_ASSERTS
 # Gencode arguments, only supporting Volta or newer
 # SMS ?= 70 75 80 86
 # In initial dev, only compile for 70 or 86 depending on which workstation I'm on, b/c it is faster.
-SMS ?=  70 86
-
-# Need to 
-# GENCODE_FLAGS := -gencode arch=compute_86,code=lto_86
-# GENCODE_FLAGS := -gencode arch=compute_86,code=lto_86
+SMS ?=  86
 
-ifeq ($(GENCODE_FLAGS),)
-# Generate SASS code for each SM architecture listed in $(SMS)
-$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+ifeq (${FastFFT_lto},1)
+$(info "Building with LTO")
+GENCODE_FLAGS := -gencode arch=compute_86,code=lto_86
+else
+$(info "Building without LTO")
+GENCODE_FLAGS := -gencode arch=compute_86,code=sm_86
+endif
 
 
-# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
-HIGHEST_SM := $(lastword $(sort $(SMS)))
-ifneq ($(HIGHEST_SM),)
-GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=sm_$(HIGHEST_SM)
-endif
-endif # GENCODE_FLAGS
+# TODO:
+# Lets first clean this up to have several targets, one for the library, one for the tests and move the helper Image object to the tests.
+# Lets then get set lto as an option using an environment (or cli) variable. Print this out and alter the binary name.
+# For now, just build with 86 for speed.
 
 # FIXME: this is set in my own bashrc. Until cufftdx comes out of early release, this is fine.
-CUFFTDX_INCLUDE_DIR := $(FastFFT_cufftdx_dir)/include
+CUFFTDX_INCLUDE_DIR := ../include
 $(info $$CUFFTDX_INCLUDE_DIR is [${CUFFTDX_INCLUDE_DIR}])
 
 # TODO: check on cuda version (building with 11.3.1 r/n)
@@ -41,9 +39,7 @@ CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`)
 CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include
 
 
-SRCS=test.cu
-TARGETS=test
-
+DEBUG_FLAGS :=  
 # Debug level determines various asserts and print macros defined in FastFFT.cuh These should only be set when building tests and developing.
 ifeq (${FFT_DEBUG_LEVEL},)
 debug_level=0
@@ -52,7 +48,7 @@ else
 debug_level=${FFT_DEBUG_LEVEL}
 $(info $$debug_level is  [${FFT_DEBUG_LEVEL}])
 endif
-NVCC_FLAGS+=-DFFT_DEBUG_LEVEL=${debug_level}
+DEBUG_FLAGS+=-DFFT_DEBUG_LEVEL=${debug_level}
 
 # For testing/debugging it is convenient to execute and have print functions for partial transforms.
 # These will go directly in the kernels and also in the helper Image.cuh definitions for PrintArray.
@@ -61,17 +57,14 @@ NVCC_FLAGS+=-DFFT_DEBUG_LEVEL=${debug_level}
 # 4 intermediate ops, like conj multiplication
 # Inv 5, 6, 7 ( original y, z, x)
 
-ifeq (${DEBUG_STAGE},)
+ifeq (${FFT_DEBUG_STAGE},)
 debug_stage=8
-$(info $$DEBUG_STAGE is not defined, setting to 8)
+$(info $$FFT_DEBUG_STAGE is not defined, setting to 8)
 else
-debug_stage=${DEBUG_STAGE}
-$(info $$debug_stage is  [${DEBUG_STAGE}])
-endif
-
-ifneq (${HEAVYERRORCHECKING_FFT},)
-NVCC_FLAGS+=-DHEAVYERRORCHECKING_FFT
+debug_stage=${FFT_DEBUG_STAGE}
+$(info $$debug_stage is  [${FFT_DEBUG_STAGE}])
 endif
+DEBUG_FLAGS+=-DFFT_DEBUG_STAGE=${debug_stage}
 
 # Track what the last build did so that the pre-commit hook can skip re-building if there is no debug included.
 remove_debug_file=0
@@ -81,41 +74,27 @@ ifeq (${debug_stage}, 8)
 	endif
 else
 	remove_debug_file=1
-ifeq (${HEAVYERRORCHECKING_FFT},)
-# If HEAVYERRORCHECKING_FFT is not already asked for, then add it anytime debug_stage < 8 (partial FFTs)
-	NVCC_FLAGS+=-DHEAVYERRORCHECKING_FFT
 endif
-endif
-
 
-# Assuming make is being run from build
-TESTS_BUILD_DIR := ./tests
-EXAMPLE_BUILD_DIR := ./examples
-BENCHMARK_BUILD_DIR := ./benchmarks
-
-SRC_DIR  := ../src
-
-#TEST_SRCS=$(shell find $(SRC_DIR)/tests -name '*.cu' -or -name '*.cpp')
-# Because I am linking test.cu.cpp -> test.cu as a cheap trick to get auto formatting in vs code, we can't include cpp in the src list
-TEST_SRCS=$(shell find $(SRC_DIR)/tests -name '*.cu')
-EXAMPLE_SRCS=$(shell find $(SRC_DIR)/cpp -name '*.cu' -or -name '*.cpp')
-BENCHMARK_SRCS=$(shell find $(SRC_DIR)/benchmarks -name '*.cu' -or -name '*.cpp')
+ifeq (${FastFFT_sync_checking},1)
+# If HEAVYERRORCHECKING_FFT is not already asked for, then add it anytime debug_stage < 8 (partial FFTs)
+	DEBUG_FLAGS+=-DHEAVYERRORCHECKING_FFT
+endif
 
 
-TEST_TARGETS=$(patsubst $(SRC_DIR)/tests/%.cu,$(TESTS_BUILD_DIR)/%,$(TEST_SRCS))
-EXAMPLE_TARGETS=$(patsubst %.cpp,%,$(EXAMPLE_SRCS))
-BENCHMARK_TARGETS=$(patsubst %.cu,%,$(BENCHMARK_SRCS))
 
-TEST_OBJS=$(patsubst %,%.o,$(TEST_TARGETS))
-EXAMPLE_OBJS=$(patsubst %,%.o,$(EXAMPLE_TARGETS))
-BENCHMARK_OBJS=$(patsubst %,%.o,$(BENCHMARK_TARGETS))
+# External libraries used for testing
+EXTERNAL_LIBS= -lfftw3f -lcufft_static -lculibos -lcudart_static -lrt
 
-printit:
-	echo This is $(TEST_SRCS)
-	echo $(TEST_TARGETS)
-	echo $(TEST_OBJS)
+TEST_BUILD_DIR=tests
+TEST_SRC_DIR=../src/tests
+# Get all the test source files and remove cu extension
+TEST_TARGETS=$(patsubst %.cu,$(TEST_BUILD_DIR)/%,$(notdir $(wildcard $(TEST_SRC_DIR)/*.cu)))
+TEST_DEPS=$(wildcard $(TEST_SRC_DIR)/*.cuh)
 
+$(info $$TEST_TARGETS is [${TEST_TARGETS}])
 
+$(info )
 all: $(TEST_TARGETS)
 ifeq (${remove_debug_file}, 0)
 	touch built_without_debug.txt
@@ -123,63 +102,54 @@ else
 	rm -f built_without_debug.txt
 endif
 
-$(TEST_TARGETS): $(TEST_OBJS)
-	$(NVCC) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $< -lfftw3f -lcufft_static -lculibos -lcudart_static -lrt
-
-$(TEST_OBJS): $(TEST_SRCS)
-	mkdir -p $(TESTS_BUILD_DIR)
-	$(NVCC) -dc $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) -DDEBUG_FFT_STAGE=${debug_stage} -o $@ $<
+$(TEST_BUILD_DIR)/%: $(TEST_BUILD_DIR)/%.o lib/Image.o libFastFFT.a
+	$(NVCC) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $< lib/Image.o -L$(PWD) -lFastFFT $(EXTERNAL_LIBS)
+# $(NVCC) $(NVCC_FLAGS) -dlto -arch=sm_86 -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $^ -lFastFFT $(EXTERNAL_LIBS)
 
-# .PHONY: all examples tests benchmarks clean
 
-# all: build
+$(TEST_BUILD_DIR)/%.o: $(TEST_SRC_DIR)/%.cu $(TEST_DEPS)
+	@mkdir -p tests
+	$(NVCC) -dc $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $<
 
-# build: simpleCUFFT_callback
+# tests/test1: tests/linkedTest.o tests/test.o lib/Image.o libFastFFT.a
+# 	$(NVCC) $(NVCC_FLAGS) $(GENCODE_FLAGS) -o $@ $^ -L$(PWD) -lFastFFT $(EXTERNAL_LIBS)
+# # $(NVCC) $(NVCC_FLAGS) -dlto -arch=sm_86 -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $^ -lFastFFT $(EXTERNAL_LIBS)
 
-# check.deps:
-# ifeq ($(SAMPLE_ENABLED),0)
-# 	@echo "Sample will be waived due to the above missing dependencies"
-# else
-# 	@echo "Sample is ready - all dependencies have been met"
-# endif
+# tests/linkedTest.o: tests/test.o lib/Image.o 
+# 	$(NVCC) $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) --device-link -o $@ $^ 
+# # $(NVCC) $(NVCC_FLAGS) -dlto -arch=sm_86 -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) --device-link -o $@ $^ 
 
-# simpleCUFFT_callback.o:simpleCUFFT_callback.cu
-# 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+# tests/test.o: ../src/tests/test.cu
+# 	@mkdir -p tests
+# 	$(NVCC) -dc $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $<
 
-# simpleCUFFT_callback: simpleCUFFT_callback.o
-# 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
-# 	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-# 	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+lib/Image.o: ../src/fastfft/Image.cu ../src/fastfft/Image.cuh
+	@mkdir -p lib
+	$(NVCC) -dc $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $< 
+	
 
-# all:
-# 	$(NVCC) -dc $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) -DDEBUG_FFT_STAGE=${debug_stage} -o test.o -c ../cpp/test.cu
-# 	$(NVCC) $(NVCC_FLAGS) -o test.app test.o -lfftw3f -lcufft_static -lculibos -lcudart_static -lrt
 
+# Build the library, using static linking for now to make sure we're set to go with cisTEM
+libFastFFT.a: lib/FastFFT.o 
+	$(NVCC) $(NVCC_FLAGS) --lib $(GENCODE_FLAGS) $(DEBUG_FLAGS) -o $@ $^ $(EXTERNAL_LIBS)
 
-# SRCS=$(filter-out nvrtc_*.cu, $(wildcard *.cu))
-# TARGETS=$(patsubst %.cu,%,$(SRCS))
-
-# NVRTC_SRCS=$(wildcard nvrtc_*.cu)
-# NVRTC_TARGETS=$(patsubst %.cu,%,$(NVRTC_SRCS))
-
-# $(TARGETS): %: %.cu
-# 	$(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR)
-
-# $(NVRTC_TARGETS): %: %.cu
-# 	$(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(NVRTC_DEFINES) -lnvrtc -lcuda
+lib/FastFFT.o: ../src/fastfft/FastFFT.cu ../include/FastFFT.cuh
+	@mkdir -p lib
+	$(NVCC) -dc $(NVCC_FLAGS) $(GENCODE_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(DEBUG_FLAGS) -o $@ $<  
+ 
 
 
 # remove all but the executables.
+.PHONY: clean
 clean:
-	rm -f $(TESTS_BUILD_DIR)/*.*
+	rm -f tests/*.o lib/*.o *.a *.o
 	rm -f $(TEST_TARGETS)
 
 
 # clean out everything.
-clobber:
-ifneq ($(TESTS_BUILD_DIR),)
-	rm -f $(TESTS_BUILD_DIR)/*
-	rmdir $(TESTS_BUILD_DIR)
-endif
+.PHONY: clobber
+clobber: clean
+	rm -f libFastFFT.a
+
 
 .DEFAULT_GOAL := all
diff --git a/docs/_docs/references/development_tools.md b/docs/_docs/references/development_tools.md
index 8132b28..56d25e6 100644
--- a/docs/_docs/references/development_tools.md
+++ b/docs/_docs/references/development_tools.md
@@ -10,7 +10,6 @@ In src/FastFFT.cuh the HEAVYERRORCHECKING_FFT should be defined whenever definin
 Without these guards, it can be very difficult to know where errors are actually coming from!
 ```
 
-
 ```c++
 // When defined Turns on synchronization based checking for all FFT kernels as well as cudaErr macros
 #define HEAVYERRORCHECKING_FFT
@@ -24,8 +23,8 @@ Without these guards, it can be very difficult to know where errors are actually
 #define postcheck 
 #define precheck 
 #else
-#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
-#define precheck { cudaErr(cudaGetLastError()); }
+#define postcheck { cudaErr(cudaPeekAtLastError()); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error) }
+#define precheck { cudaErr(cudaGetLastError()) }
 #endif
 
 inline void checkCudaErr(cudaError_t err) 
@@ -37,6 +36,7 @@ inline void checkCudaErr(cudaError_t err)
   } 
 };
 ```
+
 ## Cuda error checking (API)
 
 calls to any cuda or cuda library API should be enclosed with cudaErr(), which will check for errors and print them to std::cerr.
@@ -46,7 +46,7 @@ calls to any cuda or cuda library API should be enclosed with cudaErr(), which w
 Old school, yes. Effective, also yes. Print out a message, what line and file the statement came from.
 
 ```c++
-#define MyFFTPrintWithDetails(...)	{std::cerr << __VA_ARGS__  << " From: " << __FILE__  << " " << __LINE__  << " " << __PRETTY_FUNCTION__ << std::endl;}
+#define MyFFTPrintWithDetails(...) {std::cerr << __VA_ARGS__  << " From: " << __FILE__  << " " << __LINE__  << " " << __PRETTY_FUNCTION__ << std::endl;}
 
 ```
 
@@ -62,8 +62,7 @@ As much as possible is statically checked. This is not always possible, so runti
 
 ## Partial transform checkpoints
 
-Because the size and order of the data are changed inbetween different steps of a multi-dimensional trasnform, isolating bugs requires some method to do so. Right now, this is achieved by setting the debug_stage manually in build/Makefile. 
-
+Because the size and order of the data are changed inbetween different steps of a multi-dimensional trasnform, isolating bugs requires some method to do so. Right now, this is achieved by setting the debug_stage manually in build/Makefile.
 
 ```c++
 # For testing/debugging it is convenient to execute and have print functions for partial transforms.
@@ -81,6 +80,7 @@ debug_stage=3
 ## Other printing based debug tools
 
 FastFFT::PrintVector()
+
 - print vectors of different type and number of elements
 - can be used directly, however, it is generally used indirectly via either
 
@@ -104,7 +104,6 @@ FastFFT::PrintState()
     std::cout << "is_in_memory_host_pointer " << is_in_memory_host_pointer << std::endl;
     std::cout << "is_in_memory_device_pointer " << is_in_memory_device_pointer << std::endl;
     std::cout << "is_in_buffer_memory " << is_in_buffer_memory << std::endl;
-    std::cout << "is_host_memory_pinned " << is_host_memory_pinned << std::endl;
     std::cout << "is_fftw_padded_input " << is_fftw_padded_input << std::endl;
     std::cout << "is_fftw_padded_output " << is_fftw_padded_output << std::endl;
     std::cout << "is_real_valued_input " << is_real_valued_input << std::endl;
@@ -139,7 +138,6 @@ FastFFT::PrintState()
 
   or
 
-
 FastFFT::PrintLaunchParameters()
 
 ```c++
@@ -158,4 +156,4 @@ FastFFT::PrintLaunchParameters()
     std::cout << "  physical_x_output: " << LP.mem_offsets.physical_x_output << std::endl;
 
   };
-```
\ No newline at end of file
+```
diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh
deleted file mode 120000
index e4d9bec..0000000
--- a/include/FastFFT.cuh
+++ /dev/null
@@ -1 +0,0 @@
-FastFFT.cuh.h
\ No newline at end of file
diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh
new file mode 100644
index 0000000..e8deafb
--- /dev/null
+++ b/include/FastFFT.cuh
@@ -0,0 +1,1507 @@
+// Utilites for FastFFT.cu that we don't need the host to know about (FastFFT.h)
+#include "FastFFT.h"
+
+#ifndef Fast_FFT_cuh_
+#define Fast_FFT_cuh_
+
+#include <cuda_fp16.h>
+#include "cufftdx/include/cufftdx.hpp"
+
+// clang-format off
+
+// “This software contains source code provided by NVIDIA Corporation.” Much of it is modfied as noted at relevant function definitions.
+
+// When defined Turns on synchronization based checking for all FFT kernels as well as cudaErr macros
+// Defined in the Makefile when DEBUG_STAGE is not equal 8 (the default, not partial transforms.)
+// #define HEAVYERRORCHECKING_FFT
+
+// Various levels of debuging conditions and prints
+// #define FFT_DEBUG_LEVEL 0
+
+// #define forceforce( type )  __nv_is_extended_device_lambda_closure_type( type )
+//FIXME: change to constexpr func
+
+
+
+template <typename K>
+constexpr inline bool IS_IKF_t( ) {
+    if constexpr ( std::is_final_v<K> ) {
+        return true;
+    }
+    else {
+        return false;
+    }
+};
+
+
+
+#if FFT_DEBUG_LEVEL < 1
+
+#define MyFFTDebugPrintWithDetails(...)
+#define MyFFTDebugAssertTrue(cond, msg, ...)
+#define MyFFTDebugAssertFalse(cond, msg, ...)
+#define MyFFTDebugAssertTestTrue(cond, msg, ...)
+#define MyFFTDebugAssertTestFalse(cond, msg, ...)
+
+#else
+// Minimally define asserts that check state variables and setup.
+#define MyFFTDebugAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; exit(-1); } }
+#define MyFFTDebugAssertFalse(cond, msg, ...) { if ( (cond) == true ) { std::cerr << msg << std::endl  << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;  exit(-1); } }                                                                                                    
+                                                                                                                    
+#endif
+
+#if FFT_DEBUG_LEVEL > 1
+// Turn on checkpoints in the testing functions.
+#define MyFFTDebugAssertTestTrue(cond, msg, ...)  { if ( (cond) != true ) { std::cerr << "    Test " << msg << " FAILED!" << std::endl  << "  at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1); } else { std::cerr << "    Test " << msg << " passed!" << std::endl; }}
+#define MyFFTDebugAssertTestFalse(cond, msg, ...)  { if ( (cond) == true ) {  std::cerr << "    Test " << msg << " FAILED!" << std::endl   << " at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;   exit(-1);  } else {  std::cerr << "    Test " << msg << " passed!" << std::endl;  } }
+
+#endif
+
+#if FFT_DEBUG_LEVEL == 2
+#define MyFFTDebugPrintWithDetails(...)
+#endif
+
+#if FFT_DEBUG_LEVEL == 3
+// More verbose debug info
+#define MyFFTDebugPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
+#define MyFFTDebugPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
+#endif
+
+#if FFT_DEBUG_LEVEL == 4
+// More verbose debug info + state info
+#define MyFFTDebugPrint(...) { FastFFT::FourierTransformer::PrintState( );  std::cerr << __VA_ARGS__ << std::endl; }
+#define MyFFTDebugPrintWithDetails(...)  { FastFFT::FourierTransformer::PrintState( ); std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
+
+#endif
+
+// Always in use
+#define MyFFTPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
+#define MyFFTPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
+#define MyFFTRunTimeAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1); }  }
+#define MyFFTRunTimeAssertFalse(cond, msg, ...) { if ( (cond) == true ) {std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1);  } }                                                                                                               
+
+
+
+// I use the same things in cisTEM, so check for them. FIXME, get rid of defines and also find a better sharing mechanism.
+#ifndef cudaErr
+// Note we are using std::cerr b/c the wxWidgets apps running in cisTEM are capturing std::cout
+// If I leave cudaErr blank when HEAVYERRORCHECKING_FFT is not defined, I get some reports/warnings about unused or unreferenced variables. I suspect the performance hit is very small so just leave this on.
+// The real cost is in the synchronization of in pre/postcheck.
+#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if ( status != cudaSuccess ) { std::cerr << cudaGetErrorString(status) << " :-> "; MyFFTPrintWithDetails(""); } };
+#endif
+
+#ifndef postcheck
+    #ifndef precheck
+        #ifndef HEAVYERRORCHECKING_FFT
+            #define postcheck
+            #define precheck
+        #else
+            #define postcheck  { cudaErr(cudaPeekAtLastError( )); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error) }
+            #define precheck { cudaErr(cudaGetLastError( )) }
+        #endif
+    #endif
+#endif
+
+
+inline void checkCudaErr(cudaError_t err) {
+    if ( err != cudaSuccess ) {
+        std::cerr << cudaGetErrorString(err) << " :-> " << std::endl;
+        MyFFTPrintWithDetails(" ");
+    }
+};
+
+#define USEFASTSINCOS
+// The __sincosf doesn't appear to be the problem with accuracy, likely just the extra additions, but it probably also is less flexible with other types. I don't see a half precision equivalent.
+#ifdef USEFASTSINCOS
+__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
+    __sincosf(arg, s, c);
+}
+#else
+__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
+    sincos(arg, s, c);
+}
+#endif
+
+namespace FastFFT {
+
+template <typename T>
+bool pointer_is_in_memory_and_registered(T ptr, const char* ptr_name = nullptr) {
+    cudaPointerAttributes attr;
+    cudaErr(cudaPointerGetAttributes(&attr, ptr));
+
+    if ( attr.type == 1 && attr.devicePointer == attr.hostPointer ) {
+        return true;
+    }
+    else {
+        return false;
+    }
+}
+
+__device__ __forceinline__ int
+d_ReturnReal1DAddressFromPhysicalCoord(int3 coords, short4 img_dims) {
+    return ((((int)coords.z * (int)img_dims.y + coords.y) * (int)img_dims.w * 2) + (int)coords.x);
+}
+
+static constexpr const int XZ_STRIDE = 16;
+
+static constexpr const int          bank_size    = 32;
+static constexpr const int          bank_padded  = bank_size + 1;
+static constexpr const unsigned int ubank_size   = 32;
+static constexpr const unsigned int ubank_padded = ubank_size + 1;
+
+__device__ __forceinline__ int GetSharedMemPaddedIndex(const int index) {
+    return (index % bank_size) + ((index / bank_size) * bank_padded);
+}
+
+__device__ __forceinline__ int GetSharedMemPaddedIndex(const unsigned int index) {
+    return (index % ubank_size) + ((index / ubank_size) * ubank_padded);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress(const unsigned int pixel_pitch) {
+    return pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y);
+}
+
+// Return the address of the 1D transform index 0. Right now testing for a stride of 2, but this could be modifiable if it works.
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_strided_Z(const unsigned int pixel_pitch) {
+    // In the current condition, threadIdx.z is either 0 || 1, and gridDim.z = size_z / 2
+    // index into a 2D tile in the XZ plane, for output in the ZX transposed plane (for coalsced write.)
+    return pixel_pitch * (blockIdx.y + (XZ_STRIDE * blockIdx.z + threadIdx.z) * gridDim.y);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int ReturnZplane(const unsigned int NX, const unsigned int NY) {
+    return (blockIdx.z * NX * NY);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_Z(const unsigned int NY) {
+    return blockIdx.y + (blockIdx.z * NY);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTColumn_XYZ_transpose(const unsigned int NX) {
+    // NX should be size_of<FFT>::value for this method. Should this be templated?
+    // presumably the XZ axis is alread transposed on the forward, used to index into this state. Indexs in (ZY)' plane for input, to be transposed and permuted to output.'
+    return NX * (XZ_STRIDE * (blockIdx.y + gridDim.y * blockIdx.z) + threadIdx.z);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose(const unsigned int X) {
+    return blockIdx.z + gridDim.z * (blockIdx.y + X * gridDim.y);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX) {
+    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
+    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
+    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
+    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
+    // above * blockIdx.y -> offset in physical Y (transposed Y)
+    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
+    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + (IDX / XZ_STRIDE) * gridDim.y);
+}
+
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
+    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
+    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
+    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
+    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
+    // above * blockIdx.y -> offset in physical Y (transposed Y)
+    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
+    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.y);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX) {
+    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
+    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + (IDX / XZ_STRIDE) * gridDim.z);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
+    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
+    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.z);
+}
+
+// Return the address of the 1D transform index 0
+static __device__ __forceinline__ unsigned int Return1DFFTColumn_XZ_to_XY( ) {
+    // return blockIdx.y + gridDim.y * ( blockIdx.z + gridDim.z * X);
+    return blockIdx.y + gridDim.y * blockIdx.z;
+}
+
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX_to_XY( ) {
+    return blockIdx.z + gridDim.z * blockIdx.y;
+}
+
+static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX( ) {
+    return Return1DFFTColumn_XZ_to_XY( );
+}
+
+// Complex a * conj b multiplication
+template <typename ComplexType, typename ScalarType>
+static __device__ __host__ inline auto ComplexConjMulAndScale(const ComplexType a, const ComplexType b, ScalarType s) -> decltype(b) {
+    ComplexType c;
+    c.x = s * (a.x * b.x + a.y * b.y);
+    c.y = s * (a.y * b.x - a.x * b.y);
+    return c;
+}
+
+// GetCudaDeviceArch from https://github.com/mnicely/cufft_examples/blob/master/Common/cuda_helper.h
+void GetCudaDeviceProps(DeviceProps& dp);
+
+void CheckSharedMemory(int& memory_requested, DeviceProps& dp);
+void CheckSharedMemory(unsigned int& memory_requested, DeviceProps& dp);
+
+using namespace cufftdx;
+
+// TODO this probably needs to depend on the size of the xform, at least small vs large.
+constexpr const int elements_per_thread_16   = 4;
+constexpr const int elements_per_thread_32   = 8;
+constexpr const int elements_per_thread_64   = 8;
+constexpr const int elements_per_thread_128  = 8;
+constexpr const int elements_per_thread_256  = 8;
+constexpr const int elements_per_thread_512  = 8;
+constexpr const int elements_per_thread_1024 = 8;
+constexpr const int elements_per_thread_2048 = 8;
+constexpr const int elements_per_thread_4096 = 8;
+constexpr const int elements_per_thread_8192 = 16;
+
+namespace KernelFunction {
+
+// Define an enum for different functors
+// Intra Kernel Function Type
+enum IKF_t { NOOP,
+             CONJ_MUL };
+
+// Maybe a better way to check , but using keyword final to statically check for non NONE types
+template <class T, int N_ARGS, IKF_t U>
+class my_functor {};
+
+template <class T>
+class my_functor<T, 0, IKF_t::NOOP> {
+  public:
+    __device__ __forceinline__
+            T
+            operator( )( ) {
+        printf("really specific NOOP\n");
+        return 0;
+    }
+};
+
+template <class T>
+class my_functor<T, 2, IKF_t::CONJ_MUL> final {
+  public:
+    __device__ __forceinline__
+            T
+            operator( )(float& template_fft_x, float& template_fft_y, const float& target_fft_x, const float& target_fft_y) {
+        // Is there a better way than declaring this variable each time?
+        // This is target * conj(template)
+        float tmp      = (template_fft_x * target_fft_x + template_fft_y * target_fft_y);
+        template_fft_y = (template_fft_x * target_fft_y - template_fft_y * target_fft_x);
+        template_fft_x = tmp;
+    }
+};
+
+} // namespace KernelFunction
+
+// constexpr const std::map<unsigned int, unsigned int> elements_per_thread = {
+//     {16, 4}, {"GPU", 15}, {"RAM", 20},
+// };
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// FFT kernels
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////
+// BLOCK FFT based Kernel definitions
+////////////////////////////////////////
+
+/* 
+
+transpose definitions in the kernel names refer to the physical axes in memory, which may not match the logical axes if following a previous transpose.
+    2 letters indicate a swap of the axes specified
+    3 letters indicate a permutation. E.g./ XZY, x -> Z, z -> Y, y -> X
+R2C and C2R kernels are named as:
+<cufftdx transform method>_fft_kernel_< fft type >_< size change >_< transpose axes >
+
+C2C additionally specify direction and may specify an operation.
+<cufftdx transform method>_fft_kernel_< fft type >_< direction >_< size change >_< transpose axes >_< operation in between round trip kernels >
+
+*/
+
+/////////////
+// R2C
+/////////////
+
+/*
+  For these kernels the XY transpose is intended for 2d transforms, while the XZ is for 3d transforms.
+*/
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_R2C_NONE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+// XZ_STRIDE ffts/block via threadIdx.x, notice launch bounds. Creates partial coalescing.
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
+        void block_fft_kernel_R2C_NONE_XZ(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_R2C_INCREASE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+// XZ_STRIDE ffts/block via threadIdx.x, notice launch bounds. Creates partial coalescing.
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
+        void block_fft_kernel_R2C_INCREASE_XZ(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void block_fft_kernel_R2C_DECREASE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+/////////////
+// C2C
+/////////////
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_INCREASE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
+template <class FFT, class ComplexType = typename FFT::value_type>
+__global__ void block_fft_kernel_C2C_DECREASE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_WithPadding_SwapRealSpaceQuadrants(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+
+template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+                                                                Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
+
+template <class FFT, class invFFT, class ComplexType = typename FFT::value_type, class PreOpType, class IntraOpType, class PostOpType>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+                                                           Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv,
+                                                           PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
+
+template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+                                                                                       Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
+
+template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
+                                                                   Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_NONE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_NONE_XZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_NONE_XYZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2C_INCREASE_XYZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
+/////////////
+// C2R
+/////////////
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2R_NONE(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__launch_bounds__(FFT::max_threads_per_block) __global__
+        void block_fft_kernel_C2R_NONE_XY(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
+
+// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void block_fft_kernel_C2R_DECREASE_XY(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, const float twiddle_in, const unsigned int Q, typename FFT::workspace_type workspace);
+
+//////////////////////////////
+// Thread FFT based Kernel definitions
+//////////////////////////////
+
+/////////////
+// R2C
+/////////////
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void thread_fft_kernel_R2C_decomposed(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void thread_fft_kernel_R2C_decomposed_transposed(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+/////////////
+// C2C
+/////////////
+
+template <class FFT, class ComplexType = typename FFT::value_type>
+__global__ void thread_fft_kernel_C2C_decomposed(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
+__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+/////////////
+// C2R
+/////////////
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void thread_fft_kernel_C2R_decomposed(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
+__global__ void thread_fft_kernel_C2R_decomposed_transposed(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// End FFT Kernels
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class InputType, class OutputType>
+__global__ void clip_into_top_left_kernel(InputType* input_values, OutputType* output_values, const short4 dims);
+
+// Modified from GpuImage::ClipIntoRealKernel
+template <typename InputType, typename OutputType>
+__global__ void clip_into_real_kernel(InputType*  real_values_gpu,
+                                      OutputType* other_image_real_values_gpu,
+                                      short4      dims,
+                                      short4      other_dims,
+                                      int3        wanted_coordinate_of_box_center,
+                                      OutputType  wanted_padding_value);
+
+//////////////////////////////////////////////
+// IO functions adapted from the cufftdx examples
+///////////////////////////////
+
+template <class FFT>
+struct io {
+    using complex_type = typename FFT::value_type;
+    using scalar_type  = typename complex_type::value_type;
+
+    static inline __device__ unsigned int stride_size( ) {
+        return cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+    }
+
+    static inline __device__ void load_r2c(const scalar_type* input,
+                                           complex_type*      thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i].x = input[index];
+            thread_data[i].y = 0.0f;
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_r2c(const complex_type* thread_data,
+                                            complex_type*       output,
+                                            int                 offset) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = offset + threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            output[index] = thread_data[i];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element
+        constexpr unsigned int values_left_to_store =
+                threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[index] = thread_data[FFT::elements_per_thread / 2];
+        }
+    }
+
+    // Since we can make repeated use of the same shared memory for each sub-fft
+    // we use this method to load into shared mem instead of directly to registers
+    // TODO set this up for async mem load
+    static inline __device__ void load_shared(const complex_type* input,
+                                              complex_type*       shared_input,
+                                              complex_type*       thread_data,
+                                              float*              twiddle_factor_args,
+                                              float               twiddle_in,
+                                              int*                input_map,
+                                              int*                output_map,
+                                              int                 Q) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            input_map[i]           = index;
+            output_map[i]          = Q * index;
+            twiddle_factor_args[i] = twiddle_in * index;
+            thread_data[i]         = input[index];
+            shared_input[index]    = thread_data[i];
+            index += stride;
+        }
+    }
+
+    // Since we can make repeated use of the same shared memory for each sub-fft
+    // we use this method to load into shared mem instead of directly to registers
+    // TODO set this up for async mem load
+    static inline __device__ void load_shared(const complex_type* input,
+                                              complex_type*       shared_input,
+                                              complex_type*       thread_data,
+                                              float*              twiddle_factor_args,
+                                              float               twiddle_in) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            twiddle_factor_args[i] = twiddle_in * index;
+            thread_data[i]         = input[index];
+            shared_input[index]    = thread_data[i];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void load_shared(const complex_type* input,
+                                              complex_type*       shared_input,
+                                              complex_type*       thread_data,
+                                              float*              twiddle_factor_args,
+                                              float               twiddle_in,
+                                              int*                input_map,
+                                              int*                output_map,
+                                              int                 Q,
+                                              int                 number_of_elements) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            if ( index < number_of_elements ) {
+                input_map[i]           = index;
+                output_map[i]          = Q * index;
+                twiddle_factor_args[i] = twiddle_in * index;
+                thread_data[i]         = input[index];
+                shared_input[index]    = thread_data[i];
+                index += stride;
+            }
+            else {
+                input_map[i] = -9999; // ignore this in subsequent ops
+            }
+        }
+    }
+
+    // Since we can make repeated use of the same shared memory for each sub-fft
+    // we use this method to load into shared mem instead of directly to registers
+    // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
+    static inline __device__ void load_r2c_shared(const scalar_type* input,
+                                                  scalar_type*       shared_input,
+                                                  complex_type*      thread_data,
+                                                  float*             twiddle_factor_args,
+                                                  float              twiddle_in,
+                                                  int*               input_map,
+                                                  int*               output_map,
+                                                  int                Q) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // if (blockIdx.y == 0) ("blck %i index %i \n", Q*index, index);
+            input_map[i]           = index;
+            output_map[i]          = Q * index;
+            twiddle_factor_args[i] = twiddle_in * index;
+            thread_data[i].x       = input[index];
+            thread_data[i].y       = 0.0f;
+            shared_input[index]    = thread_data[i].x;
+            index += stride;
+        }
+    }
+
+    // Since we can make repeated use of the same shared memory for each sub-fft
+    // we use this method to load into shared mem instead of directly to registers
+    // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
+    static inline __device__ void load_r2c_shared(const scalar_type* input,
+                                                  scalar_type*       shared_input,
+                                                  complex_type*      thread_data,
+                                                  float*             twiddle_factor_args,
+                                                  float              twiddle_in) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            twiddle_factor_args[i] = twiddle_in * index;
+            thread_data[i].x       = input[index];
+            thread_data[i].y       = 0.0f;
+            shared_input[index]    = thread_data[i].x;
+            index += stride;
+        }
+    }
+
+    static inline __device__ void load_r2c_shared_and_pad(const scalar_type* input,
+                                                          complex_type*      shared_mem) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            shared_mem[GetSharedMemPaddedIndex(index)] = complex_type(input[index], 0.f);
+            index += stride;
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void copy_from_shared(const complex_type* shared_mem,
+                                                   complex_type*       thread_data,
+                                                   const unsigned int  Q) {
+        const unsigned int stride = stride_size( ) * Q; // I think the Q is needed, but double check me TODO
+        unsigned int       index  = (threadIdx.x * Q) + threadIdx.z;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i] = shared_mem[GetSharedMemPaddedIndex(index)];
+            index += stride;
+        }
+        __syncthreads( ); // FFT().execute is setup to reuse the shared mem, so we need to sync here. Optionally, we could allocate more shared mem and remove this sync
+    }
+
+    // Note that unlike most functions in this file, this one does not have a
+    // const decorator on the thread mem, as we want to modify it with the twiddle factors
+    // before reducing the full shared mem space.
+    static inline __device__ void reduce_block_fft(complex_type*      thread_data,
+                                                   complex_type*      shared_mem,
+                                                   const float        twiddle_in,
+                                                   const unsigned int Q) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+        complex_type       twiddle;
+        // In the first loop, all threads participate and write back to natural order in shared memory
+        // while also updating with the full size twiddle factor.
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // ( index * threadIdx.z) == ( k % P * n2 )
+            SINCOS(twiddle_in * (index * threadIdx.z), &twiddle.y, &twiddle.x);
+            thread_data[i] *= twiddle;
+
+            shared_mem[GetSharedMemPaddedIndex(index)] = thread_data[i];
+            index += stride;
+        }
+        __syncthreads( );
+
+        // Now we reduce the shared memory into the first block of size P
+        // Reuse index
+        for ( index = 2; index <= Q; index *= 2 ) {
+            // Some threads drop out each loop
+            if ( threadIdx.z % index == 0 ) {
+                for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                    thread_data[i] += shared_mem[GetSharedMemPaddedIndex(threadIdx.x + (i * stride) + (index / 2 * size_of<FFT>::value))];
+                }
+            } // end if condition
+            // All threads can reach this point
+            __syncthreads( );
+        }
+    }
+
+    static inline __device__ void store_r2c_reduced(const complex_type* thread_data,
+                                                    complex_type*       output,
+                                                    const unsigned int  pixel_pitch,
+                                                    const unsigned int  memory_limit) {
+        if ( threadIdx.z == 0 ) {
+            // Finally we write out the first size_of<FFT>::values to global
+            const unsigned int stride = stride_size( );
+            unsigned int       index  = threadIdx.x;
+            for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) {
+                if ( index < memory_limit ) {
+                    // transposed index.
+                    output[index * pixel_pitch + blockIdx.y] = thread_data[i];
+                }
+                index += stride;
+            }
+        }
+    }
+
+    // when using load_shared || load_r2c_shared, we need then copy from shared mem into the registers.
+    // notice we still need the packed complex values for the xform.
+    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
+                                                   complex_type*      thread_data,
+                                                   int*               input_map) {
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i].x = shared_input[input_map[i]];
+            thread_data[i].y = 0.0f;
+        }
+    }
+
+    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
+                                                   complex_type*       thread_data,
+                                                   int*                input_map) {
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i] = shared_input_complex[input_map[i]];
+        }
+    }
+
+    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
+                                                   complex_type*      thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i].x = shared_input[index];
+            thread_data[i].y = 0.0f;
+            index += stride;
+        }
+    }
+
+    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
+                                                   complex_type*       thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i] = shared_input_complex[index];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void load_shared_and_conj_multiply(const complex_type* image_to_search,
+                                                                complex_type*       thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        complex_type       c;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            c.x = (thread_data[i].x * image_to_search[index].x + thread_data[i].y * image_to_search[index].y);
+            c.y = (thread_data[i].y * image_to_search[index].x - thread_data[i].x * image_to_search[index].y);
+            // a * conj b
+            thread_data[i] = c; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
+            index += stride;
+        }
+    }
+
+    // TODO: set user lambda to default = false, then get rid of other load_shared
+    template <class FunctionType>
+    static inline __device__ void load_shared(const complex_type* image_to_search,
+                                              complex_type*       thread_data,
+                                              FunctionType        intra_op_functor = nullptr) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                intra_op_functor(thread_data[i].x, thread_data[i].y, image_to_search[index].x, image_to_search[index].y); //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
+                index += stride;
+            }
+        }
+        else {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                // a * conj b
+                thread_data[i] = thread_data[i], image_to_search[index]; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
+                index += stride;
+            }
+        }
+    }
+
+    // Now we need send to shared mem and transpose on the way
+    // TODO: fix bank conflicts later.
+    static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* shared_mem,
+                                                             complex_type* thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[i];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[FFT::elements_per_thread / 2];
+        }
+        __syncthreads( );
+    }
+
+    // Now we need send to shared mem and transpose on the way
+    // TODO: fix bank conflicts later.
+    static inline __device__ void transpose_in_shared_XZ(complex_type* shared_mem,
+                                                         complex_type* thread_data) {
+        const unsigned int stride = io<FFT>::stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // return (XZ_STRIDE*blockIdx.z + threadIdx.z) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + X * gridDim.y );
+            // XZ_STRIDE == XZ_STRIDE
+            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[i];
+            index += stride;
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store_r2c_transposed_xz(const complex_type* thread_data,
+                                                          complex_type*       output) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            output[Return1DFFTAddress_XZ_transpose(index)] = thread_data[i];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[Return1DFFTAddress_XZ_transpose(index)] = thread_data[FFT::elements_per_thread / 2];
+        }
+        __syncthreads( );
+    }
+
+    // Store a transposed tile, made up of contiguous (full) FFTS
+    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
+                                                                    complex_type*       output) {
+        const unsigned int     stride                 = stride_size( );
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft      = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
+        }
+        __syncthreads( );
+    }
+
+    // Store a transposed tile, made up of non-contiguous (strided partial) FFTS
+    //
+    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
+                                                                    complex_type*       output,
+                                                                    const unsigned int  Q,
+                                                                    const unsigned int  sub_fft) {
+        const unsigned int     stride                 = stride_size( );
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            output[Return1DFFTAddress_XZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft      = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[Return1DFFTAddress_XZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store_transposed_xz_strided_Z(const complex_type* shared_mem,
+                                                                complex_type*       output) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + threadIdx.z * cufftdx::size_of<FFT>::value;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
+            index += stride;
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
+                                                          complex_type*       output,
+                                                          int                 pixel_pitch) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
+            output[index * pixel_pitch + blockIdx.y] = thread_data[i];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[index * pixel_pitch + blockIdx.y] = thread_data[FFT::elements_per_thread / 2];
+        }
+    }
+
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
+                                                          complex_type*       output,
+                                                          int*                output_MAP,
+                                                          int                 pixel_pitch) {
+        const unsigned int stride = stride_size( );
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
+            output[output_MAP[i] * pixel_pitch + blockIdx.y] = thread_data[i];
+            // if (blockIdx.y == 32) printf("from store transposed %i , val %f %f\n", output_MAP[i], thread_data[i].x, thread_data[i].y);
+        }
+        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        if ( threadIdx.x < values_left_to_store ) {
+            output[output_MAP[FFT::elements_per_thread / 2] * pixel_pitch + blockIdx.y] = thread_data[FFT::elements_per_thread / 2];
+        }
+    }
+
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
+                                                          complex_type*       output,
+                                                          int*                output_MAP,
+                                                          int                 pixel_pitch,
+                                                          int                 memory_limit) {
+        const unsigned int stride = stride_size( );
+        for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) {
+            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
+            // if (blockIdx.y == 1) printf("index, pitch, blcok, address %i, %i, %i, %i\n", output_MAP[i], pixel_pitch, memory_limit, output_MAP[i]*pixel_pitch + blockIdx.y);
+
+            if ( output_MAP[i] < memory_limit )
+                output[output_MAP[i] * pixel_pitch + blockIdx.y] = thread_data[i];
+            // if (blockIdx.y == 32) printf("from store transposed %i , val %f %f\n", output_MAP[i], thread_data[i].x, thread_data[i].y);
+        }
+        // constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        // constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
+        // constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
+        // if (threadIdx.x < values_left_to_store)
+        // {
+        //   printf("index, pitch, blcok, address %i, %i, %i, %i\n", output_MAP[FFT::elements_per_thread / 2], pixel_pitch, blockIdx.y, output_MAP[FFT::elements_per_thread / 2]*pixel_pitch + blockIdx.y);
+        //   if (output_MAP[FFT::elements_per_thread / 2] < memory_limit) output[output_MAP[FFT::elements_per_thread / 2]*pixel_pitch + blockIdx.y] =  thread_data[FFT::elements_per_thread / 2];
+        // }
+    }
+
+    static inline __device__ void load_c2r(const complex_type* input,
+                                           complex_type*       thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            thread_data[i] = input[index];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
+        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
+        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
+        if ( threadIdx.x < values_left_to_load ) {
+            thread_data[FFT::elements_per_thread / 2] = input[index];
+        }
+    }
+
+    static inline __device__ void load_c2r_transposed(const complex_type* input,
+                                                      complex_type*       thread_data,
+                                                      unsigned int        pixel_pitch) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            thread_data[i] = input[(pixel_pitch * index) + blockIdx.y];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
+        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
+        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
+        if ( threadIdx.x < values_left_to_load ) {
+            thread_data[FFT::elements_per_thread / 2] = input[(pixel_pitch * index) + blockIdx.y];
+        }
+    }
+
+    static inline __device__ void load_c2r_shared_and_pad(const complex_type* input,
+                                                          complex_type*       shared_mem,
+                                                          const unsigned int  pixel_pitch) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
+            shared_mem[GetSharedMemPaddedIndex(index)] = input[pixel_pitch * index];
+            index += stride;
+        }
+        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
+        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
+        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
+        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
+        if ( threadIdx.x < values_left_to_load ) {
+            shared_mem[GetSharedMemPaddedIndex(index)] = input[pixel_pitch * index];
+        }
+        __syncthreads( );
+    }
+
+    // this may benefit from asynchronous execution
+    static inline __device__ void load(const complex_type* input,
+                                       complex_type*       thread_data) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            thread_data[i] = input[index];
+            // if (blockIdx.y == 0) printf("block %i , val %f %f\n", index, input[index].x, input[index].y);
+
+            index += stride;
+        }
+    }
+
+    // this may benefit from asynchronous execution
+    static inline __device__ void load(const complex_type* input,
+                                       complex_type*       thread_data,
+                                       int                 last_index_to_load) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            if ( index < last_index_to_load )
+                thread_data[i] = input[index];
+            else
+                thread_data[i] = complex_type(0.0f, 0.0f);
+            index += stride;
+        }
+    }
+
+    //  TODO: set pre_op_functor to default=false and get rid of other load
+    template <class FunctionType>
+    static inline __device__ void load(const complex_type* input,
+                                       complex_type*       thread_data,
+                                       int                 last_index_to_load,
+                                       FunctionType        pre_op_functor = nullptr) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                if ( index < last_index_to_load )
+                    thread_data[i] = pre_op_functor(input[index]);
+                else
+                    thread_data[i] = pre_op_functor(complex_type(0.0f, 0.0f));
+                index += stride;
+            }
+        }
+        else {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                if ( index < last_index_to_load )
+                    thread_data[i] = input[index];
+                else
+                    thread_data[i] = complex_type(0.0f, 0.0f);
+                index += stride;
+            }
+        }
+    }
+
+    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
+                                                           complex_type*       output,
+                                                           int                 first_negative_index) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        complex_type       phase_shift;
+        int                logical_y;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // If no kernel based changes are made to source_idx, this will be the same as the original index value
+            phase_shift = thread_data[i];
+            logical_y   = index;
+            if ( logical_y >= first_negative_index )
+                logical_y -= 2 * first_negative_index;
+            if ( (int(blockIdx.y) + logical_y) % 2 != 0 )
+                phase_shift *= -1.f;
+            output[index] = phase_shift;
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
+                                                           complex_type*       output,
+                                                           int*                source_idx,
+                                                           int                 first_negative_index) {
+        const unsigned int stride = stride_size( );
+        complex_type       phase_shift;
+        int                logical_y;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // If no kernel based changes are made to source_idx, this will be the same as the original index value
+            phase_shift = thread_data[i];
+            logical_y   = source_idx[i];
+            if ( logical_y >= first_negative_index )
+                logical_y -= 2 * first_negative_index;
+            if ( (int(blockIdx.y) + logical_y) % 2 != 0 )
+                phase_shift *= -1.f;
+            output[source_idx[i]] = phase_shift;
+        }
+    }
+
+    template <class FunctionType = std::nullptr_t>
+    static inline __device__ void store(const complex_type* thread_data,
+                                        complex_type*       output,
+                                        FunctionType        post_op_functor = nullptr) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                output[index] = post_op_functor(thread_data[i]);
+                index += stride;
+            }
+        }
+        else {
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                output[index] = thread_data[i];
+                index += stride;
+            }
+        }
+    }
+
+    static inline __device__ void store(const complex_type* thread_data,
+                                        complex_type*       output,
+                                        const unsigned int  Q,
+                                        const unsigned int  sub_fft) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            output[index * Q + sub_fft] = thread_data[i];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_Z(const complex_type* shared_mem,
+                                          complex_type*       output) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            output[Return1DFFTAddress_YZ_transpose_strided_Z(index)] = shared_mem[index];
+
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_Z(const complex_type* shared_mem,
+                                          complex_type*       output,
+                                          const unsigned int  Q,
+                                          const unsigned int  sub_fft) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            output[Return1DFFTAddress_YZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
+            index += stride;
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store(const complex_type* thread_data,
+                                        complex_type*       output,
+                                        unsigned int        memory_limit) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            if ( index < memory_limit )
+                output[index] = thread_data[i];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store(const complex_type* thread_data,
+                                        complex_type*       output,
+                                        int*                source_idx) {
+        const unsigned int stride = stride_size( );
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // If no kernel based changes are made to source_idx, this will be the same as the original index value
+            output[source_idx[i]] = thread_data[i];
+        }
+    }
+
+    static inline __device__ void store_subset(const complex_type* thread_data,
+                                               complex_type*       output,
+                                               int*                source_idx) {
+        const unsigned int stride = stride_size( );
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // If no kernel based changes are made to source_idx, this will be the same as the original index value
+            if ( source_idx[i] >= 0 )
+                output[source_idx[i]] = thread_data[i];
+        }
+    }
+
+    static inline __device__ void store_coalesced(const complex_type* shared_output,
+                                                  complex_type*       global_output,
+                                                  int                 offset) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = offset + threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            global_output[index] = shared_output[index];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void load_c2c_shared_and_pad(const complex_type* input,
+                                                          complex_type*       shared_mem) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            shared_mem[GetSharedMemPaddedIndex(index)] = input[index];
+            index += stride;
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store_c2c_reduced(const complex_type* thread_data,
+                                                    complex_type*       output) {
+        if ( threadIdx.z == 0 ) {
+            // Finally we write out the first size_of<FFT>::values to global
+            const unsigned int stride = stride_size( );
+            unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                if ( index < size_of<FFT>::value ) {
+                    // transposed index.
+                    output[index] = thread_data[i];
+                }
+                index += stride;
+            }
+        }
+    }
+
+    static inline __device__ void store_c2r_reduced(const complex_type* thread_data,
+                                                    scalar_type*        output) {
+        if ( threadIdx.z == 0 ) {
+            // Finally we write out the first size_of<FFT>::values to global
+            const unsigned int stride = stride_size( );
+            unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                if ( index < size_of<FFT>::value ) {
+                    // transposed index.
+                    output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+                }
+                index += stride;
+            }
+        }
+    }
+
+    static inline __device__ void store_transposed(const complex_type* thread_data,
+                                                   complex_type*       output,
+                                                   int*                output_map,
+                                                   int*                rotated_offset,
+                                                   int                 memory_limit) {
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // If no kernel based changes are made to source_idx, this will be the same as the original index value
+            if ( output_map[i] < memory_limit )
+                output[rotated_offset[1] * output_map[i] + rotated_offset[0]] = thread_data[i];
+        }
+    }
+
+    static inline __device__ void store_c2r(const complex_type* thread_data,
+                                            scalar_type*        output) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_c2r(const complex_type* thread_data,
+                                            scalar_type*        output,
+                                            unsigned int        memory_limit) {
+        const unsigned int stride = stride_size( );
+        unsigned int       index  = threadIdx.x;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            // TODO: does reinterpret_cast<const scalar_type*>(thread_data)[i] make more sense than just thread_data[i].x??
+            if ( index < memory_limit )
+                output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
+            index += stride;
+        }
+    }
+}; // struct io}
+
+template <class FFT>
+struct io_thread {
+    using complex_type = typename FFT::value_type;
+    using scalar_type  = typename complex_type::value_type;
+
+    static inline __device__ void load_r2c(const scalar_type* input,
+                                           complex_type*      thread_data,
+                                           const int          stride) {
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            thread_data[i].x = input[index];
+            thread_data[i].y = scalar_type(0);
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_r2c(const complex_type* shared_output,
+                                            complex_type*       output,
+                                            const int           stride,
+                                            const int           memory_limit) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
+            output[index] = shared_output[index];
+            index += stride;
+        }
+        if ( index < memory_limit ) {
+            output[index] = shared_output[index];
+        }
+    }
+
+    static inline __device__ void store_r2c_transposed_xy(const complex_type* shared_output,
+                                                          complex_type*       output,
+                                                          int                 stride,
+                                                          int                 pixel_pitch,
+                                                          int                 memory_limit) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
+            output[index * pixel_pitch] = shared_output[index];
+            index += stride;
+        }
+        if ( index < memory_limit ) {
+            output[index * pixel_pitch] = shared_output[index];
+        }
+    }
+
+    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
+                                                            complex_type*       shared_output,
+                                                            float               twiddle_in,
+                                                            int                 Q,
+                                                            int                 memory_limit) {
+        // Unroll the first loop and initialize the shared mem.
+        complex_type twiddle;
+        int          index = threadIdx.x * size_of<FFT>::value;
+        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+            twiddle *= thread_data[i];
+            if ( index + i < memory_limit )
+                shared_output[index + i] = twiddle;
+        }
+        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
+
+        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
+            // wrap around, 0 --> 1, Q-1 --> 0 etc.
+            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+                twiddle *= thread_data[i];
+                if ( index + i < memory_limit ) {
+                    atomicAdd_block(&shared_output[index + i].x, twiddle.x);
+                    atomicAdd_block(&shared_output[index + i].y, twiddle.y);
+                }
+            }
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void load_c2c(const complex_type* input,
+                                           complex_type*       thread_data,
+                                           const int           stride) {
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            thread_data[i] = input[index];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void store_c2c(const complex_type* shared_output,
+                                            complex_type*       output,
+                                            const int           stride) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            output[index] = shared_output[index];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
+                                                            complex_type*       shared_output,
+                                                            float               twiddle_in,
+                                                            int                 Q) {
+        // Unroll the first loop and initialize the shared mem.
+        complex_type twiddle;
+        int          index = threadIdx.x * size_of<FFT>::value;
+        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+            twiddle *= thread_data[i];
+            shared_output[index + i] = twiddle;
+        }
+        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
+
+        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
+            // wrap around, 0 --> 1, Q-1 --> 0 etc.
+            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
+            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+                twiddle *= thread_data[i];
+                atomicAdd_block(&shared_output[index + i].x, twiddle.x);
+                atomicAdd_block(&shared_output[index + i].y, twiddle.y);
+            }
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void load_c2r(const complex_type* input,
+                                           complex_type*       thread_data,
+                                           const int           stride,
+                                           const int           memory_limit) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index  = threadIdx.x;
+        unsigned int offset = 2 * memory_limit - 2;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            if ( index < memory_limit ) {
+                thread_data[i] = input[index];
+            }
+            else {
+                // assuming even dimension
+                // FIXME shouldn't need to read in from global for an even stride
+                thread_data[i]   = input[offset - index];
+                thread_data[i].y = -thread_data[i].y; // conjugate
+            }
+            index += stride;
+        }
+    }
+
+    // FIXME as above
+    static inline __device__ void load_c2r_transposed(const complex_type* input,
+                                                      complex_type*       thread_data,
+                                                      int                 stride,
+                                                      int                 pixel_pitch,
+                                                      int                 memory_limit) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index = threadIdx.x;
+        // unsigned int offset = 2*memory_limit - 2;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            if ( index < memory_limit ) {
+                thread_data[i] = input[index * pixel_pitch];
+            }
+            else {
+                // input[2*memory_limit - index - 2];
+                // assuming even dimension
+                // FIXME shouldn't need to read in from global for an even stride
+                thread_data[i]   = input[(2 * memory_limit - index) * pixel_pitch];
+                thread_data[i].y = -thread_data[i].y; // conjugate
+            }
+            index += stride;
+        }
+    }
+
+    static inline __device__ void remap_decomposed_segments_c2r(const complex_type* thread_data,
+                                                                scalar_type*        shared_output,
+                                                                scalar_type         twiddle_in,
+                                                                int                 Q) {
+        // Unroll the first loop and initialize the shared mem.
+        complex_type twiddle;
+        int          index = threadIdx.x * size_of<FFT>::value;
+        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+            shared_output[index + i] = (twiddle.x * thread_data[i].x - twiddle.y * thread_data[i].y); // assuming the output is real, only the real parts add, so don't bother with the complex
+        }
+        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
+
+        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
+            // wrap around, 0 --> 1, Q-1 --> 0 etc.
+            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
+
+            for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+                // if (threadIdx.x == 32) printf("remap tid, subfft, q, index + i %i %i %i %i\n", threadIdx.x,sub_fft, Q, index+i);
+                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
+                atomicAdd_block(&shared_output[index + i], twiddle.x * thread_data[i].x - twiddle.y * thread_data[i].y);
+            }
+        }
+        __syncthreads( );
+    }
+
+    static inline __device__ void store_c2r(const scalar_type* shared_output,
+                                            scalar_type*       output,
+                                            const int          stride) {
+        // Each thread reads in the input data at stride = mem_offsets.Q
+        unsigned int index = threadIdx.x;
+        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
+            output[index] = shared_output[index];
+            index += stride;
+        }
+    }
+
+    static inline __device__ void load_shared_and_conj_multiply(const complex_type* image_to_search,
+                                                                const complex_type* shared_mem,
+                                                                complex_type*       thread_data,
+                                                                const int           stride) {
+        unsigned int index = threadIdx.x;
+        complex_type c;
+        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
+            c.x = (shared_mem[index].x * image_to_search[index].x + shared_mem[index].y * image_to_search[index].y);
+            c.y = (shared_mem[index].y * image_to_search[index].x - shared_mem[index].x * image_to_search[index].y);
+            // a * conj b
+            thread_data[i] = c; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
+            index += stride;
+        }
+        __syncthreads( );
+    }
+}; // struct thread_io
+
+} // namespace FastFFT
+
+// clang-format on
+
+#endif // Fast_FFT_cuh_
diff --git a/include/FastFFT.cuh.h b/include/FastFFT.cuh.h
deleted file mode 100644
index 7bc46ca..0000000
--- a/include/FastFFT.cuh.h
+++ /dev/null
@@ -1,1488 +0,0 @@
-// Utilites for FastFFT.cu that we don't need the host to know about (FastFFT.h)
-#include "FastFFT.h"
-
-#ifndef Fast_FFT_cuh_
-#define Fast_FFT_cuh_
-
-#include "cufftdx/include/cufftdx.hpp"
-
-// “This software contains source code provided by NVIDIA Corporation.” Much of it is modfied as noted at relevant function definitions.
-
-// When defined Turns on synchronization based checking for all FFT kernels as well as cudaErr macros
-// Defined in the Makefile when DEBUG_STAGE is not equal 8 (the default, not partial transforms.)
-// #define HEAVYERRORCHECKING_FFT
-
-// Various levels of debuging conditions and prints
-// #define FFT_DEBUG_LEVEL 0
-
-// #define forceforce( type )  __nv_is_extended_device_lambda_closure_type( type )
-//FIXME: change to constexpr func
-template <typename K>
-constexpr inline bool IS_IKF_t( ) {
-    if constexpr ( std::is_final_v<K> ) {
-        return true;
-    }
-    else {
-        return false;
-    }
-};
-
-// clang-format off
-
-#if FFT_DEBUG_LEVEL < 1
-
-#define MyFFTDebugPrintWithDetails(...)
-#define MyFFTDebugAssertTrue(cond, msg, ...)
-#define MyFFTDebugAssertFalse(cond, msg, ...)
-#define MyFFTDebugAssertTestTrue(cond, msg, ...)
-#define MyFFTDebugAssertTestFalse(cond, msg, ...)
-
-#else
-// Minimally define asserts that check state variables and setup.
-#define MyFFTDebugAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; exit(-1); } }
-#define MyFFTDebugAssertFalse(cond, msg, ...) { if ( (cond) == true ) { std::cerr << msg << std::endl  << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;  exit(-1); } }                                                                                                    
-                                                                                                                    
-#endif
-
-#if FFT_DEBUG_LEVEL > 1
-// Turn on checkpoints in the testing functions.
-#define MyFFTDebugAssertTestTrue(cond, msg, ...)  { if ( (cond) != true ) { std::cerr << "    Test " << msg << " FAILED!" << std::endl  << "  at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1); } else { std::cerr << "    Test " << msg << " passed!" << std::endl; }}
-#define MyFFTDebugAssertTestFalse(cond, msg, ...)  { if ( (cond) == true ) {  std::cerr << "    Test " << msg << " FAILED!" << std::endl   << " at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;   exit(-1);  } else {  std::cerr << "    Test " << msg << " passed!" << std::endl;  } }
-
-#endif
-
-#if FFT_DEBUG_LEVEL == 2
-#define MyFFTDebugPrintWithDetails(...)
-#endif
-
-#if FFT_DEBUG_LEVEL == 3
-// More verbose debug info
-#define MyFFTDebugPrint(...) \
-    { std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTDebugPrintWithDetails(...) \
-    { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-
-#endif
-
-#if FFT_DEBUG_LEVEL == 4
-// More verbose debug info + state info
-#define MyFFTDebugPrint(...) { PrintState( );  std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTDebugPrintWithDetails(...)  { PrintState( ); std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-
-#endif
-
-// Always in use
-#define MyFFTPrint(...) { std::cerr << __VA_ARGS__ << std::endl; }
-#define MyFFTPrintWithDetails(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-#define MyFFTRunTimeAssertTrue(cond, msg, ...) { if ( (cond) != true ) { std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1); }  }
-#define MyFFTRunTimeAssertFalse(cond, msg, ...) { if ( (cond) == true ) {std::cerr << msg << std::endl << " Failed Assert at " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl;exit(-1);  } }                                                                                                               
-
-
-
-// I use the same things in cisTEM, so check for them. FIXME, get rid of defines and also find a better sharing mechanism.
-#ifndef cudaErr
-// Note we are using std::cerr b/c the wxWidgets apps running in cisTEM are capturing std::cout
-// If I leave cudaErr blank when HEAVYERRORCHECKING_FFT is not defined, I get some reports/warnings about unused or unreferenced variables. I suspect the performance hit is very small so just leave this on.
-// The real cost is in the synchronization of in pre/postcheck.
-#define cudaErr(error) { auto status = static_cast<cudaError_t>(error); if ( status != cudaSuccess ) { std::cerr << cudaGetErrorString(status) << " :-> "; MyFFTPrintWithDetails(""); } };
-#endif
-
-#ifndef postcheck
-    #ifndef precheck
-        #ifndef HEAVYERRORCHECKING_FFT
-            #define postcheck
-            #define precheck
-        #else
-            #define postcheck  { cudaErr(cudaPeekAtLastError( )); cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); cudaErr(error); };
-            #define precheck { cudaErr(cudaGetLastError( )); }
-        #endif
-    #endif
-#endif  
-
-// clang-format on
-
-inline void checkCudaErr(cudaError_t err) {
-    if ( err != cudaSuccess ) {
-        std::cerr << cudaGetErrorString(err) << " :-> " << std::endl;
-        MyFFTPrintWithDetails(" ");
-    }
-};
-
-#define USEFASTSINCOS
-// The __sincosf doesn't appear to be the problem with accuracy, likely just the extra additions, but it probably also is less flexible with other types. I don't see a half precision equivalent.
-#ifdef USEFASTSINCOS
-__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
-    __sincosf(arg, s, c);
-}
-#else
-__device__ __forceinline__ void SINCOS(float arg, float* s, float* c) {
-    sincos(arg, s, c);
-}
-#endif
-
-namespace FastFFT {
-
-__device__ __forceinline__ int
-d_ReturnReal1DAddressFromPhysicalCoord(int3 coords, short4 img_dims) {
-    return ((((int)coords.z * (int)img_dims.y + coords.y) * (int)img_dims.w * 2) + (int)coords.x);
-}
-
-static constexpr const int XZ_STRIDE = 16;
-
-static constexpr const int          bank_size    = 32;
-static constexpr const int          bank_padded  = bank_size + 1;
-static constexpr const unsigned int ubank_size   = 32;
-static constexpr const unsigned int ubank_padded = ubank_size + 1;
-
-__device__ __forceinline__ int GetSharedMemPaddedIndex(const int index) {
-    return (index % bank_size) + ((index / bank_size) * bank_padded);
-}
-
-__device__ __forceinline__ int GetSharedMemPaddedIndex(const unsigned int index) {
-    return (index % ubank_size) + ((index / ubank_size) * ubank_padded);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress(const unsigned int pixel_pitch) {
-    return pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0. Right now testing for a stride of 2, but this could be modifiable if it works.
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_strided_Z(const unsigned int pixel_pitch) {
-    // In the current condition, threadIdx.z is either 0 || 1, and gridDim.z = size_z / 2
-    // index into a 2D tile in the XZ plane, for output in the ZX transposed plane (for coalsced write.)
-    return pixel_pitch * (blockIdx.y + (XZ_STRIDE * blockIdx.z + threadIdx.z) * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int ReturnZplane(const unsigned int NX, const unsigned int NY) {
-    return (blockIdx.z * NX * NY);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_Z(const unsigned int NY) {
-    return blockIdx.y + (blockIdx.z * NY);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTColumn_XYZ_transpose(const unsigned int NX) {
-    // NX should be size_of<FFT>::value for this method. Should this be templated?
-    // presumably the XZ axis is alread transposed on the forward, used to index into this state. Indexs in (ZY)' plane for input, to be transposed and permuted to output.'
-    return NX * (XZ_STRIDE * (blockIdx.y + gridDim.y * blockIdx.z) + threadIdx.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose(const unsigned int X) {
-    return blockIdx.z + gridDim.z * (blockIdx.y + X * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
-    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
-    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
-    // above * blockIdx.y -> offset in physical Y (transposed Y)
-    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
-    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + (IDX / XZ_STRIDE) * gridDim.y);
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_XZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    // (IDX % XZ_STRIDE) -> transposed x coordinate in tile
-    // ((blockIdx.z*XZ_STRIDE) -> tile offest in physical X (with above gives physical X out (transposed Z))
-    // (XZ_STRIDE*gridDim.z) -> n elements in physical X (transposed Z)
-    // above * blockIdx.y -> offset in physical Y (transposed Y)
-    // (IDX / XZ_STRIDE) -> n elements physical Z (transposed X)
-    return ((IDX % XZ_STRIDE) + (blockIdx.z * XZ_STRIDE)) + (XZ_STRIDE * gridDim.z) * (blockIdx.y + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.y);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + (IDX / XZ_STRIDE) * gridDim.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YZ_transpose_strided_Z(const unsigned int IDX, const unsigned int Q, const unsigned int sub_fft) {
-    // return (XZ_STRIDE*blockIdx.z + (X % XZ_STRIDE)) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + (X / XZ_STRIDE) * gridDim.y );
-    return ((IDX % XZ_STRIDE) + (blockIdx.y * XZ_STRIDE)) + (gridDim.y * XZ_STRIDE) * (blockIdx.z + ((IDX / XZ_STRIDE) * Q + sub_fft) * gridDim.z);
-}
-
-// Return the address of the 1D transform index 0
-static __device__ __forceinline__ unsigned int Return1DFFTColumn_XZ_to_XY( ) {
-    // return blockIdx.y + gridDim.y * ( blockIdx.z + gridDim.z * X);
-    return blockIdx.y + gridDim.y * blockIdx.z;
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX_to_XY( ) {
-    return blockIdx.z + gridDim.z * blockIdx.y;
-}
-
-static __device__ __forceinline__ unsigned int Return1DFFTAddress_YX( ) {
-    return Return1DFFTColumn_XZ_to_XY( );
-}
-
-// Complex a * conj b multiplication
-template <typename ComplexType, typename ScalarType>
-static __device__ __host__ inline auto ComplexConjMulAndScale(const ComplexType a, const ComplexType b, ScalarType s) -> decltype(b) {
-    ComplexType c;
-    c.x = s * (a.x * b.x + a.y * b.y);
-    c.y = s * (a.y * b.x - a.x * b.y);
-    return c;
-}
-
-// GetCudaDeviceArch from https://github.com/mnicely/cufft_examples/blob/master/Common/cuda_helper.h
-void GetCudaDeviceProps(DeviceProps& dp);
-
-void CheckSharedMemory(int& memory_requested, DeviceProps& dp);
-void CheckSharedMemory(unsigned int& memory_requested, DeviceProps& dp);
-
-using namespace cufftdx;
-
-// TODO this probably needs to depend on the size of the xform, at least small vs large.
-constexpr const int elements_per_thread_16   = 4;
-constexpr const int elements_per_thread_32   = 8;
-constexpr const int elements_per_thread_64   = 8;
-constexpr const int elements_per_thread_128  = 8;
-constexpr const int elements_per_thread_256  = 8;
-constexpr const int elements_per_thread_512  = 8;
-constexpr const int elements_per_thread_1024 = 8;
-constexpr const int elements_per_thread_2048 = 8;
-constexpr const int elements_per_thread_4096 = 8;
-constexpr const int elements_per_thread_8192 = 16;
-
-namespace KernelFunction {
-
-    // Define an enum for different functors
-    // Intra Kernel Function Type
-    enum IKF_t { NOOP, CONJ_MUL};
-
-// Maybe a better way to check , but using keyword final to statically check for non NONE types
-template <class T, int N_ARGS, IKF_t U>
-class my_functor {};
-
-template <class T>
-class my_functor<T, 0, IKF_t::NOOP> {
-  public:
-    __device__ __forceinline__
-            T
-            operator( )( ) {
-        printf("really specific NOOP\n");
-        return 0;
-    }
-};
-
-template <class T>
-class my_functor<T, 2, IKF_t::CONJ_MUL> final {
-  public:
-    __device__ __forceinline__
-            T
-            operator( )(float& template_fft_x, float& template_fft_y, const float& target_fft_x, const float& target_fft_y) {
-        // Is there a better way than declaring this variable each time?
-        float tmp      = (template_fft_x * target_fft_x + template_fft_y * target_fft_y);
-        template_fft_y = (template_fft_y * target_fft_x - template_fft_x * target_fft_y);
-        template_fft_x = tmp;
-    }
-};
-
-} // namespace KernelFunction
-
-// constexpr const std::map<unsigned int, unsigned int> elements_per_thread = {
-//     {16, 4}, {"GPU", 15}, {"RAM", 20},
-// };
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// FFT kernels
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////
-// BLOCK FFT based Kernel definitions
-////////////////////////////////////////
-
-/* 
-
-transpose definitions in the kernel names refer to the physical axes in memory, which may not match the logical axes if following a previous transpose.
-    2 letters indicate a swap of the axes specified
-    3 letters indicate a permutation. E.g./ XZY, x -> Z, z -> Y, y -> X
-R2C and C2R kernels are named as:
-<cufftdx transform method>_fft_kernel_< fft type >_< size change >_< transpose axes >
-
-C2C additionally specify direction and may specify an operation.
-<cufftdx transform method>_fft_kernel_< fft type >_< direction >_< size change >_< transpose axes >_< operation in between round trip kernels >
-
-*/
-
-/////////////
-// R2C
-/////////////
-
-/*
-  For these kernels the XY transpose is intended for 2d transforms, while the XZ is for 3d transforms.
-*/
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_R2C_NONE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-// XZ_STRIDE ffts/block via threadIdx.x, notice launch bounds. Creates partial coalescing.
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
-        void block_fft_kernel_R2C_NONE_XZ(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_R2C_INCREASE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-// XZ_STRIDE ffts/block via threadIdx.x, notice launch bounds. Creates partial coalescing.
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
-        void block_fft_kernel_R2C_INCREASE_XZ(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void block_fft_kernel_R2C_DECREASE_XY(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-/////////////
-// C2C
-/////////////
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_INCREASE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
-template <class FFT, class ComplexType = typename FFT::value_type>
-__global__ void block_fft_kernel_C2C_DECREASE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_WithPadding_SwapRealSpaceQuadrants(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-
-template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
-                                                                Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
-
-template <class FFT, class invFFT, class ComplexType = typename FFT::value_type, class PreOpType, class IntraOpType, class PostOpType>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
-                                                           Offsets mem_offsets, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv,
-                                                           PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
-
-template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
-                                                                                       Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
-
-template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
-                                                                   Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv);
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_NONE(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_NONE_XZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_NONE_XYZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__launch_bounds__(XZ_STRIDE* FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2C_INCREASE_XYZ(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q, typename FFT::workspace_type workspace);
-/////////////
-// C2R
-/////////////
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2R_NONE(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__launch_bounds__(FFT::max_threads_per_block) __global__
-        void block_fft_kernel_C2R_NONE_XY(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, typename FFT::workspace_type workspace);
-
-// __launch_bounds__(FFT::max_threads_per_block)  we don't know this because it is threadDim.x * threadDim.z - this could be templated if it affects performance significantly
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void block_fft_kernel_C2R_DECREASE_XY(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, const float twiddle_in, const unsigned int Q, typename FFT::workspace_type workspace);
-
-//////////////////////////////
-// Thread FFT based Kernel definitions
-//////////////////////////////
-
-/////////////
-// R2C
-/////////////
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void thread_fft_kernel_R2C_decomposed(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void thread_fft_kernel_R2C_decomposed_transposed(const ScalarType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-/////////////
-// C2C
-/////////////
-
-template <class FFT, class ComplexType = typename FFT::value_type>
-__global__ void thread_fft_kernel_C2C_decomposed(const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-template <class FFT, class invFFT, class ComplexType = typename FFT::value_type>
-__global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-/////////////
-// C2R
-/////////////
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void thread_fft_kernel_C2R_decomposed(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-template <class FFT, class ComplexType = typename FFT::value_type, class ScalarType = typename ComplexType::value_type>
-__global__ void thread_fft_kernel_C2R_decomposed_transposed(const ComplexType* __restrict__ input_values, ScalarType* __restrict__ output_values, Offsets mem_offsets, float twiddle_in, int Q);
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// End FFT Kernels
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class InputType, class OutputType>
-__global__ void clip_into_top_left_kernel(InputType* input_values, OutputType* output_values, const short4 dims);
-
-// Modified from GpuImage::ClipIntoRealKernel
-template <typename InputType, typename OutputType>
-__global__ void clip_into_real_kernel(InputType*  real_values_gpu,
-                                      OutputType* other_image_real_values_gpu,
-                                      short4      dims,
-                                      short4      other_dims,
-                                      int3        wanted_coordinate_of_box_center,
-                                      OutputType  wanted_padding_value);
-
-//////////////////////////////////////////////
-// IO functions adapted from the cufftdx examples
-///////////////////////////////
-
-template <class FFT>
-struct io {
-    using complex_type = typename FFT::value_type;
-    using scalar_type  = typename complex_type::value_type;
-
-    static inline __device__ unsigned int stride_size( ) {
-        return cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-    }
-
-    static inline __device__ void load_r2c(const scalar_type* input,
-                                           complex_type*      thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i].x = input[index];
-            thread_data[i].y = 0.0f;
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_r2c(const complex_type* thread_data,
-                                            complex_type*       output,
-                                            int                 offset) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = offset + threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            output[index] = thread_data[i];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element
-        constexpr unsigned int values_left_to_store =
-                threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[index] = thread_data[FFT::elements_per_thread / 2];
-        }
-    }
-
-    // Since we can make repeated use of the same shared memory for each sub-fft
-    // we use this method to load into shared mem instead of directly to registers
-    // TODO set this up for async mem load
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in,
-                                              int*                input_map,
-                                              int*                output_map,
-                                              int                 Q) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            input_map[i]           = index;
-            output_map[i]          = Q * index;
-            twiddle_factor_args[i] = twiddle_in * index;
-            thread_data[i]         = input[index];
-            shared_input[index]    = thread_data[i];
-            index += stride;
-        }
-    }
-
-    // Since we can make repeated use of the same shared memory for each sub-fft
-    // we use this method to load into shared mem instead of directly to registers
-    // TODO set this up for async mem load
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            twiddle_factor_args[i] = twiddle_in * index;
-            thread_data[i]         = input[index];
-            shared_input[index]    = thread_data[i];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void load_shared(const complex_type* input,
-                                              complex_type*       shared_input,
-                                              complex_type*       thread_data,
-                                              float*              twiddle_factor_args,
-                                              float               twiddle_in,
-                                              int*                input_map,
-                                              int*                output_map,
-                                              int                 Q,
-                                              int                 number_of_elements) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            if ( index < number_of_elements ) {
-                input_map[i]           = index;
-                output_map[i]          = Q * index;
-                twiddle_factor_args[i] = twiddle_in * index;
-                thread_data[i]         = input[index];
-                shared_input[index]    = thread_data[i];
-                index += stride;
-            }
-            else {
-                input_map[i] = -9999; // ignore this in subsequent ops
-            }
-        }
-    }
-
-    // Since we can make repeated use of the same shared memory for each sub-fft
-    // we use this method to load into shared mem instead of directly to registers
-    // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
-    static inline __device__ void load_r2c_shared(const scalar_type* input,
-                                                  scalar_type*       shared_input,
-                                                  complex_type*      thread_data,
-                                                  float*             twiddle_factor_args,
-                                                  float              twiddle_in,
-                                                  int*               input_map,
-                                                  int*               output_map,
-                                                  int                Q) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // if (blockIdx.y == 0) ("blck %i index %i \n", Q*index, index);
-            input_map[i]           = index;
-            output_map[i]          = Q * index;
-            twiddle_factor_args[i] = twiddle_in * index;
-            thread_data[i].x       = input[index];
-            thread_data[i].y       = 0.0f;
-            shared_input[index]    = thread_data[i].x;
-            index += stride;
-        }
-    }
-
-    // Since we can make repeated use of the same shared memory for each sub-fft
-    // we use this method to load into shared mem instead of directly to registers
-    // TODO set this up for async mem load - alternatively, load to registers then copy but leave in register for firt compute
-    static inline __device__ void load_r2c_shared(const scalar_type* input,
-                                                  scalar_type*       shared_input,
-                                                  complex_type*      thread_data,
-                                                  float*             twiddle_factor_args,
-                                                  float              twiddle_in) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            twiddle_factor_args[i] = twiddle_in * index;
-            thread_data[i].x       = input[index];
-            thread_data[i].y       = 0.0f;
-            shared_input[index]    = thread_data[i].x;
-            index += stride;
-        }
-    }
-
-    static inline __device__ void load_r2c_shared_and_pad(const scalar_type* input,
-                                                          complex_type*      shared_mem) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            shared_mem[GetSharedMemPaddedIndex(index)] = complex_type(input[index], 0.f);
-            index += stride;
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void copy_from_shared(const complex_type* shared_mem,
-                                                   complex_type*       thread_data,
-                                                   const unsigned int  Q) {
-        const unsigned int stride = stride_size( ) * Q; // I think the Q is needed, but double check me TODO
-        unsigned int       index  = (threadIdx.x * Q) + threadIdx.z;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i] = shared_mem[GetSharedMemPaddedIndex(index)];
-            index += stride;
-        }
-        __syncthreads( ); // FFT().execute is setup to reuse the shared mem, so we need to sync here. Optionally, we could allocate more shared mem and remove this sync
-    }
-
-    // Note that unlike most functions in this file, this one does not have a
-    // const decorator on the thread mem, as we want to modify it with the twiddle factors
-    // before reducing the full shared mem space.
-    static inline __device__ void reduce_block_fft(complex_type*      thread_data,
-                                                   complex_type*      shared_mem,
-                                                   const float        twiddle_in,
-                                                   const unsigned int Q) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-        complex_type       twiddle;
-        // In the first loop, all threads participate and write back to natural order in shared memory
-        // while also updating with the full size twiddle factor.
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // ( index * threadIdx.z) == ( k % P * n2 )
-            SINCOS(twiddle_in * (index * threadIdx.z), &twiddle.y, &twiddle.x);
-            thread_data[i] *= twiddle;
-
-            shared_mem[GetSharedMemPaddedIndex(index)] = thread_data[i];
-            index += stride;
-        }
-        __syncthreads( );
-
-        // Now we reduce the shared memory into the first block of size P
-        // Reuse index
-        for ( index = 2; index <= Q; index *= 2 ) {
-            // Some threads drop out each loop
-            if ( threadIdx.z % index == 0 ) {
-                for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                    thread_data[i] += shared_mem[GetSharedMemPaddedIndex(threadIdx.x + (i * stride) + (index / 2 * size_of<FFT>::value))];
-                }
-            } // end if condition
-            // All threads can reach this point
-            __syncthreads( );
-        }
-    }
-
-    static inline __device__ void store_r2c_reduced(const complex_type* thread_data,
-                                                    complex_type*       output,
-                                                    const unsigned int  pixel_pitch,
-                                                    const unsigned int  memory_limit) {
-        if ( threadIdx.z == 0 ) {
-            // Finally we write out the first size_of<FFT>::values to global
-            const unsigned int stride = stride_size( );
-            unsigned int       index  = threadIdx.x;
-            for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) {
-                if ( index < memory_limit ) {
-                    // transposed index.
-                    output[index * pixel_pitch + blockIdx.y] = thread_data[i];
-                }
-                index += stride;
-            }
-        }
-    }
-
-    // when using load_shared || load_r2c_shared, we need then copy from shared mem into the registers.
-    // notice we still need the packed complex values for the xform.
-    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
-                                                   complex_type*      thread_data,
-                                                   int*               input_map) {
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i].x = shared_input[input_map[i]];
-            thread_data[i].y = 0.0f;
-        }
-    }
-
-    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
-                                                   complex_type*       thread_data,
-                                                   int*                input_map) {
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i] = shared_input_complex[input_map[i]];
-        }
-    }
-
-    static inline __device__ void copy_from_shared(const scalar_type* shared_input,
-                                                   complex_type*      thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i].x = shared_input[index];
-            thread_data[i].y = 0.0f;
-            index += stride;
-        }
-    }
-
-    static inline __device__ void copy_from_shared(const complex_type* shared_input_complex,
-                                                   complex_type*       thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i] = shared_input_complex[index];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void load_shared_and_conj_multiply(const complex_type* image_to_search,
-                                                                complex_type*       thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        complex_type       c;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            c.x = (thread_data[i].x * image_to_search[index].x + thread_data[i].y * image_to_search[index].y);
-            c.y = (thread_data[i].y * image_to_search[index].x - thread_data[i].x * image_to_search[index].y);
-            // a * conj b
-            thread_data[i] = c; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
-            index += stride;
-        }
-    }
-
-    // TODO: set user lambda to default = false, then get rid of other load_shared
-    template <class FunctionType>
-    static inline __device__ void load_shared(const complex_type* image_to_search,
-                                              complex_type*       thread_data,
-                                              FunctionType        intra_op_lambda = nullptr) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                intra_op_lambda(thread_data[i].x, thread_data[i].y, image_to_search[index].x, image_to_search[index].y); //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
-                index += stride;
-            }
-        }
-        else {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                // a * conj b
-                thread_data[i] = thread_data[i], image_to_search[index]; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
-                index += stride;
-            }
-        }
-    }
-
-    // Now we need send to shared mem and transpose on the way
-    // TODO: fix bank conflicts later.
-    static inline __device__ void transpose_r2c_in_shared_XZ(complex_type* shared_mem,
-                                                             complex_type* thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[i];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[FFT::elements_per_thread / 2];
-        }
-        __syncthreads( );
-    }
-
-    // Now we need send to shared mem and transpose on the way
-    // TODO: fix bank conflicts later.
-    static inline __device__ void transpose_in_shared_XZ(complex_type* shared_mem,
-                                                         complex_type* thread_data) {
-        const unsigned int stride = io<FFT>::stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // return (XZ_STRIDE*blockIdx.z + threadIdx.z) + (XZ_STRIDE*gridDim.z) * ( blockIdx.y + X * gridDim.y );
-            // XZ_STRIDE == XZ_STRIDE
-            shared_mem[threadIdx.z + index * XZ_STRIDE] = thread_data[i];
-            index += stride;
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store_r2c_transposed_xz(const complex_type* thread_data,
-                                                          complex_type*       output) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            output[Return1DFFTAddress_XZ_transpose(index)] = thread_data[i];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[Return1DFFTAddress_XZ_transpose(index)] = thread_data[FFT::elements_per_thread / 2];
-        }
-        __syncthreads( );
-    }
-
-    // Store a transposed tile, made up of contiguous (full) FFTS
-    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                    complex_type*       output) {
-        const unsigned int     stride                 = stride_size( );
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft      = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
-        }
-        __syncthreads( );
-    }
-
-    // Store a transposed tile, made up of non-contiguous (strided partial) FFTS
-    //
-    static inline __device__ void store_r2c_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                    complex_type*       output,
-                                                                    const unsigned int  Q,
-                                                                    const unsigned int  sub_fft) {
-        const unsigned int     stride                 = stride_size( );
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        unsigned int           index                  = threadIdx.x + threadIdx.z * output_values_to_store;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            output[Return1DFFTAddress_XZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft      = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[Return1DFFTAddress_XZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store_transposed_xz_strided_Z(const complex_type* shared_mem,
-                                                                complex_type*       output) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + threadIdx.z * cufftdx::size_of<FFT>::value;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[Return1DFFTAddress_XZ_transpose_strided_Z(index)] = shared_mem[index];
-            index += stride;
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int                 pixel_pitch) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
-            output[index * pixel_pitch + blockIdx.y] = thread_data[i];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[index * pixel_pitch + blockIdx.y] = thread_data[FFT::elements_per_thread / 2];
-        }
-    }
-
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int*                output_MAP,
-                                                          int                 pixel_pitch) {
-        const unsigned int stride = stride_size( );
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
-            output[output_MAP[i] * pixel_pitch + blockIdx.y] = thread_data[i];
-            // if (blockIdx.y == 32) printf("from store transposed %i , val %f %f\n", output_MAP[i], thread_data[i].x, thread_data[i].y);
-        }
-        constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        constexpr unsigned int values_left_to_store   = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        if ( threadIdx.x < values_left_to_store ) {
-            output[output_MAP[FFT::elements_per_thread / 2] * pixel_pitch + blockIdx.y] = thread_data[FFT::elements_per_thread / 2];
-        }
-    }
-
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* thread_data,
-                                                          complex_type*       output,
-                                                          int*                output_MAP,
-                                                          int                 pixel_pitch,
-                                                          int                 memory_limit) {
-        const unsigned int stride = stride_size( );
-        for ( unsigned int i = 0; i <= FFT::elements_per_thread / 2; i++ ) {
-            // output map is thread local, so output_MAP[i] gives the x-index in the non-transposed array and blockIdx.y gives the y-index
-            // if (blockIdx.y == 1) printf("index, pitch, blcok, address %i, %i, %i, %i\n", output_MAP[i], pixel_pitch, memory_limit, output_MAP[i]*pixel_pitch + blockIdx.y);
-
-            if ( output_MAP[i] < memory_limit )
-                output[output_MAP[i] * pixel_pitch + blockIdx.y] = thread_data[i];
-            // if (blockIdx.y == 32) printf("from store transposed %i , val %f %f\n", output_MAP[i], thread_data[i].x, thread_data[i].y);
-        }
-        // constexpr unsigned int threads_per_fft        = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        // constexpr unsigned int output_values_to_store = (cufftdx::size_of<FFT>::value / 2) + 1;
-        // constexpr unsigned int values_left_to_store = threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft);
-        // if (threadIdx.x < values_left_to_store)
-        // {
-        //   printf("index, pitch, blcok, address %i, %i, %i, %i\n", output_MAP[FFT::elements_per_thread / 2], pixel_pitch, blockIdx.y, output_MAP[FFT::elements_per_thread / 2]*pixel_pitch + blockIdx.y);
-        //   if (output_MAP[FFT::elements_per_thread / 2] < memory_limit) output[output_MAP[FFT::elements_per_thread / 2]*pixel_pitch + blockIdx.y] =  thread_data[FFT::elements_per_thread / 2];
-        // }
-    }
-
-    static inline __device__ void load_c2r(const complex_type* input,
-                                           complex_type*       thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            thread_data[i] = input[index];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
-        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
-        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
-        if ( threadIdx.x < values_left_to_load ) {
-            thread_data[FFT::elements_per_thread / 2] = input[index];
-        }
-    }
-
-    static inline __device__ void load_c2r_transposed(const complex_type* input,
-                                                      complex_type*       thread_data,
-                                                      unsigned int        pixel_pitch) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            thread_data[i] = input[(pixel_pitch * index) + blockIdx.y];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
-        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
-        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
-        if ( threadIdx.x < values_left_to_load ) {
-            thread_data[FFT::elements_per_thread / 2] = input[(pixel_pitch * index) + blockIdx.y];
-        }
-    }
-
-    static inline __device__ void load_c2r_shared_and_pad(const complex_type* input,
-                                                          complex_type*       shared_mem,
-                                                          const unsigned int  pixel_pitch) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-        for ( unsigned int i = 0; i < FFT::elements_per_thread / 2; i++ ) {
-            shared_mem[GetSharedMemPaddedIndex(index)] = input[pixel_pitch * index];
-            index += stride;
-        }
-        constexpr unsigned int threads_per_fft       = cufftdx::size_of<FFT>::value / FFT::elements_per_thread;
-        constexpr unsigned int output_values_to_load = (cufftdx::size_of<FFT>::value / 2) + 1;
-        // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element
-        constexpr unsigned int values_left_to_load = threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft);
-        if ( threadIdx.x < values_left_to_load ) {
-            shared_mem[GetSharedMemPaddedIndex(index)] = input[pixel_pitch * index];
-        }
-        __syncthreads( );
-    }
-
-    // this may benefit from asynchronous execution
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            thread_data[i] = input[index];
-            // if (blockIdx.y == 0) printf("block %i , val %f %f\n", index, input[index].x, input[index].y);
-
-            index += stride;
-        }
-    }
-
-    // this may benefit from asynchronous execution
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data,
-                                       int                 last_index_to_load) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            if ( index < last_index_to_load )
-                thread_data[i] = input[index];
-            else
-                thread_data[i] = complex_type(0.0f, 0.0f);
-            index += stride;
-        }
-    }
-
-    //  TODO: set pre_op_lambda to default=false and get rid of other load
-    template <class FunctionType>
-    static inline __device__ void load(const complex_type* input,
-                                       complex_type*       thread_data,
-                                       int                 last_index_to_load,
-                                       FunctionType        pre_op_lambda = nullptr) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                if ( index < last_index_to_load )
-                    thread_data[i] = pre_op_lambda(input[index]);
-                else
-                    thread_data[i] = pre_op_lambda(complex_type(0.0f, 0.0f));
-                index += stride;
-            }
-        }
-        else {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                if ( index < last_index_to_load )
-                    thread_data[i] = input[index];
-                else
-                    thread_data[i] = complex_type(0.0f, 0.0f);
-                index += stride;
-            }
-        }
-    }
-
-    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
-                                                           complex_type*       output,
-                                                           int                 first_negative_index) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        complex_type       phase_shift;
-        int                logical_y;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // If no kernel based changes are made to source_idx, this will be the same as the original index value
-            phase_shift = thread_data[i];
-            logical_y   = index;
-            if ( logical_y >= first_negative_index )
-                logical_y -= 2 * first_negative_index;
-            if ( (int(blockIdx.y) + logical_y) % 2 != 0 )
-                phase_shift *= -1.f;
-            output[index] = phase_shift;
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_and_swap_quadrants(const complex_type* thread_data,
-                                                           complex_type*       output,
-                                                           int*                source_idx,
-                                                           int                 first_negative_index) {
-        const unsigned int stride = stride_size( );
-        complex_type       phase_shift;
-        int                logical_y;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // If no kernel based changes are made to source_idx, this will be the same as the original index value
-            phase_shift = thread_data[i];
-            logical_y   = source_idx[i];
-            if ( logical_y >= first_negative_index )
-                logical_y -= 2 * first_negative_index;
-            if ( (int(blockIdx.y) + logical_y) % 2 != 0 )
-                phase_shift *= -1.f;
-            output[source_idx[i]] = phase_shift;
-        }
-    }
-
-    template <class FunctionType = std::nullptr_t>
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        FunctionType        post_op_lambda = nullptr) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        if constexpr ( IS_IKF_t<FunctionType>( ) ) {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                output[index] = post_op_lambda(thread_data[i]);
-                index += stride;
-            }
-        }
-        else {
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                output[index] = thread_data[i];
-                index += stride;
-            }
-        }
-    }
-
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        const unsigned int  Q,
-                                        const unsigned int  sub_fft) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[index * Q + sub_fft] = thread_data[i];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_Z(const complex_type* shared_mem,
-                                          complex_type*       output) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[Return1DFFTAddress_YZ_transpose_strided_Z(index)] = shared_mem[index];
-
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_Z(const complex_type* shared_mem,
-                                          complex_type*       output,
-                                          const unsigned int  Q,
-                                          const unsigned int  sub_fft) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + threadIdx.z * size_of<FFT>::value;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[Return1DFFTAddress_YZ_transpose_strided_Z(index, Q, sub_fft)] = shared_mem[index];
-            index += stride;
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        unsigned int        memory_limit) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            if ( index < memory_limit )
-                output[index] = thread_data[i];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store(const complex_type* thread_data,
-                                        complex_type*       output,
-                                        int*                source_idx) {
-        const unsigned int stride = stride_size( );
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // If no kernel based changes are made to source_idx, this will be the same as the original index value
-            output[source_idx[i]] = thread_data[i];
-        }
-    }
-
-    static inline __device__ void store_subset(const complex_type* thread_data,
-                                               complex_type*       output,
-                                               int*                source_idx) {
-        const unsigned int stride = stride_size( );
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // If no kernel based changes are made to source_idx, this will be the same as the original index value
-            if ( source_idx[i] >= 0 )
-                output[source_idx[i]] = thread_data[i];
-        }
-    }
-
-    static inline __device__ void store_coalesced(const complex_type* shared_output,
-                                                  complex_type*       global_output,
-                                                  int                 offset) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = offset + threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            global_output[index] = shared_output[index];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void load_c2c_shared_and_pad(const complex_type* input,
-                                                          complex_type*       shared_mem) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            shared_mem[GetSharedMemPaddedIndex(index)] = input[index];
-            index += stride;
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store_c2c_reduced(const complex_type* thread_data,
-                                                    complex_type*       output) {
-        if ( threadIdx.z == 0 ) {
-            // Finally we write out the first size_of<FFT>::values to global
-            const unsigned int stride = stride_size( );
-            unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                if ( index < size_of<FFT>::value ) {
-                    // transposed index.
-                    output[index] = thread_data[i];
-                }
-                index += stride;
-            }
-        }
-    }
-
-    static inline __device__ void store_c2r_reduced(const complex_type* thread_data,
-                                                    scalar_type*        output) {
-        if ( threadIdx.z == 0 ) {
-            // Finally we write out the first size_of<FFT>::values to global
-            const unsigned int stride = stride_size( );
-            unsigned int       index  = threadIdx.x + (threadIdx.z * size_of<FFT>::value);
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                if ( index < size_of<FFT>::value ) {
-                    // transposed index.
-                    output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
-                }
-                index += stride;
-            }
-        }
-    }
-
-    static inline __device__ void store_transposed(const complex_type* thread_data,
-                                                   complex_type*       output,
-                                                   int*                output_map,
-                                                   int*                rotated_offset,
-                                                   int                 memory_limit) {
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // If no kernel based changes are made to source_idx, this will be the same as the original index value
-            if ( output_map[i] < memory_limit )
-                output[rotated_offset[1] * output_map[i] + rotated_offset[0]] = thread_data[i];
-        }
-    }
-
-    static inline __device__ void store_c2r(const complex_type* thread_data,
-                                            scalar_type*        output) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_c2r(const complex_type* thread_data,
-                                            scalar_type*        output,
-                                            unsigned int        memory_limit) {
-        const unsigned int stride = stride_size( );
-        unsigned int       index  = threadIdx.x;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            // TODO: does reinterpret_cast<const scalar_type*>(thread_data)[i] make more sense than just thread_data[i].x??
-            if ( index < memory_limit )
-                output[index] = reinterpret_cast<const scalar_type*>(thread_data)[i];
-            index += stride;
-        }
-    }
-}; // struct io}
-
-template <class FFT>
-struct io_thread {
-    using complex_type = typename FFT::value_type;
-    using scalar_type  = typename complex_type::value_type;
-
-    static inline __device__ void load_r2c(const scalar_type* input,
-                                           complex_type*      thread_data,
-                                           const int          stride) {
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            thread_data[i].x = input[index];
-            thread_data[i].y = scalar_type(0);
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_r2c(const complex_type* shared_output,
-                                            complex_type*       output,
-                                            const int           stride,
-                                            const int           memory_limit) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
-            output[index] = shared_output[index];
-            index += stride;
-        }
-        if ( index < memory_limit ) {
-            output[index] = shared_output[index];
-        }
-    }
-
-    static inline __device__ void store_r2c_transposed_xy(const complex_type* shared_output,
-                                                          complex_type*       output,
-                                                          int                 stride,
-                                                          int                 pixel_pitch,
-                                                          int                 memory_limit) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value / 2; i++ ) {
-            output[index * pixel_pitch] = shared_output[index];
-            index += stride;
-        }
-        if ( index < memory_limit ) {
-            output[index * pixel_pitch] = shared_output[index];
-        }
-    }
-
-    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
-                                                            complex_type*       shared_output,
-                                                            float               twiddle_in,
-                                                            int                 Q,
-                                                            int                 memory_limit) {
-        // Unroll the first loop and initialize the shared mem.
-        complex_type twiddle;
-        int          index = threadIdx.x * size_of<FFT>::value;
-        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-            twiddle *= thread_data[i];
-            if ( index + i < memory_limit )
-                shared_output[index + i] = twiddle;
-        }
-        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
-
-        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
-            // wrap around, 0 --> 1, Q-1 --> 0 etc.
-            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-                twiddle *= thread_data[i];
-                if ( index + i < memory_limit ) {
-                    atomicAdd_block(&shared_output[index + i].x, twiddle.x);
-                    atomicAdd_block(&shared_output[index + i].y, twiddle.y);
-                }
-            }
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void load_c2c(const complex_type* input,
-                                           complex_type*       thread_data,
-                                           const int           stride) {
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            thread_data[i] = input[index];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void store_c2c(const complex_type* shared_output,
-                                            complex_type*       output,
-                                            const int           stride) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            output[index] = shared_output[index];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void remap_decomposed_segments(const complex_type* thread_data,
-                                                            complex_type*       shared_output,
-                                                            float               twiddle_in,
-                                                            int                 Q) {
-        // Unroll the first loop and initialize the shared mem.
-        complex_type twiddle;
-        int          index = threadIdx.x * size_of<FFT>::value;
-        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-            twiddle *= thread_data[i];
-            shared_output[index + i] = twiddle;
-        }
-        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
-
-        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
-            // wrap around, 0 --> 1, Q-1 --> 0 etc.
-            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
-            for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-                twiddle *= thread_data[i];
-                atomicAdd_block(&shared_output[index + i].x, twiddle.x);
-                atomicAdd_block(&shared_output[index + i].y, twiddle.y);
-            }
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void load_c2r(const complex_type* input,
-                                           complex_type*       thread_data,
-                                           const int           stride,
-                                           const int           memory_limit) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index  = threadIdx.x;
-        unsigned int offset = 2 * memory_limit - 2;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            if ( index < memory_limit ) {
-                thread_data[i] = input[index];
-            }
-            else {
-                // assuming even dimension
-                // FIXME shouldn't need to read in from global for an even stride
-                thread_data[i]   = input[offset - index];
-                thread_data[i].y = -thread_data[i].y; // conjugate
-            }
-            index += stride;
-        }
-    }
-
-    // FIXME as above
-    static inline __device__ void load_c2r_transposed(const complex_type* input,
-                                                      complex_type*       thread_data,
-                                                      int                 stride,
-                                                      int                 pixel_pitch,
-                                                      int                 memory_limit) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index = threadIdx.x;
-        // unsigned int offset = 2*memory_limit - 2;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            if ( index < memory_limit ) {
-                thread_data[i] = input[index * pixel_pitch];
-            }
-            else {
-                // input[2*memory_limit - index - 2];
-                // assuming even dimension
-                // FIXME shouldn't need to read in from global for an even stride
-                thread_data[i]   = input[(2 * memory_limit - index) * pixel_pitch];
-                thread_data[i].y = -thread_data[i].y; // conjugate
-            }
-            index += stride;
-        }
-    }
-
-    static inline __device__ void remap_decomposed_segments_c2r(const complex_type* thread_data,
-                                                                scalar_type*        shared_output,
-                                                                scalar_type         twiddle_in,
-                                                                int                 Q) {
-        // Unroll the first loop and initialize the shared mem.
-        complex_type twiddle;
-        int          index = threadIdx.x * size_of<FFT>::value;
-        twiddle_in *= threadIdx.x; // twiddle factor arg now just needs to multiplied by K = (index + i)
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-            shared_output[index + i] = (twiddle.x * thread_data[i].x - twiddle.y * thread_data[i].y); // assuming the output is real, only the real parts add, so don't bother with the complex
-        }
-        __syncthreads( ); // make sure all the shared mem is initialized to the starting value. There should be no contention as every thread is working on its own block of memory.
-
-        for ( unsigned int sub_fft = 1; sub_fft < Q; sub_fft++ ) {
-            // wrap around, 0 --> 1, Q-1 --> 0 etc.
-            index = ((threadIdx.x + sub_fft) % Q) * size_of<FFT>::value;
-
-            for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-                // if (threadIdx.x == 32) printf("remap tid, subfft, q, index + i %i %i %i %i\n", threadIdx.x,sub_fft, Q, index+i);
-                SINCOS(twiddle_in * (index + i), &twiddle.y, &twiddle.x);
-                atomicAdd_block(&shared_output[index + i], twiddle.x * thread_data[i].x - twiddle.y * thread_data[i].y);
-            }
-        }
-        __syncthreads( );
-    }
-
-    static inline __device__ void store_c2r(const scalar_type* shared_output,
-                                            scalar_type*       output,
-                                            const int          stride) {
-        // Each thread reads in the input data at stride = mem_offsets.Q
-        unsigned int index = threadIdx.x;
-        for ( unsigned int i = 0; i < size_of<FFT>::value; i++ ) {
-            output[index] = shared_output[index];
-            index += stride;
-        }
-    }
-
-    static inline __device__ void load_shared_and_conj_multiply(const complex_type* image_to_search,
-                                                                const complex_type* shared_mem,
-                                                                complex_type*       thread_data,
-                                                                const int           stride) {
-        unsigned int index = threadIdx.x;
-        complex_type c;
-        for ( unsigned int i = 0; i < FFT::elements_per_thread; i++ ) {
-            c.x = (shared_mem[index].x * image_to_search[index].x + shared_mem[index].y * image_to_search[index].y);
-            c.y = (shared_mem[index].y * image_to_search[index].x - shared_mem[index].x * image_to_search[index].y);
-            // a * conj b
-            thread_data[i] = c; //ComplexConjMulAndScale<complex_type, scalar_type>(thread_data[i], image_to_search[index], 1.0f);
-            index += stride;
-        }
-        __syncthreads( );
-    }
-}; // struct thread_io
-
-} // namespace FastFFT
-
-#endif // Fast_FFT_cuh_
diff --git a/include/FastFFT.h b/include/FastFFT.h
index 262b57d..2efd63d 100644
--- a/include/FastFFT.h
+++ b/include/FastFFT.h
@@ -1,27 +1,34 @@
 // Insert some license stuff here
 
-#ifndef fast_FFT_H_
-#define fast_FFT_H_
+#ifndef _INCLUDE_FASTFFT_H
+#define _INCLUDE_FASTFFT_H
 
 #include <chrono>
 #include <random>
+#include <iostream>
 
-// #include <cuda_runtime.h>
-// #include <cuda.h>
-#include <cuda_fp16.h>
+// Forward declaration so we can leave the inclusion of cuda_fp16.h to FastFFT.cu
+struct __half;
+struct __half2;
+// #include <cuda_fp16.h>
 
 #ifndef ENABLE_FastFFT // ifdef being used in cisTEM that defines these
-#if __cplusplus > 201703L
+#if __cplusplus >= 202002L
 #include <numbers>
 using namespace std::numbers;
 #else
-// For now we do not have c++20 so we need to define this for constants. Modified from /usr/include/c++/11/numbers
-/// pi
+#if __cplusplus < 201703L
+#message "C++ is " __cplusplus
+#error "C++17 or later required"
+#else
 template <typename _Tp>
 // inline constexpr _Tp pi_v = _Enable_if_floating<_Tp>(3.141592653589793238462643383279502884L);
 inline constexpr _Tp pi_v = 3.141592653589793238462643383279502884L;
-#endif
-#endif
+#endif // __cplusplus require > 17
+#endif // __cplusplus 20 support
+#endif // enable FastFFT
+
+#include "../src/fastfft/types.cuh"
 
 // For testing/debugging it is convenient to execute and have print functions for partial transforms.
 // These will go directly in the kernels and also in the helper Image.cuh definitions for PrintArray.
@@ -29,7 +36,7 @@ inline constexpr _Tp pi_v = 3.141592653589793238462643383279502884L;
 // Fwd 0, 1, 2, 3( none, x, z, original y)
 // 4 intermediate ops, like conj multiplication
 // Inv 5, 6, 7 ( original y, z, x)
-// Defined in make by setting environmental variable  DEBUG_FFT_STAGE
+// Defined in make by setting environmental variable  FFT_DEBUG_STAGE
 
 // #include <iostream>
 /*
@@ -170,48 +177,29 @@ struct DevicePointers<__half2*, __half*> {
 template <class ComputeType = float, class InputType = float, class OutputType = float, int Rank = 2>
 class FourierTransformer {
 
-  public:
-    // Used to specify input/calc/output data types
-    enum DataType { int4_2,
-                    uint8,
-                    int8,
-                    uint16,
-                    int16,
-                    fp16,
-                    bf16,
-                    tf32,
-                    uint32,
-                    int32,
-                    fp32 };
-
-    std::vector<std::string> DataTypeName{"int4_2", "uint8", "int8", "uint16", "int16", "fp16", "bf16", "tf32", "uint32", "int32", "fp32"};
-
-    enum OriginType { natural,
-                      centered,
-                      quadrant_swapped }; // Used to specify the origin of the data
-
-    std::vector<std::string> OriginTypeName{"natural", "centered", "quadrant_swapped"};
+  private:
 
+  public:
     // Using the enum directly from python is not something I've figured out yet. Just make simple methods.
     inline void SetOriginTypeNatural(bool set_input_type = true) {
         if ( set_input_type )
-            input_origin_type = natural;
+            input_origin_type = OriginType::natural;
         else
-            output_origin_type = natural;
+            output_origin_type = OriginType::natural;
     }
 
     inline void SetOriginTypeCentered(bool set_input_type = true) {
         if ( set_input_type )
-            input_origin_type = centered;
+            input_origin_type = OriginType::centered;
         else
-            output_origin_type = centered;
+            output_origin_type = OriginType::centered;
     }
 
     inline void SetOriginTypeQuadrantSwapped(bool set_input_type = true) {
         if ( set_input_type )
-            input_origin_type = quadrant_swapped;
+            input_origin_type = OriginType::quadrant_swapped;
         else
-            output_origin_type = quadrant_swapped;
+            output_origin_type = OriginType::quadrant_swapped;
     }
 
     short padding_jump_val;
@@ -232,18 +220,18 @@ class FourierTransformer {
     // This is pretty similar to an FFT plan, I should probably make it align with CufftPlan
     void SetForwardFFTPlan(size_t input_logical_x_dimension, size_t input_logical_y_dimension, size_t input_logical_z_dimension,
                            size_t output_logical_x_dimension, size_t output_logical_y_dimension, size_t output_logical_z_dimension,
-                           bool is_padded_output,
-                           bool is_host_memory_pinned);
+                           bool is_padded_output = true);
 
     void SetInverseFFTPlan(size_t input_logical_x_dimension, size_t input_logical_y_dimension, size_t input_logical_z_dimension,
                            size_t output_logical_x_dimension, size_t output_logical_y_dimension, size_t output_logical_z_dimension,
-                           bool is_padded_output);
+                           bool is_padded_output = true);
 
     // For the time being, the caller is responsible for having the memory allocated for any of these input/output pointers.
     void SetInputPointer(InputType* input_pointer, bool is_input_on_device);
     // When passing in a pointer from python (cupy or pytorch) it is a long, and needs to be cast to input type.
     // For now, we are assuming memory ops are all handled in the python code.
     void SetInputPointer(long input_pointer);
+    void SetCallerPinnedInputPointer(InputType* input_pointer);
 
     ///////////////////////////////////////////////
     // Public actions:
@@ -252,13 +240,27 @@ class FourierTransformer {
     ///////////////////////////////////////////////
     inline void Wait( ) { cudaStreamSynchronize(cudaStreamPerThread); };
 
-    void CopyHostToDevice( );
-    // By default we are blocking with a stream sync until complete for simplicity. This is overkill and should FIXME.
+    void CopyHostToDevceAndSynchronize(InputType* input_pointer, int n_elements_to_copy = 0);
+    void CopyHostToDevice(InputType* input_pointer, int n_elements_to_copy = 0);
     // If int n_elements_to_copy = 0 the appropriate size will be determined by the state of the transform completed (none, fwd, inv.)
     // For partial increase/decrease transforms, needed for testing, this will be invalid, so specify the int n_elements_to_copy.
-    void CopyDeviceToHost(bool free_gpu_memory, bool unpin_host_memory, int n_elements_to_copy = 0);
     // When the size changes, we need a new host pointer
-    void CopyDeviceToHost(OutputType* output_pointer, bool free_gpu_memory = true, bool unpin_host_memory = true, int n_elements_to_copy = 0);
+    void CopyDeviceToHostAndSynchronize(OutputType* output_pointer, bool free_gpu_memory = true, int n_elements_to_copy = 0);
+    void CopyDeviceToHost(OutputType* output_pointer, bool free_gpu_memory = true, int n_elements_to_copy = 0);
+
+    // Ideally, in addition to position/momentum space (buffer) ponters, there would also be a input pointer, which may point
+    // to a gpu address that is from an external process or to the FastFFT buffer space. This way, when calling CopyHostToDevice,
+    // that input is set to the FastFFT buffer space, data is copied and the first Fwd kernels are called as they are currently.
+    // This would also allow the input pointer to point to a different address than the FastFFT buffer only accessed on initial kernel
+    // calls and read only. In turn we could skip the device to device transfer we are doing in the following method.
+    void CopyDeviceToDeviceFromNonOwningAddress(InputType* input_pointer, int n_elements_to_copy = 0);
+
+    // Here we may be copying input data type from another GPU buffer, OR output data type to another GPU buffer.
+    // Check in these methods that the types match
+    template <class TransferDataType>
+    void CopyDeviceToDeviceAndSynchronize(TransferDataType* input_pointer, bool free_gpu_memory = true, int n_elements_to_copy = 0);
+    template <class TransferDataType>
+    void CopyDeviceToDevice(TransferDataType* input_pointer, bool free_gpu_memory = true, int n_elements_to_copy = 0);
     // FFT calls
 
     //   void FwdFFT(bool swap_real_space_quadrants = false, bool transpose_output = true);
@@ -351,7 +353,6 @@ class FourierTransformer {
         std::cerr << "is_in_memory_host_pointer " << is_in_memory_host_pointer << std::endl;
         std::cerr << "is_in_memory_device_pointer " << is_in_memory_device_pointer << std::endl;
         std::cerr << "is_in_buffer_memory " << is_in_buffer_memory << std::endl;
-        std::cerr << "is_host_memory_pinned " << is_host_memory_pinned << std::endl;
         std::cerr << "is_fftw_padded_input " << is_fftw_padded_input << std::endl;
         std::cerr << "is_fftw_padded_output " << is_fftw_padded_output << std::endl;
         std::cerr << "is_real_valued_input " << is_real_valued_input << std::endl;
@@ -388,24 +389,22 @@ class FourierTransformer {
         std::cerr << "fwd_size_change_type " << SizeChangeName[fwd_size_change_type] << std::endl;
         std::cerr << "inv_size_change_type " << SizeChangeName[inv_size_change_type] << std::endl;
         std::cerr << "transform stage complete " << TransformStageCompletedName[transform_stage_completed] << std::endl;
-        std::cerr << "input_origin_type " << OriginTypeName[input_origin_type] << std::endl;
-        std::cerr << "output_origin_type " << OriginTypeName[output_origin_type] << std::endl;
+        std::cerr << "input_origin_type " << OriginType::name[input_origin_type] << std::endl;
+        std::cerr << "output_origin_type " << OriginType::name[output_origin_type] << std::endl;
 
     }; // PrintState()
 
     // private:
 
-    DeviceProps device_properties;
-    OriginType  input_origin_type;
-    OriginType  output_origin_type;
+    DeviceProps      device_properties;
+    OriginType::Enum input_origin_type;
+    OriginType::Enum output_origin_type;
 
     // booleans to track state, could be bit fields but that seem opaque to me.
     bool is_in_memory_host_pointer; // To track allocation of host side memory
     bool is_in_memory_device_pointer; // To track allocation of device side memory.
     bool is_in_buffer_memory; // To track whether the current result is in dev_ptr.position_space or dev_ptr.position_space_buffer (momemtum space/ momentum space buffer respectively.)
 
-    bool is_host_memory_pinned; // Specified in the constructor. Assuming host memory won't be pinned for many applications.
-
     bool is_fftw_padded_input; // Padding for in place r2c transforms
     bool is_fftw_padded_output; // Currently the output state will match the input state, otherwise it is an error.
 
@@ -420,35 +419,21 @@ class FourierTransformer {
     FFT_Size transform_size;
     int      elements_per_thread_complex; // Set depending on the kernel and size of the transform.
 
-    // FIXME this seems like a bad idea. Added due to conflicing labels in switch statements, even with explicitly scope.
-    enum SizeChangeType : uint8_t { increase,
-                                    decrease,
-                                    no_change }; // Assumed to be the same for all dimesnions. This may be relaxed later.
-
     std::vector<std::string> SizeChangeName{"increase", "decrease", "no_change"};
 
-    enum TransformStageCompleted : uint8_t { none = 10,
-                                             fwd  = 11,
-                                             inv  = 12 }; // none must be greater than number of sizeChangeTypes, padding must match in TransformStageCompletedName vector
-
     std::vector<std::string> TransformStageCompletedName{"", "", "", "", "", // padding of 5
                                                          "", "", "", "", "", // padding of 5
                                                          "none", "fwd", "inv"};
 
-    enum DimensionCheckType : uint8_t { CopyFromHost,
-                                        CopyToHost,
-                                        FwdTransform,
-                                        InvTransform };
-
     std::vector<std::string> DimensionCheckName{"CopyFromHost", "CopyToHost", "FwdTransform", "InvTransform"};
 
     bool is_from_python_call;
     bool is_owner_of_memory;
 
-    SizeChangeType fwd_size_change_type;
-    SizeChangeType inv_size_change_type;
+    SizeChangeType::Enum fwd_size_change_type;
+    SizeChangeType::Enum inv_size_change_type;
 
-    TransformStageCompleted transform_stage_completed;
+    TransformStageCompleted::Enum transform_stage_completed;
 
     // dims_in may change during calculation, depending on padding, but is reset after each call.
     short4 dims_in;
@@ -467,7 +452,7 @@ class FourierTransformer {
 
     void SetDefaults( );
     void ValidateDimensions( );
-    void SetDimensions(DimensionCheckType check_op_type);
+    void SetDimensions(DimensionCheckType::Enum check_op_type);
 
     void SetDevicePointers(bool should_allocate_buffer_memory);
 
@@ -482,6 +467,7 @@ class FourierTransformer {
         d) IsForwardType()
         e) IsTransformAlongZ()
   */
+
     enum KernelType { r2c_decomposed, // Thread based, full length.
                       r2c_decomposed_transposed, // Thread based, full length, transposed.
                       r2c_none_XY,
@@ -676,30 +662,30 @@ class FourierTransformer {
     // 1.
     // First call passed from a public transform function, selects block or thread and the transform precision.
     template <bool use_thread_method = false, class PreOpType = std::nullptr_t, class IntraOpType = std::nullptr_t, class PostOpType = std::nullptr_t> // bool is just used as a dummy type
-    void SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform = true, PreOpType pre_op_lambda = nullptr, IntraOpType intra_op_lambda = nullptr, PostOpType post_op_lambda = nullptr);
+    void SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform = true, PreOpType pre_op_functor = nullptr, IntraOpType intra_op_functor = nullptr, PostOpType post_op_functor = nullptr);
 
     // 2. // TODO: remove this now that the functors are working
     // Check to see if any intra kernel functions are wanted, and if so set the appropriate device pointers.
     template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-    void SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
+    void SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     // 3.
     // Second call, sets size of the transform kernel, selects the appropriate GPU arch
 
     // template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-    // void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
+    // void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
     // This allows us to iterate through a set of constexpr sizes passed as a template parameter pack. The value is in providing a means to have different size packs
     // for different fft configurations, eg. 2d vs 3d
     template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
+    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType, unsigned int SizeValue, unsigned int Ept, unsigned int... OtherValues>
-    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
+    void SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     // 3.
     // Third call, sets the input and output dimensions and type
     template <class FFT_base_arch, class PreOpType, class IntraOpType, class PostOpType>
-    void SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda);
+    void SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor);
 
     void PrintLaunchParameters(LaunchParams LP) {
         std::cerr << "Launch parameters: " << std::endl;
diff --git a/src/FastFFT.cu b/src/FastFFT.cu
deleted file mode 120000
index 2cae54e..0000000
--- a/src/FastFFT.cu
+++ /dev/null
@@ -1 +0,0 @@
-FastFFT.cu.cpp
\ No newline at end of file
diff --git a/src/cpp/Image.cu b/src/cpp/Image.cu
deleted file mode 120000
index f1f96a5..0000000
--- a/src/cpp/Image.cu
+++ /dev/null
@@ -1 +0,0 @@
-Image.cu.cpp
\ No newline at end of file
diff --git a/src/cpp/Image.cuh b/src/cpp/Image.cuh
deleted file mode 120000
index bbfa7c5..0000000
--- a/src/cpp/Image.cuh
+++ /dev/null
@@ -1 +0,0 @@
-Image.cuh.h
\ No newline at end of file
diff --git a/src/cpp/Image.cuh.h b/src/cpp/Image.cuh.h
deleted file mode 100644
index 1122ad8..0000000
--- a/src/cpp/Image.cuh.h
+++ /dev/null
@@ -1,233 +0,0 @@
-// Collection of helper functions for test.cu
-
-#include <iostream>
-#include <vector>
-#include <chrono>
-#include <cmath>
-#include <string>
-
-// sudo apt-get install libfftw3-dev libfftw3-doc
-#include <fftw3.h>
-
-#include <cuda_runtime_api.h>
-#include "../../include/cufftdx/include/cufftdx.hpp"
-#include <cufft.h>
-#include <cufftXt.h>
-
-// A simple class to represent image objects needed for testing FastFFT.
-
-template <class wanted_real_type, class wanted_complex_type>
-class Image {
-
-  public:
-    Image( );
-    Image(short4 wanted_size);
-    ~Image( );
-
-    wanted_real_type*    real_values;
-    wanted_complex_type* complex_values;
-    bool*                clipIntoMask;
-
-    short4 size;
-    int    real_memory_allocated;
-    int    padding_jump_value;
-
-    float fftw_epsilon;
-
-    bool is_in_memory;
-    bool is_fftw_planned;
-    bool is_in_real_space;
-    bool is_cufft_planned;
-
-    void Allocate( );
-    void Allocate(bool plan_fftw);
-    void FwdFFT( );
-    void InvFFT( );
-
-    // Make FFTW plans for comparing CPU to GPU xforms.
-    // This is nearly verbatim from cisTEM::Image::Allocate - I do not know if FFTW_ESTIMATE is the best option.
-    // In cisTEM we almost always use MKL, so this might be worth testing. I always used exhaustive in Matlab/emClarity.
-    fftwf_plan plan_fwd = NULL;
-    fftwf_plan plan_bwd = NULL;
-
-    cufftHandle cuda_plan_forward;
-    cufftHandle cuda_plan_inverse;
-    size_t      cuda_plan_worksize_forward;
-    size_t      cuda_plan_worksize_inverse;
-
-    cudaEvent_t startEvent{nullptr};
-    cudaEvent_t stopEvent{nullptr};
-    float       elapsed_gpu_ms{ };
-
-    inline void create_timing_events( ) {
-        cudaEventCreate(&startEvent, cudaEventBlockingSync);
-        cudaEventCreate(&stopEvent, cudaEventBlockingSync);
-    }
-
-    inline void record_start( ) { cudaEventRecord(startEvent); }
-
-    inline void record_stop( ) { cudaEventRecord(stopEvent); }
-
-    inline void synchronize( ) { cudaEventSynchronize(stopEvent); }
-
-    inline void print_time(std::string msg, bool print_out = true) {
-        cudaEventElapsedTime(&elapsed_gpu_ms, startEvent, stopEvent);
-        if ( print_out ) {
-            std::cout << "Time on " << msg << " " << elapsed_gpu_ms << " ms" << std::endl;
-        }
-    }
-
-    void MakeCufftPlan( );
-    void MakeCufftPlan3d( );
-
-    void SetClipIntoMask(short4 input_size, short4 output_size);
-    bool is_set_clip_into_mask = false;
-    // void SetClipIntoCallback(cufftReal* image_to_insert, int image_to_insert_size_x, int image_to_insert_size_y,int image_to_insert_pitch);
-    void SetComplexConjMultiplyAndLoadCallBack(cufftComplex* search_image_FT, cufftReal FT_normalization_factor);
-    void MultiplyConjugateImage(wanted_complex_type* other_image);
-
-  private:
-};
-
-// To print a message and some number n_to_print complex values to stdout
-void print_values_complex(float* input, std::string msg, int n_to_print) {
-    for ( int i = 0; i < n_to_print * 2; i += 2 ) {
-        std::cout << msg << i / 2 << "  " << input[i] << " " << input[i + 1] << std::endl;
-    }
-}
-
-// Return sum of real values
-float ReturnSumOfReal(float* input, short4 size, bool print_val = false) {
-    double temp_sum         = 0;
-    long   address          = 0;
-    int    padding_jump_val = size.w * 2 - size.x;
-    for ( int k = 0; k < size.z; k++ ) {
-        for ( int j = 0; j < size.y; j++ ) {
-            for ( int i = 0; i < size.x; i++ ) {
-
-                temp_sum += double(input[address]);
-                address++;
-            }
-            address += padding_jump_val;
-        }
-    }
-
-    return float(temp_sum);
-}
-
-// Return the sum of the complex values
-float2 ReturnSumOfComplex(float2* input, int n_to_print) {
-    double sum_x = 0;
-    double sum_y = 0;
-
-    for ( int i = 0; i < n_to_print; i++ ) {
-        sum_x += input[i].x;
-        sum_y += input[i].y;
-    }
-
-    return make_float2(float(sum_x), float(sum_y));
-}
-
-// Return the sum of the complex values
-float ReturnSumOfComplexAmplitudes(float2* input, int n_to_print) {
-    // We want to asses the error in the FFT at single/half precision, but to not add
-    // extra error for the use double here.
-    double sum = 0;
-    double x;
-    double y;
-
-    for ( int i = 0; i < n_to_print; i++ ) {
-        x = double(input[i].x);
-        y = double(input[i].y);
-        sum += sqrt(x * x + y * y);
-    }
-
-    return sum;
-}
-
-void ClipInto(const float* array_to_paste, float* array_to_paste_into, short4 size_from, short4 size_into, short4 wanted_center, float wanted_padding_value) {
-
-    long pixel_counter = 0;
-
-    int kk;
-    int k;
-    int kk_logi;
-
-    int jj;
-    int jj_logi;
-    int j;
-
-    int ii;
-    int ii_logi;
-    int i;
-
-    short4 center_to_paste_into = make_short4(size_into.x / 2, size_into.y / 2, size_into.z / 2, 0);
-    short4 center_to_paste      = make_short4(size_from.x / 2, size_from.y / 2, size_from.z / 2, 0);
-    int    padding_jump_value;
-
-    if ( size_into.x % 2 == 0 )
-        padding_jump_value = 2;
-    else
-        padding_jump_value = 1;
-
-    for ( kk = 0; kk < size_into.z; kk++ ) {
-        kk_logi = kk - center_to_paste_into.z;
-        k       = center_to_paste.z + wanted_center.z + kk_logi;
-
-        for ( jj = 0; jj < size_into.y; jj++ ) {
-            jj_logi = jj - center_to_paste_into.y;
-            j       = center_to_paste.y + wanted_center.y + jj_logi;
-
-            for ( ii = 0; ii < size_into.x; ii++ ) {
-                ii_logi = ii - center_to_paste_into.x;
-                i       = center_to_paste.x + wanted_center.x + ii_logi;
-
-                if ( k < 0 || k >= size_from.z || j < 0 || j >= size_from.y || i < 0 || i >= size_from.x ) {
-                    array_to_paste_into[pixel_counter] = wanted_padding_value;
-                }
-                else {
-                    array_to_paste_into[pixel_counter] = array_to_paste[k * (size_from.w * 2 * size_from.y) + j * (size_from.x + padding_jump_value) + i];
-                }
-
-                pixel_counter++;
-            }
-
-            pixel_counter += padding_jump_value;
-        }
-    }
-
-} // end of clip into
-
-//#define HEAVYERRORCHECKING_IMG
-
-// Note we are using std::cerr b/c the wxWidgets apps running in cisTEM are capturing std::cout
-#ifndef HEAVYERRORCHECKING_IMG
-#define postcheck_img
-#define cudaErr_img(error)                             \
-    {                                                  \
-        auto status = static_cast<cudaError_t>(error); \
-        { ; }                                          \
-    };
-#define precheck_img
-#else
-#define MyFFTPrintWithDetails(...) \
-    { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; }
-#define postcheck_img                                                   \
-    {                                                                   \
-        cudaError_t error = cudaStreamSynchronize(cudaStreamPerThread); \
-        if ( error != cudaSuccess ) {                                   \
-            std::cerr << cudaGetErrorString(error) << std::endl;        \
-            MyFFTPrintWithDetails("");                                  \
-        }                                                               \
-    };
-#define cudaErr_img(error)                                        \
-    {                                                             \
-        auto status = static_cast<cudaError_t>(error);            \
-        if ( status != cudaSuccess ) {                            \
-            std::cerr << cudaGetErrorString(status) << std::endl; \
-            MyFFTPrintWithDetails("");                            \
-        }                                                         \
-    };
-#define precheck_img \
-    { cudaErr_img(cudaGetLastError( )); }
-#endif
\ No newline at end of file
diff --git a/src/FastFFT.cu.cpp b/src/fastfft/FastFFT.cu
similarity index 84%
rename from src/FastFFT.cu.cpp
rename to src/fastfft/FastFFT.cu
index f57a7f9..f3dbd78 100644
--- a/src/FastFFT.cu.cpp
+++ b/src/fastfft/FastFFT.cu
@@ -5,7 +5,7 @@
 #include <string>
 #include <stdio.h>
 
-#include "../include/FastFFT.cuh"
+#include "../../include/FastFFT.cuh"
 
 namespace FastFFT {
 
@@ -13,7 +13,7 @@ template <class FFT, class invFFT, class ComplexType, class PreOpType, class Int
 __launch_bounds__(FFT::max_threads_per_block) __global__
         void block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE(const ComplexType* __restrict__ image_to_search, const ComplexType* __restrict__ input_values, ComplexType* __restrict__ output_values,
                                                            Offsets mem_offsets, int apparent_Q, typename FFT::workspace_type workspace_fwd, typename invFFT::workspace_type workspace_inv,
-                                                           PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+                                                           PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     // Initialize the shared memory, assuming everyting matches the input data X size in
     using complex_type = ComplexType;
@@ -26,19 +26,19 @@ __launch_bounds__(FFT::max_threads_per_block) __global__
     // For simplicity, we explicitly zeropad the input data to the size of the FFT.
     // It may be worth trying to use threadIdx.z as in the DECREASE methods.
     // Until then, this
-    io<FFT>::load(&input_values[Return1DFFTAddress(size_of<FFT>::value / apparent_Q)], thread_data, size_of<FFT>::value / apparent_Q, pre_op_lambda);
+    io<FFT>::load(&input_values[Return1DFFTAddress(size_of<FFT>::value / apparent_Q)], thread_data, size_of<FFT>::value / apparent_Q, pre_op_functor);
 
     // In the first FFT the modifying twiddle factor is 1 so the data are reeal
     FFT( ).execute(thread_data, shared_mem, workspace_fwd);
 
-#if DEBUG_FFT_STAGE > 3
+#if FFT_DEBUG_STAGE > 3
     //  * apparent_Q
-    io<invFFT>::load_shared(&image_to_search[Return1DFFTAddress(size_of<FFT>::value)], thread_data, intra_op_lambda);
+    io<invFFT>::load_shared(&image_to_search[Return1DFFTAddress(size_of<FFT>::value)], thread_data, intra_op_functor);
 #endif
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
     invFFT( ).execute(thread_data, shared_mem, workspace_inv);
-    io<invFFT>::store(thread_data, &output_values[Return1DFFTAddress(size_of<FFT>::value)], post_op_lambda);
+    io<invFFT>::store(thread_data, &output_values[Return1DFFTAddress(size_of<FFT>::value)], post_op_functor);
 #else
     // Do not do the post op lambda if the invFFT is not used.
     io<invFFT>::store(thread_data, &output_values[Return1DFFTAddress(size_of<FFT>::value)]);
@@ -63,7 +63,6 @@ FourierTransformer<ComputeType, InputType, OutputType, Rank>::FourierTransformer
 template <class ComputeType, class InputType, class OutputType, int Rank>
 FourierTransformer<ComputeType, InputType, OutputType, Rank>::~FourierTransformer( ) {
     Deallocate( );
-    UnPinHostMemory( );
     SetDefaults( );
 }
 
@@ -74,9 +73,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDefaults(
     is_in_memory_host_pointer   = false; // To track allocation of host side memory
     is_in_memory_device_pointer = false; // To track allocation of device side memory.
     is_in_buffer_memory         = false; // To track whether the current result is in dev_ptr.position_space or dev_ptr.position_space_buffer (momemtum space/ momentum space buffer respectively.)
-    transform_stage_completed   = none;
-
-    is_host_memory_pinned = false; // Specified in the constructor. Assuming host memory won't be pinned for many applications.
+    transform_stage_completed   = TransformStageCompleted::none;
 
     is_fftw_padded_input  = false; // Padding for in place r2c transforms
     is_fftw_padded_output = false; // Currently the output state will match the input state, otherwise it is an error.
@@ -98,36 +95,23 @@ template <class ComputeType, class InputType, class OutputType, int Rank>
 void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Deallocate( ) {
     // TODO: confirm this is NOT called when memory is allocated by external process.
     if ( is_in_memory_device_pointer && is_owner_of_memory ) {
-        precheck
-                cudaErr(cudaFree(d_ptr.position_space));
-        postcheck
-                is_in_memory_device_pointer = false;
+        precheck;
+        cudaErr(cudaFreeAsync(d_ptr.position_space, cudaStreamPerThread));
+        postcheck;
+        is_in_memory_device_pointer = false;
     }
 
     if ( is_from_python_call ) {
-        precheck
-                cudaErr(cudaFree(d_ptr.position_space_buffer));
-        postcheck
-    }
-}
-
-template <class ComputeType, class InputType, class OutputType, int Rank>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::UnPinHostMemory( ) {
-
-    if ( is_host_memory_pinned ) {
-        precheck
-                cudaErr(cudaHostUnregister(host_pointer));
-        postcheck
-                is_host_memory_pinned = false;
+        precheck;
+        cudaErr(cudaFreeAsync(d_ptr.position_space_buffer, cudaStreamPerThread));
+        postcheck;
     }
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetForwardFFTPlan(size_t input_logical_x_dimension, size_t input_logical_y_dimension, size_t input_logical_z_dimension,
                                                                                      size_t output_logical_x_dimension, size_t output_logical_y_dimension, size_t output_logical_z_dimension,
-                                                                                     bool is_padded_input,
-                                                                                     bool is_host_memory_pinned) {
-
+                                                                                     bool is_padded_input) {
     MyFFTDebugAssertTrue(input_logical_x_dimension > 0, "Input logical x dimension must be > 0");
     MyFFTDebugAssertTrue(input_logical_y_dimension > 0, "Input logical y dimension must be > 0");
     MyFFTDebugAssertTrue(input_logical_z_dimension > 0, "Input logical z dimension must be > 0");
@@ -141,12 +125,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetForwardFFT
     is_fftw_padded_input = is_padded_input; // Note: Must be set before ReturnPaddedMemorySize
     MyFFTRunTimeAssertTrue(is_fftw_padded_input, "Support for input arrays that are not FFTW padded needs to be implemented."); // FIXME
 
-    std::cerr << "In forward fft setup with dims " << fwd_dims_in.x << " " << fwd_dims_in.y << " " << fwd_dims_in.z << " " << fwd_dims_in.w << std::endl;
     // ReturnPaddedMemorySize also sets FFTW padding etc.
     input_memory_allocated      = ReturnPaddedMemorySize(fwd_dims_in);
     fwd_output_memory_allocated = ReturnPaddedMemorySize(fwd_dims_out); // sets .w and also increases compute_memory_allocated if needed.
 
-    std::cerr << "In forward fft setup with input_memory_allocated " << input_memory_allocated << std::endl;
     // The compute memory allocated is the max of all possible sizes.
 
     this->input_origin_type = OriginType::natural;
@@ -183,30 +165,32 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetInputPoint
     is_from_python_call = false;
 
     if ( is_input_on_device ) {
-        // We'll need a check on compute type, and a conversion if needed prior to this.
-        d_ptr.position_space = input_pointer;
-        is_owner_of_memory   = false;
+        is_set_input_pointer = false;
 
-        // We'll assume the host memory is pinned and if not, that is the calling processes problem. We also will not unpin.
+        // TODO: This could be named more clearly to reflect that it is not the owner of the INPUT GPU memory
+        is_owner_of_memory = false;
     }
     else {
         host_pointer = input_pointer;
         // arguably this could be set when actually doing the allocation, but I think this also makes sense as there may be multiople allocation
         // routines, but here we already know the calling process has not done it for us.
-        is_owner_of_memory = true;
-        // Check to see if the host memory is pinned.
-        if ( ! is_host_memory_pinned ) {
-            precheck
-                    cudaErr(cudaHostRegister((void*)host_pointer, sizeof(InputType) * input_memory_allocated, cudaHostRegisterDefault));
-            postcheck
-
-                    precheck
-                            cudaErr(cudaHostGetDevicePointer(&pinnedPtr, host_pointer, 0));
-            postcheck
-
-                    is_host_memory_pinned = true;
-        }
+        is_owner_of_memory        = true;
+        is_set_input_pointer      = true;
+        is_in_memory_host_pointer = true;
     }
+}
+
+template <class ComputeType, class InputType, class OutputType, int Rank>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetCallerPinnedInputPointer(InputType* input_pointer) {
+    MyFFTDebugAssertFalse(input_memory_allocated == 0, "There is no input memory allocated.");
+    MyFFTDebugAssertTrue(is_set_output_params, "Output parameters not set");
+    MyFFTRunTimeAssertFalse(is_set_input_pointer, "The input pointer has already been set!");
+
+    host_pointer = input_pointer;
+    // arguably this could be set when actually doing the allocation, but I think this also makes sense as there may be multiople allocation
+    // routines, but here we already know the calling process has not done it for us.
+    is_owner_of_memory = false;
+    // Check to see if the host memory is pinned.
 
     is_in_memory_host_pointer = true;
     is_set_input_pointer      = true;
@@ -242,9 +226,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDevicePoin
     if ( should_allocate_buffer_memory ) {
 
         // TODO: confirm this is correct for complex valued input
-        precheck
-                cudaErr(cudaMalloc(&d_ptr.position_space_buffer, buffer_address * sizeof(ComputeType)));
-        postcheck if constexpr ( std::is_same<decltype(d_ptr.momentum_space), __half2>::value ) {
+        precheck;
+        cudaErr(cudaMalloc(&d_ptr.position_space_buffer, buffer_address * sizeof(ComputeType)));
+        postcheck;
+        if constexpr ( std::is_same<decltype(d_ptr.momentum_space), __half2>::value ) {
             d_ptr.momentum_space        = (__half2*)d_ptr.position_space;
             d_ptr.momentum_space_buffer = (__half2*)d_ptr.position_space_buffer;
         }
@@ -254,7 +239,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDevicePoin
         }
     }
     else {
-        SetDimensions(CopyFromHost);
+        SetDimensions(DimensionCheckType::CopyFromHost);
         if constexpr ( std::is_same<decltype(d_ptr.momentum_space), __half2>::value ) {
             d_ptr.momentum_space        = (__half2*)d_ptr.position_space;
             d_ptr.position_space_buffer = &d_ptr.position_space[buffer_address];
@@ -269,123 +254,136 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDevicePoin
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyHostToDevice( ) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyHostToDevceAndSynchronize(InputType* input_pointer, int n_elements_to_copy) {
+    CopyHostToDevice(input_pointer, n_elements_to_copy);
+    cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
+}
+
+template <class ComputeType, class InputType, class OutputType, int Rank>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyHostToDevice(InputType* input_pointer, int n_elements_to_copy) {
 
-    SetDimensions(CopyFromHost);
-    MyFFTDebugAssertTrue(is_in_memory_host_pointer, "Host memory not allocated");
+    SetDimensions(DimensionCheckType::CopyFromHost);
+    MyFFTDebugAssertTrue(pointer_is_in_memory_and_registered(input_pointer), "Host memory not in memory and/or pinned");
 
     // FIXME switch to stream ordered malloc
     if ( ! is_in_memory_device_pointer ) {
         // Allocate enough for the out of place buffer as well.
         // MyFFTPrintWithDetails("Allocating device memory for input pointer");
-        precheck
-                cudaErr(cudaMalloc(&d_ptr.position_space, compute_memory_allocated * sizeof(ComputeType)));
-        postcheck
+        precheck;
+        cudaErr(cudaMalloc(&d_ptr.position_space, compute_memory_allocated * sizeof(ComputeType)));
+        postcheck;
 
-                SetDevicePointers(is_from_python_call);
+        SetDevicePointers(is_from_python_call);
 
         is_in_memory_device_pointer = true;
     }
-    std::cerr << "From inside memory to copy is" << memory_size_to_copy << std::endl;
-    precheck
-            cudaErr(cudaMemcpyAsync(d_ptr.position_space, pinnedPtr, memory_size_to_copy * sizeof(InputType), cudaMemcpyHostToDevice, cudaStreamPerThread));
-    postcheck
+    precheck;
+    cudaErr(cudaMemcpyAsync(d_ptr.position_space, input_pointer, memory_size_to_copy * sizeof(InputType), cudaMemcpyHostToDevice, cudaStreamPerThread));
+    postcheck;
 
-            // TODO r/n assuming InputType is _half, _half2, float, or _float2 (real, complex, real, complex) need to handle other types and convert
-            bool should_block_until_complete = true; // FIXME after switching to stream ordered malloc this will not be needed.
-    if ( should_block_until_complete ) {
-        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
-    }
+    // TODO: Not sure if this is the cleanest way to do this. Other instances tagged SET_TRANFORMANDBUFFER
+    transform_stage_completed = TransformStageCompleted::none;
+    is_in_buffer_memory       = false;
+}
+
+template <class ComputeType, class InputType, class OutputType, int Rank>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToHostAndSynchronize(OutputType* output_pointer, bool free_gpu_memory, int n_elements_to_copy) {
+    CopyDeviceToHost(output_pointer, free_gpu_memory, n_elements_to_copy);
+    cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToHost(bool free_gpu_memory, bool unpin_host_memory, int n_elements_to_copy) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToHost(OutputType* output_pointer, bool free_gpu_memory, int n_elements_to_copy) {
+    MyFFTDebugAssertTrue(pointer_is_in_memory_and_registered(output_pointer), "Host memory not in memory and/or pinned");
 
-    SetDimensions(CopyToHost);
+    SetDimensions(DimensionCheckType::CopyToHost);
     if ( n_elements_to_copy != 0 )
         memory_size_to_copy = n_elements_to_copy;
-
-    // std::cout << "N elements " << n_elements_to_copy << " memory to copy " << memory_size_to_copy <<  std::endl;
     MyFFTDebugAssertTrue(is_in_memory_device_pointer, "GPU memory not allocated");
-    ComputeType* copy_pointer;
-    if ( is_in_buffer_memory )
-        copy_pointer = d_ptr.position_space_buffer;
-    else
-        copy_pointer = d_ptr.position_space;
-
-    // FIXME this is assuming the input type matches the compute type.
-    precheck
-            cudaErr(cudaMemcpyAsync(pinnedPtr, copy_pointer, memory_size_to_copy * sizeof(InputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
-    postcheck
-
-            // Just set true her for now
-            bool should_block_until_complete = true;
-    if ( should_block_until_complete )
-        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
+    if ( is_in_buffer_memory ) {
+        std::cerr << "Clopying from buffer memory" << std::endl;
+        precheck;
+        cudaErr(cudaMemcpyAsync(output_pointer, d_ptr.position_space_buffer, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
+        postcheck;
+    }
+    else {
+        precheck;
+        cudaErr(cudaMemcpyAsync(output_pointer, d_ptr.position_space, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
+        postcheck;
+    }
 
-    // TODO add asserts etc.
     if ( free_gpu_memory ) {
         Deallocate( );
     }
+}
 
-    if ( unpin_host_memory ) {
-        UnPinHostMemory( );
+template <class ComputeType, class InputType, class OutputType, int Rank>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToDeviceFromNonOwningAddress(InputType* input_pointer, int n_elements_to_copy) {
+    SetDimensions(DimensionCheckType::CopyFromHost);
+    // FIXME switch to stream ordered malloc
+    if ( ! is_in_memory_device_pointer ) {
+        // Allocate enough for the out of place buffer as well.
+        // MyFFTPrintWithDetails("Allocating device memory for input pointer");
+        precheck;
+        cudaErr(cudaMalloc(&d_ptr.position_space, compute_memory_allocated * sizeof(ComputeType)));
+        postcheck;
+
+        SetDevicePointers(is_from_python_call);
+
+        is_in_memory_device_pointer = true;
     }
+    precheck;
+    cudaErr(cudaMemcpyAsync(d_ptr.position_space, input_pointer, memory_size_to_copy * sizeof(InputType), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
+    postcheck;
+    // TODO: Not sure if this is the cleanest way to do this. Other instances tagged SET_TRANFORMANDBUFFER
+    transform_stage_completed = TransformStageCompleted::none;
+    is_in_buffer_memory       = false;
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToHost(OutputType* output_pointer, bool free_gpu_memory, bool unpin_host_memory, int n_elements_to_copy) {
+template <class TransferDataType>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToDeviceAndSynchronize(TransferDataType* output_pointer, bool free_gpu_memory, int n_elements_to_copy) {
+    CopyDeviceToDevice(output_pointer, free_gpu_memory, n_elements_to_copy);
+    cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
+}
 
-    SetDimensions(CopyToHost);
+template <class ComputeType, class InputType, class OutputType, int Rank>
+template <class TransferDataType>
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::CopyDeviceToDevice(TransferDataType* output_pointer, bool free_gpu_memory, int n_elements_to_copy) {
+
+    // TODO: can the test for pinned pointers be used to directly assert if GPU memory is allocated rather than using a bool?
+    SetDimensions(DimensionCheckType::CopyDeviceToDevice);
     if ( n_elements_to_copy != 0 )
         memory_size_to_copy = n_elements_to_copy;
-    MyFFTDebugAssertTrue(is_in_memory_device_pointer, "GPU memory not allocated");
-    // Assuming the output is not pinned, TODO change to optionally maintain as host_input as well.
-    OutputType* tmpPinnedPtr;
-    precheck
-            // FIXME this is assuming output type is the same as compute type.
-            cudaErr(cudaHostRegister(output_pointer, sizeof(OutputType) * memory_size_to_copy, cudaHostRegisterDefault));
-    postcheck
-
-            precheck
-                    cudaErr(cudaHostGetDevicePointer(&tmpPinnedPtr, output_pointer, 0));
-    postcheck if ( is_in_buffer_memory ) {
-        precheck
-                cudaErr(cudaMemcpyAsync(tmpPinnedPtr, d_ptr.position_space_buffer, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
-        postcheck
+    MyFFTDebugAssertTrue(is_in_memory_device_pointer || ! is_owner_of_memory, "GPU memory not allocated");
+
+    if ( is_in_buffer_memory ) {
+        precheck;
+        cudaErr(cudaMemcpyAsync(output_pointer, d_ptr.position_space_buffer, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
+        postcheck;
     }
     else {
-        precheck
-                cudaErr(cudaMemcpyAsync(tmpPinnedPtr, d_ptr.position_space, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
-        postcheck
+        precheck;
+        cudaErr(cudaMemcpyAsync(output_pointer, d_ptr.position_space, memory_size_to_copy * sizeof(OutputType), cudaMemcpyDeviceToHost, cudaStreamPerThread));
+        postcheck;
     }
 
-    // Just set true her for now
-    bool should_block_until_complete = true;
-    if ( should_block_until_complete )
-        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
-
-    precheck
-            cudaErr(cudaHostUnregister(output_pointer));
-    postcheck
-
-            if ( free_gpu_memory ) { Deallocate( ); }
-    if ( unpin_host_memory ) {
-        UnPinHostMemory( );
+    if ( free_gpu_memory ) {
+        Deallocate( );
     }
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class PreOpType, class IntraOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(PreOpType pre_op, IntraOpType intra_op) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(PreOpType pre_op_functor, IntraOpType intra_op_functor) {
 
-    SetDimensions(FwdTransform);
+    SetDimensions(DimensionCheckType::FwdTransform);
 
     // All placeholders
-    constexpr bool use_thread_method         = false;
-    const bool     do_forward_transform      = true;
-    const bool     swap_real_space_quadrants = false;
-    const bool     transpose_output          = true;
+    constexpr bool use_thread_method    = false;
+    const bool     do_forward_transform = true;
+    // const bool     swap_real_space_quadrants = false;
+    // const bool transpose_output = true;
 
     // SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform, bool use_thread_method)
     switch ( transform_dimension ) {
@@ -394,24 +392,24 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(P
             // Note: the only time the non-transposed method should be used is for 1d data.
             if constexpr ( use_thread_method ) {
                 if ( is_real_valued_input )
-                    SetPrecisionAndExectutionMethod<true>(r2c_decomposed, do_forward_transform); //FFT_R2C_decomposed(transpose_output);
+                    SetPrecisionAndExectutionMethod<true>(r2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor); //FFT_R2C_decomposed(transpose_output);
                 else
-                    SetPrecisionAndExectutionMethod<true>(c2c_decomposed, do_forward_transform);
+                    SetPrecisionAndExectutionMethod<true>(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor);
                 transform_stage_completed = TransformStageCompleted::fwd;
             }
             else {
                 if ( is_real_valued_input ) {
                     switch ( fwd_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod(r2c_none_XY);
+                            SetPrecisionAndExectutionMethod(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         case SizeChangeType::decrease: {
-                            SetPrecisionAndExectutionMethod(r2c_decrease);
+                            SetPrecisionAndExectutionMethod(r2c_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         case SizeChangeType::increase: {
-                            SetPrecisionAndExectutionMethod(r2c_increase);
+                            SetPrecisionAndExectutionMethod(r2c_increase, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         default: {
@@ -422,15 +420,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(P
                 else {
                     switch ( fwd_size_change_type ) {
                         case SizeChangeType::no_change: {
-                            SetPrecisionAndExectutionMethod(c2c_fwd_none);
+                            SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         case SizeChangeType::decrease: {
-                            SetPrecisionAndExectutionMethod(c2c_fwd_decrease);
+                            SetPrecisionAndExectutionMethod(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         case SizeChangeType::increase: {
-                            SetPrecisionAndExectutionMethod(c2c_fwd_increase);
+                            SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
                             break;
                         }
                         default: {
@@ -444,31 +442,31 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(P
         }
         case 2: {
             switch ( fwd_size_change_type ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     // FIXME there is some redundancy in specifying _decomposed and use_thread_method
                     // Note: the only time the non-transposed method should be used is for 1d data.
                     if ( use_thread_method ) {
-                        SetPrecisionAndExectutionMethod<true>(r2c_decomposed_transposed, do_forward_transform);
+                        SetPrecisionAndExectutionMethod<true>(r2c_decomposed_transposed, do_forward_transform, pre_op_functor, intra_op_functor);
                         transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                        SetPrecisionAndExectutionMethod<true>(c2c_decomposed, do_forward_transform);
+                        SetPrecisionAndExectutionMethod<true>(c2c_decomposed, do_forward_transform, pre_op_functor, intra_op_functor);
                     }
                     else {
-                        SetPrecisionAndExectutionMethod(r2c_none_XY);
+                        SetPrecisionAndExectutionMethod(r2c_none_XY, do_forward_transform, pre_op_functor, intra_op_functor);
                         transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                        SetPrecisionAndExectutionMethod(c2c_fwd_none);
+                        SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
                     }
                     break;
                 }
-                case increase: {
-                    SetPrecisionAndExectutionMethod(r2c_increase);
+                case SizeChangeType::increase: {
+                    SetPrecisionAndExectutionMethod(r2c_increase, do_forward_transform, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod(c2c_fwd_increase);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
                     break;
                 }
-                case decrease: {
-                    SetPrecisionAndExectutionMethod(r2c_decrease);
+                case SizeChangeType::decrease: {
+                    SetPrecisionAndExectutionMethod(r2c_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod(c2c_fwd_decrease);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_decrease, do_forward_transform, pre_op_functor, intra_op_functor);
                     break;
                 }
             }
@@ -476,22 +474,22 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd(P
         }
         case 3: {
             switch ( fwd_size_change_type ) {
-                case no_change: {
-                    SetPrecisionAndExectutionMethod(r2c_none_XZ);
+                case SizeChangeType::no_change: {
+                    SetPrecisionAndExectutionMethod(r2c_none_XZ, do_forward_transform, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod(c2c_fwd_none_Z);
-                    SetPrecisionAndExectutionMethod(c2c_fwd_none);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_none_Z, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_none, do_forward_transform, pre_op_functor, intra_op_functor);
                     break;
                 }
-                case increase: {
-                    SetPrecisionAndExectutionMethod(r2c_increase_XZ);
+                case SizeChangeType::increase: {
+                    SetPrecisionAndExectutionMethod(r2c_increase_XZ, do_forward_transform, pre_op_functor, intra_op_functor);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
-                    SetPrecisionAndExectutionMethod(c2c_fwd_increase_Z);
-                    SetPrecisionAndExectutionMethod(c2c_fwd_increase);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_increase_Z, do_forward_transform, pre_op_functor, intra_op_functor);
+                    SetPrecisionAndExectutionMethod(c2c_fwd_increase, do_forward_transform, pre_op_functor, intra_op_functor);
                     // SetPrecisionAndExectutionMethod(c2c_fwd_increase_Z);
                     break;
                 }
-                case decrease: {
+                case SizeChangeType::decrease: {
                     // Not yet supported
                     MyFFTRunTimeAssertTrue(false, "3D FFT fwd no change not yet supported");
                     break;
@@ -505,13 +503,13 @@ template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class IntraOpType, class PostOpType>
 void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Inv(IntraOpType intra_op, PostOpType post_op) {
 
-    SetDimensions(InvTransform);
+    SetDimensions(DimensionCheckType::InvTransform);
 
     // All placeholders
-    constexpr bool use_thread_method         = false;
-    const bool     do_forward_transform      = false;
-    const bool     swap_real_space_quadrants = false;
-    const bool     transpose_output          = true;
+    constexpr bool use_thread_method    = false;
+    const bool     do_forward_transform = false;
+    // const bool     swap_real_space_quadrants = false;
+    // const bool     transpose_output          = true;
 
     switch ( transform_dimension ) {
         case 1: {
@@ -569,7 +567,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Inv(I
         }
         case 2: {
             switch ( inv_size_change_type ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     // FIXME there is some redundancy in specifying _decomposed and use_thread_method
                     // Note: the only time the non-transposed method should be used is for 1d data.
                     if ( use_thread_method ) {
@@ -584,13 +582,13 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Inv(I
                     }
                     break;
                 }
-                case increase: {
+                case SizeChangeType::increase: {
                     SetPrecisionAndExectutionMethod(c2c_inv_increase);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
                     SetPrecisionAndExectutionMethod(c2r_increase);
                     break;
                 }
-                case decrease: {
+                case SizeChangeType::decrease: {
                     SetPrecisionAndExectutionMethod(c2c_inv_decrease);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
                     SetPrecisionAndExectutionMethod(c2r_decrease);
@@ -605,20 +603,20 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Inv(I
         }
         case 3: {
             switch ( inv_size_change_type ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     SetPrecisionAndExectutionMethod(c2c_inv_none_XZ);
                     transform_stage_completed = TransformStageCompleted::inv; // technically not complete, needed for copy on validation of partial fft.
                     SetPrecisionAndExectutionMethod(c2c_inv_none_Z);
                     SetPrecisionAndExectutionMethod(c2r_none);
                     break;
                 }
-                case increase: {
+                case SizeChangeType::increase: {
                     SetPrecisionAndExectutionMethod(r2c_increase);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
                     // SetPrecisionAndExectutionMethod(c2c_fwd_increase_Z);
                     break;
                 }
-                case decrease: {
+                case SizeChangeType::decrease: {
                     // Not yet supported
                     MyFFTRunTimeAssertTrue(false, "3D FFT inv no decrease not yet supported");
                     break;
@@ -813,11 +811,11 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Inv(I
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_Image_Inv(float2* image_to_search, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_Image_Inv(float2* image_to_search, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     // Set the member pointer to the passed pointer
     d_ptr.image_to_search = image_to_search;
-    SetDimensions(FwdTransform);
+    SetDimensions(DimensionCheckType::FwdTransform);
 
     switch ( transform_dimension ) {
         case 1: {
@@ -827,18 +825,18 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
         case 2: {
             ;
             switch ( fwd_size_change_type ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     SetPrecisionAndExectutionMethod(r2c_none_XY, true);
                     switch ( inv_size_change_type ) {
-                        case no_change: {
+                        case SizeChangeType::no_change: {
                             MyFFTRunTimeAssertTrue(false, "2D FFT generic lambda no change/nochange not yet supported");
                             break;
                         }
-                        case increase: {
+                        case SizeChangeType::increase: {
                             MyFFTRunTimeAssertTrue(false, "2D FFT generic lambda no change/increase not yet supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
                             SetPrecisionAndExectutionMethod(xcorr_fwd_none_inv_decrease, true);
                             SetPrecisionAndExectutionMethod(c2r_decrease, false);
                             break;
@@ -850,23 +848,23 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
                     } // switch on inv size change type
                     break;
                 } // case fwd no change
-                case increase: {
+                case SizeChangeType::increase: {
                     SetPrecisionAndExectutionMethod(r2c_increase, true);
                     switch ( inv_size_change_type ) {
-                        case no_change: {
-                            SetPrecisionAndExectutionMethod<false, PreOpType, IntraOpType, PostOpType>(generic_fwd_increase_op_inv_none, true, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                        case SizeChangeType::no_change: {
+                            SetPrecisionAndExectutionMethod<false, PreOpType, IntraOpType, PostOpType>(generic_fwd_increase_op_inv_none, true, pre_op_functor, intra_op_functor, post_op_functor);
                             SetPrecisionAndExectutionMethod(c2r_none_XY, false);
                             transform_stage_completed = TransformStageCompleted::inv;
                             break;
                         }
 
-                        case increase: {
+                        case SizeChangeType::increase: {
                             // I don't see where increase increase makes any sense
                             // FIXME add a check on this in the validation step.
                             MyFFTRunTimeAssertTrue(false, "2D FFT Cross correlation with fwd and inv size increase is not supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
                             // with FwdTransform set, call c2c
                             // Set InvTransform
                             // Call new kernel that handles the conj mul inv c2c trimmed, and inv c2r in one go.
@@ -880,22 +878,22 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
                     } // switch on inv size change type
                     break;
                 }
-                case decrease: {
+                case SizeChangeType::decrease: {
 
                     SetPrecisionAndExectutionMethod(r2c_decrease, true);
                     switch ( inv_size_change_type ) {
-                        case no_change: {
+                        case SizeChangeType::no_change: {
                             SetPrecisionAndExectutionMethod(xcorr_fwd_increase_inv_none, true);
                             SetPrecisionAndExectutionMethod(c2r_none_XY, false); // TODO the output could be smaller
                             transform_stage_completed = TransformStageCompleted::inv;
                             break;
                         }
-                        case increase: {
+                        case SizeChangeType::increase: {
 
                             MyFFTRunTimeAssertTrue(false, "2D FFT Cross correlation with fwd and inv size increase is not supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
 
                             MyFFTRunTimeAssertTrue(false, "2D FFT Cross correlation with fwd decrease and inv size decrease is a work in progress");
                             break;
@@ -915,29 +913,29 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
         }
         case 3: {
             switch ( fwd_size_change_type ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     MyFFTDebugAssertTrue(false, "3D FFT Cross correlation fwd no change not yet supported");
                     switch ( inv_size_change_type ) {
-                        case no_change: {
+                        case SizeChangeType::no_change: {
                             break;
                         }
-                        case increase: {
+                        case SizeChangeType::increase: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size increase is not supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size decrease is not supported");
                             break;
                         }
                     }
                     break;
                 }
-                case increase: {
+                case SizeChangeType::increase: {
                     SetPrecisionAndExectutionMethod(r2c_increase_XZ);
                     transform_stage_completed = TransformStageCompleted::fwd; // technically not complete, needed for copy on validation of partial fft.
                     SetPrecisionAndExectutionMethod(c2c_fwd_increase_Z);
                     switch ( inv_size_change_type ) {
-                        case no_change: {
+                        case SizeChangeType::no_change: {
                             // TODO: will need a kernel for generic_fwd_increase_op_inv_none_XZ
                             SetPrecisionAndExectutionMethod(generic_fwd_increase_op_inv_none);
                             // SetPrecisionAndExectutionMethod(c2c_inv_none_XZ);
@@ -946,11 +944,11 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
                             SetPrecisionAndExectutionMethod(c2r_none);
                             break;
                         }
-                        case increase: {
+                        case SizeChangeType::increase: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size increase is not supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size decrease is not supported");
                             break;
                         }
@@ -960,17 +958,17 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::Generic_Fwd_I
                     }
                     break;
                 }
-                case decrease: {
+                case SizeChangeType::decrease: {
                     MyFFTDebugAssertTrue(false, "3D FFT Cross correlation fwd decrease not yet supported");
                     switch ( inv_size_change_type ) {
-                        case no_change: {
+                        case SizeChangeType::no_change: {
                             break;
                         }
-                        case increase: {
+                        case SizeChangeType::increase: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size increase is not supported");
                             break;
                         }
-                        case decrease: {
+                        case SizeChangeType::decrease: {
                             MyFFTDebugAssertTrue(false, "3D FFT Cross correlation with fwd and inv size decrease is not supported");
                             break;
                         }
@@ -998,7 +996,6 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ValidateDimen
 
     MyFFTDebugAssertTrue(is_set_input_params, "Input parameters not set");
     MyFFTDebugAssertTrue(is_set_output_params, "Output parameters not set");
-    MyFFTDebugAssertTrue(is_set_input_pointer, "The input data pointer is not set");
 
     MyFFTRunTimeAssertTrue(fwd_dims_out.x == inv_dims_in.x &&
                                    fwd_dims_out.y == inv_dims_in.y &&
@@ -1012,7 +1009,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ValidateDimen
         MyFFTDebugAssertTrue(fwd_dims_out.y >= fwd_dims_in.y, "If padding, all dimensions must be >=, y out < y in");
         MyFFTDebugAssertTrue(fwd_dims_out.z >= fwd_dims_in.z, "If padding, all dimensions must be >=, z out < z in");
 
-        fwd_size_change_type = increase;
+        fwd_size_change_type = SizeChangeType::increase;
     }
     else if ( fwd_dims_out.x < fwd_dims_in.x || fwd_dims_out.y < fwd_dims_in.y || fwd_dims_out.z < fwd_dims_in.z ) {
         // For now we must pad in all dimensions, this is not needed and should be lifted. FIXME
@@ -1020,10 +1017,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ValidateDimen
         MyFFTDebugAssertTrue(fwd_dims_out.y <= fwd_dims_in.y, "If padding, all dimensions must be <=, y out > y in");
         MyFFTDebugAssertTrue(fwd_dims_out.z <= fwd_dims_in.z, "If padding, all dimensions must be <=, z out > z in");
 
-        fwd_size_change_type = decrease;
+        fwd_size_change_type = SizeChangeType::decrease;
     }
     else if ( fwd_dims_out.x == fwd_dims_in.x && fwd_dims_out.y == fwd_dims_in.y && fwd_dims_out.z == fwd_dims_in.z ) {
-        fwd_size_change_type = no_change;
+        fwd_size_change_type = SizeChangeType::no_change;
     }
     else {
         // TODO: if this is relaxed, the dimensionality check below will be invalid.
@@ -1037,13 +1034,13 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ValidateDimen
         MyFFTDebugAssertTrue(inv_dims_out.y >= inv_dims_in.y, "If padding, all dimensions must be >=, y out < y in");
         MyFFTDebugAssertTrue(inv_dims_out.z >= inv_dims_in.z, "If padding, all dimensions must be >=, z out < z in");
 
-        inv_size_change_type = increase;
+        inv_size_change_type = SizeChangeType::increase;
     }
     else if ( inv_dims_out.x < inv_dims_in.x || inv_dims_out.y < inv_dims_in.y || inv_dims_out.z < inv_dims_in.z ) {
-        inv_size_change_type = decrease;
+        inv_size_change_type = SizeChangeType::decrease;
     }
     else if ( inv_dims_out.x == inv_dims_in.x && inv_dims_out.y == inv_dims_in.y && inv_dims_out.z == inv_dims_in.z ) {
-        inv_size_change_type = no_change;
+        inv_size_change_type = SizeChangeType::no_change;
     }
     else {
         // TODO: if this is relaxed, the dimensionality check below will be invalid.
@@ -1075,48 +1072,66 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ValidateDimen
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDimensions(DimensionCheckType check_op_type) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetDimensions(DimensionCheckType::Enum check_op_type) {
     // This should be run inside any public method call to ensure things ar properly setup.
     if ( ! is_size_validated ) {
         ValidateDimensions( );
     }
 
     switch ( check_op_type ) {
-        case CopyFromHost: {
+        case DimensionCheckType::CopyFromHost: {
             // MyFFTDebugAssertTrue(transform_stage_completed == none, "When copying from host, the transform stage should be none, something has gone wrong.");
             // FIXME: is this the right thing to do? Maybe this should be explicitly "reset" when the input image is "refereshed."
-            transform_stage_completed = none;
-            std::cerr << "input memory allocate " << input_memory_allocated << std::endl;
-            memory_size_to_copy = input_memory_allocated;
+            transform_stage_completed = TransformStageCompleted::none;
+            memory_size_to_copy       = input_memory_allocated;
             break;
         }
 
-        case CopyToHost: {
+        case DimensionCheckType::CopyToHost: {
+            // FIXME currently there is no check that the right amount of memory is allocated on the host side array.
+            switch ( transform_stage_completed ) {
+                case SizeChangeType::no_change: {
+                    memory_size_to_copy = input_memory_allocated;
+                    break;
+                }
+                case TransformStageCompleted::fwd: {
+                    memory_size_to_copy = fwd_output_memory_allocated;
+                    break;
+                }
+                case TransformStageCompleted::inv: {
+                    memory_size_to_copy = inv_output_memory_allocated;
+                    break;
+                }
+            } // switch transform_stage_completed
+            break;
+        } // case CopToHost
+
+        case DimensionCheckType::CopyDeviceToDevice: {
             // FIXME currently there is no check that the right amount of memory is allocated on the host side array.
             switch ( transform_stage_completed ) {
-                case no_change: {
+                case SizeChangeType::no_change: {
                     memory_size_to_copy = input_memory_allocated;
                     break;
                 }
-                case fwd: {
+                case TransformStageCompleted::fwd: {
                     memory_size_to_copy = fwd_output_memory_allocated;
                     break;
                 }
-                case inv: {
+                case TransformStageCompleted::inv: {
                     memory_size_to_copy = inv_output_memory_allocated;
                     break;
                 }
             } // switch transform_stage_completed
             break;
-        } // case CopToHose
+        } // case CopyDeviceToDevice
 
-        case FwdTransform: {
-            MyFFTDebugAssertTrue(transform_stage_completed == none || transform_stage_completed == inv, "When doing a forward transform, the transform stage completed should be none, something has gone wrong.");
+        case DimensionCheckType::FwdTransform: {
+            MyFFTDebugAssertTrue(transform_stage_completed == TransformStageCompleted::none || transform_stage_completed == TransformStageCompleted::inv, "When doing a forward transform, the transform stage completed should be none, something has gone wrong.");
             break;
         }
 
-        case InvTransform: {
-            MyFFTDebugAssertTrue(transform_stage_completed == fwd, "When doing an inverse transform, the transform stage completed should be fwd, something has gone wrong.");
+        case DimensionCheckType::InvTransform: {
+            MyFFTDebugAssertTrue(transform_stage_completed == TransformStageCompleted::fwd, "When doing an inverse transform, the transform stage completed should be fwd, something has gone wrong.");
             break;
         }
     } // end switch on operation type
@@ -1408,11 +1423,11 @@ __global__ void thread_fft_kernel_C2C_decomposed_ConjMul(const ComplexType* __re
     // Now we need to aggregate each of the Q transforms into each output block of size P
     io_thread<FFT>::remap_decomposed_segments(thread_data, shared_mem, twiddle_in, Q, size_of<FFT>::value * Q);
 
-#if DEBUG_FFT_STAGE > 3
+#if FFT_DEBUG_STAGE > 3
     io_thread<invFFT>::load_shared_and_conj_multiply(&image_to_search[Return1DFFTAddress(size_of<FFT>::value * Q)], shared_mem, thread_data, Q);
 #endif
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
     invFFT( ).execute(thread_data);
     // Now we need to aggregate each of the Q transforms into each output block of size P
     io_thread<invFFT>::remap_decomposed_segments(thread_data, shared_mem, -twiddle_in, Q, size_of<FFT>::value * Q);
@@ -1444,12 +1459,12 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__
     // In the first FFT the modifying twiddle factor is 1 so the data are reeal
     FFT( ).execute(thread_data, shared_mem, workspace_fwd);
 
-#if DEBUG_FFT_STAGE > 3
+#if FFT_DEBUG_STAGE > 3
     //  * apparent_Q
     io<invFFT>::load_shared_and_conj_multiply(&image_to_search[Return1DFFTAddress(size_of<FFT>::value)], thread_data);
 #endif
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
     invFFT( ).execute(thread_data, shared_mem, workspace_inv);
 #endif
 
@@ -1474,7 +1489,7 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__
     // In the first FFT the modifying twiddle factor is 1 so the data are reeal
     FFT( ).execute(thread_data, shared_mem, workspace_fwd);
 
-#if DEBUG_FFT_STAGE > 3
+#if FFT_DEBUG_STAGE > 3
     // Swap real space quadrants using a phase shift by N/2 pixels
     const unsigned int stride = io<invFFT>::stride_size( );
     int                logical_y;
@@ -1490,7 +1505,7 @@ __launch_bounds__(invFFT::max_threads_per_block) __global__
     io<invFFT>::load_shared_and_conj_multiply(&image_to_search[Return1DFFTAddress(size_of<FFT>::value * Q)], thread_data);
 #endif
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
     invFFT( ).execute(thread_data, shared_mem, workspace_inv);
 #endif
 
@@ -1524,12 +1539,12 @@ __global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const Complex
     // // Full twiddle multiply and store in natural order in shared memory
     // io<FFT>::reduce_block_fft(thread_data, shared_mem, twiddle_in, Q);
 
-#if DEBUG_FFT_STAGE > 3
+#if FFT_DEBUG_STAGE > 3
     // Load in imageFFT to search
     io<invFFT>::load_shared_and_conj_multiply(&image_to_search[Return1DFFTAddress(size_of<FFT>::value)], thread_data);
 #endif
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
     // Run the inverse FFT
     // invFFT().execute(thread_data, &shared_mem[fft_shared_mem_num_elements * threadIdx.z], workspace_inv);
     invFFT( ).execute(thread_data, shared_mem, workspace_inv);
@@ -1538,7 +1553,7 @@ __global__ void block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul(const Complex
 
 // // The reduced store considers threadIdx.z to ignore extra threads
 // io<invFFT>::store_c2c_reduced(thread_data, &output_values[blockIdx.y * gridDim.y]);
-#if DEBUG_FFT_STAGE < 5
+#if FFT_DEBUG_STAGE < 5
     // There is no size reduction for this debug stage, so we need to use the pixel_pitch of the input array.
     io<invFFT>::store(thread_data, &output_values[Return1DFFTAddress(size_of<FFT>::value)]);
 #else
@@ -1943,9 +1958,9 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ClipIntoTopLe
 
     const short4 area_to_clip_from = make_short4(fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.w * 2, fwd_dims_out.w * 2);
 
-    precheck
-            clip_into_top_left_kernel<InputType, InputType><<<local_gridDims, local_threadsPerBlock, 0, cudaStreamPerThread>>>(d_ptr.position_space, d_ptr.position_space, area_to_clip_from);
-    postcheck
+    precheck;
+    clip_into_top_left_kernel<InputType, InputType><<<local_gridDims, local_threadsPerBlock, 0, cudaStreamPerThread>>>(d_ptr.position_space, d_ptr.position_space, area_to_clip_from);
+    postcheck;
 }
 
 // FIXME assumed FWD
@@ -1989,9 +2004,9 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::ClipIntoReal(
     const short4 area_to_clip_from    = make_short4(fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.w * 2, fwd_dims_out.w * 2);
     float        wanted_padding_value = 0.f;
 
-    precheck
-            clip_into_real_kernel<float, float><<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(d_ptr.position_space, d_ptr.position_space, fwd_dims_in, fwd_dims_out, wanted_center, wanted_padding_value);
-    postcheck
+    precheck;
+    clip_into_real_kernel<float, float><<<gridDims, threadsPerBlock, 0, cudaStreamPerThread>>>(d_ptr.position_space, d_ptr.position_space, fwd_dims_in, fwd_dims_out, wanted_center, wanted_padding_value);
+    postcheck;
 }
 
 // Modified from GpuImage::ClipIntoRealKernel
@@ -2037,7 +2052,7 @@ __global__ void clip_into_real_kernel(InputType*  real_values_gpu,
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <bool use_thread_method, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetPrecisionAndExectutionMethod(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // For kernels with fwd and inv transforms, we want to not set the direction yet.
 
     static const bool is_half  = std::is_same_v<ComputeType, __half>;
@@ -2046,35 +2061,35 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetPrecisionA
 
     if constexpr ( use_thread_method ) {
         using FFT = decltype(Thread( ) + Size<32>( ) + Precision<ComputeType>( ));
-        SetIntraKernelFunctions<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+        SetIntraKernelFunctions<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
     }
     else {
         using FFT = decltype(Block( ) + Precision<ComputeType>( ) + FFTsPerBlock<1>( ));
-        SetIntraKernelFunctions<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+        SetIntraKernelFunctions<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
     }
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetIntraKernelFunctions(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     if constexpr ( ! detail::has_any_block_operator<FFT_base>::value ) {
-        // SelectSizeAndType<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+        // SelectSizeAndType<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
     }
     else {
         if constexpr ( Rank == 3 ) {
-            SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+            SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
         }
         else {
             // TODO: 8192 will fail for sm75 if wanted need some extra logic ... , 8192, 16
-            SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 8>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+            SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, 16, 4, 32, 8, 64, 8, 128, 8, 256, 8, 512, 8, 1024, 8, 2048, 8, 4096, 8>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
         }
     }
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // This provides both a termination point for the recursive version needed for the block transform case as well as the actual function for thread transform with fixed size 32
     GetTransformSize(kernel_type);
     if constexpr ( ! detail::has_any_block_operator<FFT_base>::value ) {
@@ -2082,22 +2097,22 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAnd
         switch ( device_properties.device_arch ) {
             case 700: {
                 using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 750: {
                 using FFT = decltype(FFT_base( ) + SM<750>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 800: {
                 using FFT = decltype(FFT_base( ) + SM<800>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 860: {
                 using FFT = decltype(FFT_base( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             default: {
@@ -2110,7 +2125,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAnd
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class FFT_base, class PreOpType, class IntraOpType, class PostOpType, unsigned int SizeValue, unsigned int Ept, unsigned int... OtherValues>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAndType(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
     // Use recursion to step through the allowed sizes.
     GetTransformSize(kernel_type);
 
@@ -2119,24 +2134,24 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAnd
         switch ( device_properties.device_arch ) {
             case 700: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 750: {
                 if constexpr ( SizeValue <= 4096 ) {
                     using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<750>( ) + ElementsPerThread<8>( ));
-                    SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                    SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 }
                 break;
             }
             case 800: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<800>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             case 860: {
                 using FFT = decltype(FFT_base( ) + Size<SizeValue>( ) + SM<700>( ) + ElementsPerThread<8>( ));
-                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+                SetAndLaunchKernel<FFT, PreOpType, IntraOpType, PostOpType>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
                 break;
             }
             default: {
@@ -2146,12 +2161,12 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SelectSizeAnd
         }
     }
 
-    SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, OtherValues...>(kernel_type, do_forward_transform, pre_op_lambda, intra_op_lambda, post_op_lambda);
+    SelectSizeAndType<FFT_base, PreOpType, IntraOpType, PostOpType, OtherValues...>(kernel_type, do_forward_transform, pre_op_functor, intra_op_functor, post_op_functor);
 }
 
 template <class ComputeType, class InputType, class OutputType, int Rank>
 template <class FFT_base_arch, class PreOpType, class IntraOpType, class PostOpType>
-void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_lambda, IntraOpType intra_op_lambda, PostOpType post_op_lambda) {
+void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchKernel(KernelType kernel_type, bool do_forward_transform, PreOpType pre_op_functor, IntraOpType intra_op_functor, PostOpType post_op_functor) {
 
     using complex_type = typename FFT_base_arch::value_type;
     using scalar_type  = typename complex_type::value_type;
@@ -2193,14 +2208,14 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 int shared_mem = LP.mem_offsets.shared_output * sizeof(complex_type);
                 CheckSharedMemory(shared_mem, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_R2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem));
-#if DEBUG_FFT_STAGE > 0
-                precheck
-                        thread_fft_kernel_R2C_decomposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_mem, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                postcheck
+#if FFT_DEBUG_STAGE > 0
+                precheck;
+                thread_fft_kernel_R2C_decomposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_mem, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
-                        break;
+                break;
             }
 
             case r2c_decomposed_transposed: {
@@ -2211,15 +2226,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 int shared_mem = LP.mem_offsets.shared_output * sizeof(complex_type);
                 CheckSharedMemory(shared_mem, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_R2C_decomposed_transposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem));
-#if DEBUG_FFT_STAGE > 0
-                precheck
-                        thread_fft_kernel_R2C_decomposed_transposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_mem, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                postcheck
+#if FFT_DEBUG_STAGE > 0
+                precheck;
+                thread_fft_kernel_R2C_decomposed_transposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_mem, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2r_decomposed: {
@@ -2231,15 +2246,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 int          shared_memory = LP.mem_offsets.shared_output * sizeof(scalar_type);
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2R_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 6
-                precheck
-                        thread_fft_kernel_C2R_decomposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                postcheck
+#if FFT_DEBUG_STAGE > 6
+                precheck;
+                thread_fft_kernel_C2R_decomposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2r_decomposed_transposed: {
@@ -2250,15 +2265,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 int          shared_memory = LP.mem_offsets.shared_output * sizeof(scalar_type);
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2R_decomposed_transposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 6
-                precheck
-                        thread_fft_kernel_C2R_decomposed_transposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                postcheck
+#if FFT_DEBUG_STAGE > 6
+                precheck;
+                thread_fft_kernel_C2R_decomposed_transposed<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case xcorr_decomposed: {
@@ -2278,19 +2293,19 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     MyFFTRunTimeAssertTrue(false, "decomposed xcorr with swap real space quadrants is not implemented.");
                     // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<FFT,complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-                    // precheck
+                    // precheck;
                     // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<FFT,invFFT, complex_type><< <LP.gridDims,  LP.threadsPerBlock, shared_memory, cudaStreamPerThread>> >
                     // ( (complex_type*) image_to_search, (complex_type*)  d_ptr.momentum_space_buffer,  (complex_type*) d_ptr.momentum_space, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv);
-                    // postcheck
+                    // postcheck;
                 }
                 else {
                     cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed_ConjMul<FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                     // the image_to_search pointer is set during call to CrossCorrelate,
-                    precheck
-                            thread_fft_kernel_C2C_decomposed_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                    postcheck
+                    precheck;
+                    thread_fft_kernel_C2C_decomposed_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                    postcheck;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2307,10 +2322,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 2
-                    precheck
-                            thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                    postcheck
+#if FFT_DEBUG_STAGE > 2
+                    precheck;
+                    thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                    postcheck;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2321,10 +2336,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     int shared_memory = LP.mem_offsets.shared_output * sizeof(complex_type);
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)thread_fft_kernel_C2C_decomposed<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 4
-                    precheck
-                            thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
-                    postcheck
+#if FFT_DEBUG_STAGE > 4
+                    precheck;
+                    thread_fft_kernel_C2C_decomposed<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q);
+                    postcheck;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2349,14 +2364,14 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 //  cudaErr(cudaFuncSetCacheConfig( (void*)block_fft_kernel_R2C_NONE_XY<FFT,complex_type,scalar_type>,cudaFuncCachePreferShared ));
                 //  cudaFuncSetSharedMemConfig ( (void*)block_fft_kernel_R2C_NONE_XY<FFT,complex_type,scalar_type>, cudaSharedMemBankSizeEightByte );
 
-#if DEBUG_FFT_STAGE > 0
-                precheck
-                        block_fft_kernel_R2C_NONE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 0
+                precheck;
+                block_fft_kernel_R2C_NONE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, workspace);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
-                        break;
+                break;
             }
 
             case r2c_none_XZ: {
@@ -2371,10 +2386,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
 
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_NONE_XZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 0
-                    precheck
-                            block_fft_kernel_R2C_NONE_XZ<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, workspace);
-                    postcheck
+#if FFT_DEBUG_STAGE > 0
+                    precheck;
+                    block_fft_kernel_R2C_NONE_XZ<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, workspace);
+                    postcheck;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2394,14 +2409,14 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_DECREASE_XY<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 0
-                precheck
-                        block_fft_kernel_R2C_DECREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 0
+                precheck;
+                block_fft_kernel_R2C_DECREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
-                        break;
+                break;
             }
 
             case r2c_increase: {
@@ -2415,15 +2430,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_INCREASE_XY<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 0
-                precheck
-                        block_fft_kernel_R2C_INCREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 0
+                precheck;
+                block_fft_kernel_R2C_INCREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case r2c_increase_XZ: {
@@ -2441,10 +2456,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_R2C_INCREASE_XZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 0
-                    precheck
-                            block_fft_kernel_R2C_INCREASE_XZ<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                    postcheck
+#if FFT_DEBUG_STAGE > 0
+                    precheck;
+                    block_fft_kernel_R2C_INCREASE_XZ<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(scalar_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                    postcheck;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2461,18 +2476,18 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 auto        workspace     = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                 int         shared_memory = FFT::shared_memory_size;
 
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                precheck
-                        block_fft_kernel_C2C_NONE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
-                postcheck
+                precheck;
+                block_fft_kernel_C2C_NONE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
+                postcheck;
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2c_fwd_none_Z: {
@@ -2485,13 +2500,13 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     auto        workspace     = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                     int         shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, size_of<FFT>::value * (unsigned int)sizeof(complex_type) * XZ_STRIDE);
 
-#if DEBUG_FFT_STAGE > 1
+#if FFT_DEBUG_STAGE > 1
 
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                    precheck
-                            block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
-                    postcheck
+                    precheck;
+                    block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
+                    postcheck;
 #else
                     // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                     is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2507,7 +2522,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 auto         workspace  = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                 LaunchParams LP         = SetLaunchParameters(elements_per_thread_complex, c2c_fwd_decrease);
 
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                 // the shared mem is mixed between storage, shuffling and FFT. For this kernel we need to add padding to avoid bank conlicts (N/32)
                 // For decrease methods, the shared_input > shared_output
                 int shared_memory = std::max(FFT::shared_memory_size * LP.threadsPerBlock.z, (LP.mem_offsets.shared_input + LP.mem_offsets.shared_input / 32) * (unsigned int)sizeof(complex_type));
@@ -2515,15 +2530,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_DECREASE<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-                precheck
-                        block_fft_kernel_C2C_DECREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+                precheck;
+                block_fft_kernel_C2C_DECREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2c_fwd_increase_Z: {
@@ -2540,13 +2555,13 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     int shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, XZ_STRIDE * LP.mem_offsets.physical_x_output / LP.Q * (unsigned int)sizeof(complex_type));
                     shared_memory += XZ_STRIDE * LP.mem_offsets.shared_input * (unsigned int)sizeof(complex_type);
 
-#if DEBUG_FFT_STAGE > 1
+#if FFT_DEBUG_STAGE > 1
 
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_INCREASE_XYZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                    precheck
-                            block_fft_kernel_C2C_INCREASE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                    postcheck
+                    precheck;
+                    block_fft_kernel_C2C_INCREASE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                    postcheck;
 #else
                     // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                     is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2564,19 +2579,19 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 auto        workspace     = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                 int         shared_memory = FFT::shared_memory_size + (unsigned int)sizeof(complex_type) * (LP.mem_offsets.shared_input + LP.mem_offsets.shared_output);
 
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_INCREASE<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-                precheck
-                        block_fft_kernel_C2C_INCREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+                precheck;
+                block_fft_kernel_C2C_INCREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2c_inv_none: {
@@ -2591,17 +2606,17 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
 
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 4
-                precheck
-                        block_fft_kernel_C2C_NONE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 4
+                precheck;
+                block_fft_kernel_C2C_NONE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
+                postcheck;
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        // do something
-                        break;
+                // do something
+                break;
             }
 
             case c2c_inv_none_XZ: {
@@ -2617,10 +2632,10 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
 
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE_XZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 4
-                    precheck
-                            block_fft_kernel_C2C_NONE_XZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
-                    postcheck
+#if FFT_DEBUG_STAGE > 4
+                    precheck;
+                    block_fft_kernel_C2C_NONE_XZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
+                    postcheck;
 #else
                     // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                     is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2640,12 +2655,12 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                     auto        workspace     = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                     int         shared_memory = std::max(XZ_STRIDE * FFT::shared_memory_size, size_of<FFT>::value * (unsigned int)sizeof(complex_type) * XZ_STRIDE);
 
-#if DEBUG_FFT_STAGE > 5
+#if FFT_DEBUG_STAGE > 5
                     CheckSharedMemory(shared_memory, device_properties);
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                    precheck
-                            block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
-                    postcheck
+                    precheck;
+                    block_fft_kernel_C2C_NONE_XYZ<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, workspace);
+                    postcheck;
 #else
                     // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                     is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2660,7 +2675,7 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 auto         workspace  = make_workspace<FFT>(error_code); // std::cout << " EPT: " << FFT::elements_per_thread << "kernel " << KernelName[kernel_type] << std::endl;
                 LaunchParams LP         = SetLaunchParameters(elements_per_thread_complex, c2c_inv_decrease);
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
                 // the shared mem is mixed between storage, shuffling and FFT. For this kernel we need to add padding to avoid bank conlicts (N/32)
                 // For decrease methods, the shared_input > shared_output
                 int shared_memory = std::max(FFT::shared_memory_size * LP.threadsPerBlock.z, (LP.mem_offsets.shared_input + LP.mem_offsets.shared_input / 32) * (unsigned int)sizeof(complex_type));
@@ -2668,21 +2683,21 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_DECREASE<FFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-                precheck
-                        block_fft_kernel_C2C_DECREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+                precheck;
+                block_fft_kernel_C2C_DECREASE<FFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2c_inv_increase: {
                 MyFFTRunTimeAssertTrue(false, "c2c_inv_increase is not yet implemented.");
 
-#if DEBUG_FFT_STAGE > 4
+#if FFT_DEBUG_STAGE > 4
 #else
                 // Since we skip the memory ops, unlike the other kernels, we need to flip the buffer pinter
                 is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2703,15 +2718,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
 
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2R_NONE<FFT, complex_type, scalar_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 6
-                precheck
-                        block_fft_kernel_C2R_NONE<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 6
+                precheck;
+                block_fft_kernel_C2R_NONE<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, workspace);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2r_none_XY: {
@@ -2726,15 +2741,15 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
 
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2R_NONE_XY<FFT, complex_type, scalar_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-#if DEBUG_FFT_STAGE > 6
-                precheck
-                        block_fft_kernel_C2R_NONE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 6
+                precheck;
+                block_fft_kernel_C2R_NONE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, workspace);
+                postcheck;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
 
-                        break;
+                break;
             }
 
             case c2r_decrease: {
@@ -2750,12 +2765,12 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
                 cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2R_DECREASE_XY<FFT, complex_type, scalar_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-#if DEBUG_FFT_STAGE > 6
-                precheck
-                        block_fft_kernel_C2R_DECREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
-                postcheck
+#if FFT_DEBUG_STAGE > 6
+                precheck;
+                block_fft_kernel_C2R_DECREASE_XY<FFT, complex_type, scalar_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>(complex_input, scalar_output, LP.mem_offsets, LP.twiddle_in, LP.Q, workspace);
+                postcheck;
 
-                        transform_stage_completed = TransformStageCompleted::inv;
+                transform_stage_completed = TransformStageCompleted::inv;
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -2785,28 +2800,28 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
 
                 // FIXME
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                 bool swap_real_space_quadrants = false;
                 if ( swap_real_space_quadrants ) {
                     MyFFTRunTimeAssertTrue(false, "Swapping real space quadrants is not yet implemented.");
                     // cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<FFT,invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
 
-                    // precheck
+                    // precheck;
                     // block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul_SwapRealSpaceQuadrants<FFT,invFFT, complex_type><< <LP.gridDims,  LP.threadsPerBlock, shared_memory, cudaStreamPerThread>> >
                     // ( (complex_type *)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv);
-                    // postcheck
+                    // postcheck;
                 }
                 else {
 
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                    precheck
+                    precheck;
 
-                            // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
-                            // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
-                            int apparent_Q = size_of<FFT>::value / fwd_dims_in.y;
+                    // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
+                    // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
+                    int apparent_Q = size_of<FFT>::value / fwd_dims_in.y;
 
                     block_fft_kernel_C2C_FWD_INCREASE_INV_NONE_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv);
-                    postcheck
+                    postcheck;
                 }
 #else
                 is_in_buffer_memory = ! is_in_buffer_memory;
@@ -2836,25 +2851,25 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 CheckSharedMemory(shared_memory, device_properties);
 
 // FIXME
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                 bool swap_real_space_quadrants = false;
                 if ( swap_real_space_quadrants ) {
                     // cudaErr(cudaFuncSetAttribute((void*)_INV_DECREASE_ConjMul_SwapRealSpaceQuadrants<FFT,invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
                     MyFFTDebugAssertFalse(swap_real_space_quadrants, "Swap real space quadrants not yet implemented in xcorr_fwd_none_inv_decrease.");
 
-                    // precheck
+                    // precheck;
                     // _INV_DECREASE_ConjMul_SwapRealSpaceQuadrants<FFT,invFFT, complex_type><< <LP.gridDims,  LP.threadsPerBlock, shared_memory, cudaStreamPerThread>> >
                     // ( (complex_type *)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in,LP.Q, workspace_fwd, workspace_inv);
-                    // postcheck
+                    // postcheck;
                 }
                 else {
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<FFT, invFFT, complex_type>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
                     // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
                     // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
                     int apparent_Q = size_of<FFT>::value / inv_dims_out.y;
-                    precheck
-                            block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv);
-                    postcheck
+                    precheck;
+                    block_fft_kernel_C2C_FWD_NONE_INV_DECREASE_ConjMul<FFT, invFFT, complex_type><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, LP.twiddle_in, apparent_Q, workspace_fwd, workspace_inv);
+                    postcheck;
                 }
                 transform_stage_completed = TransformStageCompleted::fwd;
 
@@ -2888,18 +2903,18 @@ void FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetAndLaunchK
                 if constexpr ( IS_IKF_t<IntraOpType>( ) ) {
 
 // FIXME
-#if DEBUG_FFT_STAGE > 2
+#if FFT_DEBUG_STAGE > 2
                     // Right now, because of the n_threads == size_of<FFT> requirement, we are explicitly zero padding, so we need to send an "apparent Q" to know the input size.
                     // Could send the actual size, but later when converting to use the transform decomp with different sized FFTs this will be a more direct conversion.
                     int apparent_Q = size_of<FFT>::value / fwd_dims_in.y;
 
                     cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
-                    precheck
-                            block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_lambda, intra_op_lambda, post_op_lambda);
-                    postcheck
+                    precheck;
+                    block_fft_kernel_C2C_FWD_INCREASE_OP_INV_NONE<FFT, invFFT, complex_type, PreOpType, IntraOpType, PostOpType><<<LP.gridDims, LP.threadsPerBlock, shared_memory, cudaStreamPerThread>>>((complex_type*)d_ptr.image_to_search, complex_input, complex_output, LP.mem_offsets, apparent_Q, workspace_fwd, workspace_inv, pre_op_functor, intra_op_functor, post_op_functor);
+                    postcheck;
 
-                            // FIXME: this is set in the public method calls for other functions. Since it will be changed to 0-7 to match FFT_DEBUG_STAGE, fix it then.
-                            transform_stage_completed = TransformStageCompleted::fwd;
+                    // FIXME: this is set in the public method calls for other functions. Since it will be changed to 0-7 to match FFT_DEBUG_STAGE, fix it then.
+                    transform_stage_completed = TransformStageCompleted::fwd;
 #else
                     is_in_buffer_memory = ! is_in_buffer_memory;
 #endif
@@ -3078,7 +3093,7 @@ LaunchParams FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetLa
     // Set the twiddle factor, only differ in sign between fwd/inv transforms.
     // For mixed kernels (eg. xcorr_* the size type is defined by where the size change happens.
     // FIXME fwd_increase (oversampling) xcorr -> inv decrease (peak search) is a likely algorithm, that will not fit with this logic.
-    SizeChangeType size_change_type;
+    SizeChangeType::Enum size_change_type;
     if ( IsForwardType(kernel_type) ) {
         size_change_type = fwd_size_change_type;
         L.twiddle_in = L.twiddle_in = -2 * pi_v<float> / transform_size.N;
@@ -3093,7 +3108,7 @@ LaunchParams FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetLa
         L.threadsPerBlock = dim3(transform_size.Q, 1, 1);
     }
     else {
-        if ( size_change_type == decrease ) {
+        if ( size_change_type == SizeChangeType::decrease ) {
             L.threadsPerBlock = dim3(transform_size.P / ept, 1, transform_size.Q);
         }
         else {
@@ -3104,14 +3119,14 @@ LaunchParams FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetLa
 
     // Set the shared mem sizes, which depend on the size_change_type
     switch ( size_change_type ) {
-        case no_change: {
+        case SizeChangeType::no_change: {
             // no shared memory is needed outside that for the FFT itself.
             // For C2C kernels of size_type increase, the shared output may be reset below in order to store for coalesced global writes.
             L.mem_offsets.shared_input  = 0;
             L.mem_offsets.shared_output = 0;
             break;
         }
-        case decrease: {
+        case SizeChangeType::decrease: {
             // Prior to reduction, we must be able to store the full transform. An alternate algorithm with multiple reads would relieve this dependency and
             // may be worth considering if L2 cache residence on Ampere is an effective way to reduce repeated Globabl memory access.
             // Note: that this shared memory is not static, in the sense that it is used both for temporory fast storage, as well as the calculation of the FFT. The max of those two requirments is calculated per kernel.
@@ -3124,7 +3139,7 @@ LaunchParams FourierTransformer<ComputeType, InputType, OutputType, Rank>::SetLa
             } // TODO this line is just from case increase, haven't thought about it.
             break;
         }
-        case increase: {
+        case SizeChangeType::increase: {
             // We want to re-use the input memory as we loop over construction of the full FFT. This shared memory is independent of the
             // memory used for the FFT itself.
             L.mem_offsets.shared_input = transform_size.P;
@@ -3282,27 +3297,56 @@ void CheckSharedMemory(unsigned int& memory_requested, DeviceProps& dp) {
     // if (memory_requested > dp.max_shared_memory_per_block) { memory_requested = dp.max_shared_memory_per_block; }
 }
 
-template class FourierTransformer<float, float, float>;
+using namespace FastFFT::KernelFunction;
+// my_functor, IKF_t
+
+// 2d explicit instantiations
+
+template class FourierTransformer<float, float, float, 2>;
 
-template void FourierTransformer<float, float, float>::Generic_Fwd<FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-                                                                   FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>>(
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>);
+template void FourierTransformer<float, float, float>::CopyDeviceToDevice<float>(float*, bool, int);
+template void FourierTransformer<float, float, float>::CopyDeviceToDevice<float2>(float2*, bool, int);
+template void FourierTransformer<float, float, float>::CopyDeviceToDeviceAndSynchronize<float>(float*, bool, int);
+template void FourierTransformer<float, float, float>::CopyDeviceToDeviceAndSynchronize<float2>(float2*, bool, int);
 
-template void FourierTransformer<float, float, float>::Generic_Inv<FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-                                                                   FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>>(
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>);
+template void FourierTransformer<float, float, float>::Generic_Fwd<my_functor<float, 0, IKF_t::NOOP>,
+                                                                   my_functor<float, 0, IKF_t::NOOP>>(my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                      my_functor<float, 0, IKF_t::NOOP>);
+
+template void FourierTransformer<float, float, float>::Generic_Inv<my_functor<float, 0, IKF_t::NOOP>,
+                                                                   my_functor<float, 0, IKF_t::NOOP>>(my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                      my_functor<float, 0, IKF_t::NOOP>);
 
 template void FourierTransformer<float, float, float>::Generic_Fwd<std::nullptr_t, std::nullptr_t>(std::nullptr_t, std::nullptr_t);
 template void FourierTransformer<float, float, float>::Generic_Inv<std::nullptr_t, std::nullptr_t>(std::nullptr_t, std::nullptr_t);
 
-template void FourierTransformer<float, float, float>::Generic_Fwd_Image_Inv<FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-                                                                             FastFFT::KernelFunction::my_functor<float, 2, FastFFT::KernelFunction::IKF_t::CONJ_MUL>,
-                                                                             FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>>(
-        float2*,
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>,
-        FastFFT::KernelFunction::my_functor<float, 2, FastFFT::KernelFunction::IKF_t::CONJ_MUL>,
-        FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::IKF_t::NOOP>);
+template void FourierTransformer<float, float, float>::Generic_Fwd_Image_Inv<my_functor<float, 0, IKF_t::NOOP>,
+                                                                             my_functor<float, 2, IKF_t::CONJ_MUL>,
+                                                                             my_functor<float, 0, IKF_t::NOOP>>(float2*,
+                                                                                                                my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                                my_functor<float, 2, IKF_t::CONJ_MUL>,
+                                                                                                                my_functor<float, 0, IKF_t::NOOP>);
+
+// 3d explicit instantiations
+
+template class FourierTransformer<float, float, float, 3>;
+
+template void FourierTransformer<float, float, float, 3>::Generic_Fwd<my_functor<float, 0, IKF_t::NOOP>,
+                                                                      my_functor<float, 0, IKF_t::NOOP>>(my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                         my_functor<float, 0, IKF_t::NOOP>);
+
+template void FourierTransformer<float, float, float, 3>::Generic_Inv<my_functor<float, 0, IKF_t::NOOP>,
+                                                                      my_functor<float, 0, IKF_t::NOOP>>(my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                         my_functor<float, 0, IKF_t::NOOP>);
+
+template void FourierTransformer<float, float, float, 3>::Generic_Fwd<std::nullptr_t, std::nullptr_t>(std::nullptr_t, std::nullptr_t);
+template void FourierTransformer<float, float, float, 3>::Generic_Inv<std::nullptr_t, std::nullptr_t>(std::nullptr_t, std::nullptr_t);
+
+template void FourierTransformer<float, float, float, 3>::Generic_Fwd_Image_Inv<my_functor<float, 0, IKF_t::NOOP>,
+                                                                                my_functor<float, 2, IKF_t::CONJ_MUL>,
+                                                                                my_functor<float, 0, IKF_t::NOOP>>(float2*,
+                                                                                                                   my_functor<float, 0, IKF_t::NOOP>,
+                                                                                                                   my_functor<float, 2, IKF_t::CONJ_MUL>,
+                                                                                                                   my_functor<float, 0, IKF_t::NOOP>);
 
 } // namespace FastFFT
diff --git a/src/cpp/Image.cu.cpp b/src/fastfft/Image.cu
similarity index 73%
rename from src/cpp/Image.cu.cpp
rename to src/fastfft/Image.cu
index 6ffc145..8bdaa4d 100644
--- a/src/cpp/Image.cu.cpp
+++ b/src/fastfft/Image.cu
@@ -54,12 +54,14 @@ void Image<wanted_real_type, wanted_complex_type>::SetClipIntoMask(short4 input_
     int   n_values = output_size.w * 2 * output_size.y;
     bool* tmpMask  = new bool[n_values];
 
-    precheck
-            cudaErr(cudaMalloc(&clipIntoMask, (n_values) * sizeof(bool)));
-    postcheck
+    precheck;
+    cudaErr(cudaMalloc(&clipIntoMask, (n_values) * sizeof(bool)));
+    postcheck;
 
-            if ( output_size.x % 2 == 0 ) pjv = 2;
-    else pjv                                  = 1;
+    if ( output_size.x % 2 == 0 )
+        pjv = 2;
+    else
+        pjv = 1;
 
     for ( int j = 0; j < output_size.y; j++ ) {
         for ( int i = 0; i < output_size.x; i++ ) {
@@ -276,27 +278,27 @@ __device__ cufftCallbackLoadR d_realLoadAndClipInto = CB_realLoadAndClipInto;
 //   CB_realLoadAndClipInto_params* d_params;
 //   CB_realLoadAndClipInto_params h_params;
 
-//   precheck
+//   precheck;
 //   h_params.target = (cufftReal *)image_to_insert;
 //   h_params.mask = (bool*) clipIntoMask;
 //   cudaErr(cudaMalloc((void **)&d_params,sizeof(CB_realLoadAndClipInto_params)));
-//   postcheck
+//   postcheck;
 
-//   precheck
+//   precheck;
 //   cudaErr(cudaMemcpyAsync(d_params, &h_params, sizeof(CB_realLoadAndClipInto_params), cudaMemcpyHostToDevice, cudaStreamPerThread));
-//   postcheck
+//   postcheck;
 
-//   precheck
+//   precheck;
 //   cudaErr(cudaMemcpyFromSymbol(&h_realLoadAndClipInto,d_realLoadAndClipInto, sizeof(h_realLoadAndClipInto)));
-//   postcheck
+//   postcheck;
 
-//   precheck
+//   precheck;
 //   cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
-//   postcheck
+//   postcheck;
 
-//   precheck
+//   precheck;
 //   cudaErr(cufftXtSetCallback(cuda_plan_forward, (void **)&h_realLoadAndClipInto, CUFFT_CB_LD_REAL, (void **)&d_params));
-//   postcheck
+//   postcheck;
 
 // }
 
@@ -329,3 +331,121 @@ void Image<wanted_real_type, wanted_complex_type>::SetComplexConjMultiplyAndLoad
     cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
     cudaErr(cufftXtSetCallback(cuda_plan_inverse, (void**)&h_complexConjMulLoad, CUFFT_CB_LD_COMPLEX, (void**)&d_params));
 }
+
+// To print a message and some number n_to_print complex values to stdout
+template <class wanted_real_type, class wanted_complex_type>
+void Image<wanted_real_type, wanted_complex_type>::print_values_complex(float* input, std::string msg, int n_to_print) {
+    for ( int i = 0; i < n_to_print * 2; i += 2 ) {
+        std::cout << msg << i / 2 << "  " << input[i] << " " << input[i + 1] << std::endl;
+    }
+}
+
+// Return sum of real values
+template <class wanted_real_type, class wanted_complex_type>
+float Image<wanted_real_type, wanted_complex_type>::ReturnSumOfReal(float* input, short4 size, bool print_val) {
+    double temp_sum         = 0;
+    long   address          = 0;
+    int    padding_jump_val = size.w * 2 - size.x;
+    for ( int k = 0; k < size.z; k++ ) {
+        for ( int j = 0; j < size.y; j++ ) {
+            for ( int i = 0; i < size.x; i++ ) {
+
+                temp_sum += double(input[address]);
+                address++;
+            }
+            address += padding_jump_val;
+        }
+    }
+
+    return float(temp_sum);
+}
+
+// Return the sum of the complex values
+
+template <class wanted_real_type, class wanted_complex_type>
+float2 Image<wanted_real_type, wanted_complex_type>::ReturnSumOfComplex(float2* input, int n_to_print) {
+    double sum_x = 0;
+    double sum_y = 0;
+
+    for ( int i = 0; i < n_to_print; i++ ) {
+        sum_x += input[i].x;
+        sum_y += input[i].y;
+    }
+
+    return make_float2(float(sum_x), float(sum_y));
+}
+
+// Return the sum of the complex values
+template <class wanted_real_type, class wanted_complex_type>
+float Image<wanted_real_type, wanted_complex_type>::ReturnSumOfComplexAmplitudes(float2* input, int n_to_print) {
+    // We want to asses the error in the FFT at single/half precision, but to not add
+    // extra error for the use double here.
+    double sum = 0;
+    double x;
+    double y;
+
+    for ( int i = 0; i < n_to_print; i++ ) {
+        x = double(input[i].x);
+        y = double(input[i].y);
+        sum += sqrt(x * x + y * y);
+    }
+
+    return sum;
+}
+
+template <class wanted_real_type, class wanted_complex_type>
+void Image<wanted_real_type, wanted_complex_type>::ClipInto(const float* array_to_paste, float* array_to_paste_into, short4 size_from, short4 size_into, short4 wanted_center, float wanted_padding_value) {
+
+    long pixel_counter = 0;
+
+    int kk;
+    int k;
+    int kk_logi;
+
+    int jj;
+    int jj_logi;
+    int j;
+
+    int ii;
+    int ii_logi;
+    int i;
+
+    short4 center_to_paste_into = make_short4(size_into.x / 2, size_into.y / 2, size_into.z / 2, 0);
+    short4 center_to_paste      = make_short4(size_from.x / 2, size_from.y / 2, size_from.z / 2, 0);
+    int    padding_jump_value;
+
+    if ( size_into.x % 2 == 0 )
+        padding_jump_value = 2;
+    else
+        padding_jump_value = 1;
+
+    for ( kk = 0; kk < size_into.z; kk++ ) {
+        kk_logi = kk - center_to_paste_into.z;
+        k       = center_to_paste.z + wanted_center.z + kk_logi;
+
+        for ( jj = 0; jj < size_into.y; jj++ ) {
+            jj_logi = jj - center_to_paste_into.y;
+            j       = center_to_paste.y + wanted_center.y + jj_logi;
+
+            for ( ii = 0; ii < size_into.x; ii++ ) {
+                ii_logi = ii - center_to_paste_into.x;
+                i       = center_to_paste.x + wanted_center.x + ii_logi;
+
+                if ( k < 0 || k >= size_from.z || j < 0 || j >= size_from.y || i < 0 || i >= size_from.x ) {
+                    array_to_paste_into[pixel_counter] = wanted_padding_value;
+                }
+                else {
+                    array_to_paste_into[pixel_counter] = array_to_paste[k * (size_from.w * 2 * size_from.y) + j * (size_from.x + padding_jump_value) + i];
+                }
+
+                pixel_counter++;
+            }
+
+            pixel_counter += padding_jump_value;
+        }
+    }
+
+} // end of clip into
+
+template class Image<float, float2>;
+// template Image<float, float2>::Image(short4);
\ No newline at end of file
diff --git a/src/fastfft/Image.cuh b/src/fastfft/Image.cuh
new file mode 100644
index 0000000..b6c9275
--- /dev/null
+++ b/src/fastfft/Image.cuh
@@ -0,0 +1,100 @@
+// Collection of helper functions for test.cu
+
+#ifndef SRC_CPP_IMAGE_CUH_
+#define SRC_CPP_IMAGE_CUH_
+
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <cmath>
+#include <string>
+
+// sudo apt-get install libfftw3-dev libfftw3-doc
+#include <fftw3.h>
+
+#include <cuda_runtime_api.h>
+#include "../../include/cufftdx/include/cufftdx.hpp"
+#include <cufft.h>
+#include <cufftXt.h>
+
+// A simple class to represent image objects needed for testing FastFFT.
+
+template <class wanted_real_type, class wanted_complex_type>
+class Image {
+
+  public:
+    Image( );
+    Image(short4 wanted_size);
+    ~Image( );
+
+    wanted_real_type*    real_values;
+    wanted_complex_type* complex_values;
+    bool*                clipIntoMask;
+
+    short4 size;
+    int    real_memory_allocated;
+    int    padding_jump_value;
+
+    float fftw_epsilon;
+
+    bool is_in_memory;
+    bool is_fftw_planned;
+    bool is_in_real_space;
+    bool is_cufft_planned;
+
+    void Allocate( );
+    void Allocate(bool plan_fftw);
+    void FwdFFT( );
+    void InvFFT( );
+
+    // Make FFTW plans for comparing CPU to GPU xforms.
+    // This is nearly verbatim from cisTEM::Image::Allocate - I do not know if FFTW_ESTIMATE is the best option.
+    // In cisTEM we almost always use MKL, so this might be worth testing. I always used exhaustive in Matlab/emClarity.
+    fftwf_plan plan_fwd = NULL;
+    fftwf_plan plan_bwd = NULL;
+
+    cufftHandle cuda_plan_forward;
+    cufftHandle cuda_plan_inverse;
+    size_t      cuda_plan_worksize_forward;
+    size_t      cuda_plan_worksize_inverse;
+
+    cudaEvent_t startEvent{nullptr};
+    cudaEvent_t stopEvent{nullptr};
+    float       elapsed_gpu_ms{ };
+
+    inline void create_timing_events( ) {
+        cudaEventCreate(&startEvent, cudaEventBlockingSync);
+        cudaEventCreate(&stopEvent, cudaEventBlockingSync);
+    }
+
+    inline void record_start( ) { cudaEventRecord(startEvent); }
+
+    inline void record_stop( ) { cudaEventRecord(stopEvent); }
+
+    inline void synchronize( ) { cudaEventSynchronize(stopEvent); }
+
+    inline void print_time(std::string msg, bool print_out = true) {
+        cudaEventElapsedTime(&elapsed_gpu_ms, startEvent, stopEvent);
+        if ( print_out ) {
+            std::cout << "Time on " << msg << " " << elapsed_gpu_ms << " ms" << std::endl;
+        }
+    }
+
+    void MakeCufftPlan( );
+    void MakeCufftPlan3d( );
+
+    void SetClipIntoMask(short4 input_size, short4 output_size);
+    bool is_set_clip_into_mask = false;
+    // void SetClipIntoCallback(cufftReal* image_to_insert, int image_to_insert_size_x, int image_to_insert_size_y,int image_to_insert_pitch);
+    void   SetComplexConjMultiplyAndLoadCallBack(cufftComplex* search_image_FT, cufftReal FT_normalization_factor);
+    void   MultiplyConjugateImage(wanted_complex_type* other_image);
+    void   print_values_complex(float* input, std::string msg, int n_to_print);
+    float  ReturnSumOfReal(float* input, short4 size, bool print_val = false);
+    float2 ReturnSumOfComplex(float2* input, int n_to_print);
+    float  ReturnSumOfComplexAmplitudes(float2* input, int n_to_print);
+    void   ClipInto(const float* array_to_paste, float* array_to_paste_into, short4 size_from, short4 size_into, short4 wanted_center, float wanted_padding_value);
+
+  private:
+};
+
+#endif // SRC_CPP_IMAGE_CUH_
\ No newline at end of file
diff --git a/src/fastfft/types.cuh b/src/fastfft/types.cuh
new file mode 100644
index 0000000..d261ca5
--- /dev/null
+++ b/src/fastfft/types.cuh
@@ -0,0 +1,61 @@
+#ifndef _SRC_FASTFFT_TYPES_H_
+#define _SRC_FASTFFT_TYPES_H_
+
+#include <array>
+#include <string_view>
+
+namespace FastFFT {
+
+namespace DataType {
+// Used to specify input/calc/output data types
+enum Enum { int4_2,
+            uint8,
+            int8,
+            uint16,
+            int16,
+            fp16,
+            bf16,
+            tf32,
+            uint32,
+            int32,
+            fp32 };
+
+constexpr std::array<std::string_view, 11> name = {"int4_2", "uint8", "int8", "uint16", "int16", "fp16", "bf16", "tf32", "uint32", "int32", "fp32"};
+
+} // namespace DataType
+
+namespace SizeChangeType {
+// FIXME this seems like a bad idea. Added due to conflicing labels in switch statements, even with explicitly scope.
+enum Enum : uint8_t { increase,
+                      decrease,
+                      no_change };
+} // namespace SizeChangeType
+
+namespace OriginType {
+// Used to specify the origin of the data
+enum Enum : int { natural,
+                  centered,
+                  quadrant_swapped };
+
+constexpr std::array<std::string_view, 3> name = {"natural", "centered", "quadrant_swapped"};
+
+} // namespace OriginType
+
+namespace TransformStageCompleted {
+enum Enum : uint8_t { none = 10,
+                      fwd  = 11,
+                      inv  = 12 }; // none must be greater than number of sizeChangeTypes, padding must match in TransformStageCompletedName vector
+} // namespace TransformStageCompleted
+
+namespace DimensionCheckType {
+enum Enum : uint8_t { CopyFromHost,
+                      CopyToHost,
+                      CopyDeviceToDevice,
+                      FwdTransform,
+                      InvTransform };
+
+} // namespace DimensionCheckType
+
+} // namespace FastFFT
+
+#endif /* _SRC_FASTFFT_TYPES_H_ */
\ No newline at end of file
diff --git a/src/python/FastFFT_binding/test_cupy.py b/src/python/FastFFT_binding/test_cupy.py
index e440bdc..cd2d965 100644
--- a/src/python/FastFFT_binding/test_cupy.py
+++ b/src/python/FastFFT_binding/test_cupy.py
@@ -11,7 +11,7 @@
 print("Cupy array is {}".format(a))
 
 # Setup the plans
-FT.SetForwardFFTPlan(16,16,1,16,16,1, True, True)
+FT.SetForwardFFTPlan(16,16,1,16,16,1, True)
 FT.Wait()
 FT.SetInverseFFTPlan(16,16,1,16,16,1, True)
 FT.Wait()
diff --git a/src/python/test_binding/test_pybind11.cu b/src/python/test_binding/test_pybind11.cu
index 6b15604..bb68339 100644
--- a/src/python/test_binding/test_pybind11.cu
+++ b/src/python/test_binding/test_pybind11.cu
@@ -14,83 +14,80 @@
 
 namespace py = pybind11;
 
-template<class TypeOne, class TypeTwo>
+template <class TypeOne, class TypeTwo>
 __global__ void add_kernel(TypeOne one, TypeTwo two, TypeOne& retval) {
 
     // add then numbers and 1 to be sure it ran in this device code.
-   retval = one + TypeOne(two) + TypeOne(1.0f);
+    retval = one + TypeOne(two) + TypeOne(1.0f);
 }
 
-template<class TypeOne>
+template <class TypeOne>
 __global__ void sum_array(TypeOne* array_ptr, int n_elem) {
 
-    for (int i = 1; i < n_elem; i++) {
+    for ( int i = 1; i < n_elem; i++ ) {
         array_ptr[0] += array_ptr[i];
     }
 }
 
-template<class TypeOne, class TypeTwo>
+template <class TypeOne, class TypeTwo>
 class TestClass {
 
-    public:
-        TestClass(TypeOne one, TypeTwo two) : one_(one), two_(two) {}
+  public:
+    TestClass(TypeOne one, TypeTwo two) : one_(one), two_(two) {}
 
-        TypeOne getOne() { return one_; }
-        TypeTwo getTwo() { return two_; }
+    TypeOne getOne( ) { return one_; }
 
-        TypeOne add(TypeOne i, TypeTwo j) {
+    TypeTwo getTwo( ) { return two_; }
 
-            TypeOne retval = TypeOne(4);
-            TypeOne* d_retval;
-            cudaErr(cudaMallocManaged(&d_retval, sizeof(TypeOne)));
+    TypeOne add(TypeOne i, TypeTwo j) {
 
-            std::cout << "Value pre kernel is " << retval << std::endl;
-            precheck
-            add_kernel<<<1, 1, 0, cudaStreamPerThread>>>(i, j, *d_retval);
-            postcheck
-            cudaStreamSynchronize(cudaStreamPerThread);
+        TypeOne  retval = TypeOne(4);
+        TypeOne* d_retval;
+        cudaErr(cudaMallocManaged(&d_retval, sizeof(TypeOne)));
 
-            retval = *d_retval;
-            std::cout << "Value post kernel is " << retval << std::endl;
+        std::cout << "Value pre kernel is " << retval << std::endl;
+        precheck;
+        add_kernel<<<1, 1, 0, cudaStreamPerThread>>>(i, j, *d_retval);
+        postcheck;
+        cudaStreamSynchronize(cudaStreamPerThread);
 
-            cudaErr(cudaFree(d_retval));
-            return retval;
-        }
+        retval = *d_retval;
+        std::cout << "Value post kernel is " << retval << std::endl;
 
-        void sum_cupy_array(long cupy_ptr, int cupy_size) {
+        cudaErr(cudaFree(d_retval));
+        return retval;
+    }
 
-            // Simple test to take the pointer from a cupy array and work on it in the gpu.
-            TypeOne* d_array = reinterpret_cast<TypeOne*>(cupy_ptr);
-            precheck
-            sum_array<<<1, 1, 0, cudaStreamPerThread>>>(d_array, cupy_size);
-            postcheck
-            cudaStreamSynchronize(cudaStreamPerThread);
+    void sum_cupy_array(long cupy_ptr, int cupy_size) {
 
-        }
-     
+        // Simple test to take the pointer from a cupy array and work on it in the gpu.
+        TypeOne* d_array = reinterpret_cast<TypeOne*>(cupy_ptr);
+        precheck;
+        sum_array<<<1, 1, 0, cudaStreamPerThread>>>(d_array, cupy_size);
+        postcheck;
+        cudaStreamSynchronize(cudaStreamPerThread);
+    }
 
-    private:
-        TypeOne one_;
-        TypeTwo two_;
+  private:
+    TypeOne one_;
+    TypeTwo two_;
 };
 
-template<typename typeOne, typename typeTwo>
-void declare_array(py::module &m, const std::string &typestr) {
-    using Class = TestClass<typeOne, typeTwo>;
+template <typename typeOne, typename typeTwo>
+void declare_array(py::module& m, const std::string& typestr) {
+    using Class              = TestClass<typeOne, typeTwo>;
     std::string pyclass_name = std::string("TestClass") + typestr;
-    py::class_<Class>(m, pyclass_name.c_str())
-    .def(py::init<typeOne, typeTwo>())
-    .def("getOne", &TestClass<typeOne, typeTwo>::getOne) 
-    .def("getTwo", &TestClass<typeOne, typeTwo>::getTwo) 
-    .def("add", &TestClass<typeOne, typeTwo>::add)
-    .def("sum_cupy_array", &TestClass<typeOne, typeTwo>::sum_cupy_array);
-
+    py::class_<Class>(m, pyclass_name.c_str( ))
+            .def(py::init<typeOne, typeTwo>( ))
+            .def("getOne", &TestClass<typeOne, typeTwo>::getOne)
+            .def("getTwo", &TestClass<typeOne, typeTwo>::getTwo)
+            .def("add", &TestClass<typeOne, typeTwo>::add)
+            .def("sum_cupy_array", &TestClass<typeOne, typeTwo>::sum_cupy_array);
 }
 
 PYBIND11_MODULE(fastfft_test, m) {
-    
+
     declare_array<int, float>(m, "_int_float");
     declare_array<float, int>(m, "_float_int");
     declare_array<float, float>(m, "_float_float");
-
 }
diff --git a/src/tests/constant_image_test.cu b/src/tests/constant_image_test.cu
new file mode 100644
index 0000000..438f2c0
--- /dev/null
+++ b/src/tests/constant_image_test.cu
@@ -0,0 +1,193 @@
+
+#include "tests.h"
+
+template <int Rank>
+bool const_image_test(std::vector<int>& size) {
+
+    bool              all_passed = true;
+    std::vector<bool> init_passed(size.size( ), true);
+    std::vector<bool> FFTW_passed(size.size( ), true);
+    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
+    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
+
+    for ( int n = 0; n < size.size( ); n++ ) {
+
+        short4 input_size;
+        short4 output_size;
+        long   full_sum = long(size[n]);
+        if ( Rank == 3 ) {
+            input_size  = make_short4(size[n], size[n], size[n], 0);
+            output_size = make_short4(size[n], size[n], size[n], 0);
+            full_sum    = full_sum * full_sum * full_sum * full_sum * full_sum * full_sum;
+        }
+        else {
+            input_size  = make_short4(size[n], size[n], 1, 0);
+            output_size = make_short4(size[n], size[n], 1, 0);
+            full_sum    = full_sum * full_sum * full_sum * full_sum;
+        }
+
+        float sum;
+
+        Image<float, float2> host_input(input_size);
+        Image<float, float2> host_output(output_size);
+        Image<float, float2> device_output(output_size);
+
+        // Pointers to the arrays on the host -- maybe make this a struct of some sort? I'm sure there is a parallel in cuda, look into cuarray/texture code
+
+        // We just make one instance of the FourierTransformer class, with calc type float.
+        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
+        FastFFT::FourierTransformer<float, float, float, Rank> FT;
+
+        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
+        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+        // The padding (dims.w) is calculated based on the setup
+        short4 dims_in  = FT.ReturnFwdInputDimensions( );
+        short4 dims_out = FT.ReturnFwdOutputDimensions( );
+
+        // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
+        // Note: there is no reason we really need this, because the xforms will always be out of place.
+        //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
+        host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
+        host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+
+        // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
+        // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
+        device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
+
+        // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
+        // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
+        bool set_fftw_plan = true;
+        host_input.Allocate(set_fftw_plan);
+        host_output.Allocate(set_fftw_plan);
+
+        // Set our input host memory to a constant. Then FFT[0] = host_input_memory_allocated
+        FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 1.0f);
+
+        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+        // ensures faster transfer. If false, it will be pinned for you.
+        FT.SetInputPointer(host_output.real_values, false);
+        sum = host_output.ReturnSumOfReal(host_output.real_values, dims_out);
+
+        if ( sum != long(dims_in.x) * long(dims_in.y) * long(dims_in.z) ) {
+            all_passed     = false;
+            init_passed[n] = false;
+        }
+
+        // MyFFTDebugAssertTestTrue( sum == dims_out.x*dims_out.y*dims_out.z,"Unit impulse Init ");
+
+        // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
+        FT.CopyHostToDevice(host_output.real_values);
+
+        host_output.FwdFFT( );
+
+        bool test_passed = true;
+        for ( long index = 1; index < host_output.real_memory_allocated / 2; index++ ) {
+            if ( host_output.complex_values[index].x != 0.0f && host_output.complex_values[index].y != 0.0f ) {
+                std::cout << host_output.complex_values[index].x << " " << host_output.complex_values[index].y << " " << std::endl;
+                test_passed = false;
+            }
+        }
+        if ( host_output.complex_values[0].x != (float)dims_out.x * (float)dims_out.y * (float)dims_out.z )
+            test_passed = false;
+
+        if ( test_passed == false ) {
+            all_passed     = false;
+            FFTW_passed[n] = false;
+        }
+        // MyFFTDebugAssertTestTrue( test_passed, "FFTW unit impulse forward FFT");
+
+        // Just to make sure we don't get a false positive, set the host memory to some undesired value.
+        FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
+
+        // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
+        // bool swap_real_space_quadrants = false;
+        FT.FwdFFT( );
+
+        // in buffer, do not deallocate, do not unpin memory
+        FT.CopyDeviceToHostAndSynchronize(host_output.real_values, false);
+        test_passed = true;
+        // FIXME: centralized test conditions
+        for ( long index = 1; index < host_output.real_memory_allocated / 2; index++ ) {
+            if ( host_output.complex_values[index].x != 0.0f && host_output.complex_values[index].y != 0.0f ) {
+                test_passed = false;
+            } // std::cout << host_output.complex_values[index].x  << " " << host_output.complex_values[index].y << " " );}
+        }
+        if ( host_output.complex_values[0].x != (float)dims_out.x * (float)dims_out.y * (float)dims_out.z )
+            test_passed = false;
+
+        bool continue_debugging;
+        // We don't want this to break compilation of other tests, so only check at runtime.
+        if constexpr ( FFT_DEBUG_STAGE < 5 ) {
+            continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(host_output, dims_in, dims_out, dims_in, dims_out, __LINE__);
+        }
+
+        if ( test_passed == false ) {
+            all_passed                = false;
+            FastFFT_forward_passed[n] = false;
+        }
+        // MyFFTDebugAssertTestTrue( test_passed, "FastFFT unit impulse forward FFT");
+        FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f);
+
+        FT.InvFFT( );
+        FT.CopyDeviceToHostAndSynchronize(host_output.real_values, true);
+
+        if constexpr ( FFT_DEBUG_STAGE > 4 ) {
+            continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(host_output, dims_in, dims_out, dims_in, dims_out, __LINE__);
+        }
+
+        // Assuming the outputs are always even dimensions, padding_jump_val is always 2.
+        sum = host_output.ReturnSumOfReal(host_output.real_values, dims_out, true);
+
+        if ( sum != full_sum ) {
+            all_passed                  = false;
+            FastFFT_roundTrip_passed[n] = false;
+        }
+        MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x));
+    } // loop over sizes
+
+    if ( all_passed ) {
+        if ( Rank == 3 )
+            std::cout << "    All 3d const_image tests passed!" << std::endl;
+        else
+            std::cout << "    All 2d const_image tests passed!" << std::endl;
+    }
+    else {
+        for ( int n = 0; n < size.size( ); n++ ) {
+            if ( ! init_passed[n] )
+                std::cout << "    Initialization failed for size " << size[n] << std::endl;
+            if ( ! FFTW_passed[n] )
+                std::cout << "    FFTW failed for size " << size[n] << std::endl;
+            if ( ! FastFFT_forward_passed[n] )
+                std::cout << "    FastFFT failed for forward transform size " << size[n] << std::endl;
+            if ( ! FastFFT_roundTrip_passed[n] )
+                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << std::endl;
+        }
+    }
+    return all_passed;
+}
+
+int main(int argc, char** argv) {
+
+    std::string test_name;
+    // Default to running all tests
+    bool run_2d_unit_tests = false;
+    bool run_3d_unit_tests = false;
+
+    const std::string_view text_line = "constant image";
+    FastFFT::CheckInputArgs(argc, argv, text_line, run_2d_unit_tests, run_3d_unit_tests);
+
+    if ( run_2d_unit_tests ) {
+        if ( ! const_image_test<2>(FastFFT::test_size) )
+            return 1;
+    }
+
+    if ( run_3d_unit_tests ) {
+        if ( ! const_image_test<3>(FastFFT::test_size_3d) )
+            return 1;
+        // if (! unit_impulse_test(test_size_3d, true, true)) return 1;
+    }
+
+    return 0;
+};
\ No newline at end of file
diff --git a/src/tests/helper_functions.cuh b/src/tests/helper_functions.cuh
new file mode 100644
index 0000000..71f26a2
--- /dev/null
+++ b/src/tests/helper_functions.cuh
@@ -0,0 +1,191 @@
+#ifndef SRC_CPP_HELPER_FUNCTIONS_CUH_
+#define SRC_CPP_HELPER_FUNCTIONS_CUH_
+
+#ifndef FFT_DEBUG_LEVEL
+#error "FFT_DEBUG_LEVEL not defined"
+#endif
+
+#ifndef FFT_DEBUG_STAGE
+#error "FFT_DEBUG_STAGE not defined"
+#endif
+
+#include <iostream>
+
+#include "../fastfft/Image.cuh"
+#include "../../include/FastFFT.cuh"
+
+// clang-format off
+#define MyTestPrintAndExit(...) { std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; exit(-1); }
+
+// clang-format on
+
+void PrintArray(float2* array, short NX, short NY, short NZ, int line_wrapping = 34) {
+    // COMPLEX TODO make these functions.
+    int n = 0;
+    for ( int z = 0; z < NZ; z++ ) {
+        for ( int x = 0; x < NX; x++ ) {
+            std::cout << x << "[ ";
+            for ( int y = 0; y < NY; y++ ) {
+                // TODO: could these use the indexing functions?
+                std::cout << array[x + NX * (y + z * NY)].x << "," << array[x + NX * (y + z * NY)].y << " ";
+                n++;
+                if ( n == line_wrapping ) {
+                    n = 0;
+                    std::cout << std::endl;
+                } // line wrapping
+            }
+            std::cout << "] " << std::endl;
+            n = 0;
+        }
+        if ( NZ > 0 )
+            std::cout << " ... ... ... " << z << " ... ... ..." << std::endl;
+    }
+};
+
+void PrintArray(float* array, short NX, short NY, short NZ, short NW, int line_wrapping = 34) {
+    int n = 0;
+    for ( int z = 0; z < NZ; z++ ) {
+        for ( int x = 0; x < NX; x++ ) {
+
+            std::cout << x << "[ ";
+            for ( int y = 0; y < NY; y++ ) {
+                std::cout << array[x + (2 * NW) * (y + z * NY)] << " ";
+                n++;
+                if ( n == line_wrapping ) {
+                    n = 0;
+                    std::cout << std::endl;
+                } // line wrapping
+            }
+            std::cout << "] " << std::endl;
+            n = 0;
+        }
+        if ( NZ > 0 )
+            std::cout << " ... ... ... " << z << " ... ... ..." << std::endl;
+    }
+};
+
+void PrintArray_XZ(float2* array, short NX, short NY, short NZ, int line_wrapping = 34) {
+    // COMPLEX TODO make these functions.
+    int n = 0;
+    for ( int x = 0; x < NX; x++ ) {
+        for ( int z = 0; z < NZ; z++ ) {
+
+            std::cout << z << "[ ";
+            for ( int y = 0; y < NY; y++ ) {
+                std::cout << array[z + NZ * (y + x * NY)].x << "," << array[z + NZ * (y + x * NY)].y << " ";
+                n++;
+                if ( n == line_wrapping ) {
+                    n = 0;
+                    std::cout << std::endl;
+                } // line wrapping
+            }
+            std::cout << "] " << std::endl;
+            n = 0;
+        }
+        if ( NZ > 0 )
+            std::cout << " ... ... ... " << x << " ... ... ..." << std::endl;
+    }
+};
+
+template <typename realType, typename complexType>
+void CheckUnitImpulseRealImage(Image<realType, complexType>& positive_control, int input_line) {
+
+    long address = 0;
+    // Loop over the real values z,y,x skipping the fft padding
+    for ( int k = 0; k < positive_control.size.z; k++ ) {
+        for ( int j = 0; j < positive_control.size.y; j++ ) {
+            for ( int i = 0; i < positive_control.size.x; i++ ) {
+                // Only check the address if we have too.
+                if ( positive_control.real_values[address] != 0.0f && address != 0 ) {
+                    PrintArray(positive_control.real_values, positive_control.size.x, positive_control.size.y, positive_control.size.z, positive_control.size.w);
+                    MyTestPrintAndExit(" ");
+                }
+                address++;
+            }
+            address += positive_control.padding_jump_value;
+        }
+    }
+    return;
+}
+
+// For debugging the individual stages of the xforms
+// Note: for some reason, passing by value altered the values while passing by reference did not. (Opposite?)
+// eg.
+template <int fft_debug_stage, int Rank, typename realType, typename complexType>
+bool debug_partial_fft(Image<realType, complexType>& test_image,
+                       short4                        fwd_dims_in,
+                       short4                        fwd_dims_out,
+                       short4                        inv_dims_in,
+                       short4                        inv_dims_out,
+                       int                           input_line) {
+
+    bool debug_stage_is_8 = false;
+    if constexpr ( fft_debug_stage == 0 ) {
+        PrintArray(test_image.real_values, fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_in.w);
+    }
+    else if constexpr ( fft_debug_stage == 1 ) {
+        if ( Rank == 2 )
+            // Transformed X transposed XY
+            PrintArray(test_image.complex_values, fwd_dims_in.y, fwd_dims_out.w, fwd_dims_in.z);
+        else
+            // Transformed X transposed XZ
+            PrintArray(test_image.complex_values, fwd_dims_in.z, fwd_dims_in.y, fwd_dims_out.w);
+    }
+    else if constexpr ( fft_debug_stage == 2 ) {
+        if ( Rank == 2 )
+            // Noop, Transformed X transposed XY
+            PrintArray(test_image.complex_values, fwd_dims_in.y, fwd_dims_out.w, fwd_dims_in.z);
+        else
+            // Transformed Z, permute XYZ
+            PrintArray(test_image.complex_values, fwd_dims_in.y, fwd_dims_out.w, fwd_dims_out.z);
+    }
+    else if constexpr ( fft_debug_stage == 3 ) {
+        if ( Rank == 2 )
+            // Transormed Y, no reordering
+            PrintArray(test_image.complex_values, fwd_dims_out.y, fwd_dims_out.w, fwd_dims_out.z);
+        else
+            // Transormed Y, no reordering
+            PrintArray(test_image.complex_values, fwd_dims_out.y, fwd_dims_out.w, fwd_dims_out.z);
+    }
+    else if constexpr ( fft_debug_stage == 4 ) {
+        // Same for 2d/3d intra-transorm op (if specified)
+        PrintArray(test_image.complex_values, fwd_dims_out.y, fwd_dims_out.w, fwd_dims_out.z);
+    }
+    else if constexpr ( fft_debug_stage == 5 ) {
+        if ( Rank == 2 )
+            // Inv Transformed Y, no transpose
+            PrintArray(test_image.complex_values, inv_dims_out.y, inv_dims_in.w, inv_dims_out.z);
+        else
+            // Inv Transformed Y, swap YZ
+            PrintArray(test_image.complex_values, inv_dims_in.z, inv_dims_in.w, inv_dims_out.y);
+    }
+    else if constexpr ( fft_debug_stage == 6 ) {
+        if ( Rank == 2 )
+            // Nothing different from debug 5 for 2d
+            PrintArray(test_image.complex_values, inv_dims_out.y, inv_dims_in.w, inv_dims_out.z);
+        else
+            // Inv Transformed Z, permute XYZ
+            PrintArray(test_image.complex_values, inv_dims_in.w, inv_dims_out.y, inv_dims_out.z);
+    }
+    else if constexpr ( fft_debug_stage == 7 ) {
+        if ( Rank == 2 )
+            // Inv transformed X, no transpose
+            PrintArray(test_image.real_values, inv_dims_out.x, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
+        else
+            // Inv transformed X, no transpose
+            PrintArray(test_image.real_values, inv_dims_out.x, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
+    }
+    else if constexpr ( fft_debug_stage == 8 ) {
+        debug_stage_is_8 = true;
+    }
+    else
+        MyTestPrintAndExit("FFT_DEBUG_STAGE not recognized " + std::to_string(FFT_DEBUG_STAGE));
+
+    // std::cerr << "Debug stage " << fft_debug_stage << " passed." << std::endl;
+    return debug_stage_is_8;
+
+    if ( ! debug_stage_is_8 )
+        std::cerr << " Failed Assert at " << __FILE__ << " " << input_line << " " << __PRETTY_FUNCTION__ << std::endl;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/tests/non_cuda_compilation_unit.cu b/src/tests/non_cuda_compilation_unit.cu
new file mode 100644
index 0000000..65660fb
--- /dev/null
+++ b/src/tests/non_cuda_compilation_unit.cu
@@ -0,0 +1,78 @@
+// Named .cu for convenience with building
+
+// The purpose of this test is to ensure that we can build a "pure" cpp file and only link against the CUDA business at the end
+
+#include "../../include/FastFFT.h"
+
+int main(int argc, char** argv) {
+
+    const int input_size = 64;
+
+    FastFFT::FourierTransformer<float, float, float, 2> FT;
+    // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
+    FT.SetForwardFFTPlan(input_size, input_size, 1, input_size, input_size, 1, true);
+    FT.SetInverseFFTPlan(input_size, input_size, 1, input_size, input_size, 1, false);
+
+    // The padding (dims.w) is calculated based on the setup
+    short4 dims_in  = FT.ReturnFwdInputDimensions( );
+    short4 dims_out = FT.ReturnFwdOutputDimensions( );
+
+    std::array<float, (input_size + 2) * input_size> host_input;
+    std::array<float, (input_size + 2) * input_size> host_output;
+
+    int host_input_real_memory_allocated  = FT.ReturnInputMemorySize( );
+    int host_output_real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+
+    if ( host_input_real_memory_allocated != host_output_real_memory_allocated ) {
+        std::cout << "Error: input and output memory sizes do not match" << std::endl;
+        std::cout << "Input: " << host_input_real_memory_allocated << " Output: " << host_output_real_memory_allocated << std::endl;
+        return 1;
+    }
+
+    if ( host_input_real_memory_allocated != host_input.size( ) ) {
+        std::cout << "Error: input memory size does not match expected" << std::endl;
+        std::cout << "Input: " << host_input_real_memory_allocated << " Expected: " << host_input.size( ) << std::endl;
+        return 1;
+    }
+
+    // fill with negative ones so we can make sure the copy and set function works
+    host_input.fill(-1.0f);
+    host_output.fill(-1.0f);
+
+    // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+    // ensures faster transfer. If false, it will be pinned for you.
+    FT.SetInputPointer(host_input.data( ), false);
+
+    // Check basic initialization function
+    FT.SetToConstant(host_input.data( ), host_output_real_memory_allocated, 3.14f);
+    for ( auto& val : host_input ) {
+        if ( val != 3.14f ) {
+            std::cout << "Error: input memory not set to constant" << std::endl;
+            return 1;
+        }
+    }
+
+    // Now set to a unit impulse
+    host_input.fill(0.0f);
+    host_input.at(0) = 1.0f;
+
+    // Copy to the device
+    FT.CopyHostToDevice(host_input.data( ));
+
+    // Do a round trip FFT
+    FT.FwdFFT( );
+    FT.InvFFT( );
+
+    // Now copy back to the output array (still set to -1)
+    FT.CopyDeviceToHost(host_output.data( ), true, host_input_real_memory_allocated);
+    if ( host_output.at(0) == input_size * input_size ) {
+        std::cout << "Success: output memory copied back correctly after fft/ifft pair" << std::endl;
+    }
+    else {
+        std::cout << "Error: output memory not copied back correctly" << std::endl;
+        std::cout << "Output: " << host_output.at(0) << " Expected: " << input_size * input_size << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/tests/padded_convolution_FastFFT_vs_cuFFT.cu b/src/tests/padded_convolution_FastFFT_vs_cuFFT.cu
new file mode 100644
index 0000000..3e0d359
--- /dev/null
+++ b/src/tests/padded_convolution_FastFFT_vs_cuFFT.cu
@@ -0,0 +1,457 @@
+#include "tests.h"
+#include <cufft.h>
+#include <cufftXt.h>
+
+template <int Rank>
+void compare_libraries(std::vector<int> size, FastFFT::SizeChangeType::Enum size_change_type, bool do_rectangle) {
+
+    using SCT = FastFFT::SizeChangeType::Enum;
+
+    constexpr bool skip_cufft_for_profiling = false;
+    constexpr bool print_out_time           = true;
+    // bool set_padding_callback = false; // the padding callback is slower than pasting in b/c the read size of the pointers is larger than the actual data. do not use.
+    bool set_conjMult_callback   = true;
+    bool is_size_change_decrease = false;
+
+    if ( size_change_type == SCT::decrease ) {
+        is_size_change_decrease = true;
+    }
+
+    // For an increase or decrease in size, we have to shrink the loop by one,
+    // for a no_change, we don't because every size is compared to itself.
+    int loop_limit = 1;
+    if ( size_change_type == SCT::no_change )
+        loop_limit = 0;
+
+    // Currently, to test a non-square input, the fixed input sizes are used
+    // and the input x size is reduced by input_x / make_rect_x
+    int make_rect_x;
+    int make_rect_y = 1;
+    if ( do_rectangle )
+        make_rect_x = 2;
+    else
+        make_rect_x = 1;
+
+    if ( Rank == 3 && do_rectangle ) {
+        std::cout << "ERROR: cannot do 3d and rectangle at the same time" << std::endl;
+        return;
+    }
+
+    short4 input_size;
+    short4 output_size;
+    for ( int iSize = 0; iSize < size.size( ) - loop_limit; iSize++ ) {
+        int oSize;
+        int loop_size;
+        // TODO: the logic here is confusing, clean it up
+        if ( size_change_type != SCT::no_change ) {
+            oSize     = iSize + 1;
+            loop_size = size.size( );
+        }
+        else {
+            oSize     = iSize;
+            loop_size = oSize + 1;
+        }
+
+        while ( oSize < loop_size ) {
+
+            if ( is_size_change_decrease ) {
+                output_size = make_short4(size[iSize] / make_rect_x, size[iSize] / make_rect_y, 1, 0);
+                input_size  = make_short4(size[oSize] / make_rect_x, size[oSize] / make_rect_y, 1, 0);
+                if ( Rank == 3 ) {
+                    output_size.z = size[iSize];
+                    input_size.z  = size[oSize];
+                }
+            }
+            else {
+                input_size  = make_short4(size[iSize] / make_rect_x, size[iSize] / make_rect_y, 1, 0);
+                output_size = make_short4(size[oSize] / make_rect_x, size[oSize] / make_rect_y, 1, 0);
+                if ( Rank == 3 ) {
+                    input_size.z  = size[iSize];
+                    output_size.z = size[oSize];
+                }
+            }
+            if ( print_out_time ) {
+                printf("Testing padding from %i,%i,%i to %i,%i,%i\n", input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+            }
+
+            if ( (input_size.x == output_size.x && input_size.y == output_size.y && input_size.z == output_size.z) ) {
+                // Also will change the path called in FastFFT to just be fwd/inv xform.
+                set_conjMult_callback = false;
+            }
+
+            // bool test_passed = true;
+
+            Image<float, float2> FT_input(input_size);
+            Image<float, float2> FT_output(output_size);
+            Image<float, float2> cuFFT_input(input_size);
+            Image<float, float2> cuFFT_output(output_size);
+
+            short4 target_size;
+
+            if ( is_size_change_decrease )
+                target_size = input_size; // assuming xcorr_fwd_NOOP_inv_DECREASE
+            else
+                target_size = output_size;
+
+            Image<float, float2> target_search_image(target_size);
+            Image<float, float2> positive_control(target_size);
+
+            // We just make one instance of the FourierTransformer class, with calc type float.
+            // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
+            FastFFT::FourierTransformer<float, float, float, Rank> FT;
+            // Create an instance to copy memory also for the cufft tests.
+            FastFFT::FourierTransformer<float, float, float, Rank> cuFFT;
+            FastFFT::FourierTransformer<float, float, float, Rank> targetFT;
+
+            if ( is_size_change_decrease ) {
+                FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z);
+                FT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+
+                // For the subset of outputs this is just the input size, assuming the program then accesses just the valid data (could explicitly put into a new array which would be even slower.)
+                cuFFT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z);
+                cuFFT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z);
+
+                targetFT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z);
+                targetFT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+            }
+            else {
+                FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+                FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+                cuFFT.SetForwardFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+                cuFFT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+                targetFT.SetForwardFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+                targetFT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+            }
+
+            short4 fwd_dims_in  = FT.ReturnFwdInputDimensions( );
+            short4 fwd_dims_out = FT.ReturnFwdOutputDimensions( );
+            short4 inv_dims_in  = FT.ReturnInvInputDimensions( );
+            short4 inv_dims_out = FT.ReturnInvOutputDimensions( );
+
+            FT_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
+            FT_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+
+            cuFFT_input.real_memory_allocated  = cuFFT.ReturnInputMemorySize( );
+            cuFFT_output.real_memory_allocated = cuFFT.ReturnInvOutputMemorySize( );
+
+            if ( is_size_change_decrease )
+                target_search_image.real_memory_allocated = targetFT.ReturnInputMemorySize( );
+            else
+                target_search_image.real_memory_allocated = targetFT.ReturnInvOutputMemorySize( ); // the larger of the two.
+
+            positive_control.real_memory_allocated = target_search_image.real_memory_allocated; // this won't change size
+
+            bool set_fftw_plan = false;
+            FT_input.Allocate(set_fftw_plan);
+            FT_output.Allocate(set_fftw_plan);
+
+            cuFFT_input.Allocate(set_fftw_plan);
+            cuFFT_output.Allocate(set_fftw_plan);
+
+            target_search_image.Allocate(true);
+            positive_control.Allocate(true);
+
+            // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+            // ensures faster transfer. If false, it will be pinned for you.
+            FT.SetInputPointer(FT_input.real_values, false);
+            cuFFT.SetInputPointer(cuFFT_input.real_values, false);
+            targetFT.SetInputPointer(target_search_image.real_values, false);
+
+            // Set a unit impulse at the center of the input array.
+            // For now just considering the real space image to have been implicitly quadrant swapped so the center is at the origin.
+            FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 0.0f);
+            FT.SetToConstant(cuFFT_input.real_values, cuFFT_input.real_memory_allocated, 0.0f);
+            FT.SetToConstant(FT_output.real_values, FT_output.real_memory_allocated, 0.0f);
+            FT.SetToConstant(cuFFT_output.real_values, cuFFT_output.real_memory_allocated, 0.0f);
+            FT.SetToConstant(target_search_image.real_values, target_search_image.real_memory_allocated, 0.0f);
+            FT.SetToConstant(positive_control.real_values, target_search_image.real_memory_allocated, 0.0f);
+
+            // Place these values at the origin of the image and after convolution, should be at 0,0,0.
+            float testVal_1                    = 2.0f;
+            float testVal_2                    = set_conjMult_callback ? 3.0f : 1.0; // This way the test conditions are the same, the 1. indicating no conj
+            FT_input.real_values[0]            = testVal_1;
+            cuFFT_input.real_values[0]         = testVal_1;
+            target_search_image.real_values[0] = testVal_2;
+            positive_control.real_values[0]    = testVal_1;
+
+            // Transform the target on the host prior to transfer.
+            target_search_image.FwdFFT( );
+
+            // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
+            FT.CopyHostToDevice(FT_input.real_values);
+
+            cuFFT.CopyHostToDevice(cuFFT_input.real_values);
+
+            targetFT.CopyHostToDevice(target_search_image.real_values);
+
+            // Wait on the transfers to finish.
+            cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
+
+            // Positive control on the host.
+            // After both forward FFT's we should constant values in each pixel = testVal_1 and testVal_2.
+            // After the Conjugate multiplication, we should have a constant value of testVal_1*testVal_2.
+            // After the inverse FFT, we should have a constant value of testVal_1*testVal_2 in the center pixel and 0 everywhere else.
+            positive_control.FwdFFT( );
+            if ( set_conjMult_callback )
+                positive_control.MultiplyConjugateImage(target_search_image.complex_values);
+            positive_control.InvFFT( );
+
+            CheckUnitImpulseRealImage(positive_control, __LINE__);
+
+            if ( positive_control.real_values[0] == positive_control.size.x * positive_control.size.y * positive_control.size.z * testVal_1 * testVal_2 ) {
+                if ( print_out_time ) {
+                    std::cout << "Test passed for FFTW positive control." << std::endl;
+                }
+            }
+            else {
+                std::cout << "Test failed for FFTW positive control. Value at zero is  " << positive_control.real_values[0] << std::endl;
+                MyTestPrintAndExit(" ");
+            }
+
+            cuFFT_output.create_timing_events( );
+            if ( Rank == 3 ) {
+                cuFFT_output.MakeCufftPlan3d( );
+            }
+            else {
+                if ( print_out_time ) {
+                    std::cout << "2D test " << std::endl;
+                }
+
+                cuFFT_input.MakeCufftPlan( );
+                cuFFT_output.MakeCufftPlan( );
+            }
+
+            std::cout << "Test lambda" << std::endl;
+
+            FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>     noop;
+            FastFFT::KernelFunction::my_functor<float, 2, FastFFT::KernelFunction::CONJ_MUL> conj_mul;
+
+            //////////////////////////////////////////
+            //////////////////////////////////////////
+            // Warm up and check for accuracy
+            // we set set_conjMult_callback = false
+            if ( set_conjMult_callback || is_size_change_decrease ) {
+                // FT.CrossCorrelate(targetFT.d_ptr.momentum_space, false);
+                // Will type deduction work here?
+                MyFFTDebugPrintWithDetails("Calling Generic_Fwd_Image_Inv");
+                FT.Generic_Fwd_Image_Inv(targetFT.d_ptr.momentum_space, noop, conj_mul, noop);
+            }
+            else {
+                MyFFTDebugPrintWithDetails("Calling Generic_Fwd_Image_Inv");
+                FT.FwdFFT( );
+                FT.InvFFT( );
+            }
+
+            bool continue_debugging;
+            if ( is_size_change_decrease ) {
+                // Because the output is smaller than the input, we just copy to FT input.
+                // FIXME: In reality, we didn't need to allocate FT_output at all in this case
+                FT.CopyDeviceToHostAndSynchronize(FT_input.real_values, false);
+                continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(FT_input, fwd_dims_in, fwd_dims_out, inv_dims_in, inv_dims_out, __LINE__);
+            }
+            else {
+                // the output is equal or > the input, so we can always copy there.
+                FT.CopyDeviceToHostAndSynchronize(FT_output.real_values, false, false);
+                continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(FT_output, fwd_dims_in, fwd_dims_out, inv_dims_in, inv_dims_out, __LINE__);
+            }
+
+            if ( ! continue_debugging ) {
+                MyTestPrintAndExit(" ");
+            }
+
+            if ( is_size_change_decrease ) {
+                CheckUnitImpulseRealImage(FT_input, __LINE__);
+            }
+            else {
+                CheckUnitImpulseRealImage(FT_output, __LINE__);
+            }
+
+            int n_loops;
+            if ( Rank == 3 ) {
+                int max_size = std::max(fwd_dims_in.x, fwd_dims_out.x);
+                if ( max_size < 128 ) {
+                    n_loops = 1000;
+                }
+                else if ( max_size <= 256 ) {
+                    n_loops = 400;
+                }
+                else if ( max_size <= 512 ) {
+                    n_loops = 150;
+                }
+                else {
+                    n_loops = 50;
+                }
+            }
+            else {
+                int max_size = std::max(fwd_dims_in.x, fwd_dims_out.x);
+                if ( max_size < 256 ) {
+                    n_loops = 10000;
+                }
+                else if ( max_size <= 512 ) {
+                    n_loops = 5000;
+                }
+                else if ( max_size <= 2048 ) {
+                    n_loops = 2500;
+                }
+                else {
+                    n_loops = 1000;
+                }
+            }
+
+            cuFFT_output.record_start( );
+            for ( int i = 0; i < n_loops; ++i ) {
+                if ( set_conjMult_callback || is_size_change_decrease ) {
+                    //   FT.CrossCorrelate(targetFT.d_ptr.momentum_space_buffer, false);
+                    // Will type deduction work here?
+                    FT.Generic_Fwd_Image_Inv(targetFT.d_ptr.momentum_space, noop, conj_mul, noop);
+                }
+                else {
+                    FT.FwdFFT( );
+                    FT.InvFFT( );
+                }
+            }
+            cuFFT_output.record_stop( );
+            cuFFT_output.synchronize( );
+            cuFFT_output.print_time("FastFFT", print_out_time);
+            float FastFFT_time = cuFFT_output.elapsed_gpu_ms;
+
+            // if (set_padding_callback)
+            // {
+            //   precheck;
+            //   cufftReal* overlap_pointer;
+            //   overlap_pointer = cuFFT.d_ptr.position_space;
+            //   cuFFT_output.SetClipIntoCallback(overlap_pointer, cuFFT_input.size.x, cuFFT_input.size.y, cuFFT_input.size.w*2);
+            //   postcheck;
+            // }
+
+            if ( set_conjMult_callback ) {
+                precheck;
+                // FIXME scaling factor
+                cuFFT_output.SetComplexConjMultiplyAndLoadCallBack((cufftComplex*)targetFT.d_ptr.momentum_space_buffer, 1.0f);
+                postcheck;
+            }
+
+            if ( ! skip_cufft_for_profiling ) {
+                //////////////////////////////////////////
+                //////////////////////////////////////////
+                // Warm up and check for accuracy
+                if ( is_size_change_decrease ) {
+
+                    precheck;
+                    cudaErr(cufftExecR2C(cuFFT_input.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
+                    postcheck;
+
+                    precheck;
+                    cudaErr(cufftExecC2R(cuFFT_input.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
+                    postcheck;
+                }
+                else {
+                    // cuFFT.ClipIntoTopLeft();
+                    // cuFFT.ClipIntoReal(cuFFT_output.size.x/2, cuFFT_output.size.y/2, cuFFT_output.size.z/2);
+                    // cuFFT.CopyDeviceToHostAndSynchronize(cuFFT_output.real_values,false);
+
+                    precheck;
+                    cudaErr(cufftExecR2C(cuFFT_output.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
+                    postcheck;
+
+                    precheck;
+                    cudaErr(cufftExecC2R(cuFFT_output.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
+                    postcheck;
+                }
+
+                cuFFT_output.record_start( );
+                for ( int i = 0; i < n_loops; ++i ) {
+                    // std::cout << i << "i / " << n_loops << "n_loops" << std::endl;
+                    if ( set_conjMult_callback )
+                        cuFFT.ClipIntoTopLeft( );
+                    // cuFFT.ClipIntoReal(input_size.x/2, input_size.y/2, input_size.z/2);
+
+                    if ( is_size_change_decrease ) {
+                        precheck;
+                        cudaErr(cufftExecR2C(cuFFT_input.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
+                        postcheck;
+
+                        precheck;
+                        cudaErr(cufftExecC2R(cuFFT_input.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
+                        postcheck;
+                    }
+                    else {
+                        precheck;
+                        cudaErr(cufftExecR2C(cuFFT_output.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
+                        postcheck;
+
+                        precheck;
+                        cudaErr(cufftExecC2R(cuFFT_output.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
+                        postcheck;
+                    }
+                }
+                cuFFT_output.record_stop( );
+                cuFFT_output.synchronize( );
+                cuFFT_output.print_time("cuFFT", print_out_time);
+            } // end of if (! skip_cufft_for_profiling)
+            std::cout << "For size " << input_size.x << " to " << output_size.x << ": ";
+            std::cout << "Ratio cuFFT/FastFFT : " << cuFFT_output.elapsed_gpu_ms / FastFFT_time << "\n\n"
+                      << std::endl;
+
+            oSize++;
+            // We don't want to loop if the size is not actually changing.
+        } // while loop over pad to size
+    } // for loop over pad from size
+}
+
+int main(int argc, char** argv) {
+
+    using SCT = FastFFT::SizeChangeType::Enum;
+
+    std::string test_name;
+    // Default to running all tests
+    bool run_2d_performance_tests = false;
+    bool run_3d_performance_tests = false;
+
+    const std::string_view text_line = "simple convolution";
+    FastFFT::CheckInputArgs(argc, argv, text_line, run_2d_performance_tests, run_3d_performance_tests);
+
+    // TODO: size decrease
+    if ( run_2d_performance_tests ) {
+#ifdef HEAVYERRORCHECKING_FFT
+        std::cout << "Running performance tests with heavy error checking.\n";
+        std::cout << "This doesn't make sense as the synchronizations are invalidating.\n";
+// exit(1);
+#endif
+        SCT size_change_type;
+        // Set the SCT to no_change, increase, or decrease
+        size_change_type = SCT::no_change;
+        compare_libraries<2>(FastFFT::test_size, size_change_type, false);
+        // compare_libraries<2>(test_size_rectangle, do_3d, size_change_type, true);
+
+        size_change_type = SCT::increase;
+        compare_libraries<2>(FastFFT::test_size, size_change_type, false);
+        // compare_libraries<2>(test_size_rectangle, do_3d, size_change_type, true);
+
+        size_change_type = SCT::decrease;
+        compare_libraries<2>(FastFFT::test_size, size_change_type, false);
+    }
+
+    if ( run_3d_performance_tests ) {
+#ifdef HEAVYERRORCHECKING_FFT
+        std::cout << "Running performance tests with heavy error checking.\n";
+        std::cout << "This doesn't make sense as the synchronizations are invalidating.\n";
+#endif
+
+        SCT size_change_type;
+
+        size_change_type = SCT::no_change;
+        compare_libraries<3>(FastFFT::test_size, size_change_type, false);
+
+        // TODO: These are not yet completed.
+        // size_change_type = SCT::increase;
+        // compare_libraries<3>(FastFFT::test_size, do_3d, size_change_type, false);
+
+        // size_change_type = SCT::decrease;
+        // compare_libraries(FastFFT::test_size, do_3d, size_change_type, false);
+    }
+
+    return 0;
+};
\ No newline at end of file
diff --git a/src/tests/test.cu b/src/tests/test.cu
deleted file mode 120000
index 915a8e1..0000000
--- a/src/tests/test.cu
+++ /dev/null
@@ -1 +0,0 @@
-test.cu.cpp
\ No newline at end of file
diff --git a/src/tests/test.cu b/src/tests/test.cu
new file mode 100644
index 0000000..f934565
--- /dev/null
+++ b/src/tests/test.cu
@@ -0,0 +1,307 @@
+#include "tests.h"
+
+// Define an enum for size change type to indecate a decrease, no change or increase
+
+// The Fourier transform of a constant should be a unit impulse, and on back fft, without normalization, it should be a constant * N.
+// It is assumed the input/output have the same dimension (i.e. no padding)
+
+template <int Rank>
+bool random_image_test(std::vector<int> size, bool do_3d = false) {
+
+    bool              all_passed = true;
+    std::vector<bool> init_passed(size.size( ), true);
+    std::vector<bool> FFTW_passed(size.size( ), true);
+    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
+    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
+
+    for ( int n = 0; n < size.size( ); n++ ) {
+
+        short4 input_size;
+        short4 output_size;
+        long   full_sum = long(size[n]);
+        if ( do_3d ) {
+            input_size  = make_short4(size[n], size[n], size[n], 0);
+            output_size = make_short4(size[n], size[n], size[n], 0);
+            full_sum    = full_sum * full_sum * full_sum * full_sum * full_sum * full_sum;
+        }
+        else {
+            input_size  = make_short4(size[n], size[n], 1, 0);
+            output_size = make_short4(size[n], size[n], 1, 0);
+            full_sum    = full_sum * full_sum * full_sum * full_sum;
+        }
+
+        float sum;
+
+        Image<float, float2> host_input(input_size);
+        Image<float, float2> host_output(output_size);
+        Image<float, float2> host_copy(output_size);
+        Image<float, float2> device_output(output_size);
+
+        // Pointers to the arrays on the host -- maybe make this a struct of some sort? I'm sure there is a parallel in cuda, look into cuarray/texture code
+
+        // We just make one instance of the FourierTransformer class, with calc type float.
+        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
+        FastFFT::FourierTransformer<float, float, float, Rank> FT;
+
+        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
+        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+        // The padding (dims.w) is calculated based on the setup
+        short4 dims_in  = FT.ReturnFwdInputDimensions( );
+        short4 dims_out = FT.ReturnFwdOutputDimensions( );
+
+        // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
+        // Note: there is no reason we really need this, because the xforms will always be out of place.
+        //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
+        host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
+        host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+        host_copy.real_memory_allocated   = FT.ReturnInvOutputMemorySize( );
+
+        // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
+        // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
+        device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
+
+        // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
+        // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
+        bool set_fftw_plan = true;
+        host_input.Allocate(set_fftw_plan);
+        host_output.Allocate(set_fftw_plan);
+        host_copy.Allocate(set_fftw_plan);
+
+        // Set our input host memory to a constant. Then FFT[0] = host_input_memory_allocated
+        FT.SetToRandom(host_output.real_values, host_output.real_memory_allocated, 0.0f, 1.0f);
+
+        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+        // ensures faster transfer. If false, it will be pinned for you.
+        FT.SetInputPointer(host_output.real_values, false);
+
+        // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
+        FT.CopyHostToDevice(host_output.real_values);
+
+#if FFT_DEBUG_STAGE > 0
+        host_output.FwdFFT( );
+#endif
+
+        for ( long i = 0; i < host_output.real_memory_allocated / 2; i++ ) {
+            host_copy.complex_values[i] = host_output.complex_values[i];
+        }
+
+        // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
+        bool swap_real_space_quadrants = false;
+        FT.FwdFFT( );
+
+        // in buffer, do not deallocate, do not unpin memory
+        FT.CopyDeviceToHostAndSynchronize(host_output.real_values, false);
+        bool test_passed = true;
+
+#if FFT_DEBUG_STAGE == 0
+        PrintArray(host_output.real_values, dims_out.x, dims_in.y, dims_in.z, dims_out.w);
+        PrintArray(host_copy.real_values, dims_out.x, dims_in.y, dims_in.z, dims_out.w);
+        MyTestPrintAndExit("stage 0 ");
+#elif FFT_DEBUG_STAGE == 1
+        std::cout << " For random_image_test partial transforms aren't supported, b/c we need to compare to the cpu output." << std::endl;
+        MyTestPrintAndExit("stage 1 ");
+#elif FFT_DEBUG_STAGE == 2
+        std::cout << " For random_image_test partial transforms aren't supported, b/c we need to compare to the cpu output." << std::endl;
+        MyTestPrintAndExit("stage 2 ");
+#elif FFT_DEBUG_STAGE == 3
+        PrintArray(host_output.complex_values, dims_in.y, dims_out.w, dims_out.z);
+        PrintArray(host_copy.complex_values, dims_in.y, dims_out.w, dims_out.z);
+
+        //   std::cout << "Distance between FastFFT and CPU: " << distance << std::endl;
+        MyTestPrintAndExit("stage 3 ");
+#endif
+
+        double distance = 0.0;
+        for ( long index = 0; index < host_output.real_memory_allocated / 2; index++ ) {
+            distance += sqrt((host_output.complex_values[index].x - host_copy.complex_values[index].x) * (host_output.complex_values[index].x - host_copy.complex_values[index].x) +
+                             (host_output.complex_values[index].y - host_copy.complex_values[index].y) * (host_output.complex_values[index].y - host_copy.complex_values[index].y));
+        }
+        distance /= (host_output.real_memory_allocated / 2);
+
+        std::cout << "Distance between FastFFT and CPU: " << distance << std::endl;
+        exit(0);
+        if ( test_passed == false ) {
+            all_passed                = false;
+            FastFFT_forward_passed[n] = false;
+        }
+        // MyFFTDebugAssertTestTrue( test_passed, "FastFFT unit impulse forward FFT");
+        FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f);
+
+        FT.InvFFT( );
+        FT.CopyDeviceToHostAndSynchronize(host_output.real_values, true);
+
+#if FFT_DEBUG_STAGE == 4
+        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
+        MyTestPrintAndExit("stage 4 ");
+#elif FFT_DEBUG_STAGE == 5
+        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
+        MyTestPrintAndExit("stage 5 ");
+#elif FFT_DEBUG_STAGE == 6
+        if ( do_3d ) {
+            std::cout << " in 3d print inv " << dims_out.w << "w" << std::endl;
+            PrintArray(host_output.complex_values, dims_out.w, dims_out.y, dims_out.z);
+        }
+        else
+            PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
+        MyTestPrintAndExit("stage 6 ");
+#elif FFT_DEBUG_STAGE == 7
+        PrintArray(host_output.real_values, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
+        MyTestPrintAndExit("stage 7 ");
+#elif FFT_DEBUG_STAGE > 7
+        // No debug, keep going
+#else
+        MyTestPrintAndExit(" This block is only valid for FFT_DEBUG_STAGE == 4, 5, 7 ");
+#endif
+
+        // Assuming the outputs are always even dimensions, padding_jump_val is always 2.
+        sum = host_output.ReturnSumOfReal(host_output.real_values, dims_out, true);
+
+        if ( sum != full_sum ) {
+            all_passed                  = false;
+            FastFFT_roundTrip_passed[n] = false;
+        }
+        MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x));
+    } // loop over sizes
+
+    if ( all_passed ) {
+        if ( do_3d )
+            std::cout << "    All 3d const_image tests passed!" << std::endl;
+        else
+            std::cout << "    All 2d const_image tests passed!" << std::endl;
+    }
+    else {
+        for ( int n = 0; n < size.size( ); n++ ) {
+            if ( ! init_passed[n] )
+                std::cout << "    Initialization failed for size " << size[n] << std::endl;
+            if ( ! FFTW_passed[n] )
+                std::cout << "    FFTW failed for size " << size[n] << std::endl;
+            if ( ! FastFFT_forward_passed[n] )
+                std::cout << "    FastFFT failed for forward transform size " << size[n] << std::endl;
+            if ( ! FastFFT_roundTrip_passed[n] )
+                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << std::endl;
+        }
+    }
+    return all_passed;
+}
+
+template <int Rank>
+void run_oned(std::vector<int> size) {
+
+    // Override the size to be one dimensional in x
+    std::cout << "Running one-dimensional tests\n"
+              << std::endl;
+
+    for ( int n : size ) {
+        short4 input_size  = make_short4(n, 1, 1, 0);
+        short4 output_size = make_short4(n, 1, 1, 0);
+
+        Image<float, float2>  FT_input(input_size);
+        Image<float, float2>  FT_output(output_size);
+        Image<float2, float2> FT_input_complex(input_size);
+        Image<float2, float2> FT_output_complex(output_size);
+
+        // We just make one instance of the FourierTransformer class, with calc type float.
+        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
+        FastFFT::FourierTransformer<float, float, float, Rank> FT;
+        FastFFT::FourierTransformer<float, float2, float2>     FT_complex;
+
+        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
+        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+        FT_complex.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+        FT_complex.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+        FT_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
+        FT_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+
+        FT_input_complex.real_memory_allocated  = FT_complex.ReturnInputMemorySize( );
+        FT_output_complex.real_memory_allocated = FT_complex.ReturnInvOutputMemorySize( );
+
+        bool set_fftw_plan = true;
+        FT_input.Allocate(set_fftw_plan);
+        FT_output.Allocate(set_fftw_plan);
+
+        FT_input_complex.Allocate( );
+        FT_output_complex.Allocate( );
+
+        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+        // ensures faster transfer. If false, it will be pinned for you.
+        FT.SetInputPointer(FT_input.real_values, false);
+        FT_complex.SetInputPointer(FT_input_complex.complex_values, false);
+
+        FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 1.f);
+
+        // Set a unit impulse at the center of the input array.
+        // FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 1.0f);
+        float2 const_val = make_float2(1.0f, 0.0f);
+        FT_complex.SetToConstant<float2>(FT_input_complex.complex_values, FT_input.real_memory_allocated, const_val);
+        for ( int i = 0; i < 10; i++ ) {
+            std::cout << FT_input_complex.complex_values[i].x << "," << FT_input_complex.complex_values[i].y << std::endl;
+        }
+
+        FT.CopyHostToDevice(FT_input.real_values);
+        FT_complex.CopyHostToDevice(FT_input_complex.complex_values);
+        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
+
+        // Set the outputs to a clearly wrong answer.
+        FT.SetToConstant(FT_output.real_values, FT_input.real_memory_allocated, 2.0f);
+        const_val = make_float2(2.0f, 2.0f);
+        FT_complex.SetToConstant<float2>(FT_output_complex.complex_values, FT_output.real_memory_allocated, const_val);
+
+        FT_input.FwdFFT( );
+
+        bool transpose_output          = false;
+        bool swap_real_space_quadrants = false;
+        FT.FwdFFT( );
+        FT_complex.FwdFFT( );
+
+        FT.CopyDeviceToHostAndSynchronize(FT_output.real_values, false, false);
+        FT_complex.CopyDeviceToHostAndSynchronize(FT_output_complex.real_values, false, false);
+
+        FT_input.InvFFT( );
+
+        for ( int i = 0; i < 5; ++i ) {
+            std::cout << "FFTW inv " << FT_input.real_values[i] << std::endl;
+        }
+        std::cout << std::endl;
+
+        FT.InvFFT( );
+        FT_complex.InvFFT( );
+        FT.CopyDeviceToHostAndSynchronize(FT_output.real_values, true);
+        FT_complex.CopyDeviceToHostAndSynchronize(FT_output_complex.real_values, true);
+
+        for ( int i = 0; i < 10; i++ ) {
+            std::cout << "Ft inv " << FT_output.real_values[i] << std::endl;
+        }
+        for ( int i = 0; i < 10; i++ ) {
+            std::cout << "Ft complex inv " << FT_output_complex.real_values[i].x << "," << FT_output_complex.real_values[i].y << std::endl;
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+
+    using SCT = FastFFT::SizeChangeType::Enum;
+
+    if ( argc != 2 ) {
+        return 1;
+    }
+    std::string test_name = argv[1];
+    std::printf("Standard is %li\n\n", __cplusplus);
+
+    // Input size vectors to be tested.
+    std::vector<int> test_size           = {32, 64, 128, 256, 512, 1024, 2048, 4096};
+    std::vector<int> test_size_rectangle = {64, 128, 256, 512, 1024, 2048, 4096};
+    std::vector<int> test_size_3d        = {32, 64, 128, 256, 512};
+    // std::vector<int> test_size_3d ={512};
+
+    // The launch parameters fail for 4096 -> < 64 for r2c_decrease, not sure if it is the elements_per_thread or something else.
+    // For now, just over-ride these small sizes
+    std::vector<int> test_size_for_decrease = {64, 128, 256, 512, 1024, 2048, 4096};
+
+    // If we get here, all tests passed.
+    return 0;
+};
diff --git a/src/tests/test.cu.cpp b/src/tests/test.cu.cpp
deleted file mode 100644
index cd16974..0000000
--- a/src/tests/test.cu.cpp
+++ /dev/null
@@ -1,1487 +0,0 @@
-#include "../cpp/Image.cu"
-#include "../FastFFT.cu"
-#include <cufft.h>
-#include <cufftXt.h>
-
-#define MyTestPrintAndExit(...)                                                                                          \
-    {                                                                                                                    \
-        std::cerr << __VA_ARGS__ << " From: " << __FILE__ << " " << __LINE__ << " " << __PRETTY_FUNCTION__ << std::endl; \
-        exit(-1);                                                                                                        \
-    }
-
-#ifndef FFT_DEBUG_LEVEL
-#error "FFT_DEBUG_LEVEL not defined"
-#endif
-
-#ifndef DEBUG_FFT_STAGE
-#error "DEBUG_FFT_STAGE not defined"
-#endif
-
-void PrintArray(float2* array, short NX, short NY, short NZ, int line_wrapping = 34) {
-    // COMPLEX TODO make these functions.
-    int n = 0;
-    for ( int z = 0; z < NZ; z++ ) {
-        for ( int x = 0; x < NX; x++ ) {
-
-            std::cout << x << "[ ";
-            for ( int y = 0; y < NY; y++ ) {
-                std::cout << array[x + NX * (y + z * NY)].x << "," << array[x + NX * (y + z * NY)].y << " ";
-                n++;
-                if ( n == line_wrapping ) {
-                    n = 0;
-                    std::cout << std::endl;
-                } // line wrapping
-            }
-            std::cout << "] " << std::endl;
-            n = 0;
-        }
-        if ( NZ > 0 )
-            std::cout << " ... ... ... " << z << " ... ... ..." << std::endl;
-    }
-};
-
-void PrintArray(float* array, short NX, short NY, short NZ, short NW, int line_wrapping = 34) {
-    int n = 0;
-    for ( int z = 0; z < NZ; z++ ) {
-        for ( int x = 0; x < NX; x++ ) {
-
-            std::cout << x << "[ ";
-            for ( int y = 0; y < NY; y++ ) {
-                std::cout << array[x + (2 * NW) * (y + z * NY)] << " ";
-                n++;
-                if ( n == line_wrapping ) {
-                    n = 0;
-                    std::cout << std::endl;
-                } // line wrapping
-            }
-            std::cout << "] " << std::endl;
-            n = 0;
-        }
-        if ( NZ > 0 )
-            std::cout << " ... ... ... " << z << " ... ... ..." << std::endl;
-    }
-};
-
-void PrintArray_XZ(float2* array, short NX, short NY, short NZ, int line_wrapping = 34) {
-    // COMPLEX TODO make these functions.
-    int n = 0;
-    for ( int x = 0; x < NX; x++ ) {
-        for ( int z = 0; z < NZ; z++ ) {
-
-            std::cout << z << "[ ";
-            for ( int y = 0; y < NY; y++ ) {
-                std::cout << array[z + NZ * (y + x * NY)].x << "," << array[z + NZ * (y + x * NY)].y << " ";
-                n++;
-                if ( n == line_wrapping ) {
-                    n = 0;
-                    std::cout << std::endl;
-                } // line wrapping
-            }
-            std::cout << "] " << std::endl;
-            n = 0;
-        }
-        if ( NZ > 0 )
-            std::cout << " ... ... ... " << x << " ... ... ..." << std::endl;
-    }
-};
-
-// The Fourier transform of a constant should be a unit impulse, and on back fft, without normalization, it should be a constant * N.
-// It is assumed the input/output have the same dimension (i.e. no padding)
-
-template <int Rank>
-bool const_image_test(std::vector<int> size, bool do_3d = false) {
-
-    bool              all_passed = true;
-    std::vector<bool> init_passed(size.size( ), true);
-    std::vector<bool> FFTW_passed(size.size( ), true);
-    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
-    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
-
-    for ( int n = 0; n < size.size( ); n++ ) {
-
-        short4 input_size;
-        short4 output_size;
-        long   full_sum = long(size[n]);
-        if ( do_3d ) {
-            input_size  = make_short4(size[n], size[n], size[n], 0);
-            output_size = make_short4(size[n], size[n], size[n], 0);
-            full_sum    = full_sum * full_sum * full_sum * full_sum * full_sum * full_sum;
-        }
-        else {
-            input_size  = make_short4(size[n], size[n], 1, 0);
-            output_size = make_short4(size[n], size[n], 1, 0);
-            full_sum    = full_sum * full_sum * full_sum * full_sum;
-        }
-
-        float sum;
-
-        Image<float, float2> host_input(input_size);
-        Image<float, float2> host_output(output_size);
-        Image<float, float2> device_output(output_size);
-
-        // Pointers to the arrays on the host -- maybe make this a struct of some sort? I'm sure there is a parallel in cuda, look into cuarray/texture code
-
-        // We just make one instance of the FourierTransformer class, with calc type float.
-        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
-        FastFFT::FourierTransformer<float, float, float, Rank> FT;
-
-        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
-        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-        // The padding (dims.w) is calculated based on the setup
-        short4 dims_in  = FT.ReturnFwdInputDimensions( );
-        short4 dims_out = FT.ReturnFwdOutputDimensions( );
-
-        // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
-        // Note: there is no reason we really need this, because the xforms will always be out of place.
-        //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
-        host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
-        host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
-
-        // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
-        // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
-        device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
-
-        // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
-        // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
-        bool set_fftw_plan = true;
-        host_input.Allocate(set_fftw_plan);
-        host_output.Allocate(set_fftw_plan);
-
-        // Set our input host memory to a constant. Then FFT[0] = host_input_memory_allocated
-        FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 1.0f);
-
-        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
-        // ensures faster transfer. If false, it will be pinned for you.
-        FT.SetInputPointer(host_output.real_values, false);
-        sum = ReturnSumOfReal(host_output.real_values, dims_out);
-
-        if ( sum != long(dims_in.x) * long(dims_in.y) * long(dims_in.z) ) {
-            all_passed     = false;
-            init_passed[n] = false;
-        }
-
-        // MyFFTDebugAssertTestTrue( sum == dims_out.x*dims_out.y*dims_out.z,"Unit impulse Init ");
-
-        // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
-        FT.CopyHostToDevice( );
-
-        host_output.FwdFFT( );
-
-        bool test_passed = true;
-        for ( long index = 1; index < host_output.real_memory_allocated / 2; index++ ) {
-            if ( host_output.complex_values[index].x != 0.0f && host_output.complex_values[index].y != 0.0f ) {
-                std::cout << host_output.complex_values[index].x << " " << host_output.complex_values[index].y << " " << std::endl;
-                test_passed = false;
-            }
-        }
-        if ( host_output.complex_values[0].x != (float)dims_out.x * (float)dims_out.y * (float)dims_out.z )
-            test_passed = false;
-
-        if ( test_passed == false ) {
-            all_passed     = false;
-            FFTW_passed[n] = false;
-        }
-        // MyFFTDebugAssertTestTrue( test_passed, "FFTW unit impulse forward FFT");
-
-        // Just to make sure we don't get a false positive, set the host memory to some undesired value.
-        FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
-
-        // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
-        bool swap_real_space_quadrants = false;
-        FT.FwdFFT( );
-
-        // in buffer, do not deallocate, do not unpin memory
-        FT.CopyDeviceToHost(false, false);
-        test_passed = true;
-        for ( long index = 1; index < host_output.real_memory_allocated / 2; index++ ) {
-            if ( host_output.complex_values[index].x != 0.0f && host_output.complex_values[index].y != 0.0f ) {
-                test_passed = false;
-            } // std::cout << host_output.complex_values[index].x  << " " << host_output.complex_values[index].y << " " );}
-        }
-        if ( host_output.complex_values[0].x != (float)dims_out.x * (float)dims_out.y * (float)dims_out.z )
-            test_passed = false;
-
-#if DEBUG_FFT_STAGE == 0
-        PrintArray(host_output.real_values, dims_out.x, dims_in.y, dims_in.z, dims_out.w);
-        MyTestPrintAndExit("stage 0 ");
-#elif DEBUG_FFT_STAGE == 1
-        if ( do_3d ) {
-            std::cout << " in 3d print " << std::endl;
-            PrintArray(host_output.complex_values, dims_in.z, dims_in.y, dims_out.w);
-        }
-        else
-            PrintArray(host_output.complex_values, dims_in.y, dims_out.w, dims_in.z);
-        MyTestPrintAndExit("stage 1 ");
-#elif DEBUG_FFT_STAGE == 2
-        PrintArray(host_output.complex_values, dims_in.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 2 ");
-#elif DEBUG_FFT_STAGE == 3
-        PrintArray(host_output.complex_values, dims_in.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 3 ");
-#endif
-
-        if ( test_passed == false ) {
-            all_passed                = false;
-            FastFFT_forward_passed[n] = false;
-        }
-        // MyFFTDebugAssertTestTrue( test_passed, "FastFFT unit impulse forward FFT");
-        FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f);
-
-        FT.InvFFT( );
-        FT.CopyDeviceToHost(true, true);
-
-#if DEBUG_FFT_STAGE == 4
-        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 4 ");
-#elif DEBUG_FFT_STAGE == 5
-        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 5 ");
-#elif DEBUG_FFT_STAGE == 6
-        if ( do_3d ) {
-            std::cout << " in 3d print inv " << dims_out.w << "w" << std::endl;
-            PrintArray(host_output.complex_values, dims_out.w, dims_out.y, dims_out.z);
-        }
-        else
-            PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 6 ");
-#elif DEBUG_FFT_STAGE == 7
-        PrintArray(host_output.real_values, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
-        MyTestPrintAndExit("stage 7 ");
-#elif DEBUG_FFT_STAGE > 7
-        // No debug, keep going
-#else
-        MyTestPrintAndExit(" This block is only valid for DEBUG_FFT_STAGE == 4, 5, 7 ");
-#endif
-
-        // Assuming the outputs are always even dimensions, padding_jump_val is always 2.
-        sum = ReturnSumOfReal(host_output.real_values, dims_out, true);
-
-        if ( sum != full_sum ) {
-            all_passed                  = false;
-            FastFFT_roundTrip_passed[n] = false;
-        }
-        MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x));
-    } // loop over sizes
-
-    if ( all_passed ) {
-        if ( do_3d )
-            std::cout << "    All 3d const_image tests passed!" << std::endl;
-        else
-            std::cout << "    All 2d const_image tests passed!" << std::endl;
-    }
-    else {
-        for ( int n = 0; n < size.size( ); n++ ) {
-            if ( ! init_passed[n] )
-                std::cout << "    Initialization failed for size " << size[n] << std::endl;
-            if ( ! FFTW_passed[n] )
-                std::cout << "    FFTW failed for size " << size[n] << std::endl;
-            if ( ! FastFFT_forward_passed[n] )
-                std::cout << "    FastFFT failed for forward transform size " << size[n] << std::endl;
-            if ( ! FastFFT_roundTrip_passed[n] )
-                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << std::endl;
-        }
-    }
-    return all_passed;
-}
-
-template <int Rank>
-bool random_image_test(std::vector<int> size, bool do_3d = false) {
-
-    bool              all_passed = true;
-    std::vector<bool> init_passed(size.size( ), true);
-    std::vector<bool> FFTW_passed(size.size( ), true);
-    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
-    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
-
-    for ( int n = 0; n < size.size( ); n++ ) {
-
-        short4 input_size;
-        short4 output_size;
-        long   full_sum = long(size[n]);
-        if ( do_3d ) {
-            input_size  = make_short4(size[n], size[n], size[n], 0);
-            output_size = make_short4(size[n], size[n], size[n], 0);
-            full_sum    = full_sum * full_sum * full_sum * full_sum * full_sum * full_sum;
-        }
-        else {
-            input_size  = make_short4(size[n], size[n], 1, 0);
-            output_size = make_short4(size[n], size[n], 1, 0);
-            full_sum    = full_sum * full_sum * full_sum * full_sum;
-        }
-
-        float sum;
-
-        Image<float, float2> host_input(input_size);
-        Image<float, float2> host_output(output_size);
-        Image<float, float2> host_copy(output_size);
-        Image<float, float2> device_output(output_size);
-
-        // Pointers to the arrays on the host -- maybe make this a struct of some sort? I'm sure there is a parallel in cuda, look into cuarray/texture code
-
-        // We just make one instance of the FourierTransformer class, with calc type float.
-        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
-        FastFFT::FourierTransformer<float, float, float, Rank> FT;
-
-        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
-        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-        // The padding (dims.w) is calculated based on the setup
-        short4 dims_in  = FT.ReturnFwdInputDimensions( );
-        short4 dims_out = FT.ReturnFwdOutputDimensions( );
-
-        // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
-        // Note: there is no reason we really need this, because the xforms will always be out of place.
-        //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
-        host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
-        host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
-        host_copy.real_memory_allocated   = FT.ReturnInvOutputMemorySize( );
-
-        // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
-        // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
-        device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
-
-        // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
-        // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
-        bool set_fftw_plan = true;
-        host_input.Allocate(set_fftw_plan);
-        host_output.Allocate(set_fftw_plan);
-        host_copy.Allocate(set_fftw_plan);
-
-        // Set our input host memory to a constant. Then FFT[0] = host_input_memory_allocated
-        FT.SetToRandom(host_output.real_values, host_output.real_memory_allocated, 0.0f, 1.0f);
-
-        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
-        // ensures faster transfer. If false, it will be pinned for you.
-        FT.SetInputPointer(host_output.real_values, false);
-
-        // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
-        FT.CopyHostToDevice( );
-
-#if DEBUG_FFT_STAGE > 0
-        host_output.FwdFFT( );
-#endif
-
-        for ( long i = 0; i < host_output.real_memory_allocated / 2; i++ ) {
-            host_copy.complex_values[i] = host_output.complex_values[i];
-        }
-
-        // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
-        bool swap_real_space_quadrants = false;
-        FT.FwdFFT( );
-
-        // in buffer, do not deallocate, do not unpin memory
-        FT.CopyDeviceToHost(false, false);
-        bool test_passed = true;
-
-#if DEBUG_FFT_STAGE == 0
-        PrintArray(host_output.real_values, dims_out.x, dims_in.y, dims_in.z, dims_out.w);
-        PrintArray(host_copy.real_values, dims_out.x, dims_in.y, dims_in.z, dims_out.w);
-        MyTestPrintAndExit("stage 0 ");
-#elif DEBUG_FFT_STAGE == 1
-        std::cout << " For random_image_test partial transforms aren't supported, b/c we need to compare to the cpu output." << std::endl;
-        MyTestPrintAndExit("stage 1 ");
-#elif DEBUG_FFT_STAGE == 2
-        std::cout << " For random_image_test partial transforms aren't supported, b/c we need to compare to the cpu output." << std::endl;
-        MyTestPrintAndExit("stage 2 ");
-#elif DEBUG_FFT_STAGE == 3
-        PrintArray(host_output.complex_values, dims_in.y, dims_out.w, dims_out.z);
-        PrintArray(host_copy.complex_values, dims_in.y, dims_out.w, dims_out.z);
-
-        //   std::cout << "Distance between FastFFT and CPU: " << distance << std::endl;
-        MyTestPrintAndExit("stage 3 ");
-#endif
-
-        double distance = 0.0;
-        for ( long index = 0; index < host_output.real_memory_allocated / 2; index++ ) {
-            distance += sqrt((host_output.complex_values[index].x - host_copy.complex_values[index].x) * (host_output.complex_values[index].x - host_copy.complex_values[index].x) +
-                             (host_output.complex_values[index].y - host_copy.complex_values[index].y) * (host_output.complex_values[index].y - host_copy.complex_values[index].y));
-        }
-        distance /= (host_output.real_memory_allocated / 2);
-
-        std::cout << "Distance between FastFFT and CPU: " << distance << std::endl;
-        exit(0);
-        if ( test_passed == false ) {
-            all_passed                = false;
-            FastFFT_forward_passed[n] = false;
-        }
-        // MyFFTDebugAssertTestTrue( test_passed, "FastFFT unit impulse forward FFT");
-        FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 2.0f);
-
-        FT.InvFFT( );
-        FT.CopyDeviceToHost(true, true);
-
-#if DEBUG_FFT_STAGE == 4
-        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 4 ");
-#elif DEBUG_FFT_STAGE == 5
-        PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 5 ");
-#elif DEBUG_FFT_STAGE == 6
-        if ( do_3d ) {
-            std::cout << " in 3d print inv " << dims_out.w << "w" << std::endl;
-            PrintArray(host_output.complex_values, dims_out.w, dims_out.y, dims_out.z);
-        }
-        else
-            PrintArray(host_output.complex_values, dims_out.y, dims_out.w, dims_out.z);
-        MyTestPrintAndExit("stage 6 ");
-#elif DEBUG_FFT_STAGE == 7
-        PrintArray(host_output.real_values, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
-        MyTestPrintAndExit("stage 7 ");
-#elif DEBUG_FFT_STAGE > 7
-        // No debug, keep going
-#else
-        MyTestPrintAndExit(" This block is only valid for DEBUG_FFT_STAGE == 4, 5, 7 ");
-#endif
-
-        // Assuming the outputs are always even dimensions, padding_jump_val is always 2.
-        sum = ReturnSumOfReal(host_output.real_values, dims_out, true);
-
-        if ( sum != full_sum ) {
-            all_passed                  = false;
-            FastFFT_roundTrip_passed[n] = false;
-        }
-        MyFFTDebugAssertTestTrue(sum == full_sum, "FastFFT constant image round trip for size " + std::to_string(dims_in.x));
-    } // loop over sizes
-
-    if ( all_passed ) {
-        if ( do_3d )
-            std::cout << "    All 3d const_image tests passed!" << std::endl;
-        else
-            std::cout << "    All 2d const_image tests passed!" << std::endl;
-    }
-    else {
-        for ( int n = 0; n < size.size( ); n++ ) {
-            if ( ! init_passed[n] )
-                std::cout << "    Initialization failed for size " << size[n] << std::endl;
-            if ( ! FFTW_passed[n] )
-                std::cout << "    FFTW failed for size " << size[n] << std::endl;
-            if ( ! FastFFT_forward_passed[n] )
-                std::cout << "    FastFFT failed for forward transform size " << size[n] << std::endl;
-            if ( ! FastFFT_roundTrip_passed[n] )
-                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << std::endl;
-        }
-    }
-    return all_passed;
-}
-
-template <int Rank>
-bool unit_impulse_test(std::vector<int> size, bool do_3d, bool do_increase_size) {
-
-    bool              all_passed = true;
-    std::vector<bool> init_passed(size.size( ), true);
-    std::vector<bool> FFTW_passed(size.size( ), true);
-    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
-    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
-
-    short4 input_size;
-    short4 output_size;
-    for ( int iSize = 0; iSize < size.size( ) - 1; iSize++ ) {
-        int oSize = iSize + 1;
-        while ( oSize < size.size( ) ) {
-
-            // std::cout << std::endl << "Testing padding from  " << size[iSize] << " to " << size[oSize] << std::endl;
-            if ( do_increase_size ) {
-                if ( do_3d ) {
-                    input_size  = make_short4(size[iSize], size[iSize], size[iSize], 0);
-                    output_size = make_short4(size[oSize], size[oSize], size[oSize], 0);
-                }
-                else {
-                    input_size  = make_short4(size[iSize], size[iSize], 1, 0);
-                    output_size = make_short4(size[oSize], size[oSize], 1, 0);
-                }
-            }
-            else {
-                if ( do_3d ) {
-                    output_size = make_short4(size[iSize], size[iSize], size[iSize], 0);
-                    input_size  = make_short4(size[oSize], size[oSize], size[oSize], 0);
-                }
-                else {
-                    output_size = make_short4(size[iSize], size[iSize], 1, 0);
-                    input_size  = make_short4(size[oSize], size[oSize], 1, 0);
-                }
-            }
-
-            float sum;
-
-            Image<float, float2> host_input(input_size);
-            Image<float, float2> host_output(output_size);
-            Image<float, float2> device_output(output_size);
-
-            // We just make one instance of the FourierTransformer class, with calc type float.
-            // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
-            FastFFT::FourierTransformer<float, float, float, Rank> FT;
-            // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
-            FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-            FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-            // The padding (dims.w) is calculated based on the setup
-            short4 dims_in  = FT.ReturnFwdInputDimensions( );
-            short4 dims_out = FT.ReturnFwdOutputDimensions( );
-            // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
-            // Note: there is no reason we really need this, because the xforms will always be out of place.
-            //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
-            host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
-            host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
-
-            // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
-            // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
-            device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
-
-            // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
-            // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
-            bool set_fftw_plan = true;
-            host_input.Allocate(set_fftw_plan);
-            host_output.Allocate(set_fftw_plan);
-
-            // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
-            // ensures faster transfer. If false, it will be pinned for you.
-            FT.SetInputPointer(host_input.real_values, false);
-
-            // Set a unit impulse at the center of the input array.
-            FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 0.0f);
-            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 0.0f);
-
-            sum = ReturnSumOfReal(host_output.real_values, dims_out);
-            // host_input.real_values[ dims_in.y/2 * (dims_in.x+host_input.padding_jump_value) + dims_in.x/2] = 1.0f;
-            // short4 wanted_center = make_short4(0,0,0,0);
-            // ClipInto(host_input.real_values, host_output.real_values, dims_in ,  dims_out,  wanted_center, 0.f);
-
-            // FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 0.0f);
-            host_input.real_values[0]  = 1.0f;
-            host_output.real_values[0] = 1.0f;
-
-            sum = ReturnSumOfReal(host_output.real_values, dims_out);
-            if ( sum != 1 ) {
-                all_passed         = false;
-                init_passed[iSize] = false;
-            }
-
-            // MyFFTDebugAssertTestTrue( sum == 1,"Unit impulse Init ");
-
-            // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
-            FT.CopyHostToDevice( );
-
-            host_output.FwdFFT( );
-
-            host_output.fftw_epsilon = ReturnSumOfComplexAmplitudes(host_output.complex_values, host_output.real_memory_allocated / 2);
-            // std::cout << "host " << host_output.fftw_epsilon << " " << host_output.real_memory_allocated<< std::endl;
-
-            host_output.fftw_epsilon -= (host_output.real_memory_allocated / 2);
-            if ( std::abs(host_output.fftw_epsilon) > 1e-8 ) {
-                all_passed         = false;
-                FFTW_passed[iSize] = false;
-            }
-
-            // MyFFTDebugAssertTestTrue( std::abs(host_output.fftw_epsilon) < 1e-8 , "FFTW unit impulse forward FFT");
-
-            // Just to make sure we don't get a false positive, set the host memory to some undesired value.
-            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
-
-            // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
-            bool swap_real_space_quadrants = true;
-
-            FT.FwdFFT( );
-
-            if ( do_increase_size ) {
-                FT.CopyDeviceToHost(host_output.real_values, false, false);
-
-#if DEBUG_FFT_STAGE == 0
-                PrintArray(host_output.real_values, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
-                MyTestPrintAndExit("stage 0 ");
-#elif DEBUG_FFT_STAGE == 1
-                // If we are doing a fwd increase, the data will have only been expanded along the (transposed) X dimension at this point
-                // So the (apparent) X is dims_in.y not dims_out.y
-                if ( do_3d ) {
-                    std::cout << " in 3d print " << std::endl;
-                    PrintArray(host_output.complex_values, dims_in.z, dims_in.y, dims_out.w);
-                }
-                else
-                    PrintArray(host_output.complex_values, dims_in.y, dims_in.z, dims_out.w);
-
-                MyTestPrintAndExit("stage 1 ");
-#elif DEBUG_FFT_STAGE == 2
-                // If we are doing a fwd increase, the data will have only been expanded along the (transposed) X dimension at this point
-                // So the (apparent) X is dims_in.y not dims_out.y
-                PrintArray(host_output.complex_values, dims_in.y, dims_out.z, dims_out.w);
-                MyTestPrintAndExit("stage 2 ");
-#elif DEBUG_FFT_STAGE == 3
-                // Now the array is fully expanded to dims_out, but still transposed
-                PrintArray(host_output.complex_values, dims_out.y, dims_out.z, dims_out.w);
-                MyTestPrintAndExit("stage 3 ");
-#endif
-                sum = ReturnSumOfComplexAmplitudes(host_output.complex_values, host_output.real_memory_allocated / 2);
-            }
-            else {
-                FT.CopyDeviceToHost(false, false, FT.ReturnInputMemorySize( ));
-#if DEBUG_FFT_STAGE == 0
-                PrintArray(host_input.real_values, dims_in.x, dims_in.y, dims_in.z, dims_in.w);
-                MyTestPrintAndExit("stage 0 ");
-#elif DEBUG_FFT_STAGE == 1
-                // If we are doing a fwd increase, the data will have only been expanded along the (transposed) X dimension at this point
-                // So the (apparent) X is dims_in.y not dims_out.y
-                PrintArray(host_input.complex_values, dims_in.y, dims_in.z, dims_out.w);
-                MyTestPrintAndExit("stage 1 ");
-#elif DEBUG_FFT_STAGE == 2
-                // If we are doing a fwd increase, the data will have only been expanded along the (transposed) X dimension at this point
-                // So the (apparent) X is dims_in.y not dims_out.y
-                PrintArray(host_input.complex_values, dims_in.y, dims_out.z, dims_out.w);
-                MyTestPrintAndExit("stage 2 ");
-#elif DEBUG_FFT_STAGE == 3
-                // Now the array is fully expanded to dims_out, but still transposed
-                PrintArray(host_input.complex_values, dims_out.y, dims_out.z, dims_out.w);
-                MyTestPrintAndExit("stage 3 ");
-#endif
-                sum = ReturnSumOfComplexAmplitudes(host_input.complex_values, host_input.real_memory_allocated / 2);
-            }
-
-            sum -= (host_output.real_memory_allocated / 2);
-
-            // std::cout << "sum " << sum << std::endl;
-            // std::cout << "FFT Unit Impulse Forward FFT: " << sum <<  " epsilon " << host_output.fftw_epsilon << std::endl;
-            // std::cout << "epsilon " << abs(sum - host_output.fftw_epsilon) << std::endl;
-            if ( abs(sum) > 1e-8 ) {
-                all_passed                    = false;
-                FastFFT_forward_passed[iSize] = false;
-            }
-
-            // MyFFTDebugAssertTestTrue( abs(sum - host_output.fftw_epsilon) < 1e-8, "FastFFT unit impulse forward FFT");
-            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
-
-            FT.InvFFT( );
-            FT.CopyDeviceToHost(host_output.real_values, true, true);
-
-#if DEBUG_FFT_STAGE == 5
-            PrintArray(host_output.complex_values, dims_out.y, dims_out.z, dims_out.w);
-            MyTestPrintAndExit("stage 5 ");
-#endif
-#if DEBUG_FFT_STAGE == 6
-            PrintArray(host_output.complex_values, dims_out.y, dims_out.z, dims_out.w);
-            MyTestPrintAndExit("stage 6 ");
-#elif DEBUG_FFT_STAGE == 7
-            PrintArray(host_output.real_values, dims_out.x, dims_out.y, dims_out.z, dims_out.w);
-            MyTestPrintAndExit("stage 7 ");
-#elif DEBUG_FFT_STAGE > 7
-            // No debug, keep going
-#else
-            MyTestPrintAndExit(" This block is only valid for DEBUG_FFT_STAGE == 3 || 4 ");
-#endif
-
-            sum = ReturnSumOfReal(host_output.real_values, dims_out);
-            if ( sum != dims_out.x * dims_out.y * dims_out.z ) {
-                all_passed                      = false;
-                FastFFT_roundTrip_passed[iSize] = false;
-            }
-
-            // std::cout << "size in/out " << dims_in.x << ", " << dims_out.x << std::endl;
-            // MyFFTDebugAssertTestTrue( sum == dims_out.x*dims_out.y*dims_out.z,"FastFFT unit impulse round trip FFT");
-
-            oSize++;
-        } // while loop over pad to size
-    } // for loop over pad from size
-
-    if ( all_passed ) {
-        if ( ! do_increase_size )
-            std::cout << "    All size_decrease unit impulse tests passed!" << std::endl;
-        else
-            std::cout << "    All size_increase unit impulse tests passed!" << std::endl;
-    }
-    else {
-        for ( int n = 0; n < size.size( ); n++ ) {
-            if ( ! init_passed[n] )
-                std::cout << "    Initialization failed for size " << size[n] << std::endl;
-            if ( ! FFTW_passed[n] )
-                std::cout << "    FFTW failed for size " << size[n] << std::endl;
-            if ( ! FastFFT_forward_passed[n] )
-                std::cout << "    FastFFT failed for forward transform size " << size[n] << std::endl;
-            if ( ! FastFFT_roundTrip_passed[n] )
-                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << std::endl;
-        }
-    }
-    return all_passed;
-}
-
-template <int Rank>
-void compare_libraries(std::vector<int> size, bool do_3d, int size_change_type, bool do_rectangle) {
-
-    bool skip_cufft_for_profiling = false;
-    bool print_out_time           = true;
-    // bool set_padding_callback = false; // the padding callback is slower than pasting in b/c the read size of the pointers is larger than the actual data. do not use.
-    bool set_conjMult_callback   = true;
-    bool is_size_change_decrease = false;
-
-    if ( size_change_type < 0 ) {
-        is_size_change_decrease = true;
-    }
-    int loop_limit = 1;
-    if ( size_change_type == 0 )
-        loop_limit = 0;
-
-    int make_rect_x;
-    int make_rect_y = 1;
-    if ( do_rectangle )
-        make_rect_x = 2;
-    else
-        make_rect_x = 1;
-
-    if ( do_3d && do_rectangle ) {
-        std::cout << "ERROR: cannot do 3d and rectangle at the same time" << std::endl;
-        return;
-    }
-
-    short4 input_size;
-    short4 output_size;
-    for ( int iSize = 0; iSize < size.size( ) - loop_limit; iSize++ ) {
-        int oSize;
-        int loop_size;
-        if ( size_change_type != 0 ) {
-            oSize     = iSize + 1;
-            loop_size = size.size( );
-        }
-        else {
-            oSize     = iSize;
-            loop_size = oSize + 1;
-        }
-
-        while ( oSize < loop_size ) {
-
-            if ( is_size_change_decrease ) {
-                output_size = make_short4(size[iSize] / make_rect_x, size[iSize] / make_rect_y, 1, 0);
-                input_size  = make_short4(size[oSize] / make_rect_x, size[oSize] / make_rect_y, 1, 0);
-                if ( do_3d ) {
-                    output_size.z = size[iSize];
-                    input_size.z  = size[oSize];
-                }
-            }
-            else {
-                input_size  = make_short4(size[iSize] / make_rect_x, size[iSize] / make_rect_y, 1, 0);
-                output_size = make_short4(size[oSize] / make_rect_x, size[oSize] / make_rect_y, 1, 0);
-                if ( do_3d ) {
-                    input_size.z  = size[iSize];
-                    output_size.z = size[oSize];
-                }
-            }
-            if ( print_out_time ) {
-                printf("Testing padding from %i,%i,%i to %i,%i,%i\n", input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
-            }
-
-            if ( (input_size.x == output_size.x && input_size.y == output_size.y && input_size.z == output_size.z) ) {
-                // Also will change the path called in FastFFT to just be fwd/inv xform.
-                set_conjMult_callback = false;
-            }
-
-            bool test_passed = true;
-            long address     = 0;
-
-            Image<float, float2> FT_input(input_size);
-            Image<float, float2> FT_output(output_size);
-            Image<float, float2> cuFFT_input(input_size);
-            Image<float, float2> cuFFT_output(output_size);
-
-            short4 target_size;
-
-            if ( is_size_change_decrease )
-                target_size = input_size; // assuming xcorr_fwd_NOOP_inv_DECREASE
-            else
-                target_size = output_size;
-
-            Image<float, float2> target_search_image(target_size);
-            Image<float, float2> positive_control(target_size);
-
-            // We just make one instance of the FourierTransformer class, with calc type float.
-            // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
-            FastFFT::FourierTransformer<float, float, float, Rank> FT;
-            // Create an instance to copy memory also for the cufft tests.
-            FastFFT::FourierTransformer<float, float, float, Rank> cuFFT;
-            FastFFT::FourierTransformer<float, float, float, Rank> targetFT;
-
-            if ( is_size_change_decrease ) {
-                FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z, true, false);
-                FT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true);
-
-                // For the subset of outputs this is just the input size, assuming the program then accesses just the valid data (could explicitly put into a new array which would be even slower.)
-                cuFFT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z, true, false);
-                cuFFT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z, true);
-
-                targetFT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, input_size.x, input_size.y, input_size.z, true, false);
-                targetFT.SetInverseFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true);
-            }
-            else {
-                FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-                FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-                cuFFT.SetForwardFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true, false);
-                cuFFT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-                targetFT.SetForwardFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true, false);
-                targetFT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-            }
-
-            short4 fwd_dims_in  = FT.ReturnFwdInputDimensions( );
-            short4 fwd_dims_out = FT.ReturnFwdOutputDimensions( );
-            short4 inv_dims_in  = FT.ReturnInvInputDimensions( );
-            short4 inv_dims_out = FT.ReturnInvOutputDimensions( );
-
-            FT_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
-            FT_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
-
-            cuFFT_input.real_memory_allocated  = cuFFT.ReturnInputMemorySize( );
-            cuFFT_output.real_memory_allocated = cuFFT.ReturnInvOutputMemorySize( );
-
-            if ( is_size_change_decrease )
-                target_search_image.real_memory_allocated = targetFT.ReturnInputMemorySize( );
-            else
-                target_search_image.real_memory_allocated = targetFT.ReturnInvOutputMemorySize( ); // the larger of the two.
-
-            positive_control.real_memory_allocated = target_search_image.real_memory_allocated; // this won't change size
-
-            bool set_fftw_plan = false;
-            FT_input.Allocate(set_fftw_plan);
-            FT_output.Allocate(set_fftw_plan);
-
-            cuFFT_input.Allocate(set_fftw_plan);
-            cuFFT_output.Allocate(set_fftw_plan);
-
-            target_search_image.Allocate(true);
-            positive_control.Allocate(true);
-
-            // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
-            // ensures faster transfer. If false, it will be pinned for you.
-            FT.SetInputPointer(FT_input.real_values, false);
-            cuFFT.SetInputPointer(cuFFT_input.real_values, false);
-            targetFT.SetInputPointer(target_search_image.real_values, false);
-
-            // Set a unit impulse at the center of the input array.
-            // For now just considering the real space image to have been implicitly quadrant swapped so the center is at the origin.
-            FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 0.0f);
-            FT.SetToConstant(cuFFT_input.real_values, cuFFT_input.real_memory_allocated, 0.0f);
-            FT.SetToConstant(FT_output.real_values, FT_output.real_memory_allocated, 0.0f);
-            FT.SetToConstant(cuFFT_output.real_values, cuFFT_output.real_memory_allocated, 0.0f);
-            FT.SetToConstant(target_search_image.real_values, target_search_image.real_memory_allocated, 0.0f);
-            FT.SetToConstant(positive_control.real_values, target_search_image.real_memory_allocated, 0.0f);
-
-            // Place these values at the origin of the image and after convolution, should be at 0,0,0.
-            float testVal_1                    = 2.0f;
-            float testVal_2                    = 3.0f;
-            FT_input.real_values[0]            = testVal_1;
-            cuFFT_input.real_values[0]         = testVal_1;
-            target_search_image.real_values[0] = testVal_2; //target_search_image.size.w*2*target_search_image.size.y/2 + target_search_image.size.x/2] = testVal_2;
-            positive_control.real_values[0]    = testVal_1; //target_search_image.size.w*2*target_search_image.size.y/2 + target_search_image.size.x/2] = testVal_1;
-
-            // Transform the target on the host prior to transfer.
-            target_search_image.FwdFFT( );
-
-            // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
-            FT.CopyHostToDevice( );
-            FT.CopyDeviceToHost(false, false);
-
-            cuFFT.CopyHostToDevice( );
-
-            targetFT.CopyHostToDevice( );
-
-            // Wait on the transfers to finish.
-            cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
-
-            // Positive control on the host.
-            positive_control.FwdFFT( );
-            positive_control.MultiplyConjugateImage(target_search_image.complex_values);
-            positive_control.InvFFT( );
-
-            // address = 0;
-            test_passed = true;
-            for ( int z = 1; z < positive_control.size.z; z++ ) {
-                for ( int y = 1; y < positive_control.size.y; y++ ) {
-                    for ( int x = 1; x < positive_control.size.x; x++ ) {
-                        if ( positive_control.real_values[address] != 0.0f )
-                            test_passed = false;
-                    }
-                }
-            }
-            if ( test_passed ) {
-                if ( positive_control.real_values[address] == positive_control.size.x * positive_control.size.y * positive_control.size.z * testVal_1 * testVal_2 ) {
-                    if ( print_out_time ) {
-                        std::cout << "Test passed for FFTW positive control." << std::endl;
-                    }
-                }
-                else {
-                    if ( print_out_time ) {
-                        std::cout << "Test failed for FFTW positive control. Value at zero is  " << positive_control.real_values[address] << std::endl;
-                    }
-                }
-            }
-            else {
-                if ( print_out_time ) {
-                    std::cout << "Test failed for positive control, non-zero values found away from the origin." << std::endl;
-                }
-            }
-
-            cuFFT_output.create_timing_events( );
-            if ( do_3d ) {
-                cuFFT_output.MakeCufftPlan3d( );
-            }
-            else {
-                if ( print_out_time ) {
-                    std::cout << "2D test " << std::endl;
-                }
-
-                cuFFT_input.MakeCufftPlan( );
-                cuFFT_output.MakeCufftPlan( );
-            }
-
-            std::cout << "Test lambda" << std::endl;
-
-            FastFFT::KernelFunction::my_functor<float, 0, FastFFT::KernelFunction::NOOP>     noop;
-            FastFFT::KernelFunction::my_functor<float, 2, FastFFT::KernelFunction::CONJ_MUL> conj_mul;
-
-            //////////////////////////////////////////
-            //////////////////////////////////////////
-            // Warm up and check for accuracy
-            if ( set_conjMult_callback || is_size_change_decrease ) // we set set_conjMult_callback = false
-            {
-                // FT.CrossCorrelate(targetFT.d_ptr.momentum_space, false);
-                // Will type deduction work here?
-                FT.Generic_Fwd_Image_Inv(targetFT.d_ptr.momentum_space, noop, conj_mul, noop);
-            }
-            else {
-                FT.FwdFFT( );
-                FT.InvFFT( );
-            }
-
-            if ( is_size_change_decrease ) {
-                FT.CopyDeviceToHost(false, false);
-#if DEBUG_FFT_STAGE == 0
-
-                PrintArray(FT_input.real_values, fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_in.w);
-                MyTestPrintAndExit(" Stage 0");
-#elif DEBUG_FFT_STAGE == 1
-
-                PrintArray(FT_input.complex_values, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_out.w);
-                MyTestPrintAndExit(" Stage 1");
-#elif DEBUG_FFT_STAGE == 2
-                PrintArray(FT_input.complex_values, fwd_dims_in.y, fwd_dims_out.z, fwd_dims_out.w);
-                MyTestPrintAndExit(" Stage 2");
-#elif DEBUG_FFT_STAGE == 3
-
-                PrintArray(FT_input.complex_values, fwd_dims_in.y, fwd_dims_out.z, fwd_dims_out.w);
-                MyTestPrintAndExit(" Stage 3");
-#elif DEBUG_FFT_STAGE == 4
-
-                PrintArray(FT_input.complex_values, fwd_dims_in.y, fwd_dims_out.z, fwd_dims_out.w);
-                MyTestPrintAndExit(" Stage 4");
-#elif DEBUG_FFT_STAGE == 5
-                PrintArray(FT_input.complex_values, inv_dims_out.y, inv_dims_in.z, inv_dims_in.w);
-                MyTestPrintAndExit(" Stage 5");
-#elif DEBUG_FFT_STAGE == 6
-                PrintArray(FT_input.complex_values, inv_dims_out.y, inv_dims_out.z, inv_dims_in.w);
-                MyTestPrintAndExit(" Stage 6");
-#elif DEBUG_FFT_STAGE == 7
-                PrintArray(FT_input.real_values, inv_dims_out.x, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
-                MyTestPrintAndExit(" Stage 7");
-#elif DEBUG_FFT_STAGE > 7
-                // Do nothing, we are doing all ops and not debugging.
-#else
-                MyTestPrintAndExit("DEBUG_FFT_STAGE not recognized " + std::to_string(DEBUG_FFT_STAGE));
-#endif
-            }
-            else {
-                // the output is equal or > the input, so we can always copy there.
-                FT.CopyDeviceToHost(FT_output.real_values, false, false);
-
-#if DEBUG_FFT_STAGE == 0
-                PrintArray(FT_output.real_values, fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_in.w);
-                MyTestPrintAndExit(" Stage 0");
-#elif DEBUG_FFT_STAGE == 1
-
-                PrintArray(FT_output.complex_values, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_out.w);
-                FastFFT::PrintVectorType(fwd_dims_in);
-                FastFFT::PrintVectorType(fwd_dims_out);
-                MyTestPrintAndExit(" Stage 1");
-#elif DEBUG_FFT_STAGE == 2
-                PrintArray(FT_output.complex_values, fwd_dims_in.y, fwd_dims_out.z, fwd_dims_out.w);
-                MyTestPrintAndExit(" Stage 2");
-#elif DEBUG_FFT_STAGE == 3
-                PrintArray(FT_output.complex_values, fwd_dims_out.y, fwd_dims_out.w, fwd_dims_out.z);
-                MyTestPrintAndExit(" Stage 3");
-#elif DEBUG_FFT_STAGE == 4
-                PrintArray(FT_output.complex_values, fwd_dims_out.y, fwd_dims_out.w, fwd_dims_out.z);
-                MyTestPrintAndExit(" Stage 4");
-#elif DEBUG_FFT_STAGE == 5
-                PrintArray(FT_output.complex_values, inv_dims_out.y, inv_dims_in.z, inv_dims_out.w);
-                MyTestPrintAndExit(" Stage 5");
-#elif DEBUG_FFT_STAGE == 6
-                PrintArray(FT_output.complex_values, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
-                MyTestPrintAndExit(" Stage 6");
-#elif DEBUG_FFT_STAGE == 7
-                PrintArray(FT_output.real_values, inv_dims_out.x, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
-                MyTestPrintAndExit(" Stage 7");
-#elif DEBUG_FFT_STAGE > 7
-                // Do nothing, we are doing all ops and not debugging.
-#else
-                MyTestPrintAndExit("DEBUG_FFT_STAGE not recognized " + std::to_string(DEBUG_FFT_STAGE));
-#endif
-            }
-
-            address     = 0;
-            test_passed = true;
-            if ( is_size_change_decrease ) {
-                for ( int z = 1; z < FT_input.size.z; z++ ) {
-                    for ( int y = 1; y < FT_input.size.y; y++ ) {
-                        for ( int x = 1; x < FT_input.size.x; x++ ) {
-                            if ( FT_input.real_values[address] != 0.0f )
-                                test_passed = false;
-                        }
-                    }
-                }
-                if ( test_passed ) {
-                    if ( FT_input.real_values[address] == FT_input.size.x * FT_input.size.y * FT_input.size.z * testVal_1 * testVal_2 ) {
-                        if ( print_out_time ) {
-                            std::cout << "Test passed for FastFFT positive control.\n"
-                                      << std::endl;
-                        }
-                    }
-                    else {
-                        if ( print_out_time ) {
-                            std::cout << "Test failed for FastFFT positive control. Value at zero is  " << FT_input.real_values[address] << std::endl;
-                        }
-                    }
-                }
-                else {
-                    if ( print_out_time ) {
-                        std::cout << "Test failed for FastFFT control, non-zero values found away from the origin." << std::endl;
-                    }
-                }
-            }
-            else {
-                for ( int z = 1; z < FT_output.size.z; z++ ) {
-                    for ( int y = 1; y < FT_output.size.y; y++ ) {
-                        for ( int x = 1; x < FT_output.size.x; x++ ) {
-                            if ( FT_output.real_values[address] != 0.0f )
-                                test_passed = false;
-                        }
-                    }
-                }
-                if ( test_passed ) {
-                    if ( FT_output.real_values[address] == FT_output.size.x * FT_output.size.y * FT_output.size.z * testVal_1 * testVal_2 ) {
-                        if ( print_out_time ) {
-                            std::cout << "Test passed for FastFFT positive control.\n"
-                                      << std::endl;
-                        }
-                    }
-                    else {
-                        if ( print_out_time ) {
-                            std::cout << "Test failed for FastFFT positive control. Value at zero is  " << FT_output.real_values[address] << std::endl;
-                        }
-                    }
-                }
-                else {
-                    if ( print_out_time ) {
-                        std::cout << "Test failed for FastFFT control, non-zero values found away from the origin." << std::endl;
-                    }
-                }
-            }
-
-            ////////////////////////////////////////
-            //////////////////////////////////////////
-
-#if DEBUG_FFT_STAGE == 0
-
-            PrintArray(FT_output.real_values, fwd_dims_in.x, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_in.w);
-
-            MyTestPrintAndExit("stage 0 ");
-#elif DEBUG_FFT_STAGE == 1
-            // If we are doing a fwd increase, the data will have only been expanded along the (transposed) X dimension at this point
-            // So the (apparent) X is dims_in.y not output_size.y
-            // decrease is currently just tested on the output. Really, to simplify there should be 3 different functions, fwd_none_inv_decrease (current decrease), fwd_decrease_inc_decrease (not yet) fwd_increase_inv_none
-            if ( is_size_change_decrease ) {
-                MyTestPrintAndExit("stage 1 decrease");
-                PrintArray(FT_output.complex_values, fwd_dims_out.y, fwd_dims_out.z, fwd_dims_out.w);
-            }
-            else {
-                MyTestPrintAndExit("stage 1 increase");
-                PrintArray(FT_output.complex_values, fwd_dims_in.y, fwd_dims_in.z, fwd_dims_out.w);
-            }
-
-#elif DEBUG_FFT_STAGE == 2
-            // Now the array is fully expanded to output_size, but still transposed
-            PrintArray(FT_output.complex_values, fwd_dims_out.y, fwd_dims_out.z, fwd_dims_out.w);
-            MyTestPrintAndExit("stage 2 ");
-
-#elif DEBUG_FFT_STAGE == 3
-            PrintArray(FT_output.complex_values, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
-            MyTestPrintAndExit("stage 3 ");
-#elif DEBUG_FFT_STAGE == 4
-            PrintArray(FT_output.real_values, inv_dims_out.x, inv_dims_out.y, inv_dims_out.z, inv_dims_out.w);
-            MyTestPrintAndExit("stage 4 ");
-#elif DEBUG_FFT_STAGE > 7
-            // This is the final stage, the data is fully expanded and transposed
-#else
-            MyTestPrintAndExit("This blah blah");
-#endif
-
-            int n_loops;
-            if ( do_3d ) {
-                int max_size = std::max(fwd_dims_in.x, fwd_dims_out.x);
-                if ( max_size < 128 ) {
-                    n_loops = 1000;
-                }
-                else if ( max_size <= 256 ) {
-                    n_loops = 400;
-                }
-                else if ( max_size <= 512 ) {
-                    n_loops = 150;
-                }
-                else {
-                    n_loops = 50;
-                }
-            }
-            else {
-                int max_size = std::max(fwd_dims_in.x, fwd_dims_out.x);
-                if ( max_size < 256 ) {
-                    n_loops = 10000;
-                }
-                else if ( max_size <= 512 ) {
-                    n_loops = 5000;
-                }
-                else if ( max_size <= 2048 ) {
-                    n_loops = 2500;
-                }
-                else {
-                    n_loops = 1000;
-                }
-            }
-
-            cuFFT_output.record_start( );
-            for ( int i = 0; i < n_loops; ++i ) {
-                if ( set_conjMult_callback || is_size_change_decrease ) {
-                    //   FT.CrossCorrelate(targetFT.d_ptr.momentum_space_buffer, false);
-                    // Will type deduction work here?
-                    FT.Generic_Fwd_Image_Inv(targetFT.d_ptr.momentum_space, noop, conj_mul, noop);
-                }
-                else {
-                    FT.FwdFFT( );
-                    FT.InvFFT( );
-                }
-            }
-            cuFFT_output.record_stop( );
-            cuFFT_output.synchronize( );
-            cuFFT_output.print_time("FastFFT", print_out_time);
-            float FastFFT_time = cuFFT_output.elapsed_gpu_ms;
-
-            // if (set_padding_callback)
-            // {
-            //   precheck
-            //   cufftReal* overlap_pointer;
-            //   overlap_pointer = cuFFT.d_ptr.position_space;
-            //   cuFFT_output.SetClipIntoCallback(overlap_pointer, cuFFT_input.size.x, cuFFT_input.size.y, cuFFT_input.size.w*2);
-            //   postcheck
-            // }
-
-            if ( set_conjMult_callback ) {
-                precheck
-                        // FIXME scaling factor
-                        cuFFT_output.SetComplexConjMultiplyAndLoadCallBack((cufftComplex*)targetFT.d_ptr.momentum_space_buffer, 1.0f);
-                postcheck
-            }
-
-            if ( ! skip_cufft_for_profiling ) {
-                //////////////////////////////////////////
-                //////////////////////////////////////////
-                // Warm up and check for accuracy
-                if ( is_size_change_decrease ) {
-
-                    precheck
-                            cudaErr(cufftExecR2C(cuFFT_input.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
-                    postcheck
-
-                            precheck
-                                    cudaErr(cufftExecC2R(cuFFT_input.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
-                    postcheck
-                }
-                else {
-                    // cuFFT.ClipIntoTopLeft();
-                    // cuFFT.ClipIntoReal(cuFFT_output.size.x/2, cuFFT_output.size.y/2, cuFFT_output.size.z/2);
-                    // cuFFT.CopyDeviceToHost(cuFFT_output.real_values,false, false);
-
-                    precheck
-                            cudaErr(cufftExecR2C(cuFFT_output.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
-                    postcheck
-
-                            precheck
-                                    cudaErr(cufftExecC2R(cuFFT_output.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
-                    postcheck
-                }
-
-                cuFFT_output.record_start( );
-                for ( int i = 0; i < n_loops; ++i ) {
-                    // std::cout << i << "i / " << n_loops << "n_loops" << std::endl;
-                    if ( set_conjMult_callback )
-                        cuFFT.ClipIntoTopLeft( );
-                    // cuFFT.ClipIntoReal(input_size.x/2, input_size.y/2, input_size.z/2);
-
-                    if ( is_size_change_decrease ) {
-                        precheck
-                                cudaErr(cufftExecR2C(cuFFT_input.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
-                        postcheck
-
-                                precheck
-                                        cudaErr(cufftExecC2R(cuFFT_input.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
-                        postcheck
-                    }
-                    else {
-                        precheck
-                                cudaErr(cufftExecR2C(cuFFT_output.cuda_plan_forward, (cufftReal*)cuFFT.d_ptr.position_space, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer));
-                        postcheck
-
-                                precheck
-                                        cudaErr(cufftExecC2R(cuFFT_output.cuda_plan_inverse, (cufftComplex*)cuFFT.d_ptr.momentum_space_buffer, (cufftReal*)cuFFT.d_ptr.position_space));
-                        postcheck
-                    }
-                }
-                cuFFT_output.record_stop( );
-                cuFFT_output.synchronize( );
-                cuFFT_output.print_time("cuFFT", print_out_time);
-            } // end of if (! skip_cufft_for_profiling)
-            std::cout << "For size " << input_size.x << " to " << output_size.x << ": ";
-            std::cout << "Ratio cuFFT/FastFFT : " << cuFFT_output.elapsed_gpu_ms / FastFFT_time << "\n\n"
-                      << std::endl;
-
-            oSize++;
-            // We don't want to loop if the size is not actually changing.
-        } // while loop over pad to size
-    } // for loop over pad from size
-}
-
-template <int Rank>
-void run_oned(std::vector<int> size) {
-
-    // Override the size to be one dimensional in x
-    std::cout << "Running one-dimensional tests\n"
-              << std::endl;
-
-    for ( int n : size ) {
-        short4 input_size  = make_short4(n, 1, 1, 0);
-        short4 output_size = make_short4(n, 1, 1, 0);
-
-        Image<float, float2>  FT_input(input_size);
-        Image<float, float2>  FT_output(output_size);
-        Image<float2, float2> FT_input_complex(input_size);
-        Image<float2, float2> FT_output_complex(output_size);
-
-        // We just make one instance of the FourierTransformer class, with calc type float.
-        // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
-        FastFFT::FourierTransformer<float, float, float, Rank> FT;
-        FastFFT::FourierTransformer<float, float2, float2>     FT_complex;
-
-        // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
-        FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-        FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-        FT_complex.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z, true, false);
-        FT_complex.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z, true);
-
-        FT_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
-        FT_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
-
-        FT_input_complex.real_memory_allocated  = FT_complex.ReturnInputMemorySize( );
-        FT_output_complex.real_memory_allocated = FT_complex.ReturnInvOutputMemorySize( );
-        std::cout << "Allocated " << FT_input_complex.real_memory_allocated << " bytes for input.\n";
-        std::cout << "Allocated complex " << FT_output_complex.real_memory_allocated << " bytes for input.\n";
-
-        bool set_fftw_plan = true;
-        FT_input.Allocate(set_fftw_plan);
-        FT_output.Allocate(set_fftw_plan);
-
-        FT_input_complex.Allocate( );
-        FT_output_complex.Allocate( );
-
-        // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
-        // ensures faster transfer. If false, it will be pinned for you.
-        FT.SetInputPointer(FT_input.real_values, false);
-        FT_complex.SetInputPointer(FT_input_complex.complex_values, false);
-
-        FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 1.f);
-
-        // Set a unit impulse at the center of the input array.
-        // FT.SetToConstant(FT_input.real_values, FT_input.real_memory_allocated, 1.0f);
-        float2 const_val = make_float2(1.0f, 0.0f);
-        FT_complex.SetToConstant<float2>(FT_input_complex.complex_values, FT_input.real_memory_allocated, const_val);
-        for ( int i = 0; i < 10; i++ ) {
-            std::cout << FT_input_complex.complex_values[i].x << "," << FT_input_complex.complex_values[i].y << std::endl;
-        }
-
-        FT.CopyHostToDevice( );
-        FT_complex.CopyHostToDevice( );
-        cudaErr(cudaStreamSynchronize(cudaStreamPerThread));
-
-        // Set the outputs to a clearly wrong answer.
-        FT.SetToConstant(FT_output.real_values, FT_input.real_memory_allocated, 2.0f);
-        const_val = make_float2(2.0f, 2.0f);
-        FT_complex.SetToConstant<float2>(FT_output_complex.complex_values, FT_output.real_memory_allocated, const_val);
-
-        FT_input.FwdFFT( );
-
-        for ( int i = 0; i < 5; ++i )
-            std::cout << "FFTW fwd " << FT_input.real_values[i] << std::endl;
-        std::cout << std::endl;
-
-        bool transpose_output          = false;
-        bool swap_real_space_quadrants = false;
-        FT.FwdFFT( );
-        FT_complex.FwdFFT( );
-
-        FT.CopyDeviceToHost(FT_output.real_values, false, false);
-        FT_complex.CopyDeviceToHost(FT_output_complex.real_values, false, false);
-
-        for ( int i = 0; i < 10; ++i ) {
-            std::cout << "FT fwd " << FT_output.real_values[i] << std::endl;
-        }
-        for ( int i = 0; i < 10; ++i ) {
-            std::cout << "FT complex fwd " << FT_output_complex.real_values[i].x << "," << FT_output_complex.real_values[i].y << std::endl;
-        }
-
-        FT_input.InvFFT( );
-
-        for ( int i = 0; i < 5; ++i ) {
-            std::cout << "FFTW inv " << FT_input.real_values[i] << std::endl;
-        }
-        std::cout << std::endl;
-
-        FT.InvFFT( );
-        FT_complex.InvFFT( );
-        FT.CopyDeviceToHost(FT_output.real_values, true, true);
-        FT_complex.CopyDeviceToHost(FT_output_complex.real_values, true, true);
-
-        for ( int i = 0; i < 10; i++ ) {
-            std::cout << "Ft inv " << FT_output.real_values[i] << std::endl;
-        }
-        for ( int i = 0; i < 10; i++ ) {
-            std::cout << "Ft complex inv " << FT_output_complex.real_values[i].x << "," << FT_output_complex.real_values[i].y << std::endl;
-        }
-    }
-}
-
-void print_options(char** argv) {
-    std::cout << "Usage: " << argv[0] << "\n\n";
-    std::printf("%-24s : %-24s\n", "--all", "run all available tests");
-    std::printf("%-24s : %-24s\n", "--2d-unit-tests", "run constant image and unit impulse tests for 2d");
-    std::printf("%-24s : %-24s\n", "--3d-unit-tests", "run constant image and unit impulse tests for 3d");
-    std::printf("%-24s : %-24s\n", "--2d-performance-tests", "run base fft, and cross-correlation tests for 2d");
-    std::printf("%-24s : %-24s\n", "--3d-performance-tests", "run base fft, and cross-correlation tests for 3d");
-
-    std::cout << "\n"
-              << std::endl;
-}
-
-int main(int argc, char** argv) {
-
-    if ( argc != 2 ) {
-        print_options(argv);
-        return 1;
-    }
-
-    std::string test_name = argv[1];
-    std::printf("Standard is %li\n\n", __cplusplus);
-
-    // Input size vectors to be tested.
-    std::vector<int> test_size           = {32, 64, 128, 256, 512, 1024, 2048, 4096};
-    std::vector<int> test_size_rectangle = {64, 128, 256, 512, 1024, 2048, 4096};
-    std::vector<int> test_size_3d        = {32, 64, 128, 256, 512};
-    // std::vector<int> test_size_3d ={512};
-
-    // The launch parameters fail for 4096 -> < 64 for r2c_decrease, not sure if it is the elements_per_thread or something else.
-    // For now, just over-ride these small sizes
-    std::vector<int> test_size_for_decrease   = {64, 128, 256, 512, 1024, 2048, 4096};
-    bool             run_2d_unit_tests        = false;
-    bool             run_3d_unit_tests        = false;
-    bool             run_2d_performance_tests = false;
-    bool             run_3d_performance_tests = false;
-
-    if ( test_name == "--all" ) {
-        std::cout << "Running all tests" << std::endl;
-        run_2d_unit_tests        = true;
-        run_3d_unit_tests        = true;
-        run_2d_performance_tests = true;
-        run_3d_performance_tests = true;
-    }
-    else if ( test_name == "--2d-unit-tests" ) {
-        std::cout << "Running 2d unit tests" << std::endl;
-        run_2d_unit_tests = true;
-    }
-    else if ( test_name == "--3d-unit-tests" ) {
-        std::cout << "Running 3d unit tests" << std::endl;
-        run_3d_unit_tests = true;
-    }
-    else if ( test_name == "--2d-performance-tests" ) {
-        std::cout << "Running 2d performance tests" << std::endl;
-        run_2d_performance_tests = true;
-    }
-    else if ( test_name == "--3d-performance-tests" ) {
-        std::cout << "Running 3d performance tests" << std::endl;
-        run_3d_performance_tests = true;
-    }
-    else {
-        std::cout << "\n\nUnknown test name: " << test_name << "\n\n";
-        print_options(argv);
-        return 1;
-    }
-
-    if ( run_2d_unit_tests ) {
-        if ( ! const_image_test<2>(test_size, false) )
-            return 1;
-        if ( ! unit_impulse_test<2>(test_size, false, true) )
-            return 1;
-    }
-
-    if ( run_3d_unit_tests ) {
-        if ( ! const_image_test<3>(test_size_3d, true) )
-            return 1;
-        // if (! unit_impulse_test(test_size_3d, true, true)) return 1;
-    }
-
-    if ( run_2d_performance_tests ) {
-#ifdef HEAVYERRORCHECKING_FFT
-        std::cout << "Running performance tests with heavy error checking.\n";
-        std::cout << "This doesn't make sense as the synchronizations are invalidating.\n";
-// exit(1);
-#endif
-
-        int  size_change_type;
-        bool do_3d = false;
-
-        size_change_type = 0; // no
-        compare_libraries<2>(test_size, do_3d, size_change_type, false);
-        // compare_libraries<2>(test_size_rectangle, do_3d, size_change_type, true);
-
-        size_change_type = 1; // increase
-        compare_libraries<2>(test_size, do_3d, size_change_type, false);
-        // compare_libraries<2>(test_size_rectangle, do_3d, size_change_type, true);
-        exit(1);
-        size_change_type = -1; // decrease
-        compare_libraries<2>(test_size, do_3d, size_change_type, false);
-    }
-
-    if ( run_3d_performance_tests ) {
-#ifdef HEAVYERRORCHECKING_FFT
-        std::cout << "Running performance tests with heavy error checking.\n";
-        std::cout << "This doesn't make sense as the synchronizations are invalidating.\n";
-#endif
-
-        int  size_change_type;
-        bool do_3d = true;
-
-        size_change_type = 0; // no change
-        compare_libraries<3>(test_size, do_3d, size_change_type, false);
-
-        // TODO: These are not yet completed.
-        // size_change_type = 1; // increase
-        // compare_libraries(test_size, do_3d, size_change_type, false);
-
-        // size_change_type = -1; // decrease
-        // compare_libraries(test_size, do_3d, size_change_type, false);
-    }
-
-    // If we get here, all tests passed.
-    return 0;
-};
diff --git a/src/tests/tests.h b/src/tests/tests.h
new file mode 100644
index 0000000..3818d50
--- /dev/null
+++ b/src/tests/tests.h
@@ -0,0 +1,59 @@
+#ifndef _SRC_TESTS_TESTS_H
+#define _SRC_TESTS_TESTS_H
+
+#include "../fastfft/Image.cuh"
+#include "../../include/FastFFT.cuh"
+#include "helper_functions.cuh"
+
+#include <iostream>
+
+namespace FastFFT {
+// Input size vectors to be tested.
+std::vector<int> test_size           = {32, 64, 128, 256, 512, 1024, 2048, 4096};
+std::vector<int> test_size_rectangle = {64, 128, 256, 512, 1024, 2048, 4096};
+std::vector<int> test_size_3d        = {32, 64, 128, 256, 512};
+// std::vector<int> test_size_3d ={512};
+
+// The launch parameters fail for 4096 -> < 64 for r2c_decrease, not sure if it is the elements_per_thread or something else.
+// For now, just over-ride these small sizes
+std::vector<int> test_size_for_decrease = {64, 128, 256, 512, 1024, 2048, 4096};
+
+void CheckInputArgs(int argc, char** argv, const std::string_view& text_line, bool& run_2d_unit_tests, bool& run_3d_unit_tests) {
+    switch ( argc ) {
+        case 1: {
+            std::cout << "Running all tests" << std::endl;
+            run_2d_unit_tests = true;
+            run_3d_unit_tests = true;
+            break;
+        }
+        case 2: {
+            std::string test_name = argv[1];
+            if ( test_name == "--all" ) {
+                std::cout << "Running all tests" << std::endl;
+                run_2d_unit_tests = true;
+                run_3d_unit_tests = true;
+            }
+            else if ( test_name == "--2d" ) {
+                std::cout << "Running 2d " << text_line << " tests" << std::endl;
+                run_2d_unit_tests = true;
+            }
+            else if ( test_name == "--3d" ) {
+                std::cout << "Running 3d " << text_line << " tests" << std::endl;
+                run_3d_unit_tests = true;
+            }
+            else {
+                std::cout << "Usage: " << argv[0] << " < --all (default w/ no arg), --2d, --3d>" << std::endl;
+                std::exit(0);
+            }
+            break;
+        }
+        default: {
+            std::cout << "Usage: " << argv[0] << " < --all (default w/ no arg), --2d, --3d>" << std::endl;
+            std::exit(0);
+        }
+    };
+};
+
+} // namespace FastFFT
+
+#endif
\ No newline at end of file
diff --git a/src/tests/unit_impulse_test.cu b/src/tests/unit_impulse_test.cu
new file mode 100644
index 0000000..94fe6fb
--- /dev/null
+++ b/src/tests/unit_impulse_test.cu
@@ -0,0 +1,211 @@
+
+#include "tests.h"
+
+template <int Rank>
+bool unit_impulse_test(std::vector<int> size, bool do_increase_size) {
+
+    bool              all_passed = true;
+    std::vector<bool> init_passed(size.size( ), true);
+    std::vector<bool> FFTW_passed(size.size( ), true);
+    std::vector<bool> FastFFT_forward_passed(size.size( ), true);
+    std::vector<bool> FastFFT_roundTrip_passed(size.size( ), true);
+
+    short4 input_size;
+    short4 output_size;
+    for ( int iSize = 0; iSize < size.size( ) - 1; iSize++ ) {
+        int oSize = iSize + 1;
+        while ( oSize < size.size( ) ) {
+
+            // std::cout << std::endl << "Testing padding from  " << size[iSize] << " to " << size[oSize] << std::endl;
+            if ( do_increase_size ) {
+                if ( Rank == 3 ) {
+                    input_size  = make_short4(size[iSize], size[iSize], size[iSize], 0);
+                    output_size = make_short4(size[oSize], size[oSize], size[oSize], 0);
+                }
+                else {
+                    input_size  = make_short4(size[iSize], size[iSize], 1, 0);
+                    output_size = make_short4(size[oSize], size[oSize], 1, 0);
+                }
+            }
+            else {
+                if ( Rank == 3 ) {
+                    output_size = make_short4(size[iSize], size[iSize], size[iSize], 0);
+                    input_size  = make_short4(size[oSize], size[oSize], size[oSize], 0);
+                }
+                else {
+                    output_size = make_short4(size[iSize], size[iSize], 1, 0);
+                    input_size  = make_short4(size[oSize], size[oSize], 1, 0);
+                }
+            }
+
+            float sum;
+
+            Image<float, float2> host_input(input_size);
+            Image<float, float2> host_output(output_size);
+            Image<float, float2> device_output(output_size);
+
+            // We just make one instance of the FourierTransformer class, with calc type float.
+            // For the time being input and output are also float. TODO calc optionally either fp16 or nv_bloat16, TODO inputs at lower precision for bandwidth improvement.
+            FastFFT::FourierTransformer<float, float, float, Rank> FT;
+            // This is similar to creating an FFT/CUFFT plan, so set these up before doing anything on the GPU
+            FT.SetForwardFFTPlan(input_size.x, input_size.y, input_size.z, output_size.x, output_size.y, output_size.z);
+            FT.SetInverseFFTPlan(output_size.x, output_size.y, output_size.z, output_size.x, output_size.y, output_size.z);
+
+            // The padding (dims.w) is calculated based on the setup
+            short4 dims_in  = FT.ReturnFwdInputDimensions( );
+            short4 dims_out = FT.ReturnFwdOutputDimensions( );
+            // Determine how much memory we need, working with FFTW/CUDA style in place transform padding.
+            // Note: there is no reason we really need this, because the xforms will always be out of place.
+            //       For now, this is just in place because all memory in cisTEM is allocated accordingly.
+            host_input.real_memory_allocated  = FT.ReturnInputMemorySize( );
+            host_output.real_memory_allocated = FT.ReturnInvOutputMemorySize( );
+
+            // On the device, we will always allocate enough memory for the larger of input/output including the buffer array.
+            // Minmize the number of calls to malloc which are slow and can lead to fragmentation.
+            device_output.real_memory_allocated = std::max(host_input.real_memory_allocated, host_output.real_memory_allocated);
+
+            // In your own programs, you will be handling this memory allocation yourself. We'll just make something here.
+            // I think fftwf_malloc may potentially create a different alignment than new/delete, but kinda doubt it. For cisTEM consistency...
+            bool set_fftw_plan = true;
+            host_input.Allocate(set_fftw_plan);
+            host_output.Allocate(set_fftw_plan);
+
+            // Now we want to associate the host memory with the device memory. The method here asks if the host pointer is pinned (in page locked memory) which
+            // ensures faster transfer. If false, it will be pinned for you.
+            FT.SetInputPointer(host_input.real_values, false);
+
+            // Set a unit impulse at the center of the input array.
+            FT.SetToConstant(host_input.real_values, host_input.real_memory_allocated, 0.0f);
+            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 0.0f);
+
+            host_input.real_values[0]  = 1.0f;
+            host_output.real_values[0] = 1.0f;
+
+            // This will exit if fail, so the following bools are not really needed any more.
+            CheckUnitImpulseRealImage(host_output, __LINE__);
+
+            // TODO: remove me
+            // if ( sum != 1 ) {
+            //     all_passed         = true;
+            //     init_passed[iSize] = true;
+            // }
+
+            // This copies the host memory into the device global memory. If needed, it will also allocate the device memory first.
+            FT.CopyHostToDevice(host_input.real_values);
+
+            host_output.FwdFFT( );
+
+            host_output.fftw_epsilon = host_output.ReturnSumOfComplexAmplitudes(host_output.complex_values, host_output.real_memory_allocated / 2);
+            // std::cout << "host " << host_output.fftw_epsilon << " " << host_output.real_memory_allocated<< std::endl;
+
+            host_output.fftw_epsilon -= (host_output.real_memory_allocated / 2);
+            if ( std::abs(host_output.fftw_epsilon) > 1e-8 ) {
+                all_passed         = false;
+                FFTW_passed[iSize] = false;
+            }
+
+            // MyFFTDebugAssertTestTrue( std::abs(host_output.fftw_epsilon) < 1e-8 , "FFTW unit impulse forward FFT");
+
+            // Just to make sure we don't get a false positive, set the host memory to some undesired value.
+            // FIXME: This wouldn't work for size decrease
+            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
+
+            // This method will call the regular FFT kernels given the input/output dimensions are equal when the class is instantiated.
+            // bool swap_real_space_quadrants = true;
+
+            FT.FwdFFT( );
+
+            bool continue_debugging;
+            // We don't want this to break compilation of other tests, so only check at runtime.
+            if constexpr ( FFT_DEBUG_STAGE < 5 ) {
+                if ( do_increase_size ) {
+                    FT.CopyDeviceToHostAndSynchronize(host_output.real_values, false);
+                    // Right now, only testing a size change on the forward transform,
+                    continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(host_output, input_size, output_size, output_size, output_size, __LINE__);
+                    sum                = host_output.ReturnSumOfComplexAmplitudes(host_output.complex_values, host_output.real_memory_allocated / 2);
+                }
+                else {
+                    FT.CopyDeviceToHostAndSynchronize(host_input.real_values, FT.ReturnInputMemorySize( ));
+                    continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(host_input, input_size, output_size, output_size, output_size, __LINE__);
+                    sum                = host_input.ReturnSumOfComplexAmplitudes(host_input.complex_values, host_input.real_memory_allocated / 2);
+                }
+
+                sum -= (host_output.real_memory_allocated / 2);
+
+                // FIXME : shared comparison functions
+                if ( abs(sum) > 1e-8 ) {
+                    all_passed                    = false;
+                    FastFFT_forward_passed[iSize] = false;
+                }
+            }
+            // MyFFTDebugAssertTestTrue( abs(sum - host_output.fftw_epsilon) < 1e-8, "FastFFT unit impulse forward FFT");
+            FT.SetToConstant(host_output.real_values, host_output.real_memory_allocated, 2.0f);
+
+            FT.InvFFT( );
+            FT.CopyDeviceToHostAndSynchronize(host_output.real_values, true);
+
+            if constexpr ( FFT_DEBUG_STAGE > 4 ) {
+                // Right now, only testing a size change on the forward transform,
+                continue_debugging = debug_partial_fft<FFT_DEBUG_STAGE, Rank>(host_output, input_size, output_size, output_size, output_size, __LINE__);
+
+                sum = host_output.ReturnSumOfReal(host_output.real_values, dims_out);
+                if ( sum != dims_out.x * dims_out.y * dims_out.z ) {
+                    all_passed                      = false;
+                    FastFFT_roundTrip_passed[iSize] = false;
+                }
+            }
+
+            // std::cout << "size in/out " << dims_in.x << ", " << dims_out.x << std::endl;
+            // MyFFTDebugAssertTestTrue( sum == dims_out.x*dims_out.y*dims_out.z,"FastFFT unit impulse round trip FFT");
+
+            oSize++;
+        } // while loop over pad to size
+    } // for loop over pad from size
+
+    if ( all_passed ) {
+        if ( ! do_increase_size )
+            std::cout << "    All rank " << Rank << " size_decrease unit impulse tests passed!" << std::endl;
+        else
+            std::cout << "    All rank " << Rank << " size_increase unit impulse tests passed!" << std::endl;
+    }
+    else {
+        for ( int n = 0; n < size.size( ); n++ ) {
+            if ( ! init_passed[n] )
+                std::cout << "    Initialization failed for size " << size[n] << " rank " << Rank << std::endl;
+            if ( ! FFTW_passed[n] )
+                std::cout << "    FFTW failed for size " << size[n] << " rank " << Rank << std::endl;
+            if ( ! FastFFT_forward_passed[n] )
+                std::cout << "    FastFFT failed for forward transform size " << size[n] << " rank " << Rank << std::endl;
+            if ( ! FastFFT_roundTrip_passed[n] )
+                std::cout << "    FastFFT failed for roundtrip transform size " << size[n] << " rank " << Rank << std::endl;
+        }
+    }
+    return all_passed;
+}
+
+int main(int argc, char** argv) {
+
+    std::string test_name;
+    // Default to running all tests
+    bool run_2d_unit_tests = false;
+    bool run_3d_unit_tests = false;
+
+    const std::string_view text_line = "unit impulse";
+    FastFFT::CheckInputArgs(argc, argv, text_line, run_2d_unit_tests, run_3d_unit_tests);
+
+    constexpr bool do_increase_size = true;
+    // TODO: size decrease
+    if ( run_2d_unit_tests ) {
+        if ( ! unit_impulse_test<2>(FastFFT::test_size, do_increase_size) )
+            return 1;
+    }
+
+    if ( run_3d_unit_tests ) {
+        // FIXME: tests are failing for 3d
+        if ( ! unit_impulse_test<3>(FastFFT::test_size_3d, do_increase_size) )
+            return 1;
+        // if (! unit_impulse_test(test_size_3d, true, true)) return 1;
+    }
+
+    return 0;
+};
\ No newline at end of file