Merge pull request #35 from StochasticAnalytics/paperwork

Paperwork
StochasticAnalytics · Jan 29, 2024 · 5be151f · 5be151f
2 parents 0de8fdd + 14262a8
commit 5be151f
Show file tree

Hide file tree

Showing 43 changed files with 10,237 additions and 3,212 deletions.
diff --git a/.editorconfig b/.editorconfig
diff --git a/.vscode_shared/BenHimes/settings.json b/.vscode_shared/BenHimes/settings.json
@@ -74,7 +74,17 @@
         "semaphore": "cpp",
         "stop_token": "cpp",
         "complex": "cpp",
-        "unordered_set": "cpp"
+        "unordered_set": "cpp",
+        "__config": "cpp",
+        "target": "cpp",
+        "ios": "cpp",
+        "__pragma_push": "cpp",
+        "__locale": "cpp",
+        "__bit_reference": "cpp",
+        "__functional_base": "cpp",
+        "__node_handle": "cpp",
+        "__memory": "cpp",
+        "locale": "cpp"
     },
     "C_Cpp.clang_format_path": "/usr/bin/clang-format-14",
     "editor.formatOnSave": true,

diff --git a/.vscode_shared/BenHimes/tasks.json b/.vscode_shared/BenHimes/tasks.json
@@ -14,7 +14,12 @@
     {
       "label": "Book_build",
       "type": "shell",
-      "command": "${HOME}/.FastFFTDocs/bin/jupyter-book build --all ./docs && firefox ./docs/_build/html/index.html"
+      "command": "${HOME}/.FastFFTDocs/bin/jupyter-book build --all ./docs && firefox ./docs/_build/html/index.html",
+      "problemMatcher": [],
+      "group": {
+        "kind": "build",
+        "isDefault": true
+      }
     },
     {
       "label": "Book_publish",

diff --git a/build/Makefile b/build/Makefile
@@ -2,7 +2,6 @@
 NVCC=nvcc
 # TODO: test with gcc and clang
 NVCC_FLAGS=-ccbin=g++ -t 8 
-
 # TODO: static may need to be added later or independently just for benchmarks, while libs to link through python are probably going to need to be dynamic.
 NVCC_FLAGS+=--cudart=static
 NVCC_FLAGS+=--default-stream per-thread -m64 -O3 --use_fast_math --extra-device-vectorization --extended-lambda --Wext-lambda-captures-this -std=c++17 -Xptxas --warn-on-local-memory-usage,--warn-on-spills, --generate-line-info -Xcompiler=-std=c++17
@@ -11,6 +10,9 @@ NVCC_FLAGS+=--expt-relaxed-constexpr
 # For testing, particularly until the cufftdx::BlockDim() operator is enabled
 NVCC_FLAGS+=-DCUFFTDX_DISABLE_RUNTIME_ASSERTS
 
+# Don't explicitly instantiate the 3d versions unless requested
+NVCC_FLAGS+=-DFastFFT_3d_instantiation
+
 # Gencode arguments, only supporting Volta or newer
 # SMS ?= 70 75 80 86
 # In initial dev, only compile for 70 or 86 depending on which workstation I'm on, b/c it is faster.
@@ -38,7 +40,7 @@ $(info $$CUFFTDX_INCLUDE_DIR is [${CUFFTDX_INCLUDE_DIR}])
 CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`)
 CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include
 
-
+# TODO: Changing the debug flags will not force re compilation
 DEBUG_FLAGS :=  
 # Debug level determines various asserts and print macros defined in FastFFT.cuh These should only be set when building tests and developing.
 ifeq (${FFT_DEBUG_LEVEL},)
@@ -79,6 +81,10 @@ endif
 ifeq (${FastFFT_sync_checking},1)
 # If HEAVYERRORCHECKING_FFT is not already asked for, then add it anytime debug_stage < 8 (partial FFTs)
 	DEBUG_FLAGS+=-DHEAVYERRORCHECKING_FFT
+else
+	ifeq (${debug_level}, 4)
+		DEBUG_FLAGS+=-DHEAVYERRORCHECKING_FFT
+	endif
 endif
 
 
@@ -89,6 +95,7 @@ EXTERNAL_LIBS= -lfftw3f -lcufft_static -lculibos -lcudart_static -lrt
 TEST_BUILD_DIR=tests
 TEST_SRC_DIR=../src/tests
 # Get all the test source files and remove cu extension
+# TODO: selective targets
 TEST_TARGETS=$(patsubst %.cu,$(TEST_BUILD_DIR)/%,$(notdir $(wildcard $(TEST_SRC_DIR)/*.cu)))
 TEST_DEPS=$(wildcard $(TEST_SRC_DIR)/*.cuh)
 

diff --git a/docs/_docs/MS/manuscript.md b/docs/_docs/MS/manuscript.md
@@ -7,28 +7,37 @@
 
 ## Abstract
 
-The Fast Fourier transform (FFT) is one of the most widely used and heavily optimized digital signal processing algorithms. Many of the image processing algorithms used in cryo-Electron Microscopy rely on the FFT to accelerate convolution operations central to image alignment and also in reconstruction algorithms that operate most accurately in Fourier space. FFT libraries like FFTW and cuFFT provide routines for highly-optimized general purpose multi-dimensional FFTs; however, they overlook several use-cases where only a subset of the input or output points are required. We demonstrate that algorithms based on the transform decomposition approach are well suited to the memory hierarchy of modern GPUs by implementing them in CUDA/C++ using the cufftdx header-only library. The results have practical implications, accelerating several key image processing algorithms by factors of 3-10x over those built using Nvidia’s general purpose FFT library cuFFT. These include movie-frame alignment, image resampling via Fourier cropping, 2d and 3d template matching, and subtomogram averaging and alignment.
+The Fast Fourier transform (FFT) is one of the most widely used and heavily optimized digital signal processing algorithms. Many of the image processing algorithms used in cryo-Electron Microscopy rely on the FFT to accelerate convolution operations central to image alignment and also in reconstruction algorithms that operate most accurately in Fourier space. FFT libraries like FFTW and cuFFT provide routines for highly-optimized general purpose multi-dimensional FFTs; however, their generality comes at a cost of performance in several use-cases where only a subset of the input or output points are required. We demonstrate that algorithms based on the transform decomposition approach are well suited to the memory hierarchy of modern GPUs by implementing them in CUDA/C++ using the cufftdx header-only library. The results have practical implications, accelerating several key image processing algorithms by factors of 3-10x over those built using Nvidia’s general purpose FFT library cuFFT. These include movie-frame alignment, image resampling via Fourier cropping, 2d and 3d template matching, 3d reconstruction, and subtomogram averaging and alignment.
 
 ## Introduction
 
-The Discrete Fourier Transform (DFT) and linear filtering, *e.g.* convolution, are among the most common operations in digital signal processing. It is therefore assumed that the reader has a basic familiarity with Fourier Analysis and it's applications in their respective fields; we will focus here on digital image processing for convenience. For a detailed introduction to the reader is referred to the free book by Smith {cite:p}`smith_mathematics_2008`. There are many ways to implement the computation of the DFT, the most simple of which can be understood as a matrix multiplication, which scales in computational complexity as O(n^2). The Fast Fourier Transform (FFT) implements some level of recursion, as described below, which in turn reduces the computational complexity to O(nlog(n)). This reduction in number of operations leads to both an increase in the speed, reduced roundoff error, as well as a reduced memory footprint. The goal of many popular FFT library's is to provide optimal routines for FFT's of any size that are also adapted to the vagarys of the specific computer hardware they are being run on ⚠️. Most papers describing FFT algorithms focus on computational complexity from the perspective of the number of arithmetic operations  ⚠️, however, cache coherency and limited on chip memory ultimately reduce the efficiency of large FFTs (⚠️ TODO: Intro figure 1). For certain applications, particularly research that requires high performance computing resources, this generality is not strictly needed, and substantial financial and environmental resources could be minimized by using specialezed hardware or software. (Add something here about asics and fpgas ⚠️) By incorporating prior information about specific image processing algorithms, we present optimized two and three dimensional FFT routines that blah blah blahh⚠️.
+The Discrete Fourier Transform (DFT) and linear filtering, *e.g.* convolution, are among the most common operations in digital signal processing. It is therefore assumed that the reader has a basic familiarity with Fourier Analysis and it's applications in their respective fields; we will focus here on digital image processing for convenience. For a detailed introduction to the reader is referred to the free book by Smith {cite:p}`smith_mathematics_2008`. There are many ways to implement the computation of the DFT, the most simple of which can be understood as a matrix multiplication, which scales in computational complexity as O(n^2). In order improve the performance of the DFT, several algorithms have been devised, that collectively may be called the Fast Fourier Transform (FFT). All of these implement some level of recursion, as described below, which in turn reduces the computational complexity to O(nlog(n)). The FFT additionally improves performance by reducing the total memory footprint of the calculation, allowing for better utilization of high-performance, low capacity memory cache. In addition to increasing computational speed, this reduction in the total number of floating point operations also leads to reduced roundoff error, making the end result more accurate. The goal of many popular FFT library's is to provide optimal routines for FFT's of any size that are also adapted to the vagarys of the specific computer hardware they are being run on ⚠️. Most papers describing FFT algorithms focus on computational complexity from the perspective of the number of arithmetic operations  ⚠️, however, cache coherency and limited on chip memory ultimately reduce the efficiency of large FFTs (⚠️ TODO: Intro figure 1). For certain applications, particularly research that requires high performance computing resources, some generality may be exchanged for even more performance, resulting in substantial reduction of financial, and environmental, resources. At the extreme end of this tradeoff between generality and performance are hardware build specifically for a single algorithm, (Add something here about asics and fpgas and ANTON ⚠️) By incorporating prior information about specific image processing algorithms, we present optimized two and three dimensional FFT routines that blah blah blahh⚠️.
 
 ### Background
 
+#### Examples from cryo-EM
+
+⚠️ I think it makes sense to lead with the specific use cases we will be having to test and implement the desired improvements, from the perspective of cryo-EM. From there, providing a few examples of parallels, probably in CNNs etc and only then going into specifics about the implementation.
+
 #### The discrete Fourier Transform
 
-The Fourier Transform is a mathematical operation that converts an input function into a dual space; for example a function of time into a function of frequency. These dual spaces are sometimes referred to as position and momentum space, real and Fourier space, image space and k-space etc. Given that a "real-space" function can be complex valued, we will use the position and momentum space nomenclature to avoid ambiguity.
+The Fourier Transform is a mathematical operation that maps an input function into a dual space; for example a function of time into a function of frequency. These dual spaces are sometimes referred to as position and momentum space, real and Fourier space, image space and k-space etc. Given that a "real-space" function can be complex valued, we will use the position and momentum space nomenclature to avoid ambiguity.
 
-creatinThe discrete Fourier Transform (DFT) extends the operation of the Fourier Transform to a band-limited sequence of evenly spaced samples of a continuous function. In one dimension, it is defined for a sequence of N samples $x(n)$ as: Throughout the text we use lower/upper case to refer to position/momemtum space variables.
+The discrete Fourier Transform (DFT) extends the operation of the Fourier Transform to a band-limited sequence of evenly spaced samples of a continuous function. In one dimension, it is defined for a sequence of N samples $x(n)$ as: 
 
 % This produces a labelled eqution in jupyter book that will at least render the math in vscode preview, just without the label.
-$$ X(k) = \sum_{n=0}^{N-1} x(n) \exp\left( -2\pi i k n \right) $$ (dft-1d-equation)
+$$ X(k) = \sum_{n=0}^{N-1} x(n) \exp\left( \frac{-2\pi i k}{N}n \right) $$ (dft-1d-equation)
+
+```{note}
+*Throughout the text we use lower/upper case to refer to position/momemtum space variables.*
+```
+
 
 The DFT is fully seperable when calculated with respect to a Cartesian coordintate system. For example, for an M x N array, the DFT is defined as:
 
-$$ X(k_m,k_n) = \sum_{m=0}^{M-1} \left[ \sum_{n=0}^{N-1} x(m,n) \exp\left( -2\pi i k_n n \right) \right] \exp\left( -2\pi i k_m m \right) $$ (dft-2d-equation)
+$$ X(k_m,k_n) = \sum_{m=0}^{M-1} \left[ \sum_{n=0}^{N-1} x(m,n) \exp\left( \frac{-2\pi i k_n}{N} n \right) \right] \exp\left( \frac{-2\pi i k_m}{M} m \right) $$ (dft-2d-equation)
 
-From this equation, it should be clear in the most simple case the 2D DFT can be calculated by first calculating the 1D FFT for each column and then each row, resulting in $ M \times N $ 1D DFTs. This seperability extends to higher dimensions, and is what permits us to exploit regions of the input that are known to be zero.
+From this equation, it should be clear in the most simple cases the 2D DFT can be calculated by first calculating the 1D FFT for each column and then each row, resulting in $ M \times N $ 1D DFTs. This seperability extends to higher dimensions, and is what permits us to exploit regions of the input that are known to be zero.
 
 In addition to being seperable, the DFT has several other important properties:
 
@@ -82,9 +91,6 @@ The simplest approach to avoiding redundant calculations and memory transfers in
 
 ### The DFT and FFT
 
-Fast Fourier Transform (FFT) is a mathematical operation that transforms a function $ X(n) $ from the real-valued plane into the complex-valued plane. The function $ f(x) $ is a function of $ x $ and is often a function of the real-valued signal $ x $ or a function of the complex-valued signal $ x + i\cdot y $. The FFT is an optimized algorithm for copying the discrete Fourier transform (DFT) defined as: 
-
-( I think this paragraph is probably belonging in the introduction )
 
 The FFT is useful in image analysis as it can help to isolate repetitive features of a given size. It is also useful in signal processing as it can be used to perform convolution operations. Multidimensional FFTs are fully seperable, such that in the simpliest case, FFTs in higher dimensions are composed of a sequence of 1D FFTs. For example, in two dimensions, the FFT of a 2D image is composed of the 1D FFTs of the rows and columns. A naive implementation is compuatationally ineffecient due to the strided memory access pattern⚠️. One solution is to transpose the data after the FFT and then transpose the result back ⚠️. This requires extra trips to and from main memory. Another solution is to decompose the 2D transform using vector-radix FFTs ⚠️.
 

diff --git a/docs/_docs/references/class_reference.md b/docs/_docs/references/class_reference.md
@@ -1,5 +1,7 @@
 # Fourier Transformer Class Reference
 
+
+
 ## Public Methods
 
 ## State Variables

diff --git a/docs/_docs/references/development_tools.md b/docs/_docs/references/development_tools.md
@@ -101,16 +101,14 @@ FastFFT::PrintState()
     std::cout << std::endl;
 
     std::cout << "State Variables:\n" << std::endl;
-    std::cout << "is_in_memory_host_pointer " << is_in_memory_host_pointer << std::endl;
-    std::cout << "is_in_memory_device_pointer " << is_in_memory_device_pointer << std::endl;
+        // std::cerr << "is_in_memory_device_pointer " << is_in_memory_device_pointer << std::endl; // FIXME: switched to is_pointer_in_device_memory(d_ptr.buffer_1) defined in FastFFT.cuh
     std::cout << "is_in_buffer_memory " << is_in_buffer_memory << std::endl;
     std::cout << "is_fftw_padded_input " << is_fftw_padded_input << std::endl;
     std::cout << "is_fftw_padded_output " << is_fftw_padded_output << std::endl;
-    std::cout << "is_real_valued_input " << is_real_valued_input << std::endl;
+    std::cout << "is_real_valued_input " << IsAllowedRealType<InputType>  << std::endl;
     std::cout << "is_set_input_params " << is_set_input_params << std::endl;
     std::cout << "is_set_output_params " << is_set_output_params << std::endl;
     std::cout << "is_size_validated " << is_size_validated << std::endl;
-    std::cout << "is_set_input_pointer " << is_set_input_pointer << std::endl;
     std::cout << std::endl;
 
     std::cout << "Size variables:\n" << std::endl;
@@ -125,11 +123,11 @@ FastFFT::PrintState()
     std::cout << std::endl;
 
     std::cout << "Misc:\n" << std::endl;
-    std::cout << "compute_memory_allocated " << compute_memory_allocated << std::endl;
+    std::cout << "compute_memory_wanted " << compute_memory_wanted << std::endl;
     std::cout << "memory size to copy " << memory_size_to_copy << std::endl;
     std::cout << "fwd_size_change_type " << SizeChangeName[fwd_size_change_type] << std::endl;
     std::cout << "inv_size_change_type " << SizeChangeName[inv_size_change_type] << std::endl;
-    std::cout << "transform stage complete " << TransformStageCompletedName[transform_stage_completed] << std::endl;
+    std::cout << "transform stage complete " << transform_stage_completed << std::endl;
     std::cout << "input_origin_type " << OriginTypeName[input_origin_type] << std::endl;
     std::cout << "output_origin_type " << OriginTypeName[output_origin_type] << std::endl;