Skip to content

Commit

Permalink
now seems further from working, but sorting through shit. Time for a …
Browse files Browse the repository at this point in the history
…little weekend
  • Loading branch information
bHimes committed Dec 20, 2024
1 parent df64cce commit 5a43474
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 33 deletions.
58 changes: 33 additions & 25 deletions include/FastFFT.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -900,36 +900,41 @@ struct io {

const unsigned int lane_idx = get_lane_id( ); // We may have threads coming from x/y so we get the lane ide from special reg, which could be a tiny bit slower, but more dependible than : threadIdx.x & 31;
// This should be safe b/c if blockDim.x >= 32 we should have blockDim.y = 1, For now, I have an assert prior to the kernel launch to save overhead
const unsigned int warp_idx = threadIdx.x / 32;
const unsigned int warp_idx = threadIdx.x / 32;
const unsigned int logical_x_in_1d = (threadIdx.x & 31);
// Normally we would have a gridDim.y = pixel_pitch, but since here we have reduced the number of blocks to
// pixel_pitch / n_coalesced_ffts,
const unsigned int tile_idx_x = lane_idx % (n_coalesced_ffts * read_multiplier);
const unsigned int physical_x = tile_idx_x + (blockIdx.y * n_coalesced_ffts * read_multiplier);

constexpr unsigned int n_scalar_vals_to_be_consumed = 32 / n_coalesced_ffts;
constexpr unsigned int n_complex_vals_to_be_consumed = 16 / n_coalesced_ffts;
constexpr unsigned int n_consumers_per_read = n_complex_vals_to_be_consumed / 2;
constexpr unsigned int n_sub_warp_blocks = 32 / n_complex_vals_to_be_consumed; // also pitch in elements for the tile
constexpr float no_val = scalar_compute_t{-std::numeric_limits<float>::max( )};
const unsigned int expected_reads_per_i = blockDim.x; // also can get at compile time
constexpr unsigned int tile_size_y = 16 / n_coalesced_ffts; // # threads in each dimension 4
constexpr unsigned int tile_size_x = 32 / tile_size_y; // 8
constexpr unsigned int n_complex_vals_per_read = tile_size_y / 2; // 2
constexpr unsigned int n_sub_warp_blocks = 32 / n_complex_vals_per_read; // also pitch in elements for the tile 16
constexpr float no_val = scalar_compute_t{-std::numeric_limits<float>::max( )};
const unsigned int expected_reads_per_i = blockDim.x; // also can get at compile time 4 (8 threads in y)

// In the normal implemenation there is only on read per loop, and every thread is both a producer and a consumer.
// so for 64 with 8 ept, there are 8 threads and a stride of 8 In this implementation there is a minimum of 32 threads, so
//
//revert FFT::input_ept
for ( unsigned int i = 0; i < FFT::input_ept; i++ ) {

unsigned int physical_y_in_warp = lane_idx / (read_multiplier * n_coalesced_ffts);
// All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks
for ( unsigned int warp_stride = 0; warp_stride < std::min(std::max(1u, expected_reads_per_i / n_sub_warp_blocks), n_sub_warp_blocks); warp_stride++ ) {

unsigned int physical_y_in_warp = lane_idx / tile_size_x; // 0 1 2 3
unsigned int base_physical_y_in_warp = physical_y_in_warp;
// All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks,
for ( unsigned int i_tile = 0; i_tile < std::min(std::max(1u, expected_reads_per_i / n_complex_vals_per_read), n_sub_warp_blocks); i_tile++ ) {
// 4 / 2 = 2, min(2, 16) = 2
// read in as floats
unsigned int read_from_data_index = (physical_y_in_warp + warp_idx * 32) + i * FFT::stride;
const float read_val =
// i0 0: 0 1 2 3
// 1: 4 5 6 7
// i 1: 4 5 6 7 (8 9 10 11)
const float read_val =
read_from_data_index < SignalLength ? reinterpret_cast<const float*>(input)[read_from_data_index * pixel_pitch + physical_x] : no_val;

if ( no_val != read_val )
printf("tidx: %i , tidy: %i , blockIDx.y: %i , read val: %2.2f , lane: %i , i: %i , warp: %i , tile_idx_x: %i , x: %i , y: %i , readidx: %i , n_sub_warp_blocks: %i , n_coalesced_ffts: %i , FFT::input_ept: %i\n",
threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, warp_stride, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);
// if ( no_val != read_val )
// printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n",
// threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);

for ( int i_fft = 0; i_fft < n_coalesced_ffts; i_fft++ ) {
// all threads in the warp will now have data and we need to know who gets that data
Expand All @@ -940,31 +945,34 @@ struct io {
// We may have read in a dummy value if we are beyond the signal length
__syncwarp( );

unsigned int producer_tidx = i_fft + n_sub_warp_blocks * lane_idx; // prodcuer index should stay the same and is defined by the tile size.
unsigned int producer_tidx = i_fft + 2 * tile_size_x * (base_physical_y_in_warp / 2);
float copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx, 32);

if ( threadIdx.y == 0 &&
(threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed &&
(threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed &&
(threadIdx.x & 31) >= i_tile * n_complex_vals_per_read &&
(threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read &&
no_val != copied_val ) {
printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val);
printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n",
threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);
printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val);
thread_data[thread_data_linear_idx] = copied_val;
}
__syncwarp( );
copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + n_coalesced_ffts, 32);
// n_coalesced_ffts * read_multiplier is just tile pitch which I'm calculting like 4 different ways here
copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + tile_size_x, 32);
if ( threadIdx.y == 0 &&
(threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed &&
(threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed &&
(threadIdx.x & 31) >= i_tile * n_complex_vals_per_read &&
(threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read &&
no_val != copied_val ) {
printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val);
printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val);
thread_data[thread_data_linear_idx + 1] = copied_val;
}

__syncwarp( );
}

// increment the physical y in warp
physical_y_in_warp += n_complex_vals_to_be_consumed;
physical_y_in_warp += tile_size_y;
}
}
}
Expand Down
22 changes: 14 additions & 8 deletions src/fastfft/FastFFT.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1637,6 +1637,7 @@ __launch_bounds__(MAX_TPB) __global__

FastFFT_SMEM complex_compute_t shared_mem[];

// when C2R_BUFFER_LINES and nthreadsx < 32 this will be wasted on the Y threads, but no biggie
complex_compute_t thread_data[FFT::storage_size * n_ffts];

// Total concurent FFTs, n-1 in shared and then on in thread data
Expand All @@ -1646,7 +1647,7 @@ __launch_bounds__(MAX_TPB) __global__
// TODO: probably need to check we don't pick a weird odd number or something.
io<FFT, MAX_TPB, n_ffts>::load_c2r_transposed_coalesced(&input_values[ReturnZplane(gridDim.y, mem_offsets.physical_x_input)],
(scalar_compute_t*)thread_data,
gridDim.y * n_ffts * 2); // 2 for the complex data
gridDim.y * n_ffts * 2);
//revert
// // // For loop zero the twiddles don't need to be computed
// #pragma unroll(n_ffts)
Expand All @@ -1671,9 +1672,10 @@ __launch_bounds__(MAX_TPB) __global__
for ( int i_fft = 0; i_fft < n_ffts; i_fft++ ) {
// normally Return1DFFTAddress(mem_offsets.physical_x_output) = pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y)
// we reduce the number of blocks in Y by n_ffts, hysical_x = fft_idx + blockIdx.y * n_coalesced_ffts;
io<FFT>::store_c2r(&thread_data[i_fft * FFT::storage_size],
&output_values[mem_offsets.physical_x_output *
((blockIdx.y * n_ffts + i_fft) + blockIdx.z * gridDim.y)]);
if ( threadIdx.y == 0 ) {
io<FFT>::store_c2r(&thread_data[i_fft * FFT::storage_size],
&output_values[mem_offsets.physical_x_output * (blockIdx.y * n_ffts + i_fft)]); // FIXME only 2d
}
}

#else
Expand Down Expand Up @@ -1865,8 +1867,8 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::Selec

// Use recursion to step through the allowed sizes.
GetTransformSize(kernel_type);
// Maybe revert 16 as 4
constexpr unsigned int Ept = SizeValue <= 32 ? 4 : SizeValue < 4096 ? 8

constexpr unsigned int Ept = SizeValue == 16 ? 4 : SizeValue < 4096 ? 8
: 16;
// Note: the size of the input/output may not match the size of the transform, i.e. transform_size.L <= transform_size.P
if ( SizeValue == transform_size.P ) {
Expand Down Expand Up @@ -2615,15 +2617,18 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
int shared_memory = FFT::shared_memory_size;
PrintLaunchParameters(LP);
#ifdef C2R_BUFFER_LINES
constexpr unsigned int n_buffer_lines = size_of<FFT>::value < 64 ? 2 : size_of<FFT>::value < 512 ? 2
: 4;
// neads to be chosen so that 16 / n_buffer_lines >= FFT::stride (blockDim.x)
constexpr unsigned int min_buffer_lines = std::max(16u / FFT::stride, 2u);
constexpr unsigned int n_buffer_lines = size_of<FFT>::value < 64u ? std::max(min_buffer_lines, 2u) : size_of<FFT>::value < 512 ? std::max(min_buffer_lines, 4u)
: std::max(min_buffer_lines, 4u);
LP.gridDims.y /= n_buffer_lines;
// FIXME assuming we get a multiple of 32
constexpr unsigned int max_threads_per_block = FFT::max_threads_per_block < 32 ? 32 / FFT::max_threads_per_block * FFT::max_threads_per_block : FFT::max_threads_per_block;
if ( max_threads_per_block > FFT::max_threads_per_block ) {
LP.threadsPerBlock.y = 32 / FFT::max_threads_per_block;
// TODO: other asserts, but basically, for the buffering we want to have at least 32 threads per block, and we can't modify x
}
std::cerr << "min_buffer_lines " << min_buffer_lines << " n_buffer " << n_buffer_lines << std::endl;

#else

Expand All @@ -2634,6 +2639,7 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
PrintLaunchParameters(LP);
std::cout << "n_buffer_lines: " << n_buffer_lines << std::endl;
CheckSharedMemory(shared_memory, device_properties);

// ZeroBufferMemory( );

#if FFT_DEBUG_STAGE > 6
Expand Down

0 comments on commit 5a43474

Please sign in to comment.