Skip to content

Commit

Permalink
shared impl working and 15% faster with banks
Browse files Browse the repository at this point in the history
  • Loading branch information
bHimes committed Dec 26, 2024
1 parent 9187400 commit 7d0ba4e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
10 changes: 10 additions & 0 deletions include/FastFFT.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1051,13 +1051,22 @@ struct io {
//
// 0 1 2 3 0 1 2 3

// unsigned int index = threadIdx.x;
// for ( unsigned int i = 0; i < FFT::input_ept; i++ ) {
// if ( index < SignalLength ) {
// thread_data[i] = convert_if_needed<FFT, complex_compute_t>(input, Return1DFFTAddress_transpose_XY(index, pixel_pitch));
// index += FFT::stride;
// }
// }/

unsigned int index = threadIdx.x;
// 0 1 2 3 0 1 2 3 + 4 * (0 0 0 0 1 1 1 1) = 0 1 2 3 4 5 6 7 ...
unsigned int x_prime = threadIdx.y + n_coalesced_ffts * (threadIdx.x / n_coalesced_ffts);
const unsigned int fft_idx = threadIdx.x % n_coalesced_ffts; // change to fft_idx

const unsigned int smem_idx = x_prime + fft_idx * FFT::stride;
for ( unsigned int i = 0; i < FFT::input_ept; i++ ) {
__syncthreads( );

if ( x_prime < SignalLength )
smem_buffer[smem_idx] = convert_if_needed<FFT, complex_compute_t>(input, x_prime * pixel_pitch + fft_idx + blockIdx.y * n_coalesced_ffts);
Expand All @@ -1067,6 +1076,7 @@ struct io {
thread_data[i] = convert_if_needed<FFT, complex_compute_t>(smem_buffer, threadIdx.x + threadIdx.y * FFT::stride);
__syncthreads( );

x_prime += FFT::stride;
index += FFT::stride;
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/fastfft/FastFFT.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2671,6 +2671,8 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
CheckSharedMemory(shared_memory, device_properties);

// PrintLaunchParameters(LP);
// std::cerr << "max tpb " << max_threads_per_block << " n_buffer " << n_buffer_lines << std::endl;
// exit(0);

#if FFT_DEBUG_STAGE > 6
cudaErr(cudaFuncSetAttribute((void*)block_fft_kernel_C2R_NONE_XY<FFT, max_threads_per_block, data_buffer_t, data_io_t, n_buffer_lines>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory));
Expand Down

0 comments on commit 7d0ba4e

Please sign in to comment.