diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh index 34723e7..76501ce 100644 --- a/include/FastFFT.cuh +++ b/include/FastFFT.cuh @@ -900,36 +900,41 @@ struct io { const unsigned int lane_idx = get_lane_id( ); // We may have threads coming from x/y so we get the lane ide from special reg, which could be a tiny bit slower, but more dependible than : threadIdx.x & 31; // This should be safe b/c if blockDim.x >= 32 we should have blockDim.y = 1, For now, I have an assert prior to the kernel launch to save overhead - const unsigned int warp_idx = threadIdx.x / 32; + const unsigned int warp_idx = threadIdx.x / 32; + const unsigned int logical_x_in_1d = (threadIdx.x & 31); // Normally we would have a gridDim.y = pixel_pitch, but since here we have reduced the number of blocks to // pixel_pitch / n_coalesced_ffts, const unsigned int tile_idx_x = lane_idx % (n_coalesced_ffts * read_multiplier); const unsigned int physical_x = tile_idx_x + (blockIdx.y * n_coalesced_ffts * read_multiplier); - constexpr unsigned int n_scalar_vals_to_be_consumed = 32 / n_coalesced_ffts; - constexpr unsigned int n_complex_vals_to_be_consumed = 16 / n_coalesced_ffts; - constexpr unsigned int n_consumers_per_read = n_complex_vals_to_be_consumed / 2; - constexpr unsigned int n_sub_warp_blocks = 32 / n_complex_vals_to_be_consumed; // also pitch in elements for the tile - constexpr float no_val = scalar_compute_t{-std::numeric_limits::max( )}; - const unsigned int expected_reads_per_i = blockDim.x; // also can get at compile time + constexpr unsigned int tile_size_y = 16 / n_coalesced_ffts; // # threads in each dimension 4 + constexpr unsigned int tile_size_x = 32 / tile_size_y; // 8 + constexpr unsigned int n_complex_vals_per_read = tile_size_y / 2; // 2 + constexpr unsigned int n_sub_warp_blocks = 32 / n_complex_vals_per_read; // also pitch in elements for the tile 16 + constexpr float no_val = scalar_compute_t{-std::numeric_limits::max( )}; + const unsigned int expected_reads_per_i = blockDim.x; // also can get at compile time 4 (8 threads in y) // In the normal implemenation there is only on read per loop, and every thread is both a producer and a consumer. // so for 64 with 8 ept, there are 8 threads and a stride of 8 In this implementation there is a minimum of 32 threads, so - // + //revert FFT::input_ept for ( unsigned int i = 0; i < FFT::input_ept; i++ ) { - unsigned int physical_y_in_warp = lane_idx / (read_multiplier * n_coalesced_ffts); - // All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks - for ( unsigned int warp_stride = 0; warp_stride < std::min(std::max(1u, expected_reads_per_i / n_sub_warp_blocks), n_sub_warp_blocks); warp_stride++ ) { - + unsigned int physical_y_in_warp = lane_idx / tile_size_x; // 0 1 2 3 + unsigned int base_physical_y_in_warp = physical_y_in_warp; + // All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks, + for ( unsigned int i_tile = 0; i_tile < std::min(std::max(1u, expected_reads_per_i / n_complex_vals_per_read), n_sub_warp_blocks); i_tile++ ) { + // 4 / 2 = 2, min(2, 16) = 2 // read in as floats unsigned int read_from_data_index = (physical_y_in_warp + warp_idx * 32) + i * FFT::stride; - const float read_val = + // i0 0: 0 1 2 3 + // 1: 4 5 6 7 + // i 1: 4 5 6 7 (8 9 10 11) + const float read_val = read_from_data_index < SignalLength ? reinterpret_cast(input)[read_from_data_index * pixel_pitch + physical_x] : no_val; - if ( no_val != read_val ) - printf("tidx: %i , tidy: %i , blockIDx.y: %i , read val: %2.2f , lane: %i , i: %i , warp: %i , tile_idx_x: %i , x: %i , y: %i , readidx: %i , n_sub_warp_blocks: %i , n_coalesced_ffts: %i , FFT::input_ept: %i\n", - threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, warp_stride, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept); + // if ( no_val != read_val ) + // printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n", + // threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept); for ( int i_fft = 0; i_fft < n_coalesced_ffts; i_fft++ ) { // all threads in the warp will now have data and we need to know who gets that data @@ -940,23 +945,26 @@ struct io { // We may have read in a dummy value if we are beyond the signal length __syncwarp( ); - unsigned int producer_tidx = i_fft + n_sub_warp_blocks * lane_idx; // prodcuer index should stay the same and is defined by the tile size. + unsigned int producer_tidx = i_fft + 2 * tile_size_x * (base_physical_y_in_warp / 2); float copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx, 32); if ( threadIdx.y == 0 && - (threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed && - (threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed && + (threadIdx.x & 31) >= i_tile * n_complex_vals_per_read && + (threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read && no_val != copied_val ) { - printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val); + printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n", + threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept); + printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val); thread_data[thread_data_linear_idx] = copied_val; } __syncwarp( ); - copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + n_coalesced_ffts, 32); + // n_coalesced_ffts * read_multiplier is just tile pitch which I'm calculting like 4 different ways here + copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + tile_size_x, 32); if ( threadIdx.y == 0 && - (threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed && - (threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed && + (threadIdx.x & 31) >= i_tile * n_complex_vals_per_read && + (threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read && no_val != copied_val ) { - printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val); + printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val); thread_data[thread_data_linear_idx + 1] = copied_val; } @@ -964,7 +972,7 @@ struct io { } // increment the physical y in warp - physical_y_in_warp += n_complex_vals_to_be_consumed; + physical_y_in_warp += tile_size_y; } } } diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu index 65c1b07..976a360 100644 --- a/src/fastfft/FastFFT.cu +++ b/src/fastfft/FastFFT.cu @@ -1637,6 +1637,7 @@ __launch_bounds__(MAX_TPB) __global__ FastFFT_SMEM complex_compute_t shared_mem[]; + // when C2R_BUFFER_LINES and nthreadsx < 32 this will be wasted on the Y threads, but no biggie complex_compute_t thread_data[FFT::storage_size * n_ffts]; // Total concurent FFTs, n-1 in shared and then on in thread data @@ -1646,7 +1647,7 @@ __launch_bounds__(MAX_TPB) __global__ // TODO: probably need to check we don't pick a weird odd number or something. io::load_c2r_transposed_coalesced(&input_values[ReturnZplane(gridDim.y, mem_offsets.physical_x_input)], (scalar_compute_t*)thread_data, - gridDim.y * n_ffts * 2); // 2 for the complex data + gridDim.y * n_ffts * 2); //revert // // // For loop zero the twiddles don't need to be computed // #pragma unroll(n_ffts) @@ -1671,9 +1672,10 @@ __launch_bounds__(MAX_TPB) __global__ for ( int i_fft = 0; i_fft < n_ffts; i_fft++ ) { // normally Return1DFFTAddress(mem_offsets.physical_x_output) = pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y) // we reduce the number of blocks in Y by n_ffts, hysical_x = fft_idx + blockIdx.y * n_coalesced_ffts; - io::store_c2r(&thread_data[i_fft * FFT::storage_size], - &output_values[mem_offsets.physical_x_output * - ((blockIdx.y * n_ffts + i_fft) + blockIdx.z * gridDim.y)]); + if ( threadIdx.y == 0 ) { + io::store_c2r(&thread_data[i_fft * FFT::storage_size], + &output_values[mem_offsets.physical_x_output * (blockIdx.y * n_ffts + i_fft)]); // FIXME only 2d + } } #else @@ -1865,8 +1867,8 @@ void FourierTransformer::Selec // Use recursion to step through the allowed sizes. GetTransformSize(kernel_type); - // Maybe revert 16 as 4 - constexpr unsigned int Ept = SizeValue <= 32 ? 4 : SizeValue < 4096 ? 8 + + constexpr unsigned int Ept = SizeValue == 16 ? 4 : SizeValue < 4096 ? 8 : 16; // Note: the size of the input/output may not match the size of the transform, i.e. transform_size.L <= transform_size.P if ( SizeValue == transform_size.P ) { @@ -2615,8 +2617,10 @@ void FourierTransformer::SetAn int shared_memory = FFT::shared_memory_size; PrintLaunchParameters(LP); #ifdef C2R_BUFFER_LINES - constexpr unsigned int n_buffer_lines = size_of::value < 64 ? 2 : size_of::value < 512 ? 2 - : 4; + // neads to be chosen so that 16 / n_buffer_lines >= FFT::stride (blockDim.x) + constexpr unsigned int min_buffer_lines = std::max(16u / FFT::stride, 2u); + constexpr unsigned int n_buffer_lines = size_of::value < 64u ? std::max(min_buffer_lines, 2u) : size_of::value < 512 ? std::max(min_buffer_lines, 4u) + : std::max(min_buffer_lines, 4u); LP.gridDims.y /= n_buffer_lines; // FIXME assuming we get a multiple of 32 constexpr unsigned int max_threads_per_block = FFT::max_threads_per_block < 32 ? 32 / FFT::max_threads_per_block * FFT::max_threads_per_block : FFT::max_threads_per_block; @@ -2624,6 +2628,7 @@ void FourierTransformer::SetAn LP.threadsPerBlock.y = 32 / FFT::max_threads_per_block; // TODO: other asserts, but basically, for the buffering we want to have at least 32 threads per block, and we can't modify x } + std::cerr << "min_buffer_lines " << min_buffer_lines << " n_buffer " << n_buffer_lines << std::endl; #else @@ -2634,6 +2639,7 @@ void FourierTransformer::SetAn PrintLaunchParameters(LP); std::cout << "n_buffer_lines: " << n_buffer_lines << std::endl; CheckSharedMemory(shared_memory, device_properties); + // ZeroBufferMemory( ); #if FFT_DEBUG_STAGE > 6