diff --git a/include/FastFFT.cuh b/include/FastFFT.cuh
index 34723e7..76501ce 100644
--- a/include/FastFFT.cuh
+++ b/include/FastFFT.cuh
@@ -900,36 +900,41 @@ struct io {
 
         const unsigned int lane_idx = get_lane_id( ); // We may have threads coming from x/y so we get the lane ide from special reg, which could be a tiny bit slower, but more dependible than : threadIdx.x & 31;
         // This should be safe b/c if blockDim.x >= 32 we should have blockDim.y = 1, For now, I have an assert prior to the kernel launch to save overhead
-        const unsigned int warp_idx = threadIdx.x / 32;
+        const unsigned int warp_idx        = threadIdx.x / 32;
+        const unsigned int logical_x_in_1d = (threadIdx.x & 31);
         // Normally we would have a gridDim.y = pixel_pitch, but since here we have reduced the number of blocks to
         // pixel_pitch / n_coalesced_ffts,
         const unsigned int tile_idx_x = lane_idx % (n_coalesced_ffts * read_multiplier);
         const unsigned int physical_x = tile_idx_x + (blockIdx.y * n_coalesced_ffts * read_multiplier);
 
-        constexpr unsigned int n_scalar_vals_to_be_consumed  = 32 / n_coalesced_ffts;
-        constexpr unsigned int n_complex_vals_to_be_consumed = 16 / n_coalesced_ffts;
-        constexpr unsigned int n_consumers_per_read          = n_complex_vals_to_be_consumed / 2;
-        constexpr unsigned int n_sub_warp_blocks             = 32 / n_complex_vals_to_be_consumed; // also pitch in elements for the tile
-        constexpr float        no_val                        = scalar_compute_t{-std::numeric_limits<float>::max( )};
-        const unsigned int     expected_reads_per_i          = blockDim.x; // also can get at compile time
+        constexpr unsigned int tile_size_y             = 16 / n_coalesced_ffts; // # threads in each dimension 4
+        constexpr unsigned int tile_size_x             = 32 / tile_size_y; // 8
+        constexpr unsigned int n_complex_vals_per_read = tile_size_y / 2; // 2
+        constexpr unsigned int n_sub_warp_blocks       = 32 / n_complex_vals_per_read; // also pitch in elements for the tile 16
+        constexpr float        no_val                  = scalar_compute_t{-std::numeric_limits<float>::max( )};
+        const unsigned int     expected_reads_per_i    = blockDim.x; // also can get at compile time 4 (8 threads in y)
 
         // In the normal implemenation there is only on read per loop, and every thread is both a producer and a consumer.
         // so for 64 with 8 ept, there are 8 threads and a stride of 8 In this implementation there is a minimum of 32 threads, so
-        //
+        //revert FFT::input_ept
         for ( unsigned int i = 0; i < FFT::input_ept; i++ ) {
 
-            unsigned int physical_y_in_warp = lane_idx / (read_multiplier * n_coalesced_ffts);
-            // All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks
-            for ( unsigned int warp_stride = 0; warp_stride < std::min(std::max(1u, expected_reads_per_i / n_sub_warp_blocks), n_sub_warp_blocks); warp_stride++ ) {
-
+            unsigned int physical_y_in_warp      = lane_idx / tile_size_x; // 0 1 2 3
+            unsigned int base_physical_y_in_warp = physical_y_in_warp;
+            // All reads need to be within the warp to use shfl_sync, so we need to break up the input data into 2d blocks,
+            for ( unsigned int i_tile = 0; i_tile < std::min(std::max(1u, expected_reads_per_i / n_complex_vals_per_read), n_sub_warp_blocks); i_tile++ ) {
+                // 4 / 2 = 2, min(2, 16) = 2
                 // read in as floats
                 unsigned int read_from_data_index = (physical_y_in_warp + warp_idx * 32) + i * FFT::stride;
-                const float  read_val =
+                // i0 0: 0 1 2 3
+                //    1: 4 5 6 7
+                // i 1:  4 5 6 7 (8 9 10 11)
+                const float read_val =
                         read_from_data_index < SignalLength ? reinterpret_cast<const float*>(input)[read_from_data_index * pixel_pitch + physical_x] : no_val;
 
-                if ( no_val != read_val )
-                    printf("tidx: %i , tidy: %i , blockIDx.y: %i , read val: %2.2f , lane: %i , i: %i , warp: %i , tile_idx_x: %i , x: %i , y: %i , readidx: %i , n_sub_warp_blocks: %i , n_coalesced_ffts: %i , FFT::input_ept: %i\n",
-                           threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, warp_stride, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);
+                // if ( no_val != read_val )
+                //     printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n",
+                //            threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);
 
                 for ( int i_fft = 0; i_fft < n_coalesced_ffts; i_fft++ ) {
                     // all threads in the warp will now have data and we need to know who gets that data
@@ -940,23 +945,26 @@ struct io {
                     // We may have read in a dummy value if we are beyond the signal length
                     __syncwarp( );
 
-                    unsigned int producer_tidx = i_fft + n_sub_warp_blocks * lane_idx; // prodcuer index should stay the same and is defined by the tile size.
+                    unsigned int producer_tidx = i_fft + 2 * tile_size_x * (base_physical_y_in_warp / 2);
                     float        copied_val    = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx, 32);
 
                     if ( threadIdx.y == 0 &&
-                         (threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed &&
-                         (threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed &&
+                         (threadIdx.x & 31) >= i_tile * n_complex_vals_per_read &&
+                         (threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read &&
                          no_val != copied_val ) {
-                        printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val);
+                        printf("tidx: %i tidy: %i blockIDx.y: %i read val: %2.2f lane: %i i: %i warp: %i tile_idx_x: %i x: %i y: %i readidx: %i n_sub_warp_blocks: %i n_coalesced_ffts: %i FFT::input_ept: %i\n",
+                               threadIdx.x, threadIdx.y, blockIdx.y, read_val, lane_idx, i, i_tile, tile_idx_x, physical_x, physical_y_in_warp, read_from_data_index, n_sub_warp_blocks, n_coalesced_ffts, FFT::input_ept);
+                        printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val);
                         thread_data[thread_data_linear_idx] = copied_val;
                     }
                     __syncwarp( );
-                    copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + n_coalesced_ffts, 32);
+                    // n_coalesced_ffts * read_multiplier is just tile pitch which I'm calculting like 4 different ways here
+                    copied_val = __shfl_sync(0xFFFFFFFF, read_val, producer_tidx + tile_size_x, 32);
                     if ( threadIdx.y == 0 &&
-                         (threadIdx.x & 31) >= warp_stride * n_scalar_vals_to_be_consumed &&
-                         (threadIdx.x & 31) < (warp_stride + 1) * n_scalar_vals_to_be_consumed &&
+                         (threadIdx.x & 31) >= i_tile * n_complex_vals_per_read &&
+                         (threadIdx.x & 31) < (i_tile + 1) * n_complex_vals_per_read &&
                          no_val != copied_val ) {
-                        printf("readVal:%3.3f, copiedVal:%3.3f\n", read_val, copied_val);
+                        printf("x: %i, y:%i l:%i from: %i, readVal:%3.3f, copiedVal:%3.3f\n", physical_x, base_physical_y_in_warp, producer_tidx, read_val, copied_val);
                         thread_data[thread_data_linear_idx + 1] = copied_val;
                     }
 
@@ -964,7 +972,7 @@ struct io {
                 }
 
                 // increment the physical y in warp
-                physical_y_in_warp += n_complex_vals_to_be_consumed;
+                physical_y_in_warp += tile_size_y;
             }
         }
     }
diff --git a/src/fastfft/FastFFT.cu b/src/fastfft/FastFFT.cu
index 65c1b07..976a360 100644
--- a/src/fastfft/FastFFT.cu
+++ b/src/fastfft/FastFFT.cu
@@ -1637,6 +1637,7 @@ __launch_bounds__(MAX_TPB) __global__
 
     FastFFT_SMEM complex_compute_t shared_mem[];
 
+    // when C2R_BUFFER_LINES and nthreadsx < 32 this will be wasted on the Y threads, but no biggie
     complex_compute_t thread_data[FFT::storage_size * n_ffts];
 
     // Total concurent FFTs, n-1 in shared and then on in thread data
@@ -1646,7 +1647,7 @@ __launch_bounds__(MAX_TPB) __global__
         // TODO: probably need to check we don't pick a weird odd number or something.
         io<FFT, MAX_TPB, n_ffts>::load_c2r_transposed_coalesced(&input_values[ReturnZplane(gridDim.y, mem_offsets.physical_x_input)],
                                                                 (scalar_compute_t*)thread_data,
-                                                                gridDim.y * n_ffts * 2); // 2 for the complex data
+                                                                gridDim.y * n_ffts * 2);
 //revert
 // // // For loop zero the twiddles don't need to be computed
 // #pragma unroll(n_ffts)
@@ -1671,9 +1672,10 @@ __launch_bounds__(MAX_TPB) __global__
     for ( int i_fft = 0; i_fft < n_ffts; i_fft++ ) {
         // normally  Return1DFFTAddress(mem_offsets.physical_x_output) = pixel_pitch * (blockIdx.y + blockIdx.z * gridDim.y)
         // we reduce the number of blocks in Y by n_ffts, hysical_x = fft_idx + blockIdx.y * n_coalesced_ffts;
-        io<FFT>::store_c2r(&thread_data[i_fft * FFT::storage_size],
-                           &output_values[mem_offsets.physical_x_output *
-                                          ((blockIdx.y * n_ffts + i_fft) + blockIdx.z * gridDim.y)]);
+        if ( threadIdx.y == 0 ) {
+            io<FFT>::store_c2r(&thread_data[i_fft * FFT::storage_size],
+                               &output_values[mem_offsets.physical_x_output * (blockIdx.y * n_ffts + i_fft)]); // FIXME only 2d
+        }
     }
 
 #else
@@ -1865,8 +1867,8 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::Selec
 
     // Use recursion to step through the allowed sizes.
     GetTransformSize(kernel_type);
-    // Maybe revert 16 as 4
-    constexpr unsigned int Ept = SizeValue <= 32 ? 4 : SizeValue < 4096 ? 8
+
+    constexpr unsigned int Ept = SizeValue == 16 ? 4 : SizeValue < 4096 ? 8
                                                                         : 16;
     // Note: the size of the input/output may not match the size of the transform, i.e. transform_size.L <= transform_size.P
     if ( SizeValue == transform_size.P ) {
@@ -2615,8 +2617,10 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
                     int shared_memory = FFT::shared_memory_size;
                     PrintLaunchParameters(LP);
 #ifdef C2R_BUFFER_LINES
-                    constexpr unsigned int n_buffer_lines = size_of<FFT>::value < 64 ? 2 : size_of<FFT>::value < 512 ? 2
-                                                                                                                     : 4;
+                    // neads to be chosen so that 16 / n_buffer_lines >= FFT::stride (blockDim.x)
+                    constexpr unsigned int min_buffer_lines = std::max(16u / FFT::stride, 2u);
+                    constexpr unsigned int n_buffer_lines   = size_of<FFT>::value < 64u ? std::max(min_buffer_lines, 2u) : size_of<FFT>::value < 512 ? std::max(min_buffer_lines, 4u)
+                                                                                                                                                     : std::max(min_buffer_lines, 4u);
                     LP.gridDims.y /= n_buffer_lines;
                     // FIXME assuming we get a multiple of 32
                     constexpr unsigned int max_threads_per_block = FFT::max_threads_per_block < 32 ? 32 / FFT::max_threads_per_block * FFT::max_threads_per_block : FFT::max_threads_per_block;
@@ -2624,6 +2628,7 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
                         LP.threadsPerBlock.y = 32 / FFT::max_threads_per_block;
                         // TODO: other asserts, but basically, for the buffering we want to have at least 32 threads per block, and we can't modify x
                     }
+                    std::cerr << "min_buffer_lines " << min_buffer_lines << " n_buffer " << n_buffer_lines << std::endl;
 
 #else
 
@@ -2634,6 +2639,7 @@ void FourierTransformer<ComputeBaseType, InputType, OtherImageType, Rank>::SetAn
                     PrintLaunchParameters(LP);
                     std::cout << "n_buffer_lines: " << n_buffer_lines << std::endl;
                     CheckSharedMemory(shared_memory, device_properties);
+
                     // ZeroBufferMemory( );
 
 #if FFT_DEBUG_STAGE > 6