diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh index 735b283c7..97f820e9c 100644 --- a/cub/agent/single_pass_scan_operators.cuh +++ b/cub/agent/single_pass_scan_operators.cuh @@ -35,11 +35,12 @@ #include -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../warp/warp_reduce.cuh" -#include "../config.cuh" -#include "../util_device.cuh" +#include +#include +#include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -738,8 +739,10 @@ struct TilePrefixCallbackOp // Update our status with our tile-aggregate if (threadIdx.x == 0) { - new (&temp_storage.block_aggregate) T(block_aggregate); - tile_status.SetPartial(tile_idx, block_aggregate); + detail::uninitialized_copy(&temp_storage.block_aggregate, + block_aggregate); + + tile_status.SetPartial(tile_idx, block_aggregate); } int predecessor_idx = tile_idx - threadIdx.x - 1; @@ -768,8 +771,11 @@ struct TilePrefixCallbackOp inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); tile_status.SetInclusive(tile_idx, inclusive_prefix); - new (&temp_storage.exclusive_prefix) T(exclusive_prefix); - new (&temp_storage.inclusive_prefix) T(inclusive_prefix); + detail::uninitialized_copy(&temp_storage.exclusive_prefix, + exclusive_prefix); + + detail::uninitialized_copy(&temp_storage.inclusive_prefix, + inclusive_prefix); } // Return exclusive_prefix diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh index d3521ea03..765e126ce 100644 --- a/cub/block/block_exchange.cuh +++ b/cub/block/block_exchange.cuh @@ -33,10 +33,11 @@ #pragma once -#include "../config.cuh" -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../warp/warp_exchange.cuh" +#include +#include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -209,7 +210,8 @@ private: { int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } CTA_SYNC(); @@ -250,7 +252,8 @@ private: { int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } } @@ -298,7 +301,8 @@ private: { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -328,7 +332,8 @@ private: { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -354,7 +359,8 @@ private: { int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -385,7 +391,8 @@ private: { int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } CTA_SYNC(); @@ -434,7 +441,9 @@ private: if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + + item_offset, + input_items[ITEM]); } } } @@ -476,7 +485,8 @@ private: { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -486,7 +496,8 @@ private: { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]); + detail::uninitialized_copy(output_items + ITEM, + temp_storage.buff[item_offset]); } } @@ -512,7 +523,8 @@ private: { int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -544,7 +556,8 @@ private: { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } CTA_SYNC(); @@ -584,7 +597,8 @@ private: if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } } @@ -626,7 +640,8 @@ private: { int item_offset = ranks[ITEM]; if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } CTA_SYNC(); @@ -668,7 +683,8 @@ private: if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) { if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - new (temp_storage.buff + item_offset) InputT(input_items[ITEM]); + detail::uninitialized_copy(temp_storage.buff + item_offset, + input_items[ITEM]); } } diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh index 4fec6cad1..44db484ed 100644 --- a/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -33,9 +33,10 @@ #pragma once -#include "../../warp/warp_reduce.cuh" -#include "../../config.cuh" -#include "../../util_ptx.cuh" +#include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -143,7 +144,8 @@ struct BlockReduceWarpReductions // Share lane aggregates if (lane_id == 0) { - new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate); + detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, + warp_aggregate); } CTA_SYNC(); diff --git a/cub/block/specializations/block_scan_warp_scans.cuh b/cub/block/specializations/block_scan_warp_scans.cuh index 1a8f4bb85..7fd53dc0f 100644 --- a/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/block/specializations/block_scan_warp_scans.cuh @@ -33,9 +33,10 @@ #pragma once -#include "../../config.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" +#include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -152,7 +153,8 @@ struct BlockScanWarpScans // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) { - new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate); + detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id, + warp_aggregate); } CTA_SYNC(); @@ -295,9 +297,11 @@ struct BlockScanWarpScans T block_prefix = block_prefix_callback_op(block_aggregate); if (lane_id == 0) { - // Share the prefix with all threads - new (&temp_storage.block_prefix) T(block_prefix); - exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + // Share the prefix with all threads + detail::uninitialized_copy(&temp_storage.block_prefix, + block_prefix); + + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 } } @@ -369,7 +373,8 @@ struct BlockScanWarpScans if (lane_id == 0) { // Share the prefix with all threads - new(&temp_storage.block_prefix) T(block_prefix); + detail::uninitialized_copy(&temp_storage.block_prefix, + block_prefix); } } diff --git a/cub/detail/memory.cuh b/cub/detail/memory.cuh new file mode 100644 index 000000000..70418456e --- /dev/null +++ b/cub/detail/memory.cuh @@ -0,0 +1,66 @@ +/****************************************************************************** + * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#pragma once + +#include + +#include + +CUB_NAMESPACE_BEGIN + + +namespace detail +{ + +template ::value, + int + >::type = 0> +__device__ void uninitialized_copy(T *ptr, U &&val) +{ + *ptr = cuda::std::forward(val); +} + +template ::value, + int + >::type = 0> +__device__ void uninitialized_copy(T *ptr, U &&val) +{ + new (ptr) T(cuda::std::forward(val)); +} + +} // namespace detail + + +CUB_NAMESPACE_END + diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh index 9b0148665..04ff62051 100644 --- a/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/device/dispatch/dispatch_reduce.cuh @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -123,7 +124,7 @@ __global__ void DeviceReduceKernel(InputIteratorT d_in, // Output result if (threadIdx.x == 0) { - new (d_out + blockIdx.x) AccumT(block_aggregate); + detail::uninitialized_copy(d_out + blockIdx.x, block_aggregate); } } diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh index 3af2613d4..7f711bf06 100644 --- a/cub/thread/thread_store.cuh +++ b/cub/thread/thread_store.cuh @@ -33,9 +33,9 @@ #pragma once -#include "../config.cuh" -#include "../util_ptx.cuh" -#include "../util_type.cuh" +#include +#include +#include CUB_NAMESPACE_BEGIN diff --git a/cub/util_type.cuh b/cub/util_type.cuh index cd6ac4d6d..5fc977663 100644 --- a/cub/util_type.cuh +++ b/cub/util_type.cuh @@ -47,12 +47,12 @@ #include #endif +#include #include #include #include #include - CUB_NAMESPACE_BEGIN @@ -312,7 +312,8 @@ struct InputValue if (m_is_future) { m_future_value = other.m_future_value; } else { - new (&m_immediate_value) T(other.m_immediate_value); + detail::uninitialized_copy(&m_immediate_value, + other.m_immediate_value); } }