diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
index 735b283c7..97f820e9c 100644
--- a/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/agent/single_pass_scan_operators.cuh
@@ -35,11 +35,12 @@
 
 #include <iterator>
 
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../config.cuh"
-#include "../util_device.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/memory.cuh>
+#include <cub/thread/thread_load.cuh>
+#include <cub/thread/thread_store.cuh>
+#include <cub/util_device.cuh>
+#include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -738,8 +739,10 @@ struct TilePrefixCallbackOp
         // Update our status with our tile-aggregate
         if (threadIdx.x == 0)
         {
-            new (&temp_storage.block_aggregate) T(block_aggregate);
-            tile_status.SetPartial(tile_idx, block_aggregate);
+          detail::uninitialized_copy(&temp_storage.block_aggregate,
+                                     block_aggregate);
+
+          tile_status.SetPartial(tile_idx, block_aggregate);
         }
 
         int         predecessor_idx = tile_idx - threadIdx.x - 1;
@@ -768,8 +771,11 @@ struct TilePrefixCallbackOp
             inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
             tile_status.SetInclusive(tile_idx, inclusive_prefix);
 
-            new (&temp_storage.exclusive_prefix) T(exclusive_prefix);
-            new (&temp_storage.inclusive_prefix) T(inclusive_prefix);
+            detail::uninitialized_copy(&temp_storage.exclusive_prefix,
+                                       exclusive_prefix);
+
+            detail::uninitialized_copy(&temp_storage.inclusive_prefix,
+                                       inclusive_prefix);
         }
 
         // Return exclusive_prefix
diff --git a/cub/block/block_exchange.cuh b/cub/block/block_exchange.cuh
index d3521ea03..765e126ce 100644
--- a/cub/block/block_exchange.cuh
+++ b/cub/block/block_exchange.cuh
@@ -33,10 +33,11 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../warp/warp_exchange.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/memory.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+#include <cub/warp/warp_exchange.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -209,7 +210,8 @@ private:
         {
             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -250,7 +252,8 @@ private:
                 {
                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 
@@ -298,7 +301,8 @@ private:
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         WARP_SYNC(0xffffffff);
@@ -328,7 +332,8 @@ private:
             {
                 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                           input_items[ITEM]);
             }
 
             WARP_SYNC(0xffffffff);
@@ -354,7 +359,8 @@ private:
                 {
                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
 
                 WARP_SYNC(0xffffffff);
@@ -385,7 +391,8 @@ private:
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -434,7 +441,9 @@ private:
                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
                     {
                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                        detail::uninitialized_copy(temp_storage.buff +
+                                                     item_offset,
+                                                   input_items[ITEM]);
                     }
                 }
             }
@@ -476,7 +485,8 @@ private:
         {
             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         WARP_SYNC(0xffffffff);
@@ -486,7 +496,8 @@ private:
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
+            detail::uninitialized_copy(output_items + ITEM,
+                                       temp_storage.buff[item_offset]);
         }
     }
 
@@ -512,7 +523,8 @@ private:
                 {
                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
 
                 WARP_SYNC(0xffffffff);
@@ -544,7 +556,8 @@ private:
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -584,7 +597,8 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 
@@ -626,7 +640,8 @@ private:
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+            detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                       input_items[ITEM]);
         }
 
         CTA_SYNC();
@@ -668,7 +683,8 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    new (temp_storage.buff + item_offset) InputT(input_items[ITEM]);
+                    detail::uninitialized_copy(temp_storage.buff + item_offset,
+                                               input_items[ITEM]);
                 }
             }
 
diff --git a/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/block/specializations/block_reduce_warp_reductions.cuh
index 4fec6cad1..44db484ed 100644
--- a/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -33,9 +33,10 @@
 
 #pragma once
 
-#include "../../warp/warp_reduce.cuh"
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/memory.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_reduce.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -143,7 +144,8 @@ struct BlockReduceWarpReductions
         // Share lane aggregates
         if (lane_id == 0)
         {
-            new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate);
+          detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
+                                     warp_aggregate);
         }
 
         CTA_SYNC();
diff --git a/cub/block/specializations/block_scan_warp_scans.cuh b/cub/block/specializations/block_scan_warp_scans.cuh
index 1a8f4bb85..7fd53dc0f 100644
--- a/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/block/specializations/block_scan_warp_scans.cuh
@@ -33,9 +33,10 @@
 
 #pragma once
 
-#include "../../config.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
+#include <cub/config.cuh>
+#include <cub/detail/memory.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/warp/warp_scan.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -152,7 +153,8 @@ struct BlockScanWarpScans
         // Last lane in each warp shares its warp-aggregate
         if (lane_id == WARP_THREADS - 1)
         {
-          new (temp_storage.warp_aggregates + warp_id) T(warp_aggregate);
+          detail::uninitialized_copy(temp_storage.warp_aggregates + warp_id,
+                                     warp_aggregate);
         }
 
         CTA_SYNC();
@@ -295,9 +297,11 @@ struct BlockScanWarpScans
             T block_prefix = block_prefix_callback_op(block_aggregate);
             if (lane_id == 0)
             {
-                // Share the prefix with all threads
-                new (&temp_storage.block_prefix) T(block_prefix);
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+              // Share the prefix with all threads
+              detail::uninitialized_copy(&temp_storage.block_prefix,
+                                         block_prefix);
+
+              exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0
             }
         }
 
@@ -369,7 +373,8 @@ struct BlockScanWarpScans
             if (lane_id == 0)
             {
                 // Share the prefix with all threads
-                new(&temp_storage.block_prefix) T(block_prefix);
+                detail::uninitialized_copy(&temp_storage.block_prefix,
+                                           block_prefix);
             }
         }
 
diff --git a/cub/detail/memory.cuh b/cub/detail/memory.cuh
new file mode 100644
index 000000000..70418456e
--- /dev/null
+++ b/cub/detail/memory.cuh
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <cub/config.cuh>
+
+#include <cuda/std/type_traits>
+
+CUB_NAMESPACE_BEGIN
+
+
+namespace detail
+{
+
+template <typename T,
+          typename U,
+          typename cuda::std::enable_if<
+            cuda::std::is_trivially_copyable<T>::value, 
+            int
+          >::type = 0>
+__device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  *ptr = cuda::std::forward<U>(val);
+}
+
+template <typename T, 
+         typename U,
+         typename cuda::std::enable_if<
+           !cuda::std::is_trivially_copyable<T>::value,
+           int
+         >::type = 0>
+__device__ void uninitialized_copy(T *ptr, U &&val)
+{
+  new (ptr) T(cuda::std::forward<U>(val));
+}
+
+} // namespace detail
+
+
+CUB_NAMESPACE_END
+
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 9b0148665..04ff62051 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -42,6 +42,7 @@
 #include <cub/grid/grid_even_share.cuh>
 #include <cub/iterator/arg_index_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
+#include <cub/thread/thread_store.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_device.cuh>
 
@@ -123,7 +124,7 @@ __global__ void DeviceReduceKernel(InputIteratorT d_in,
   // Output result
   if (threadIdx.x == 0)
   {
-    new (d_out + blockIdx.x) AccumT(block_aggregate);
+    detail::uninitialized_copy(d_out + blockIdx.x, block_aggregate);
   }
 }
 
diff --git a/cub/thread/thread_store.cuh b/cub/thread/thread_store.cuh
index 3af2613d4..7f711bf06 100644
--- a/cub/thread/thread_store.cuh
+++ b/cub/thread/thread_store.cuh
@@ -33,9 +33,9 @@
 
 #pragma once
 
-#include "../config.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
+#include <cub/config.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
 
 CUB_NAMESPACE_BEGIN
 
diff --git a/cub/util_type.cuh b/cub/util_type.cuh
index cd6ac4d6d..5fc977663 100644
--- a/cub/util_type.cuh
+++ b/cub/util_type.cuh
@@ -47,12 +47,12 @@
 #include <cuda_bf16.h>
 #endif
 
+#include <cub/detail/memory.cuh>
 #include <cub/util_arch.cuh>
 #include <cub/util_deprecated.cuh>
 #include <cub/util_macro.cuh>
 #include <cub/util_namespace.cuh>
 
-
 CUB_NAMESPACE_BEGIN
 
 
@@ -312,7 +312,8 @@ struct InputValue
         if (m_is_future) {
             m_future_value = other.m_future_value;
         } else {
-            new (&m_immediate_value) T(other.m_immediate_value);
+          detail::uninitialized_copy(&m_immediate_value,
+                                     other.m_immediate_value);
         }
     }