diff --git a/.clangd b/.clangd
index c3edf49bcc..e60a0cb1a7 100644
--- a/.clangd
+++ b/.clangd
@@ -32,6 +32,7 @@ CompileFlags:
     # report all errors
     - "-ferror-limit=0"
     - "-ftemplate-backtrace-limit=0"
+    - "-stdlib=libc++"
   Remove:
     - -stdpar
     # strip CUDA fatbin args
diff --git a/cub/agent/single_pass_scan_operators.cuh b/cub/agent/single_pass_scan_operators.cuh
index 63fbd7c85e..e5d1566213 100644
--- a/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/agent/single_pass_scan_operators.cuh
@@ -36,12 +36,16 @@
 #include <iterator>
 
 #include <cub/config.cuh>
+#include <cub/detail/strong_load.cuh>
+#include <cub/detail/strong_store.cuh>
 #include <cub/detail/uninitialized_copy.cuh>
 #include <cub/thread/thread_load.cuh>
 #include <cub/thread/thread_store.cuh>
 #include <cub/util_device.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <nv/target>
+
 CUB_NAMESPACE_BEGIN
 
 
@@ -106,6 +110,44 @@ enum ScanTileStatus
     SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
 };
 
+namespace detail 
+{
+
+template <int Delay, unsigned int GridThreshold = 500>
+__device__ __forceinline__ void delay()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (if (Delay > 0) 
+                {
+                  if (gridDim.x < GridThreshold) 
+                  {
+                    __threadfence_block();
+                  }
+                  else 
+                  {
+                    __nanosleep(Delay); 
+                  }
+                }));
+}
+
+template <int Delay = 350, unsigned int GridThreshold = 500>
+__device__ __forceinline__ void delay_or_prevent_hoisting()
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (delay<Delay, GridThreshold>();),
+               (__threadfence_block();));
+}
+
+template <int Delay = 350, unsigned int GridThreshold = 500>
+__device__ __forceinline__ void delay_on_dc_gpu_or_prevent_hoisting()
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_EXACTLY_SM_80, (delay<Delay, GridThreshold>();),
+    NV_PROVIDES_SM_70,   (delay<    0, GridThreshold>();),
+    NV_IS_DEVICE,        (__threadfence_block();));
+}
+
+}
 
 /**
  * Tile status interface.
@@ -127,20 +169,20 @@ struct ScanTileState<T, true>
     // Status word type
     using StatusWord = cub::detail::conditional_t<
       sizeof(T) == 8,
-      long long,
+      unsigned long long,
       cub::detail::conditional_t<
         sizeof(T) == 4,
-        int,
-        cub::detail::conditional_t<sizeof(T) == 2, short, char>>>;
+        unsigned int,
+        cub::detail::conditional_t<sizeof(T) == 2, unsigned short, unsigned char>>>;
 
     // Unit word type
     using TxnWord = cub::detail::conditional_t<
       sizeof(T) == 8,
-      longlong2,
+      ulonglong2,
       cub::detail::conditional_t<
         sizeof(T) == 4,
-        int2,
-        cub::detail::conditional_t<sizeof(T) == 2, int, uchar2>>>;
+        uint2,
+        unsigned int>>;
 
     // Device word type
     struct TileDescriptor
@@ -230,7 +272,8 @@ struct ScanTileState<T, true>
 
         TxnWord alias;
         *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+
+        detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
 
@@ -245,7 +288,8 @@ struct ScanTileState<T, true>
 
         TxnWord alias;
         *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+
+        detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
     /**
@@ -257,13 +301,18 @@ struct ScanTileState<T, true>
         T               &value)
     {
         TileDescriptor tile_descriptor;
-        do
+
         {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+          TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+          tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+        }
 
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+        while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff))
+        {   
+          detail::delay_or_prevent_hoisting();
+          TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+          tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+        }
 
         status = tile_descriptor.status;
         value = tile_descriptor.value;
@@ -281,7 +330,7 @@ template <typename T>
 struct ScanTileState<T, false>
 {
     // Status word type
-    typedef char StatusWord;
+    using StatusWord = unsigned int;
 
     // Constants
     enum
@@ -382,12 +431,7 @@ struct ScanTileState<T, false>
     {
         // Update tile inclusive value
         ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+        detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
     }
 
 
@@ -398,12 +442,7 @@ struct ScanTileState<T, false>
     {
         // Update tile partial value
         ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+        detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
     }
 
     /**
@@ -414,17 +453,21 @@ struct ScanTileState<T, false>
         StatusWord      &status,
         T               &value)
     {
-        do {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-
-            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+        do
+        {
+          status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+          __threadfence();
 
-        } while (status == SCAN_TILE_INVALID);
+        } while (WARP_ANY((status == SCAN_TILE_INVALID), 0xffffffff));
 
         if (status == StatusWord(SCAN_TILE_PARTIAL)) 
-            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        {
+          value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        }
         else
-            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+        {
+          value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+        }
     }
 };
 
@@ -471,7 +514,7 @@ template <
     typename KeyT>
 struct ReduceByKeyScanTileState<ValueT, KeyT, true>
 {
-    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+    using KeyValuePairT = KeyValuePair<KeyT, ValueT>;
 
     // Constants
     enum
@@ -486,17 +529,17 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
     // Status word type
     using StatusWord = cub::detail::conditional_t<
       STATUS_WORD_SIZE == 8,
-      long long,
+      unsigned long long,
       cub::detail::conditional_t<
         STATUS_WORD_SIZE == 4,
-        int,
-        cub::detail::conditional_t<STATUS_WORD_SIZE == 2, short, char>>>;
+        unsigned int,
+        cub::detail::conditional_t<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>;
 
     // Status word type
     using TxnWord = cub::detail::conditional_t<
       TXN_WORD_SIZE == 16,
-      longlong2,
-      cub::detail::conditional_t<TXN_WORD_SIZE == 8, long long, int>>;
+      ulonglong2,
+      cub::detail::conditional_t<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>;
 
     // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
     struct TileDescriptorBigStatus
@@ -594,7 +637,8 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
 
         TxnWord alias;
         *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+
+        detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
 
@@ -610,7 +654,8 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
 
         TxnWord alias;
         *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+
+        detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
     /**
@@ -637,11 +682,12 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
 //        value.key   = tile_descriptor.key;
 
         TileDescriptor tile_descriptor;
+
         do
         {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+          detail::delay_on_dc_gpu_or_prevent_hoisting();
+          TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+          tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
 
         } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
 
@@ -750,6 +796,7 @@ struct TilePrefixCallbackOp
         T           window_aggregate;
 
         // Wait for the warp-wide window of predecessor tiles to become valid
+        detail::delay<450>();
         ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
 
         // The exclusive tile prefix starts out as the current window aggregate
diff --git a/cub/detail/strong_load.cuh b/cub/detail/strong_load.cuh
new file mode 100644
index 0000000000..12e6672b9e
--- /dev/null
+++ b/cub/detail/strong_load.cuh
@@ -0,0 +1,182 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file Utilities for strong memory operations.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+#include <iterator>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+static __device__ __forceinline__ uint4 load_relaxed(uint4 const *ptr)
+{
+  uint4 retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.v4.u32 {%0, %1, %2, %3}, [%4];"
+                             : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.v4.u32 {%0, %1, %2, %3}, [%4];"
+                             : "=r"(retval.x), "=r"(retval.y), "=r"(retval.z), "=r"(retval.w)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ ulonglong2 load_relaxed(ulonglong2 const *ptr)
+{
+  ulonglong2 retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.v2.u64 {%0, %1}, [%2];"
+                             : "=l"(retval.x), "=l"(retval.y)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.v2.u64 {%0, %1}, [%2];"
+                             : "=l"(retval.x), "=l"(retval.y)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ ushort4 load_relaxed(ushort4 const *ptr)
+{
+  ushort4 retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.v4.u16 {%0, %1, %2, %3}, [%4];"
+                             : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.v4.u16 {%0, %1, %2, %3}, [%4];"
+                             : "=h"(retval.x), "=h"(retval.y), "=h"(retval.z), "=h"(retval.w)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ uint2 load_relaxed(uint2 const *ptr)
+{
+  uint2 retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.v2.u32 {%0, %1}, [%2];"
+                             : "=r"(retval.x), "=r"(retval.y)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.v2.u32 {%0, %1}, [%2];"
+                             : "=r"(retval.x), "=r"(retval.y)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ unsigned long long load_relaxed(unsigned long long const *ptr)
+{
+  unsigned long long retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u64 %0, [%1];"
+                             : "=l"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.u64 %0, [%1];"
+                             : "=l"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ unsigned int load_relaxed(unsigned int const *ptr)
+{
+  unsigned int retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u32 %0, [%1];"
+                             : "=r"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.u32 %0, [%1];"
+                             : "=r"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+
+  return retval;
+}
+
+static __device__ __forceinline__ unsigned short load_relaxed(unsigned short const *ptr)
+{
+  unsigned short retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("ld.relaxed.gpu.u16 %0, [%1];"
+                             : "=h"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("ld.cg.u16 %0, [%1];"
+                             : "=h"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return retval;
+}
+
+static __device__ __forceinline__ unsigned char load_relaxed(unsigned char const *ptr)
+{
+  unsigned short retval;
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("{"
+                             "  .reg .u8 datum;"
+                             "  ld.relaxed.gpu.u8 datum, [%1];"
+                             "  cvt.u16.u8 %0, datum;"
+                             "}"
+                             : "=h"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");),
+               (asm volatile("{"
+                             "  .reg .u8 datum;"
+                             "  ld.cg.u8 datum, [%1];"
+                             "  cvt.u16.u8 %0, datum;"
+                             "}"
+                             : "=h"(retval)
+                             : _CUB_ASM_PTR_(ptr)
+                             : "memory");));
+  return (unsigned char)retval;
+}
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
+
diff --git a/cub/detail/strong_store.cuh b/cub/detail/strong_store.cuh
new file mode 100644
index 0000000000..fd293519a3
--- /dev/null
+++ b/cub/detail/strong_store.cuh
@@ -0,0 +1,279 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * @file Utilities for strong memory operations.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/cpp_compatibility.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_type.cuh>
+
+CUB_NAMESPACE_BEGIN
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
+
+namespace detail
+{
+
+static __device__ __forceinline__ void store_relaxed(uint4 *ptr, uint4 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                             : "memory");),
+               (asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(ulonglong2 *ptr, ulonglong2 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v2.u64 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                             : "memory");),
+               (asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(ushort4 *ptr, ushort4 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                             : "memory");),
+               (asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(uint2 *ptr, uint2 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.v2.u32 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                             : "memory");),
+               (asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(unsigned long long *ptr,
+                                                     unsigned long long val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u64 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val)
+                             : "memory");),
+               (asm volatile("st.cg.u64 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(unsigned int *ptr, unsigned int val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u32 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val)
+                             : "memory");),
+               (asm volatile("st.cg.u32 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(unsigned short *ptr, unsigned short val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.relaxed.gpu.u16 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val)
+                             : "memory");),
+               (asm volatile("st.cg.u16 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val)
+                             : "memory");));
+}
+
+static __device__ __forceinline__ void store_relaxed(unsigned char *ptr, unsigned char val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("{"
+                             "  .reg .u8 datum;"
+                             "  cvt.u8.u16 datum, %1;"
+                             "  st.relaxed.gpu.u8 [%0], datum;"
+                             "}"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val)
+                             : "memory");),
+               (asm volatile("{"
+                             "  .reg .u8 datum;"
+                             "  cvt.u8.u16 datum, %1;"
+                             "  st.cg.u8 [%0], datum;"
+                             "}"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val)
+                             : "memory");));
+}
+
+__device__ __forceinline__ void store_release(uint4 *ptr, uint4 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.v4.u32 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                             : "memory");),
+               (__threadfence();
+                asm volatile("st.cg.v4.u32 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+                             : "memory");));
+}
+
+__device__ __forceinline__ void store_release(ulonglong2 *ptr, ulonglong2 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.v2.u64 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                             : "memory");),
+               (__threadfence(); asm volatile("st.cg.v2.u64 [%0], {%1, %2};"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "l"(val.x), "l"(val.y)
+                                              : "memory");));
+}
+
+__device__ __forceinline__ void store_release(ushort4 *ptr, ushort4 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.v4.u16 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                             : "memory");),
+               (__threadfence();
+                asm volatile("st.cg.v4.u16 [%0], {%1, %2, %3, %4};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)
+                             : "memory");));
+}
+
+__device__ __forceinline__ void store_release(uint2 *ptr, uint2 val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.v2.u32 [%0], {%1, %2};"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                             : "memory");),
+               (__threadfence(); asm volatile("st.cg.v2.u32 [%0], {%1, %2};"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "r"(val.x), "r"(val.y)
+                                              : "memory");));
+}
+
+__device__ __forceinline__ void store_release(unsigned long long *ptr, unsigned long long val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u64 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "l"(val)
+                             : "memory");),
+               (__threadfence(); asm volatile("st.cg.u64 [%0], %1;"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "l"(val)
+                                              : "memory");));
+}
+
+__device__ __forceinline__ void store_release(unsigned int *ptr, unsigned int val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u32 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "r"(val)
+                             : "memory");),
+               (__threadfence(); asm volatile("st.cg.u32 [%0], %1;"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "r"(val)
+                                              : "memory");));
+}
+
+__device__ __forceinline__ void store_release(unsigned short *ptr, unsigned short val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("st.release.gpu.u16 [%0], %1;"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"(val)
+                             : "memory");),
+               (__threadfence(); asm volatile("st.cg.u16 [%0], %1;"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "h"(val)
+                                              : "memory");));
+}
+
+__device__ __forceinline__ void store_release(unsigned char *ptr, unsigned char val)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_70,
+               (asm volatile("{"
+                             "  .reg .u8 datum;"
+                             "  cvt.u8.u16 datum, %1;"
+                             "  st.release.gpu.u8 [%0], datum;"
+                             "}"
+                             :
+                             : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val)
+                             : "memory");),
+               (__threadfence(); asm volatile("{"
+                                              "  .reg .u8 datum;"
+                                              "  cvt.u8.u16 datum, %1;"
+                                              "  st.cg.u8 [%0], datum;"
+                                              "}"
+                                              :
+                                              : _CUB_ASM_PTR_(ptr), "h"((unsigned short)val)
+                                              : "memory");));
+}
+
+} // namespace detail
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+CUB_NAMESPACE_END
+