diff --git a/cub/detail/detect_cuda_runtime.cuh b/cub/detail/detect_cuda_runtime.cuh
new file mode 100644
index 0000000000..7ac947277d
--- /dev/null
+++ b/cub/detail/detect_cuda_runtime.cuh
@@ -0,0 +1,100 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Utilities for CUDA dynamic parallelism.
+ */
+
+#pragma once
+
+#include <cub/util_namespace.cuh>
+
+#include <cuda_runtime_api.h>
+
+CUB_NAMESPACE_BEGIN
+namespace detail
+{
+
+#ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
+
+/**
+ * \def CUB_RDC_ENABLED
+ *
+ * Defined if RDC is enabled.
+ */
+#define CUB_RDC_ENABLED
+
+/**
+ * \def CUB_RUNTIME_FUNCTION
+ *
+ * Execution space for functions that can use the CUDA runtime API (`__host__`
+ * when RDC is off, `__host__ __device__` when RDC is on).
+ */
+#define CUB_RUNTIME_FUNCTION
+
+/**
+ * \def CUB_RUNTIME_ENABLED
+ *
+ * Whether or not the active compiler pass is allowed to invoke device kernels
+ * or methods from the CUDA runtime API.
+ *
+ * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__`
+ * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
+ * purposes only.
+ *
+ * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`.
+ */
+#define CUB_RUNTIME_ENABLED
+
+#else // Non-doxygen pass:
+
+#ifndef CUB_RUNTIME_FUNCTION
+
+#if defined(__CUDACC_RDC__)
+
+#define CUB_RDC_ENABLED
+#define CUB_RUNTIME_FUNCTION __host__ __device__
+
+#else // RDC disabled:
+
+#define CUB_RUNTIME_FUNCTION __host__
+
+#endif // RDC enabled
+
+#if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
+// Legacy only -- do not use in new code.
+#define CUB_RUNTIME_ENABLED
+#endif
+
+#endif // CUB_RUNTIME_FUNCTION predefined
+
+#endif // Do not document
+
+} // namespace detail
+CUB_NAMESPACE_END
diff --git a/cub/detail/device_synchronize.cuh b/cub/detail/device_synchronize.cuh
index 52c5a10663..1a868ff637 100644
--- a/cub/detail/device_synchronize.cuh
+++ b/cub/detail/device_synchronize.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cub/detail/detect_cuda_runtime.cuh>
 #include <cub/detail/exec_check_disable.cuh>
 #include <cub/util_arch.cuh>
 #include <cub/util_namespace.cuh>
@@ -38,8 +39,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
 {
   cudaError_t result = cudaErrorUnknown;
 
-#ifdef CUB_RUNTIME_ENABLED
-
 #if defined(__CUDACC__) &&                                                     \
   ((__CUDACC_VER_MAJOR__ > 11) ||                                              \
    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
@@ -48,11 +47,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
   result = __cudaDeviceSynchronizeDeprecationAvoidance();
 #else // CUDA < 11.6
 #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
-#endif
-
-#else // Device code without the CUDA runtime.
-  // Device side CUDA API calls are not supported in this configuration.
-#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaErrorInvalidConfiguration;
 #endif
 
   NV_IF_TARGET(NV_IS_HOST,
diff --git a/cub/detail/type_traits.cuh b/cub/detail/type_traits.cuh
index 9b28e2c959..803dbf74ee 100644
--- a/cub/detail/type_traits.cuh
+++ b/cub/detail/type_traits.cuh
@@ -32,8 +32,8 @@
 
 #pragma once
 
-#include "../util_cpp_dialect.cuh"
-#include "../util_namespace.cuh"
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_namespace.cuh>
 
 #include <type_traits>
 
diff --git a/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 676ae6bfaf..60f37b2adb 100644
--- a/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -259,12 +259,10 @@ struct DispatchAdjacentDifference : public SelectedPolicy
                 num_tiles,
                 tile_size);
 
-        if (debug_synchronous)
+        error = detail::DebugSyncStream(stream, debug_synchronous);
+        if (CubDebug(error))
         {
-          if (CubDebug(error = SyncStream(stream)))
-          {
-            break;
-          }
+          break;
         }
 
         // Check for failure to launch
@@ -302,12 +300,11 @@ struct DispatchAdjacentDifference : public SelectedPolicy
               difference_op,
               num_items);
 
-      if (debug_synchronous)
+
+      error = detail::DebugSyncStream(stream, debug_synchronous);
+      if (CubDebug(error))
       {
-        if (CubDebug(error = SyncStream(stream)))
-        {
-          break;
-        }
+        break;
       }
 
       // Check for failure to launch
diff --git a/cub/device/dispatch/dispatch_histogram.cuh b/cub/device/dispatch/dispatch_histogram.cuh
index c78f125a47..1fae8638d0 100644
--- a/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/device/dispatch/dispatch_histogram.cuh
@@ -35,12 +35,12 @@
 #pragma once
 
 #include <cub/agent/agent_histogram.cuh>
+#include <cub/config.cuh>
+#include <cub/grid/grid_queue.cuh>
+#include <cub/thread/thread_search.cuh>
 #include <cub/util_debug.cuh>
 #include <cub/util_device.cuh>
 #include <cub/util_math.cuh>
-#include <cub/thread/thread_search.cuh>
-#include <cub/grid/grid_queue.cuh>
-#include <cub/config.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
@@ -530,13 +530,6 @@ public:
         cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
         bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
     {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-    #else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -674,17 +667,21 @@ public:
                 tile_queue);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+              break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+                break;
+            }
         }
         while (0);
 
         return error;
-
-    #endif // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh
index 0b480899c5..86f1f8cd0e 100644
--- a/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -704,12 +704,10 @@ struct DispatchMergeSort : SelectedPolicy
 
       block_sort_launcher.launch();
 
-      if (debug_synchronous)
+      error = detail::DebugSyncStream(stream, debug_synchronous);
+      if (CubDebug(error))
       {
-        if (CubDebug(error = SyncStream(stream)))
-        {
-          break;
-        }
+        break;
       }
 
       // Check for failure to launch
@@ -769,12 +767,10 @@ struct DispatchMergeSort : SelectedPolicy
                 target_merged_tiles_number,
                 tile_size);
 
-        if (debug_synchronous)
+        error = detail::DebugSyncStream(stream, debug_synchronous);
+        if (CubDebug(error))
         {
-          if (CubDebug(error = SyncStream(stream)))
-          {
-            break;
-          }
+          break;
         }
 
         // Check for failure to launch
@@ -786,12 +782,10 @@ struct DispatchMergeSort : SelectedPolicy
         // Merge
         merge_launcher.launch(ping, target_merged_tiles_number);
 
-        if (debug_synchronous)
+        error = detail::DebugSyncStream(stream, debug_synchronous);
+        if (CubDebug(error))
         {
-          if (CubDebug(error = SyncStream(stream)))
-          {
-            break;
-          }
+          break;
         }
 
         // Check for failure to launch
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
index 37cd74d84c..fa189ba7b8 100644
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1044,11 +1044,6 @@ struct DispatchRadixSort :
     cudaError_t InvokeSingleTile(
         SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
         cudaError error = cudaSuccess;
         do
         {
@@ -1078,10 +1073,17 @@ struct DispatchRadixSort :
                 end_bit);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Update selector
             d_keys.selector ^= 1;
@@ -1090,8 +1092,6 @@ struct DispatchRadixSort :
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
     }
 
 
@@ -1141,10 +1141,17 @@ struct DispatchRadixSort :
                 pass_config.even_share);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Log scan_kernel configuration
             if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
@@ -1158,10 +1165,17 @@ struct DispatchRadixSort :
                 pass_spine_length);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Log downsweep_kernel configuration
             if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
@@ -1184,10 +1198,17 @@ struct DispatchRadixSort :
                 pass_config.even_share);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Update current bit
             current_bit += pass_bits;
@@ -1352,14 +1373,13 @@ struct DispatchRadixSort :
             {
                 break;
             }
-            if (debug_synchronous)
+
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
             {
-                if (CubDebug(error = SyncStream(stream)))
-                {
-                    break;
-                }
+              break;
             }
-            
+
             // exclusive sums to determine starts
             const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
 
@@ -1377,14 +1397,13 @@ struct DispatchRadixSort :
                             d_bins);
             if (CubDebug(error))
             {
-              break;
+                break;
             }
-            if (debug_synchronous)
+
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
             {
-                if (CubDebug(error = SyncStream(stream)))
-                {
-                    break;
-                }
+              break;
             }
 
             // use the other buffer if no overwrite is allowed
@@ -1442,12 +1461,10 @@ struct DispatchRadixSort :
                       break;
                     }
 
-                    if (debug_synchronous)
+                    error = detail::DebugSyncStream(stream, debug_synchronous);
+                    if (CubDebug(error))
                     {
-                        if (CubDebug(error = SyncStream(stream)))
-                        {
-                            break;
-                        }
+                      break;
                     }
                 }
 
@@ -1483,17 +1500,6 @@ struct DispatchRadixSort :
         DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
         DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)upsweep_kernel;
-        (void)alt_upsweep_kernel;
-        (void)scan_kernel;
-        (void)downsweep_kernel;
-        (void)alt_downsweep_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -1606,8 +1612,6 @@ struct DispatchRadixSort :
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
     }
 
 
@@ -1847,10 +1851,17 @@ struct DispatchSegmentedRadixSort :
                 current_bit, pass_bits);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Update current bit
             current_bit += pass_bits;
@@ -1893,14 +1904,6 @@ struct DispatchSegmentedRadixSort :
         SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
         SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-      (void)segmented_kernel;
-      (void)alt_segmented_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -1979,8 +1982,6 @@ struct DispatchSegmentedRadixSort :
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
index 0ac3b7b3c0..ff2adf2005 100644
--- a/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/device/dispatch/dispatch_reduce.cuh
@@ -387,12 +387,6 @@ struct DispatchReduce :
     cudaError_t InvokeSingleTile(
         SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
         cudaError error = cudaSuccess;
         do
         {
@@ -420,16 +414,21 @@ struct DispatchReduce :
                 init);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
     }
 
 
@@ -447,14 +446,6 @@ struct DispatchReduce :
         ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
         SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)                  reduce_kernel;
-        (void)                  single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -517,10 +508,17 @@ struct DispatchReduce :
                 reduction_op);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Log single_reduce_sweep_kernel configuration
             if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
@@ -539,17 +537,21 @@ struct DispatchReduce :
                 init);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
     }
 
 
@@ -717,11 +719,6 @@ struct DispatchSegmentedReduce :
     cudaError_t InvokePasses(
         DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
     {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)segmented_reduce_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
         cudaError error = cudaSuccess;
         do
         {
@@ -758,17 +755,21 @@ struct DispatchSegmentedReduce :
                 init);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
     }
 
 
diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 952ace4240..e292abbc56 100644
--- a/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -260,29 +260,6 @@ struct DispatchReduceByKey
         ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
         KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
     {
-
-#ifndef CUB_RUNTIME_ENABLED
-      (void)d_temp_storage;
-      (void)temp_storage_bytes;
-      (void)d_keys_in;
-      (void)d_unique_out;
-      (void)d_values_in;
-      (void)d_aggregates_out;
-      (void)d_num_runs_out;
-      (void)equality_op;
-      (void)reduction_op;
-      (void)num_items;
-      (void)stream;
-      (void)debug_synchronous;
-      (void)init_kernel;
-      (void)reduce_by_key_kernel;
-      (void)reduce_by_key_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -324,14 +301,23 @@ struct DispatchReduceByKey
                 d_num_runs_out);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Return if empty problem
             if (num_items == 0)
+            {
                 break;
+            }
 
             // Get SM occupancy for reduce_by_key_kernel
             int reduce_by_key_sm_occupancy;
@@ -369,17 +355,22 @@ struct DispatchReduceByKey
                     num_items);
 
                 // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
+                if (CubDebug(error = cudaPeekAtLastError()))
+                {
+                    break;
+                }
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
             }
         }
         while (0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_rle.cuh b/cub/device/dispatch/dispatch_rle.cuh
index 692a55d4e9..35ab8aa0fb 100644
--- a/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/device/dispatch/dispatch_rle.cuh
@@ -261,14 +261,6 @@ struct DeviceRleDispatch
         DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
         KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
     {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -310,14 +302,23 @@ struct DeviceRleDispatch
                 d_num_runs_out);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Return if empty problem
             if (num_items == 0)
+            {
                 break;
+            }
 
             // Get SM occupancy for device_rle_sweep_kernel
             int device_rle_kernel_sm_occupancy;
@@ -354,17 +355,21 @@ struct DeviceRleDispatch
                 num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while (0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
index 2d452d3323..f59097fcd9 100644
--- a/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/device/dispatch/dispatch_scan.cuh
@@ -276,16 +276,6 @@ struct DispatchScan:
     CUB_RUNTIME_FUNCTION __host__  __forceinline__
     cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
     {
-#ifndef CUB_RUNTIME_ENABLED
-
-        (void)init_kernel;
-        (void)scan_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
         typedef typename ActivePolicyT::ScanPolicyT Policy;
         typedef typename cub::ScanTileState<OutputT> ScanTileStateT;
 
@@ -339,11 +329,17 @@ struct DispatchScan:
                 num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Get SM occupancy for scan_kernel
             int scan_sm_occupancy;
@@ -377,17 +373,22 @@ struct DispatchScan:
                     num_items);
 
                 // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
+                if (CubDebug(error = cudaPeekAtLastError()))
+                {
+                    break;
+                }
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
             }
         }
         while (0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
     template <typename ActivePolicyT>
diff --git a/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/device/dispatch/dispatch_scan_by_key.cuh
index ff22a6208c..3557f19a16 100644
--- a/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -281,15 +281,6 @@ struct DispatchScanByKey:
     CUB_RUNTIME_FUNCTION __host__  __forceinline__
     cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
     {
-#ifndef CUB_RUNTIME_ENABLED
-
-        (void)init_kernel;
-        (void)scan_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
         typedef typename ActivePolicyT::ScanByKeyPolicyT Policy;
         typedef ReduceByKeyScanTileState<OutputT, OffsetT> ScanByKeyTileStateT;
 
@@ -339,10 +330,17 @@ struct DispatchScanByKey:
             ).doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, tile_size, num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Get SM occupancy for scan_kernel
             int scan_sm_occupancy;
@@ -380,17 +378,22 @@ struct DispatchScanByKey:
                     num_items);
 
                 // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
+                if (CubDebug(error = cudaPeekAtLastError()))
+                {
+                    break;
+                }
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
             }
         }
         while (0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
     template <typename ActivePolicyT>
diff --git a/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/device/dispatch/dispatch_segmented_sort.cuh
index 6fb68f9002..becd887a30 100644
--- a/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -605,12 +605,10 @@ DeviceSegmentedSortContinuation(
     }
 
     // Sync the stream if specified to flush runtime errors
-    if (debug_synchronous)
+    error = detail::DebugSyncStream(stream, debug_synchronous);
+    if (CubDebug(error))
     {
-      if (CubDebug(error = SyncStream(stream)))
-      {
-        return error;
-      }
+      return error;
     }
   }
 
@@ -666,18 +664,17 @@ DeviceSegmentedSortContinuation(
     }
 
     // Sync the stream if specified to flush runtime errors
-    if (debug_synchronous)
+    error = detail::DebugSyncStream(stream, debug_synchronous);
+    if (CubDebug(error))
     {
-      if (CubDebug(error = SyncStream(stream)))
-      {
-        return error;
-      }
+      return error;
     }
   }
 
   return error;
 }
 
+#ifdef CUB_RDC_ENABLED
 /*
  * Continuation kernel is used only in the CDP mode. It's used to
  * launch DeviceSegmentedSortContinuation as a separate kernel.
@@ -721,7 +718,6 @@ DeviceSegmentedSortContinuationKernel(
   //
   // Due to (4, 5), we can't pass the user-provided stream in the continuation.
   // Due to (1, 2, 3) it's safe to pass the main stream.
-  #ifdef CUB_RUNTIME_ENABLED
   cudaError_t error =
     DeviceSegmentedSortContinuation<LargeSegmentPolicyT, SmallAndMediumPolicyT>(
       large_kernel,
@@ -742,11 +738,8 @@ DeviceSegmentedSortContinuationKernel(
       debug_synchronous);
 
   CubDebug(error);
-  #else
-  // Kernel launch not supported from this device
-  CubDebug(cudaErrorNotSupported);
-  #endif
 }
+#endif // CUB_RDC_ENABLED
 
 template <typename KeyT,
           typename ValueT>
@@ -1576,7 +1569,7 @@ private:
       THRUST_NS_QUALIFIER::make_reverse_iterator(
         large_and_medium_segments_indices.get() + num_segments);
 
-    if (CubDebug(error = cub::DevicePartition::If(
+    error = cub::DevicePartition::If(
       device_partition_temp_storage.get(),
       three_way_partition_temp_storage_bytes,
       THRUST_NS_QUALIFIER::counting_iterator<OffsetT>(0),
@@ -1588,53 +1581,60 @@ private:
       large_segments_selector,
       small_segments_selector,
       stream,
-      debug_synchronous)))
+      debug_synchronous);
+    if (CubDebug(error))
     {
       return error;
     }
 
-#ifdef CUB_RUNTIME_ENABLED
-#define CUB_TMP_DEVICE_CODE                                                    \
+    // The device path is only used (and only compiles) when CDP is enabled.
+    // It's defined in a macro since we can't put `#ifdef`s inside of
+    // `NV_IF_TARGET`.
+#ifndef CUB_RDC_ENABLED
+
+#define CUB_TEMP_DEVICE_CODE
+
+#else // CUB_RDC_ENABLED
+
+#define CUB_TEMP_DEVICE_CODE                                                   \
   using MaxPolicyT = typename DispatchSegmentedSort::MaxPolicy;                \
-  THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)     \
-    .doit(DeviceSegmentedSortContinuationKernel<MaxPolicyT,                    \
-                                                LargeKernelT,                  \
-                                                SmallKernelT,                  \
-                                                KeyT,                          \
-                                                ValueT,                        \
-                                                BeginOffsetIteratorT,          \
-                                                EndOffsetIteratorT>,           \
-          large_kernel,                                                        \
-          small_kernel,                                                        \
-          num_segments,                                                        \
-          d_keys.Current(),                                                    \
-          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),       \
-          d_keys_double_buffer,                                                \
-          d_values.Current(),                                                  \
-          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),   \
-          d_values_double_buffer,                                              \
-          d_begin_offsets,                                                     \
-          d_end_offsets,                                                       \
-          group_sizes.get(),                                                   \
-          large_and_medium_segments_indices.get(),                             \
-          small_segments_indices.get(),                                        \
-          debug_synchronous);                                                  \
+  error =                                                                      \
+    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)   \
+      .doit(DeviceSegmentedSortContinuationKernel<MaxPolicyT,                  \
+                                                  LargeKernelT,                \
+                                                  SmallKernelT,                \
+                                                  KeyT,                        \
+                                                  ValueT,                      \
+                                                  BeginOffsetIteratorT,        \
+                                                  EndOffsetIteratorT>,         \
+            large_kernel,                                                      \
+            small_kernel,                                                      \
+            num_segments,                                                      \
+            d_keys.Current(),                                                  \
+            GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),     \
+            d_keys_double_buffer,                                              \
+            d_values.Current(),                                                \
+            GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values), \
+            d_values_double_buffer,                                            \
+            d_begin_offsets,                                                   \
+            d_end_offsets,                                                     \
+            group_sizes.get(),                                                 \
+            large_and_medium_segments_indices.get(),                           \
+            small_segments_indices.get(),                                      \
+            debug_synchronous);                                                \
                                                                                \
-  if (CubDebug(error = cudaPeekAtLastError()))                                 \
+  if (CubDebug(error))                                                         \
   {                                                                            \
     return error;                                                              \
   }                                                                            \
                                                                                \
-  if (debug_synchronous)                                                       \
+  error = detail::DebugSyncStream(stream, debug_synchronous);                  \
+  if (CubDebug(error))                                                         \
   {                                                                            \
-    if (CubDebug(error = SyncStream(stream)))                                  \
-    {                                                                          \
-      return error;                                                            \
-    }                                                                          \
+    return error;                                                              \
   }
-#else
-#define CUB_TMP_DEVICE_CODE error = CubDebug(cudaErrorNotSupported);
-#endif
+
+#endif // CUB_RDC_ENABLED
 
     // Clang format mangles some of this NV_IF_TARGET block
     // clang-format off
@@ -1677,10 +1677,10 @@ private:
           stream,
           debug_synchronous);),
       // NV_IS_DEVICE:
-      (CUB_TMP_DEVICE_CODE));
+      (CUB_TEMP_DEVICE_CODE));
     // clang-format on
 
-#undef CUB_TMP_DEVICE_CODE
+#undef CUB_TEMP_DEVICE_CODE
 
     return error;
   }
@@ -1732,12 +1732,10 @@ private:
     }
 
     // Sync the stream if specified to flush runtime errors
-    if (debug_synchronous)
+    error = detail::DebugSyncStream(stream, debug_synchronous);
+    if (CubDebug(error))
     {
-      if (CubDebug(error = SyncStream(stream)))
-      {
-        return error;
-      }
+      return error;
     }
 
     return error;
diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh
index fb949e6305..15e3e09a0d 100644
--- a/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/device/dispatch/dispatch_select_if.cuh
@@ -254,28 +254,6 @@ struct DispatchSelectIf
         SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
         KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
     {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_flags;
-        (void)d_selected_out;
-        (void)d_num_selected_out;
-        (void)select_op;
-        (void)equality_op;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)scan_init_kernel;
-        (void)select_if_kernel;
-        (void)select_if_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
         cudaError error = cudaSuccess;
         do
         {
@@ -317,10 +295,17 @@ struct DispatchSelectIf
                 d_num_selected_out);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Return if empty problem
             if (num_items == 0)
@@ -374,16 +359,21 @@ struct DispatchSelectIf
                 num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error = cudaPeekAtLastError()))
+            {
+                break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while (0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/device/dispatch/dispatch_spmv_orig.cuh
index 7e2d74b26d..2c88059eec 100644
--- a/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -483,12 +483,6 @@ struct DispatchSpmv
         KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
         KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
     {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
         cudaError error = cudaSuccess;
         do
         {
@@ -531,10 +525,17 @@ struct DispatchSpmv
                     spmv_params);
 
                 // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
+                if (CubDebug(error = cudaPeekAtLastError()))
+                {
+                    break;
+                }
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
 
                 break;
             }
@@ -639,7 +640,11 @@ struct DispatchSpmv
                 if (CubDebug(error = cudaPeekAtLastError())) break;
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
             }
 
             // Log spmv_kernel configuration
@@ -661,7 +666,11 @@ struct DispatchSpmv
             if (CubDebug(error = cudaPeekAtLastError())) break;
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Run reduce-by-key fixup if necessary
             if (num_merge_tiles > 1)
@@ -685,14 +694,16 @@ struct DispatchSpmv
                 if (CubDebug(error = cudaPeekAtLastError())) break;
 
                 // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+                error = detail::DebugSyncStream(stream, debug_synchronous);
+                if (CubDebug(error))
+                {
+                  break;
+                }
             }
         }
         while (0);
 
         return error;
-
-#endif // CUB_RUNTIME_ENABLED
     }
 
 
diff --git a/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/device/dispatch/dispatch_three_way_partition.cuh
index cb06438063..f643d83980 100644
--- a/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -360,12 +360,10 @@ struct DispatchThreeWayPartitionIf
       }
 
       // Sync the stream if specified to flush runtime errors
-      if (debug_synchronous)
+      error = detail::DebugSyncStream(stream, debug_synchronous);
+      if (CubDebug(error))
       {
-        if (CubDebug(error = cub::SyncStream(stream)))
-        {
-          break;
-        }
+        break;
       }
 
       // Get max x-dimension of grid
@@ -430,12 +428,10 @@ struct DispatchThreeWayPartitionIf
       }
 
       // Sync the stream if specified to flush runtime errors
-      if (debug_synchronous)
+      error = detail::DebugSyncStream(stream, debug_synchronous);
+      if (CubDebug(error))
       {
-        if (CubDebug(error = cub::SyncStream(stream)))
-        {
-          break;
-        }
+        break;
       }
     }
     while (0);
diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh
index a40e8bef4f..9eb96f0cb9 100644
--- a/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -220,16 +220,6 @@ struct DispatchUniqueByKey: SelectedPolicy
     CUB_RUNTIME_FUNCTION __host__  __forceinline__
     cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel)
     {
-#ifndef CUB_RUNTIME_ENABLED
-
-        (void)init_kernel;
-        (void)scan_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
         using Policy = typename ActivePolicyT::UniqueByKeyPolicyT;
         using UniqueByKeyAgentT = AgentUniqueByKey<Policy,
                                                    KeyInputIteratorT,
@@ -292,7 +282,11 @@ struct DispatchUniqueByKey: SelectedPolicy
             if (CubDebug(error = cudaPeekAtLastError())) break;
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Return if empty problem
             if (num_items == 0) break;
@@ -331,7 +325,7 @@ struct DispatchUniqueByKey: SelectedPolicy
             }
 
             // Invoke select_if_kernel
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+            error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
                 scan_grid_size, Policy::BLOCK_THREADS, 0, stream
             ).doit(scan_kernel,
                    d_keys_in,
@@ -345,16 +339,21 @@ struct DispatchUniqueByKey: SelectedPolicy
                    num_tiles);
 
             // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
+            if (CubDebug(error))
+            {
+              break;
+            }
 
             // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            error = detail::DebugSyncStream(stream, debug_synchronous);
+            if (CubDebug(error))
+            {
+              break;
+            }
         }
         while(0);
 
         return error;
-
-#endif  // CUB_RUNTIME_ENABLED
     }
 
     template <typename ActivePolicyT>
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
index 2c4b36bee4..f4013de568 100644
--- a/cub/util_arch.cuh
+++ b/cub/util_arch.cuh
@@ -33,9 +33,12 @@
 
 #pragma once
 
-#include "util_cpp_dialect.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
+#include <cub/util_cpp_dialect.cuh>
+#include <cub/util_namespace.cuh>
+#include <cub/util_macro.cuh>
+
+// Legacy include; this functionality used to be defined in here.
+#include <cub/detail/detect_cuda_runtime.cuh>
 
 CUB_NAMESPACE_BEGIN
 
@@ -96,16 +99,6 @@ CUB_NAMESPACE_BEGIN
 
 static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
 
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#ifndef CUB_RUNTIME_FUNCTION
-    #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
-        #define CUB_RUNTIME_ENABLED
-        #define CUB_RUNTIME_FUNCTION __host__ __device__
-    #else
-        #define CUB_RUNTIME_FUNCTION __host__
-    #endif
-#endif
-
 
 /// Number of threads per warp
 #ifndef CUB_LOG_WARP_THREADS
diff --git a/cub/util_debug.cuh b/cub/util_debug.cuh
index c38b9f8c06..b4a76a4e7d 100644
--- a/cub/util_debug.cuh
+++ b/cub/util_debug.cuh
@@ -67,14 +67,9 @@ __host__ __device__
 __forceinline__
 cudaError_t Debug(cudaError_t error, const char *filename, int line)
 {
-  (void)filename;
-  (void)line;
-
-#ifdef CUB_RUNTIME_ENABLED
   // Clear the global CUDA error state which may have been set by the last
   // call. Otherwise, errors may "leak" to unrelated kernel launches.
   cudaGetLastError();
-#endif
 
 #ifdef CUB_STDERR
   if (error)
@@ -103,6 +98,9 @@ cudaError_t Debug(cudaError_t error, const char *filename, int line)
       )
     );
   }
+#else
+  (void)filename;
+  (void)line;
 #endif
 
   return error;
diff --git a/cub/util_device.cuh b/cub/util_device.cuh
index 9a8ffa1d06..ad145f6b97 100644
--- a/cub/util_device.cuh
+++ b/cub/util_device.cuh
@@ -34,7 +34,6 @@
 #pragma once
 
 #include <cub/detail/device_synchronize.cuh>
-
 #include <cub/util_arch.cuh>
 #include <cub/util_cpp_dialect.cuh>
 #include <cub/util_debug.cuh>
@@ -121,17 +120,9 @@ __global__ void EmptyKernel(void) { }
  */
 CUB_RUNTIME_FUNCTION inline int CurrentDevice()
 {
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
     int device = -1;
     if (CubDebug(cudaGetDevice(&device))) return -1;
     return device;
-
-#else // Device code without the CUDA runtime.
-
-    return -1;
-
-#endif
 }
 
 /**
@@ -165,8 +156,6 @@ public:
  */
 CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
 {
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
     int count = -1;
     if (CubDebug(cudaGetDeviceCount(&count)))
         // CUDA makes no guarantees about the state of the output parameter if
@@ -174,12 +163,6 @@ CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
         // paranoia we'll reset `count` to `-1`.
         count = -1;
     return count;
-
-#else // Device code without the CUDA runtime.
-
-    return -1;
-
-#endif
 }
 
 /**
@@ -471,8 +454,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int &ptx_version)
  */
 CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
 {
-#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
-
     cudaError_t error = cudaSuccess;
     do
     {
@@ -484,16 +465,6 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int d
     while (0);
 
     return error;
-
-#else // Device code without the CUDA runtime.
-
-    (void)sm_version;
-    (void)device;
-
-    // CUDA API calls are not supported from this device.
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#endif
 }
 
 /**
@@ -543,9 +514,47 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
                ((void)stream;
                 result = CubDebug(cub::detail::device_synchronize());));
 
-    return result;
+  return result;
 }
 
+namespace detail
+{
+
+/**
+ * Same as SyncStream, but intended for use with the debug_synchronous flags
+ * in device algorithms. This should not be used if synchronization is required
+ * for correctness.
+ *
+ * If `debug_synchronous` is false, this function will immediately return
+ * cudaSuccess. If true, one of the following will occur:
+ *
+ * If synchronization is supported by the current compilation target and
+ * settings, the sync is performed and the sync result is returned.
+ *
+ * If syncs are not supported then no sync is performed, but a message is logged
+ * via _CubLog and cudaSuccess is returned.
+ */
+CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream,
+                                                        bool debug_synchronous)
+{
+  if (!debug_synchronous)
+  {
+    return cudaSuccess;
+  }
+
+#if 1 // All valid targets currently support device-side synchronization
+  _CubLog("%s\n", "Synchronizing...");
+  return SyncStream(stream);
+#else
+  (void)stream;
+  _CubLog("%s\n",
+          "WARNING: Skipping CUB `debug_synchronous` synchronization "
+          "(unsupported target).");
+  return cudaSuccess;
+#endif
+}
+
+} // namespace detail
 
 /**
  * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
@@ -586,25 +595,11 @@ cudaError_t MaxSmOccupancy(
     int                 block_threads,              ///< [in] Number of threads per thread block
     int                 dynamic_smem_bytes = 0)	    ///< [in] Dynamically allocated shared memory in bytes. Default is 0.
 {
-#ifndef CUB_RUNTIME_ENABLED
-
-    (void)dynamic_smem_bytes;
-    (void)block_threads;
-    (void)kernel_ptr;
-    (void)max_sm_occupancy;
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
     return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_sm_occupancy,
         kernel_ptr,
         block_threads,
         dynamic_smem_bytes));
-
-#endif  // CUB_RUNTIME_ENABLED
 }
 
 
diff --git a/examples/device/example_device_reduce.cu b/examples/device/example_device_reduce.cu
index fc8fddb0e2..c3182579d6 100644
--- a/examples/device/example_device_reduce.cu
+++ b/examples/device/example_device_reduce.cu
@@ -134,7 +134,7 @@ int main(int argc, char** argv)
 
     // Allocate host arrays
     int* h_in = new int[num_items];
-    int  h_reference;
+    int  h_reference{};
 
     // Initialize problem and solution
     Initialize(h_in, num_items);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 902103f199..bea1d90771 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -23,6 +23,7 @@ function(cub_get_test_params src labels_var defs_var)
       unused
       "${match}"
     )
+
     set(def ${CMAKE_MATCH_1})
     set(label ${CMAKE_MATCH_2})
     set(values "${CMAKE_MATCH_3}")
@@ -57,7 +58,6 @@ function(cub_get_test_params src labels_var defs_var)
       endforeach()
       set(variant_defs "${tmp_defs}")
     endif()
-
   endforeach()
 
   set(${labels_var} "${variant_labels}" PARENT_SCOPE)
@@ -115,15 +115,33 @@ function(cub_add_test target_name_var test_name test_src cub_target)
   endif()
   add_dependencies(${test_meta_target} ${test_target})
 
-  if (CUB_ENABLE_TESTS_WITH_RDC)
-    cub_enable_rdc_for_cuda_target(${test_target})
-  endif()
-
   add_test(NAME ${test_target}
     COMMAND "$<TARGET_FILE:${test_target}>"
   )
 endfunction()
 
+# Sets out_var to 1 if the label contains cdp variants, regardless of whether
+# or not CDP is enabled in this particular variant.
+function(_cub_has_cdp_variant out_var label)
+  string(FIND "${label}" "cdp_" idx)
+  if (idx EQUAL -1)
+    set(${out_var} 0 PARENT_SCOPE)
+  else()
+    set(${out_var} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Sets out_var to 1 if the label contains "cdp_1", e.g. cdp is explicitly
+# requested for this variant.
+function(_cub_is_cdp_enabled_variant out_var label)
+  string(FIND "${label}" "cdp_1" idx)
+  if (idx EQUAL -1)
+    set(${out_var} 0 PARENT_SCOPE)
+  else()
+    set(${out_var} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
 foreach (test_src IN LISTS test_srcs)
   get_filename_component(test_name "${test_src}" NAME_WE)
   string(REGEX REPLACE "^test_" "" test_name "${test_name}")
@@ -135,7 +153,7 @@ foreach (test_src IN LISTS test_srcs)
   math(EXPR range_end "${num_variants} - 1")
 
   # Verbose output:
-  if (NOT num_variants EQUAL 0)
+  if (num_variants GREATER 0)
     message(VERBOSE "Detected ${num_variants} variants of test '${test_src}':")
     foreach(var_idx RANGE ${range_end})
       math(EXPR i "${var_idx} + 1")
@@ -151,11 +169,16 @@ foreach (test_src IN LISTS test_srcs)
     if (num_variants EQUAL 0)
       # Only one version of this test.
       cub_add_test(test_target ${test_name} "${test_src}" ${cub_target})
-    else()
+      if (CUB_ENABLE_TESTS_WITH_RDC)
+        cub_enable_rdc_for_cuda_target(${test_target})
+      endif()
+    else() # has variants:
       # Meta target to build all parametrizations of the current test for the
       # current CUB_TARGET config
       set(variant_meta_target ${config_prefix}.test.${test_name}.all)
-      add_custom_target(${variant_meta_target})
+      if (NOT TARGET ${variant_meta_target})
+        add_custom_target(${variant_meta_target})
+      endif()
 
       # Meta target to build all parametrizations of the current test for all
       # CUB_TARGET configs
@@ -171,6 +194,10 @@ foreach (test_src IN LISTS test_srcs)
         list(GET variant_defs ${var_idx} defs)
         string(REPLACE ":" ";" defs "${defs}")
 
+        # Check if the test has explicit CDP variants:
+        _cub_has_cdp_variant(explicit_cdp "${label}")
+        _cub_is_cdp_enabled_variant(enable_cdp "${label}")
+
         cub_add_test(test_target
           ${test_name}.${label}
           "${test_src}"
@@ -179,9 +206,20 @@ foreach (test_src IN LISTS test_srcs)
         add_dependencies(${variant_meta_target} ${test_target})
         add_dependencies(${cub_variant_meta_target} ${test_target})
         target_compile_definitions(${test_target} PRIVATE ${defs})
-      endforeach()
-    endif()
-  endforeach()
-endforeach()
+
+        # Enable RDC if the test either:
+        # 1. Explicitly requests it (cdp_1 label)
+        # 2. Does not have an explicit CDP variant (no cdp_0 or cdp_1) but
+        #    RDC testing is globally enabled.
+        #
+        # Tests that explicitly request no cdp (cdp_0 label) should never enable
+        # RDC.
+        if (enable_cdp OR ((NOT explicit_cdp) AND CUB_ENABLE_TESTS_WITH_RDC))
+          cub_enable_rdc_for_cuda_target(${test_target})
+        endif()
+      endforeach() # Variant
+    endif() # Has variants
+  endforeach() # CUB targets
+endforeach() # Source file
 
 add_subdirectory(cmake)
diff --git a/test/README.md b/test/README.md
index 3f14c38520..81891f495f 100644
--- a/test/README.md
+++ b/test/README.md
@@ -29,26 +29,38 @@ generate multiple test executables for the full cartesian product of values.
 - `values` is a colon-separated list of values used during test generation. Only
   numeric values have been tested.
 
+## Special Labels
+
+### CDP / RDC Testing
+
+If a `label` is `cdp`, it is assumed that the parameter is used to explicitly
+test variants built with and without CDP support. The `values` for such a
+parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1`
+indicating CDP enabled (RDC on).
+
+Tests that do not contain a variant labeled `cdp` will only enable RDC if
+the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true.
+
 ## Example
 
 For example, if `test_baz.cu` contains the following lines:
 
 ```cpp
 // %PARAM% TEST_FOO foo 0:1:2
-// %PARAM% TEST_BAR bar 4:8
+// %PARAM% TEST_CDP cdp 0:1
 ```
 
 Six executables and CTest targets will be generated with unique definitions
 (only c++17 targets shown):
 
-| Executable Name                  | Preprocessor Definitions    |
-|----------------------------------|-----------------------------|
-| `cub.cpp17.test.baz.foo_0.bar_4` | `-DTEST_FOO=0 -DTEST_BAR=4` |
-| `cub.cpp17.test.baz.foo_0.bar_8` | `-DTEST_FOO=0 -DTEST_BAR=8` |
-| `cub.cpp17.test.baz.foo_1.bar_4` | `-DTEST_FOO=1 -DTEST_BAR=4` |
-| `cub.cpp17.test.baz.foo_1.bar_8` | `-DTEST_FOO=1 -DTEST_BAR=8` |
-| `cub.cpp17.test.baz.foo_2.bar_4` | `-DTEST_FOO=2 -DTEST_BAR=4` |
-| `cub.cpp17.test.baz.foo_2.bar_8` | `-DTEST_FOO=2 -DTEST_BAR=8` |
+| Executable Name                  | Preprocessor Definitions    | RDC State |
+|----------------------------------|-----------------------------|-----------|
+| `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled   |
+| `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled   |
+| `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled  |
+| `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled   |
 
 ## Changing `%PARAM%` Hints
 
diff --git a/test/cmake/check_source_files.cmake b/test/cmake/check_source_files.cmake
index 1fba8476f6..1554a2256e 100644
--- a/test/cmake/check_source_files.cmake
+++ b/test/cmake/check_source_files.cmake
@@ -83,6 +83,24 @@ if (NOT valid_count EQUAL 5)
     "Matched ${valid_count} times, expected 5.")
 endif()
 
+################################################################################
+# Legacy macro checks.
+# Check all files in CUB to make sure that they aren't using the legacy
+# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
+#
+# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
+# They are provided for legacy purposes and should be replaced with
+# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
+#
+#
+set(legacy_macro_header_exclusions
+  # This header defines a legacy CUDART macro:
+  cub/detail/detect_cuda_runtime.cuh
+)
+
+set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
+set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
+
 ################################################################################
 # Read source files:
 foreach(src ${cub_srcs})
@@ -138,6 +156,21 @@ foreach(src ${cub_srcs})
       set(found_errors 1)
     endif()
   endif()
+
+  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
+    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
+    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
+
+    if (NOT thrust_count EQUAL 0)
+      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT cub_count EQUAL 0)
+      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+  endif()
 endforeach()
 
 if (NOT found_errors EQUAL 0)
diff --git a/test/test_allocator.cu b/test/test_allocator.cu
index eabfa99b46..fcd488b8ef 100644
--- a/test/test_allocator.cu
+++ b/test/test_allocator.cu
@@ -324,7 +324,6 @@ int main(int argc, char** argv)
     // Check that that still we have 0 live block across all GPUs
     AssertEquals(allocator.live_blocks.size(), 0);
 
-#ifndef CUB_CDP
     // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
 
     if (num_gpus > 1)
@@ -361,7 +360,6 @@ int main(int argc, char** argv)
         // Check that that still we have 0 live block across all GPUs
         AssertEquals(allocator.live_blocks.size(), 0);
     }
-#endif  // CUB_CDP
 
     //
     // Performance
diff --git a/test/test_cdp_variant_state.cu b/test/test_cdp_variant_state.cu
new file mode 100644
index 0000000000..2a34842786
--- /dev/null
+++ b/test/test_cdp_variant_state.cu
@@ -0,0 +1,34 @@
+/*
+*  Copyright 2022 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <cstdlib>
+
+int main()
+{
+  // This test just checks that RDC is enabled and detected properly when using
+  // the %PARAM% system to request CDP support (see the README.md file in
+  // this directory).
+
+  // %PARAM% TEST_CDP cdp 0:1
+
+#ifdef CUB_RDC_ENABLED
+  return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
+#else
+  return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
+#endif
+}
diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu
index e0a6ae04d5..0faf67cbe3 100644
--- a/test/test_device_histogram.cu
+++ b/test/test_device_histogram.cu
@@ -433,77 +433,6 @@ struct Dispatch<1, 1, CUB>
 };
 
 
-
-//---------------------------------------------------------------------
-// CUDA nested-parallelism test kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceHistogram
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-__global__ void CnpDispatchKernel(
-    Int2Type<ALGORITHM> algorithm,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
-    int                 num_samples,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/ **
- * Dispatch to CDP kernel
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-cudaError_t Dispatch(
-    Int2Type<ALGORITHM> algorithm,
-    Int2Type<true>      use_cdp,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
-    int                 num_samples,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-*/
-
-
 //---------------------------------------------------------------------
 // Test generation
 //---------------------------------------------------------------------
diff --git a/test/test_device_radix_sort.cu b/test/test_device_radix_sort.cu
index d9a930b3a1..b9e7cb70fb 100644
--- a/test/test_device_radix_sort.cu
+++ b/test/test_device_radix_sort.cu
@@ -53,10 +53,13 @@
 
 #include <cub/util_allocator.cuh>
 #include <cub/util_math.cuh>
+#include <cub/detail/device_synchronize.cuh>
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include <cub/iterator/transform_input_iterator.cuh>
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 #include "test_util.h"
 
 using namespace cub;
@@ -80,7 +83,11 @@ enum Backend
     CUB_SEGMENTED,              // CUB method (allows overwriting of input)
     CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
 
-    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
+    // Same as above, but launches kernels from device using CDP.
+    CDP,
+    CDP_NO_OVERWRITE,
+    CDP_SEGMENTED,
+    CDP_SEGMENTED_NO_OVERWRITE,
 };
 
 static const char* BackendToString(Backend b)
@@ -97,6 +104,12 @@ static const char* BackendToString(Backend b)
       return "CUB_SEGMENTED_NO_OVERWRITE";
     case CDP:
       return "CDP";
+    case CDP_NO_OVERWRITE:
+      return "CDP_NO_OVERWRITE";
+    case CDP_SEGMENTED:
+      return "CDP_SEGMENTED";
+    case CDP_SEGMENTED_NO_OVERWRITE:
+      return "CDP_SEGMENTED_NO_OVERWRITE";
     default:
       break;
   }
@@ -114,7 +127,6 @@ static const char* BackendToString(Backend b)
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<false>         /*is_descending*/,
     Int2Type<CUB>           /*dispatch_to*/,
@@ -147,7 +159,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<false>             /*is_descending*/,
     Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
@@ -187,7 +198,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<true>          /*is_descending*/,
     Int2Type<CUB>           /*dispatch_to*/,
@@ -221,7 +231,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<true>              /*is_descending*/,
     Int2Type<CUB_NO_OVERWRITE>  /*dispatch_to*/,
@@ -286,7 +295,6 @@ __host__ __device__ bool ValidateNumItemsForSegmentedSort(NumItemsT num_items)
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<false>         /*is_descending*/,
     Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
@@ -332,7 +340,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<false>                         /*is_descending*/,
     Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
@@ -389,7 +396,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<true>          /*is_descending*/,
     Int2Type<CUB_SEGMENTED> /*dispatch_to*/,
@@ -436,7 +442,6 @@ cudaError_t Dispatch(
 template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT,
           typename NumItemsT>
 CUB_RUNTIME_FUNCTION
-__forceinline__
 cudaError_t Dispatch(
     Int2Type<true>                          /*is_descending*/,
     Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    /*dispatch_to*/,
@@ -490,104 +495,195 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceRadixSort
  */
-template <int IS_DESCENDING, typename KeyT, typename ValueT, typename BeginOffsetIteratorT,
-          typename EndOffsetIteratorT, typename NumItemsT>
-__global__ void CnpDispatchKernel(
-    Int2Type<IS_DESCENDING> is_descending,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  temp_storage_bytes,
-    DoubleBuffer<KeyT>      d_keys,
-    DoubleBuffer<ValueT>    d_values,
-    NumItemsT               num_items,
-    int                     num_segments,
-    BeginOffsetIteratorT    d_segment_begin_offsets,
-    EndOffsetIteratorT      d_segment_end_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    bool                    debug_synchronous)
+template <int IsDescending,
+          int CubBackend,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+__global__ void CDPDispatchKernel(Int2Type<IsDescending> is_descending,
+                                  Int2Type<CubBackend>   cub_backend,
+                                  int                   *d_selector,
+                                  size_t                *d_temp_storage_bytes,
+                                  cudaError_t           *d_cdp_error,
+                                  void                  *d_temp_storage,
+                                  size_t                 temp_storage_bytes,
+                                  DoubleBuffer<KeyT>     d_keys,
+                                  DoubleBuffer<ValueT>   d_values,
+                                  NumItemsT              num_items,
+                                  int                    num_segments,
+                                  BeginOffsetIteratorT d_segment_begin_offsets,
+                                  EndOffsetIteratorT   d_segment_end_offsets,
+                                  int                  begin_bit,
+                                  int                  end_bit,
+                                  bool                 debug_synchronous)
 {
-#ifndef CUB_CDP
-  (void)is_descending;
-  (void)d_selector;
-  (void)d_temp_storage_bytes;
-  (void)d_cdp_error;
-  (void)d_temp_storage;
-  (void)temp_storage_bytes;
-  (void)d_keys;
-  (void)d_values;
-  (void)num_items;
-  (void)num_segments;
-  (void)d_segment_begin_offsets;
-  (void)d_segment_end_offsets;
-  (void)begin_bit;
-  (void)end_bit;
-  (void)debug_synchronous;
-    *d_cdp_error            = cudaErrorNotSupported;
-#else
-    *d_cdp_error            = Dispatch(
-                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
-                                num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
-                                begin_bit, end_bit, 0, debug_synchronous);
-    *d_temp_storage_bytes   = temp_storage_bytes;
-    *d_selector             = d_keys.selector;
-#endif
+  *d_cdp_error = Dispatch(is_descending,
+                          cub_backend,
+                          d_selector,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys,
+                          d_values,
+                          num_items,
+                          num_segments,
+                          d_segment_begin_offsets,
+                          d_segment_end_offsets,
+                          begin_bit,
+                          end_bit,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
+  *d_selector           = d_keys.selector;
 }
 
-
 /**
- * Dispatch to CDP kernel
+ * Launch kernel and dispatch on device. Should only be called from host code.
+ * The CubBackend should be one of the non-CDP CUB backends to invoke from the
+ * device.
  */
-template <int IS_DESCENDING, typename KeyT, typename ValueT, typename BeginOffsetIteratorT,
-          typename EndOffsetIteratorT, typename NumItemsT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<CDP>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    NumItemsT               num_items,
-    int                     num_segments,
-    BeginOffsetIteratorT    d_segment_begin_offsets,
-    EndOffsetIteratorT      d_segment_end_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
+template <int IsDescending,
+          int CubBackend,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+cudaError_t LaunchCDPKernel(Int2Type<IsDescending> is_descending,
+                            Int2Type<CubBackend>   cub_backend,
+                            int                   *d_selector,
+                            size_t                *d_temp_storage_bytes,
+                            cudaError_t           *d_cdp_error,
+
+                            void                 *d_temp_storage,
+                            size_t               &temp_storage_bytes,
+                            DoubleBuffer<KeyT>   &d_keys,
+                            DoubleBuffer<ValueT> &d_values,
+                            NumItemsT             num_items,
+                            int                   num_segments,
+                            BeginOffsetIteratorT  d_segment_begin_offsets,
+                            EndOffsetIteratorT    d_segment_end_offsets,
+                            int                   begin_bit,
+                            int                   end_bit,
+                            cudaStream_t          stream,
+                            bool                  debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets,
-        begin_bit, end_bit, debug_synchronous);
+  // Invoke kernel to invoke device-side dispatch:
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<IsDescending,
+                              CubBackend,
+                              KeyT,
+                              ValueT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT,
+                              NumItemsT>,
+            is_descending,
+            cub_backend,
+            d_selector,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_segment_begin_offsets,
+            d_segment_end_offsets,
+            begin_bit,
+            end_bit,
+            debug_synchronous);
+  CubDebugExit(retval);
+  CubDebugExit(cub::detail::device_synchronize());
+
+  // Copy out selector
+  CubDebugExit(cudaMemcpy(&d_keys.selector,
+                          d_selector,
+                          sizeof(int) * 1,
+                          cudaMemcpyDeviceToHost));
+  d_values.selector = d_keys.selector;
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  return retval;
+}
 
-    // Copy out selector
-    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
-    d_values.selector = d_keys.selector;
+// Specializations of Dispatch that translate the CDP backend to the appropriate
+// CUB backend, and uses the CUB backend to launch the CDP kernel.
+#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend)                          \
+  template <int IsDescending,                                                  \
+            typename KeyT,                                                     \
+            typename ValueT,                                                   \
+            typename BeginOffsetIteratorT,                                     \
+            typename EndOffsetIteratorT,                                       \
+            typename NumItemsT>                                                \
+  cudaError_t Dispatch(Int2Type<IsDescending> is_descending,                   \
+                       Int2Type<CdpBackend> /*dispatch_to*/,                   \
+                       int         *d_selector,                                \
+                       size_t      *d_temp_storage_bytes,                      \
+                       cudaError_t *d_cdp_error,                               \
+                                                                               \
+                       void                 *d_temp_storage,                   \
+                       size_t               &temp_storage_bytes,               \
+                       DoubleBuffer<KeyT>   &d_keys,                           \
+                       DoubleBuffer<ValueT> &d_values,                         \
+                       NumItemsT             num_items,                        \
+                       int                   num_segments,                     \
+                       BeginOffsetIteratorT  d_segment_begin_offsets,          \
+                       EndOffsetIteratorT    d_segment_end_offsets,            \
+                       int                   begin_bit,                        \
+                       int                   end_bit,                          \
+                       cudaStream_t          stream,                           \
+                       bool                  debug_synchronous)                \
+  {                                                                            \
+    Int2Type<CubBackend> cub_backend{};                                        \
+    return LaunchCDPKernel(is_descending,                                      \
+                           cub_backend,                                        \
+                           d_selector,                                         \
+                           d_temp_storage_bytes,                               \
+                           d_cdp_error,                                        \
+                           d_temp_storage,                                     \
+                           temp_storage_bytes,                                 \
+                           d_keys,                                             \
+                           d_values,                                           \
+                           num_items,                                          \
+                           num_segments,                                       \
+                           d_segment_begin_offsets,                            \
+                           d_segment_end_offsets,                              \
+                           begin_bit,                                          \
+                           end_bit,                                            \
+                           stream,                                             \
+                           debug_synchronous);                                 \
+  }
 
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
+DEFINE_CDP_DISPATCHER(CDP, CUB)
+DEFINE_CDP_DISPATCHER(CDP_NO_OVERWRITE, CUB_NO_OVERWRITE)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED_NO_OVERWRITE, CUB_SEGMENTED_NO_OVERWRITE)
 
+#undef DEFINE_CDP_DISPATCHER
 
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Problem generation
@@ -721,6 +817,13 @@ void InitializeSolution(
     NumItemsT  *&h_reference_ranks,
     KeyT       *&h_reference_keys)
 {
+    if (num_items == 0)
+    {
+        h_reference_ranks = nullptr;
+        h_reference_keys = nullptr;
+        return;
+    }
+
     if (pre_sorted)
     {
         printf("Shuffling reference solution on CPU\n");
@@ -787,7 +890,9 @@ void InitializeSolution(
         {
             h_reference_ranks = new NumItemsT[num_items];
         }
-        NumItemsT max_run = 32, run = 0, i = 0;
+        NumItemsT max_run = 32;
+        NumItemsT run = 0;
+        NumItemsT i = 0;
         while (summary.size() > 0)
         {
             // Pick up a random element and a run.
@@ -842,7 +947,12 @@ void InitializeSolution(
             h_pairs[i].value = i;
         }
 
-        printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
+        printf("\nSorting reference solution on CPU "
+               "(%zd items, %d segments, %zd items/seg)...",
+               static_cast<std::size_t>(num_items),
+               num_segments,
+               static_cast<std::size_t>(num_items / num_segments));
+        fflush(stdout);
 
         for (int i = 0; i < num_segments; ++i)
         {
@@ -945,8 +1055,8 @@ void Test(
     const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
 
     printf("%s %s cub::DeviceRadixSort %zd items, %d segments, "
-           "%d-byte keys (%s) %d-byte values (%s), descending %d, "
-           "begin_bit %d, end_bit %d\n",
+           "%d-byte keys (%s) %d-byte values (%s), %d-byte num_items (%s), "
+           "descending %d, begin_bit %d, end_bit %d\n",
            BackendToString(BACKEND),
            (KEYS_ONLY) ? "keys-only" : "key-value",
            static_cast<std::size_t>(num_items),
@@ -955,6 +1065,8 @@ void Test(
            typeid(KeyT).name(),
            (KEYS_ONLY) ? 0 : static_cast<int>(sizeof(ValueT)),
            typeid(ValueT).name(),
+           static_cast<int>(sizeof(NumItemsT)),
+           typeid(NumItemsT).name(),
            IS_DESCENDING,
            begin_bit,
            end_bit);
@@ -1104,73 +1216,136 @@ bool HasEnoughMemory(std::size_t num_items, bool overwrite)
     return test_mem < total_mem;
 }
 
+
 /**
  * Test backend
  */
-template <bool IS_DESCENDING, typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename NumItemsT>
-void TestBackend(
-    KeyT                 *h_keys,
-    NumItemsT            num_items,
-    int                  num_segments,
-    BeginOffsetIteratorT d_segment_begin_offsets,
-    EndOffsetIteratorT   d_segment_end_offsets,
-    int                  begin_bit,
-    int                  end_bit,
-    KeyT                 *h_reference_keys,
-    NumItemsT            *h_reference_ranks)
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename NumItemsT>
+void TestBackend(KeyT                *h_keys,
+                 NumItemsT            num_items,
+                 int                  num_segments,
+                 BeginOffsetIteratorT d_segment_begin_offsets,
+                 EndOffsetIteratorT   d_segment_end_offsets,
+                 int                  begin_bit,
+                 int                  end_bit,
+                 KeyT                *h_reference_keys,
+                 NumItemsT           *h_reference_ranks)
 {
-    const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+#if TEST_CDP == 0
+  constexpr auto NonSegmentedOverwrite   = CUB;
+  constexpr auto NonSegmentedNoOverwrite = CUB_NO_OVERWRITE;
+  constexpr auto SegmentedOverwrite      = CUB_SEGMENTED;
+  constexpr auto SegmentedNoOverwrite    = CUB_SEGMENTED_NO_OVERWRITE;
+#else  // TEST_CDP
+  constexpr auto NonSegmentedOverwrite   = CDP;
+  constexpr auto NonSegmentedNoOverwrite = CDP_NO_OVERWRITE;
+  constexpr auto SegmentedOverwrite      = CDP_SEGMENTED;
+  constexpr auto SegmentedNoOverwrite    = CDP_SEGMENTED_NO_OVERWRITE;
+#endif // TEST_CDP
+
+  const bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+  // A conservative check assuming overwrite is allowed.
+  if (!HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), true))
+  {
+    printf("Skipping the test due to insufficient device memory\n");
+    return;
+  }
 
-    // A conservative check assuming overwrite is allowed.
-    if (!HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), true))
-    {
-        printf("Skipping the test due to insufficient device memory\n");
-        return;
-    }
+  std::unique_ptr<ValueT[]> h_value_data{};
 
-    ValueT *h_values             = NULL;
-    ValueT *h_reference_values   = NULL;
-    
-    if (!KEYS_ONLY)
-    {
-        h_values            = new ValueT[num_items];
-        h_reference_values  = new ValueT[num_items];
+  ValueT *h_values           = nullptr;
+  ValueT *h_reference_values = nullptr;
 
-        for (NumItemsT i = 0; i < num_items; ++i)
-        {
-            InitValue(INTEGER_SEED, h_values[i], i);
-            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
-        }
-    }
+  if (!KEYS_ONLY)
+  {
+    h_value_data.reset(new ValueT[2 * static_cast<std::size_t>(num_items)]);
+    h_values           = h_value_data.get();
+    h_reference_values = h_value_data.get() + num_items;
 
-    // Skip segmented sort if num_items isn't int.
-    // TODO(canonizer): re-enable these tests once num_items is templated for segmented sort.
-    if (std::is_same<NumItemsT, int>::value)
+    for (NumItemsT i = 0; i < num_items; ++i)
     {
-        Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-        Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+      InitValue(INTEGER_SEED, h_values[i], i);
+      InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
     }
+  }
+
+  // Skip segmented sort if num_items isn't int.
+  // TODO(64bit-seg-sort): re-enable these tests once num_items is templated for
+  // segmented sort.
+  if (std::is_same<NumItemsT, int>::value)
+  {
+    printf("Testing segmented sort with overwrite\n");
+    Test<SegmentedOverwrite, IS_DESCENDING>(h_keys,
+                                            h_values,
+                                            num_items,
+                                            num_segments,
+                                            d_segment_begin_offsets,
+                                            d_segment_end_offsets,
+                                            begin_bit,
+                                            end_bit,
+                                            h_reference_keys,
+                                            h_reference_values);
+    printf("Testing segmented sort with no overwrite\n");
+    Test<SegmentedNoOverwrite, IS_DESCENDING>(h_keys,
+                                              h_values,
+                                              num_items,
+                                              num_segments,
+                                              d_segment_begin_offsets,
+                                              d_segment_end_offsets,
+                                              begin_bit,
+                                              end_bit,
+                                              h_reference_keys,
+                                              h_reference_values);
+  }
+  else
+  {
+    printf("Skipping segmented sort tests (NumItemsT != int)\n");
+  }
 
-    if (num_segments == 1)
+  if (num_segments == 1)
+  {
+    printf("Testing non-segmented sort with overwrite\n");
+    Test<NonSegmentedOverwrite, IS_DESCENDING>(h_keys,
+                                               h_values,
+                                               num_items,
+                                               num_segments,
+                                               d_segment_begin_offsets,
+                                               d_segment_end_offsets,
+                                               begin_bit,
+                                               end_bit,
+                                               h_reference_keys,
+                                               h_reference_values);
+    if (HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items),
+                                      false))
     {
-        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-        if (HasEnoughMemory<KeyT, ValueT>(static_cast<std::size_t>(num_items), false))
-        {
-            Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-        }
-        else
-        {
-            printf("Skipping CUB_NO_OVERWRITE tests due to insufficient memory\n");
-        }
-    #ifdef CUB_CDP
-            Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, d_segment_begin_offsets, d_segment_end_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-    #endif
+      printf("Testing non-segmented sort with no overwrite\n");
+      Test<NonSegmentedNoOverwrite, IS_DESCENDING>(h_keys,
+                                                   h_values,
+                                                   num_items,
+                                                   num_segments,
+                                                   d_segment_begin_offsets,
+                                                   d_segment_end_offsets,
+                                                   begin_bit,
+                                                   end_bit,
+                                                   h_reference_keys,
+                                                   h_reference_values);
     }
-
-    if (h_values) delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
+    else
+    {
+      printf("Skipping no-overwrite tests with %zd items due to "
+             "insufficient memory\n",
+             static_cast<std::size_t>(num_items));
+    }
+  }
 }
 
+
 // Smallest value type for TEST_VALUE_TYPE.
 // Unless TEST_VALUE_TYPE == 3, this is the only value type tested.
 #if TEST_VALUE_TYPE == 0
@@ -1612,6 +1787,7 @@ int main(int argc, char** argv)
     // Initialize device
     CubDebugExit(args.DeviceInit());
 
+    // %PARAM% TEST_CDP cdp 0:1
     // %PARAM% TEST_KEY_BYTES bytes 1:2:4:8
     // %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3
     //   0->Keys only
diff --git a/test/test_device_reduce.cu b/test/test_device_reduce.cu
index 0599add52c..ef81c23c34 100644
--- a/test/test_device_reduce.cu
+++ b/test/test_device_reduce.cu
@@ -42,6 +42,10 @@
 #include <cub/iterator/transform_input_iterator.cuh>
 #include <cub/util_type.cuh>
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
+#include <nv/target>
+
 #include "test_util.h"
 
 #include <cstdio>
@@ -70,9 +74,28 @@ enum Backend
 {
     CUB,            // CUB method
     CUB_SEGMENTED,  // CUB segmented method
-    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+    CDP,            // GPU-based (dynamic parallelism) dispatch to CUB method
+    CDP_SEGMENTED,  // GPU-based segmented method
 };
 
+inline const char* BackendToString(Backend b)
+{
+  switch (b)
+  {
+    case CUB:
+      return "CUB";
+    case CUB_SEGMENTED:
+      return "CUB_SEGMENTED";
+    case CDP:
+      return "CDP";
+    case CDP_SEGMENTED:
+      return "CDP_SEGMENTED";
+    default:
+      break;
+  }
+
+  return "";
+}
 
 // Custom max functor
 struct CustomMax
@@ -94,7 +117,7 @@ struct CustomMax
  * Dispatch to reduce entrypoint (custom-max)
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -137,7 +160,7 @@ cudaError_t Dispatch(
  * Dispatch to sum entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -170,7 +193,7 @@ cudaError_t Dispatch(
  * Dispatch to min entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -203,7 +226,7 @@ cudaError_t Dispatch(
  * Dispatch to max entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -236,7 +259,7 @@ cudaError_t Dispatch(
  * Dispatch to argmin entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -269,7 +292,7 @@ cudaError_t Dispatch(
  * Dispatch to argmax entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -307,7 +330,7 @@ cudaError_t Dispatch(
  * Dispatch to reduce entrypoint (custom-max)
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -350,7 +373,7 @@ cudaError_t Dispatch(
  * Dispatch to sum entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -384,7 +407,7 @@ cudaError_t Dispatch(
  * Dispatch to min entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -418,7 +441,7 @@ cudaError_t Dispatch(
  * Dispatch to max entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -452,7 +475,7 @@ cudaError_t Dispatch(
  * Dispatch to argmin entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -486,7 +509,7 @@ cudaError_t Dispatch(
  * Dispatch to argmax entrypoint
  */
 template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION
 cudaError_t Dispatch(
     Int2Type<CUB_SEGMENTED>       /*dispatch_to*/,
     int                 timing_iterations,
@@ -521,91 +544,169 @@ cudaError_t Dispatch(
 // CUDA nested-parallelism test kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceReduce
  */
-template <
-    typename            InputIteratorT,
-    typename            OutputIteratorT,
-    typename            BeginOffsetIteratorT,
-    typename            EndOffsetIteratorT,
-    typename            ReductionOpT>
-__global__ void CnpDispatchKernel(
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    BeginOffsetIteratorT d_segment_begin_offsets,
-    EndOffsetIteratorT  d_segment_end_offsets,
-    ReductionOpT        reduction_op,
-    bool                debug_synchronous)
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename ReductionOpT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  OutputIteratorT      d_out,
+                                  int                  num_items,
+                                  int                  max_segments,
+                                  BeginOffsetIteratorT d_segment_begin_offsets,
+                                  EndOffsetIteratorT   d_segment_end_offsets,
+                                  ReductionOpT         reduction_op,
+                                  bool                 debug_synchronous)
 {
-#ifndef CUB_CDP
-    (void)timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_cdp_error;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_in;
-    (void)d_out;
-    (void)num_items;
-    (void)max_segments;
-    (void)d_segment_begin_offsets;
-    (void)d_segment_end_offsets;
-    (void)reduction_op;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          num_items,
+                          max_segments,
+                          d_segment_begin_offsets,
+                          d_segment_end_offsets,
+                          reduction_op,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
- * Dispatch to CUB_CDP kernel
+ * Launch kernel and dispatch on device. Should only be called from host code.
+ * The CubBackend should be one of the non-CDP CUB backends to invoke from the
+ * device.
  */
-template <typename InputIteratorT, typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_CDP>       dispatch_to,
-    int                 timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    BeginOffsetIteratorT d_segment_begin_offsets,
-    EndOffsetIteratorT  d_segment_end_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename ReductionOpT>
+cudaError_t LaunchCDPKernel(Int2Type<CubBackend> cub_backend,
+                            int                  timing_iterations,
+                            size_t              *d_temp_storage_bytes,
+                            cudaError_t         *d_cdp_error,
+
+                            void                *d_temp_storage,
+                            size_t              &temp_storage_bytes,
+                            InputIteratorT       d_in,
+                            OutputIteratorT      d_out,
+                            int                  num_items,
+                            int                  max_segments,
+                            BeginOffsetIteratorT d_segment_begin_offsets,
+                            EndOffsetIteratorT   d_segment_end_offsets,
+                            ReductionOpT         reduction_op,
+                            cudaStream_t         stream,
+                            bool                 debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_begin_offsets, d_segment_end_offsets, reduction_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CubBackend,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              BeginOffsetIteratorT,
+                              EndOffsetIteratorT,
+                              ReductionOpT>,
+            cub_backend,
+            timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            max_segments,
+            d_segment_begin_offsets,
+            d_segment_end_offsets,
+            reduction_op,
+            debug_synchronous);
+  CubDebugExit(retval);
+  CubDebugExit(cub::detail::device_synchronize());
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  return retval;
 }
 
-
+// Specializations of Dispatch that translate the CDP backend to the appropriate
+// CUB backend, and uses the CUB backend to launch the CDP kernel.
+#define DEFINE_CDP_DISPATCHER(CdpBackend, CubBackend)                          \
+  template <typename InputIteratorT,                                           \
+            typename OutputIteratorT,                                          \
+            typename BeginOffsetIteratorT,                                     \
+            typename EndOffsetIteratorT,                                       \
+            typename ReductionOpT>                                             \
+  cudaError_t Dispatch(Int2Type<CdpBackend>,                                   \
+                       int          timing_iterations,                         \
+                       size_t      *d_temp_storage_bytes,                      \
+                       cudaError_t *d_cdp_error,                               \
+                                                                               \
+                       void                *d_temp_storage,                    \
+                       size_t              &temp_storage_bytes,                \
+                       InputIteratorT       d_in,                              \
+                       OutputIteratorT      d_out,                             \
+                       int                  num_items,                         \
+                       int                  max_segments,                      \
+                       BeginOffsetIteratorT d_segment_begin_offsets,           \
+                       EndOffsetIteratorT   d_segment_end_offsets,             \
+                       ReductionOpT         reduction_op,                      \
+                       cudaStream_t         stream,                            \
+                       bool                 debug_synchronous)                 \
+  {                                                                            \
+    Int2Type<CubBackend> cub_backend{};                                        \
+    return LaunchCDPKernel(cub_backend,                                        \
+                           timing_iterations,                                  \
+                           d_temp_storage_bytes,                               \
+                           d_cdp_error,                                        \
+                           d_temp_storage,                                     \
+                           temp_storage_bytes,                                 \
+                           d_in,                                               \
+                           d_out,                                              \
+                           num_items,                                          \
+                           max_segments,                                       \
+                           d_segment_begin_offsets,                            \
+                           d_segment_end_offsets,                              \
+                           reduction_op,                                       \
+                           stream,                                             \
+                           debug_synchronous);                                 \
+  }
+
+DEFINE_CDP_DISPATCHER(CDP, CUB)
+DEFINE_CDP_DISPATCHER(CDP_SEGMENTED, CUB_SEGMENTED)
+
+#undef DEFINE_CDP_DISPATCHER
+
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Problem generation
@@ -770,7 +871,7 @@ void Test(
     // Input data types
     using InputT = cub::detail::value_t<DeviceInputIteratorT>;
 
-    // Allocate CUB_CDP device arrays for temp storage size and error
+    // Allocate CDP device arrays for temp storage size and error
     size_t          *d_temp_storage_bytes = NULL;
     cudaError_t     *d_cdp_error = NULL;
     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
@@ -859,8 +960,11 @@ void SolveAndTest(
     using OutputT = typename SolutionT::OutputT;
 
     printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
-        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
-        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
+           BackendToString(BACKEND),
+           typeid(ReductionOpT).name(),
+           num_items,
+           typeid(HostInputIteratorT).name(),
+           num_segments);
     fflush(stdout);
 
     // Allocate and solve solution
@@ -986,6 +1090,14 @@ void TestByBackend(
     OffsetT     max_segments,
     GenMode     gen_mode)
 {
+#if TEST_CDP == 0
+  constexpr auto NonSegmentedBackend   = CUB;
+  constexpr auto SegmentedBackend      = CUB_SEGMENTED;
+#else  // TEST_CDP
+  constexpr auto NonSegmentedBackend   = CDP;
+  constexpr auto SegmentedBackend      = CDP_SEGMENTED;
+#endif // TEST_CDP
+
     // Initialize host data
     printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
         num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
@@ -1008,18 +1120,14 @@ void TestByBackend(
     InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
 
     // Page-aligned-input tests
-    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1,
-        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);                 // Host-dispatch
-#ifdef CUB_CDP
-    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1,
-        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);             // Device-dispatch
-#endif
+    TestByOp<NonSegmentedBackend, OutputT>(h_in, d_in, num_items, 1,
+        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);
 
     // Non-page-aligned-input tests
     if (num_items > 1)
     {
         InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
-        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1,
+        TestByOp<NonSegmentedBackend, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1,
             h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL);
     }
 
@@ -1037,7 +1145,7 @@ void TestByBackend(
         // Test with segment pointer
         InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
         CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-        TestByOp<CUB_SEGMENTED, OutputT>(h_in, d_in, num_items, num_segments,
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
             h_segment_offsets, h_segment_offsets + 1, d_segment_offsets, d_segment_offsets + 1);
 
         // Test with segment iterator
@@ -1050,7 +1158,7 @@ void TestByBackend(
             d_segment_offsets,
             identity_op);
 
-        TestByOp<CUB_SEGMENTED, OutputT>(h_in, d_in, num_items, num_segments,
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
             h_segment_offsets_itr, h_segment_offsets_itr + 1, d_segment_offsets_itr, d_segment_offsets_itr + 1);
 
         // Test with transform iterators of different types
@@ -1064,7 +1172,7 @@ void TestByBackend(
         TransformInputIterator<OffsetT, TransformFunctor1T, OffsetT*, OffsetT> d_segment_begin_offsets_itr(d_segment_offsets, TransformFunctor1T());
         TransformInputIterator<OffsetT, TransformFunctor2T, OffsetT*, OffsetT> d_segment_end_offsets_itr(d_segment_offsets + 1, TransformFunctor2T());
 
-        TestByOp<CUB_SEGMENTED, OutputT>(h_in, d_in, num_items, num_segments,
+        TestByOp<SegmentedBackend, OutputT>(h_in, d_in, num_items, num_segments,
             h_segment_begin_offsets_itr, h_segment_end_offsets_itr,
             d_segment_begin_offsets_itr, d_segment_end_offsets_itr);
     }
@@ -1104,92 +1212,94 @@ void TestByGenMode(
     OffsetT *h_segment_offsets = new OffsetT[1 + 1];
     InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
 
-    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1,
-        h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum());
-#ifdef CUB_CDP
-    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1,
+#if TEST_CDP == 0
+    constexpr auto Backend   = CUB;
+#else  // TEST_CDP
+    constexpr auto Backend   = CDP;
+#endif // TEST_CDP
+
+    SolveAndTest<Backend, OutputT>(h_in, h_in, num_items, 1,
         h_segment_offsets, h_segment_offsets + 1, (OffsetT*) NULL, (OffsetT*)NULL, Sum());
-#endif
 
     if (h_segment_offsets) delete[] h_segment_offsets;
 }
 
-
 /// Test different problem sizes
-template <
-    typename InputT,
-    typename OutputT,
-    typename OffsetT>
-struct TestBySize
+template <typename InputT, typename OutputT, typename OffsetT>
+void TestBySize(OffsetT max_items, OffsetT max_segments, OffsetT tile_size)
 {
-    OffsetT max_items;
-    OffsetT max_segments;
-
-    TestBySize(OffsetT max_items, OffsetT max_segments) :
-        max_items(max_items),
-        max_segments(max_segments)
-    {}
-
-    template <typename ActivePolicyT>
-    cudaError_t Invoke()
-    {
-        //
-        // Black-box testing on all backends
-        //
-
-        // Test 0, 1, many
-        TestByGenMode<InputT, OutputT>(0,           max_segments);
-        TestByGenMode<InputT, OutputT>(1,           max_segments);
-        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
-
-        // Test random problem sizes from a log-distribution [8, max_items-ish)
-        int     num_iterations = 8;
-        double  max_exp = log(double(max_items)) / log(double(2.0));
-        for (int i = 0; i < num_iterations; ++i)
-        {
-            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
-            TestByGenMode<InputT, OutputT>(num_items, max_segments);
-        }
-
-        //
-        // White-box testing of single-segment problems around specific sizes
-        //
-
-        // Tile-boundaries: multiple blocks, one tile per block
-        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
+  // Test 0, 1, many
+  TestByGenMode<InputT, OutputT>(0, max_segments);
+  TestByGenMode<InputT, OutputT>(1, max_segments);
+  TestByGenMode<InputT, OutputT>(max_items, max_segments);
+
+  // Test random problem sizes from a log-distribution [8, max_items-ish)
+  int    num_iterations = 8;
+  double max_exp        = log(double(max_items)) / log(double(2.0));
+  for (int i = 0; i < num_iterations; ++i)
+  {
+    OffsetT num_items = (OffsetT)pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
+    TestByGenMode<InputT, OutputT>(num_items, max_segments);
+  }
+
+  //
+  // White-box testing of single-segment problems around specific sizes
+  //
+
+#if TEST_CDP == 0
+  constexpr auto Backend   = CUB;
+#else  // TEST_CDP
+  constexpr auto Backend   = CDP;
+#endif // TEST_CDP
+
+  // Tile-boundaries: multiple blocks, one tile per block
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4 + 1, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(tile_size * 4 - 1, 1, RANDOM, Sum());
+
+  // Tile-boundaries: multiple blocks, multiple tiles per block
+  OffsetT sm_occupancy = 32;
+  OffsetT occupancy    = tile_size * sm_occupancy * g_sm_count;
+  TestProblem<Backend, InputT, OutputT>(occupancy, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(occupancy + 1, 1, RANDOM, Sum());
+  TestProblem<Backend, InputT, OutputT>(occupancy - 1, 1, RANDOM, Sum());
+}
 
-        // Tile-boundaries: multiple blocks, multiple tiles per block
-        OffsetT sm_occupancy = 32;
-        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
-        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
 
-        return cudaSuccess;
-    }
+template <typename InputT, typename OutputT, typename OffsetT>
+struct GetTileSize
+{
+  OffsetT max_items{};
+  OffsetT max_segments{};
+  OffsetT tile_size{};
+
+  GetTileSize(OffsetT max_items, OffsetT max_segments)
+      : max_items(max_items)
+      , max_segments(max_segments)
+  {}
+
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION cudaError_t Invoke()
+  {
+    this->tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS *
+                      ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
+    return cudaSuccess;
+  }
 };
 
-
 /// Test problem type
-template <
-    typename    InputT,
-    typename    OutputT,
-    typename    OffsetT>
-void TestType(
-    OffsetT     max_items,
-    OffsetT     max_segments)
+template <typename InputT, typename OutputT, typename OffsetT>
+void TestType(OffsetT max_items, OffsetT max_segments)
 {
-    typedef typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
+  // Inspect the tuning policies to determine this arch's tile size:
+  using MaxPolicyT =
+    typename DeviceReducePolicy<InputT, OutputT, OffsetT, cub::Sum>::MaxPolicy;
+  GetTileSize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
+  CubDebugExit(MaxPolicyT::Invoke(g_ptx_version, dispatch));
 
-    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
-
-    MaxPolicyT::Invoke(g_ptx_version, dispatch);
+  TestBySize<InputT, OutputT>(max_items, max_segments, dispatch.tile_size);
 }
 
-
 //---------------------------------------------------------------------
 // Main
 //---------------------------------------------------------------------
@@ -1222,7 +1332,6 @@ int main(int argc, char** argv)
             "[--i=<timing iterations> "
             "[--device=<device-id>] "
             "[--v] "
-            "[--cdp]"
             "\n", argv[0]);
         exit(0);
     }
@@ -1237,7 +1346,8 @@ int main(int argc, char** argv)
     // Get SM count
     g_sm_count = args.deviceProp.multiProcessorCount;
 
-    // %PARAM% TEST_TYPES   types 0:1:2:3
+    // %PARAM% TEST_CDP cdp 0:1
+    // %PARAM% TEST_TYPES types 0:1:2:3
 
 #if TEST_TYPES == 0
     TestType<char, char>(max_items, max_segments);
diff --git a/test/test_device_reduce_by_key.cu b/test/test_device_reduce_by_key.cu
index 168c4ecacc..103084d262 100644
--- a/test/test_device_reduce_by_key.cu
+++ b/test/test_device_reduce_by_key.cu
@@ -33,17 +33,19 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/device_run_length_encode.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -122,94 +124,132 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceSelect
  */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <int CubBackend,
+          typename KeyInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueInputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumRunsIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  KeyInputIteratorT    d_keys_in,
+                                  KeyOutputIteratorT   d_keys_out,
+                                  ValueInputIteratorT  d_values_in,
+                                  ValueOutputIteratorT d_values_out,
+                                  NumRunsIteratorT     d_num_runs,
+                                  EqualityOpT          equality_op,
+                                  ReductionOpT         reduction_op,
+                                  OffsetT              num_items,
+                                  bool                 debug_synchronous)
 {
 
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_keys_out,
+                          d_values_in,
+                          d_values_out,
+                          d_num_runs,
+                          equality_op,
+                          reduction_op,
+                          num_items,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <typename KeyInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueInputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumRunsIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
+__forceinline__ cudaError_t
+Dispatch(Int2Type<CDP> /*dispatch_to*/,
+         int          timing_timing_iterations,
+         size_t      *d_temp_storage_bytes,
+         cudaError_t *d_cdp_error,
+
+         void                *d_temp_storage,
+         size_t              &temp_storage_bytes,
+         KeyInputIteratorT    d_keys_in,
+         KeyOutputIteratorT   d_keys_out,
+         ValueInputIteratorT  d_values_in,
+         ValueOutputIteratorT d_values_out,
+         NumRunsIteratorT     d_num_runs,
+         EqualityOpT          equality_op,
+         ReductionOpT         reduction_op,
+         OffsetT              num_items,
+         cudaStream_t         stream,
+         bool                 debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CUB,
+                              KeyInputIteratorT,
+                              KeyOutputIteratorT,
+                              ValueInputIteratorT,
+                              ValueOutputIteratorT,
+                              NumRunsIteratorT,
+                              EqualityOpT,
+                              ReductionOpT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            equality_op,
+            reduction_op,
+            num_items,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -595,10 +635,11 @@ void TestDispatch(
     int             num_items,
     ReductionOpT    reduction_op)
 {
+#if TEST_CDP == 0
     Test<CUB, KeyT, ValueT>(num_items, reduction_op);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<CDP, KeyT, ValueT>(num_items, reduction_op);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -671,7 +712,6 @@ int main(int argc, char** argv)
             "[--maxseg=<max segment length>]"
             "[--entropy=<segment length bit entropy reduction rounds>]"
             "[--v] "
-            "[--cdp]"
             "\n", argv[0]);
         exit(0);
     }
@@ -684,6 +724,8 @@ int main(int argc, char** argv)
     int ptx_version = 0;
     CubDebugExit(PtxVersion(ptx_version));
 
+    // %PARAM% TEST_CDP cdp 0:1
+
     // Test different input types
     TestOp<int, char>(num_items);
     TestOp<int, short>(num_items);
diff --git a/test/test_device_run_length_encode.cu b/test/test_device_run_length_encode.cu
index 95e6331cb5..6b4a0f7ead 100644
--- a/test/test_device_run_length_encode.cu
+++ b/test/test_device_run_length_encode.cu
@@ -27,23 +27,24 @@
  ******************************************************************************/
 
 /******************************************************************************
- * Test of DeviceReduce::RunLengthEncode utilities
+ * Test of DeviceRunLengthEncode utilities
  ******************************************************************************/
 
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_reduce.cuh>
 #include <cub/device/device_run_length_encode.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/thread/thread_operators.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -174,94 +175,131 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceRunLengthEncode
  */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    Int2Type<RLE_METHOD>            method,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <int RLE_METHOD,
+          int CubBackend,
+          typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename EqualityOp,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<RLE_METHOD> method,
+                                  Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                  *d_temp_storage,
+                                  size_t                 temp_storage_bytes,
+                                  InputIteratorT         d_in,
+                                  UniqueOutputIteratorT  d_unique_out,
+                                  OffsetsOutputIteratorT d_offsets_out,
+                                  LengthsOutputIteratorT d_lengths_out,
+                                  NumRunsIterator        d_num_runs,
+                                  cub::Equality          equality_op,
+                                  OffsetT                num_items,
+                                  bool                   debug_synchronous)
 {
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(method,
+                          cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_unique_out,
+                          d_offsets_out,
+                          d_lengths_out,
+                          d_num_runs,
+                          equality_op,
+                          num_items,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<RLE_METHOD>        method,
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    EqualityOp                  equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <int RLE_METHOD,
+          typename InputIteratorT,
+          typename UniqueOutputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename NumRunsIterator,
+          typename EqualityOp,
+          typename OffsetT>
+__forceinline__ cudaError_t
+Dispatch(Int2Type<RLE_METHOD> method,
+         Int2Type<CDP> /*dispatch_to*/,
+         int          timing_timing_iterations,
+         size_t      *d_temp_storage_bytes,
+         cudaError_t *d_cdp_error,
+
+         void                  *d_temp_storage,
+         size_t                &temp_storage_bytes,
+         InputIteratorT         d_in,
+         UniqueOutputIteratorT  d_unique_out,
+         OffsetsOutputIteratorT d_offsets_out,
+         LengthsOutputIteratorT d_lengths_out,
+         NumRunsIterator        d_num_runs,
+         EqualityOp             equality_op,
+         OffsetT                num_items,
+         cudaStream_t           stream,
+         bool                   debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<RLE_METHOD,
+                              CUB,
+                              InputIteratorT,
+                              UniqueOutputIteratorT,
+                              OffsetsOutputIteratorT,
+                              LengthsOutputIteratorT,
+                              NumRunsIterator,
+                              EqualityOp,
+                              OffsetT>,
+            method,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs,
+            equality_op,
+            num_items,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -533,7 +571,7 @@ void TestPointer(
     int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
 
     printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
         (BACKEND == CDP) ? "CDP CUB" : "CUB",
         num_items, num_runs, float(num_items) / num_runs,
         typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
@@ -586,7 +624,7 @@ void TestIterator(
     int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
 
     printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (RLE_METHOD == RLE) ? "DeviceRunLengthEncode::Encode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
         (BACKEND == CDP) ? "CDP CUB" : "CUB",
         num_items, num_runs, float(num_items) / num_runs,
         typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
@@ -653,10 +691,10 @@ template <
 void TestDispatch(
     int             num_items)
 {
+#if TEST_CDP == 0
     Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
     Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
-
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
     Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
 #endif
@@ -722,6 +760,8 @@ int main(int argc, char** argv)
     CubDebugExit(args.DeviceInit());
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
+
     // Test different input types
     TestSize<char, int, int>(num_items);
     TestSize<short, int, int>(num_items);
diff --git a/test/test_device_scan.cu b/test/test_device_scan.cu
index b9f4e41610..ca2b320151 100644
--- a/test/test_device_scan.cu
+++ b/test/test_device_scan.cu
@@ -33,17 +33,19 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
 #include <cub/device/device_scan.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/iterator/discard_output_iterator.cuh>
 #include <cub/util_allocator.cuh>
 #include <cub/util_type.cuh>
 
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -384,46 +386,37 @@ Dispatch(Int2Type<false> /*in_place*/,
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceScan
  */
-template <typename IsPrimitiveT,
+template <typename InPlaceT,
+          typename CubBackendT,
+          typename IsPrimitiveT,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename ScanOpT,
           typename InitialValueT,
-          typename OffsetT,
-          bool InPlace>
-__global__ void CnpDispatchKernel(Int2Type<InPlace> /*in_place*/,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(InPlaceT     in_place,
+                                  CubBackendT  cub_backend,
                                   IsPrimitiveT is_primitive,
-                                  int timing_timing_iterations,
-                                  size_t *d_temp_storage_bytes,
+                                  int          timing_timing_iterations,
+                                  size_t      *d_temp_storage_bytes,
                                   cudaError_t *d_cdp_error,
-                                  void *d_temp_storage,
-                                  size_t temp_storage_bytes,
-                                  InputIteratorT d_in,
+
+                                  void           *d_temp_storage,
+                                  size_t          temp_storage_bytes,
+                                  InputIteratorT  d_in,
                                   OutputIteratorT d_out,
-                                  ScanOpT scan_op,
-                                  InitialValueT initial_value,
-                                  OffsetT num_items,
-                                  bool debug_synchronous)
+                                  ScanOpT         scan_op,
+                                  InitialValueT   initial_value,
+                                  OffsetT         num_items,
+                                  bool            debug_synchronous)
 {
-#ifndef CUB_CDP
-  (void)is_primitive;
-  (void)timing_timing_iterations;
-  (void)d_temp_storage_bytes;
-  (void)d_cdp_error;
-  (void)d_temp_storage;
-  (void)temp_storage_bytes;
-  (void)d_in;
-  (void)d_out;
-  (void)scan_op;
-  (void)initial_value;
-  (void)num_items;
-  (void)debug_synchronous;
-  *d_cdp_error = cudaErrorNotSupported;
-#else
-  *d_cdp_error = Dispatch(Int2Type<CUB>(),
+  *d_cdp_error = Dispatch(in_place,
+                          cub_backend,
                           is_primitive,
                           timing_timing_iterations,
                           d_temp_storage_bytes,
@@ -439,48 +432,64 @@ __global__ void CnpDispatchKernel(Int2Type<InPlace> /*in_place*/,
                           debug_synchronous);
 
   *d_temp_storage_bytes = temp_storage_bytes;
-#endif
 }
 
 /**
  * Dispatch to CDP kernel
  */
-template <typename IsPrimitiveT,
+template <typename InPlaceT,
+          typename IsPrimitiveT,
           typename InputIteratorT,
           typename OutputIteratorT,
           typename ScanOpT,
           typename InitialValueT,
-          typename OffsetT,
-          bool InPlace>
-cudaError_t Dispatch(Int2Type<InPlace> /*in_place*/,
+          typename OffsetT>
+cudaError_t Dispatch(InPlaceT      in_place,
                      Int2Type<CDP> dispatch_to,
-                     IsPrimitiveT is_primitive,
-                     int timing_timing_iterations,
-                     size_t *d_temp_storage_bytes,
-                     cudaError_t *d_cdp_error,
-                     void *d_temp_storage,
-                     size_t &temp_storage_bytes,
-                     InputIteratorT d_in,
+                     IsPrimitiveT  is_primitive,
+                     int           timing_timing_iterations,
+                     size_t       *d_temp_storage_bytes,
+                     cudaError_t  *d_cdp_error,
+
+                     void           *d_temp_storage,
+                     size_t         &temp_storage_bytes,
+                     InputIteratorT  d_in,
                      OutputIteratorT d_out,
-                     ScanOpT scan_op,
-                     InitialValueT initial_value,
-                     OffsetT num_items,
-                     cudaStream_t stream,
-                     bool debug_synchronous)
+                     ScanOpT         scan_op,
+                     InitialValueT   initial_value,
+                     OffsetT         num_items,
+                     cudaStream_t    stream,
+                     bool            debug_synchronous)
 {
-  // Invoke kernel to invoke device-side dispatch
-  CnpDispatchKernel<<<1, 1>>>(is_primitive,
-                              timing_timing_iterations,
-                              d_temp_storage_bytes,
-                              d_cdp_error,
-                              d_temp_storage,
-                              temp_storage_bytes,
-                              d_in,
-                              d_out,
-                              scan_op,
-                              initial_value,
-                              num_items,
-                              debug_synchronous);
+  // Invoke kernel to invoke device-side dispatch to CUB backend:
+  (void)dispatch_to;
+  using CubBackendT = Int2Type<CUB>;
+  CubBackendT cub_backend;
+  cudaError_t   retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<InPlaceT,
+                              CubBackendT,
+                              IsPrimitiveT,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              ScanOpT,
+                              InitialValueT,
+                              OffsetT>,
+            in_place,
+            cub_backend,
+            is_primitive,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            initial_value,
+            num_items,
+            debug_synchronous);
+  CubDebugExit(retval);
 
   // Copy out temp_storage_bytes
   CubDebugExit(cudaMemcpy(&temp_storage_bytes,
@@ -489,7 +498,6 @@ cudaError_t Dispatch(Int2Type<InPlace> /*in_place*/,
                           cudaMemcpyDeviceToHost));
 
   // Copy out error
-  cudaError_t retval;
   CubDebugExit(cudaMemcpy(&retval,
                           d_cdp_error,
                           sizeof(cudaError_t) * 1,
@@ -497,6 +505,8 @@ cudaError_t Dispatch(Int2Type<InPlace> /*in_place*/,
   return retval;
 }
 
+#endif // TEST_CDP
+
 //---------------------------------------------------------------------
 // Test generation
 //---------------------------------------------------------------------
@@ -991,10 +1001,11 @@ void Test(
     ScanOpT         scan_op,
     InitialValueT   initial_value)
 {
+#if TEST_CDP == 0
     Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -1082,6 +1093,7 @@ int main(int argc, char** argv)
     g_device_giga_bandwidth = args.device_giga_bandwidth;
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
     // %PARAM% TEST_VALUE_TYPES types 0:1:2
 
 #if TEST_VALUE_TYPES == 0
diff --git a/test/test_device_scan_by_key.cu b/test/test_device_scan_by_key.cu
index d6b0da67aa..db46a04661 100644
--- a/test/test_device_scan_by_key.cu
+++ b/test/test_device_scan_by_key.cu
@@ -32,18 +32,20 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
+#include <cub/device/device_scan.cuh>
 #include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
 #include <cub/iterator/transform_input_iterator.cuh>
-#include <cub/device/device_scan.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -217,7 +219,6 @@ cudaError_t Dispatch(
     return error;
 }
 
-
 /**
  * Dispatch to inclusive sum entrypoint
  */
@@ -254,114 +255,130 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceScan
  */
-template <typename IsPrimitiveT, typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT, typename EqualityOpT>
-__global__ void CnpDispatchKernel(
-    IsPrimitiveT          is_primitive,
-    int                   timing_timing_iterations,
-    size_t                *d_temp_storage_bytes,
-    cudaError_t           *d_cdp_error,
-
-    void*                 d_temp_storage,
-    size_t                temp_storage_bytes,
-    KeysInputIteratorT    d_keys_in,
-    ValuesInputIteratorT  d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    ScanOpT               scan_op,
-    InitialValueT         initial_value,
-    OffsetT               num_items,
-    EqualityOpT           equality_op,
-    bool                  debug_synchronous)
+template <int CubBackend,
+          typename IsPrimitiveT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          typename EqualityOpT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  IsPrimitiveT         is_primitive,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                 *d_temp_storage,
+                                  size_t                temp_storage_bytes,
+                                  KeysInputIteratorT    d_keys_in,
+                                  ValuesInputIteratorT  d_values_in,
+                                  ValuesOutputIteratorT d_values_out,
+                                  ScanOpT               scan_op,
+                                  InitialValueT         initial_value,
+                                  OffsetT               num_items,
+                                  EqualityOpT           equality_op,
+                                  bool                  debug_synchronous)
 {
-#ifndef CUB_CDP
-    (void)is_primitive;
-    (void)timing_timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_cdp_error;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_keys_in;
-    (void)d_values_in;
-    (void)d_values_out;
-    (void)scan_op;
-    (void)initial_value;
-    (void)num_items;
-    (void)equality_op;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(
-        Int2Type<CUB>(),
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(cub_backend,
+                          is_primitive,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_values_in,
+                          d_values_out,
+                          scan_op,
+                          initial_value,
+                          num_items,
+                          equality_op,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <typename IsPrimitiveT, typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT, typename EqualityOpT>
-cudaError_t Dispatch(
-    Int2Type<CDP>         dispatch_to,
-    IsPrimitiveT          is_primitive,
-    int                   timing_timing_iterations,
-    size_t                *d_temp_storage_bytes,
-    cudaError_t           *d_cdp_error,
-
-    void*                 d_temp_storage,
-    size_t&               temp_storage_bytes,
-    KeysInputIteratorT    d_keys_in,
-    ValuesInputIteratorT  d_values_in,
-    ValuesOutputIteratorT d_values_out,
-    ScanOpT               scan_op,
-    InitialValueT         initial_value,
-    OffsetT               num_items,
-    EqualityOpT           equality_op,
-    cudaStream_t          stream,
-    bool                  debug_synchronous)
+template <typename IsPrimitiveT,
+          typename KeysInputIteratorT,
+          typename ValuesInputIteratorT,
+          typename ValuesOutputIteratorT,
+          typename ScanOpT,
+          typename InitialValueT,
+          typename OffsetT,
+          typename EqualityOpT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     IsPrimitiveT is_primitive,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                 *d_temp_storage,
+                     size_t               &temp_storage_bytes,
+                     KeysInputIteratorT    d_keys_in,
+                     ValuesInputIteratorT  d_values_in,
+                     ValuesOutputIteratorT d_values_out,
+                     ScanOpT               scan_op,
+                     InitialValueT         initial_value,
+                     OffsetT               num_items,
+                     EqualityOpT           equality_op,
+                     cudaStream_t          stream,
+                     bool                  debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_keys_in,
-        d_values_in,
-        d_values_out,
-        scan_op,
-        initial_value,
-        equality_op,
-        num_items,
-        debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CUB,
+                              IsPrimitiveT,
+                              KeysInputIteratorT,
+                              ValuesInputIteratorT,
+                              ValuesOutputIteratorT,
+                              ScanOpT,
+                              InitialValueT,
+                              OffsetT,
+                              EqualityOpT>,
+            Int2Type<CUB>{},
+            is_primitive,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_values_out,
+            scan_op,
+            initial_value,
+            num_items,
+            equality_op,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -924,10 +941,11 @@ void Test(
     InitialValueT   initial_value,
     EqualityOpT     equality_op)
 {
+#if TEST_CDP == 0
     Test<CUB, KeyT, InputT, OutputT>(num_items, scan_op, initial_value, equality_op);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<CDP, KeyT, InputT, OutputT>(num_items, scan_op, initial_value, equality_op);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -1019,7 +1037,6 @@ int main(int argc, char** argv)
             "[--i=<timing iterations> "
             "[--device=<device-id>] "
             "[--v] "
-            "[--cdp]"
             "\n", argv[0]);
         exit(0);
     }
@@ -1029,6 +1046,7 @@ int main(int argc, char** argv)
     g_device_giga_bandwidth = args.device_giga_bandwidth;
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
     // %PARAM% TEST_VALUE_TYPES types 0:1:2:3:4:5
 
 #if TEST_VALUE_TYPES == 0
diff --git a/test/test_device_segmented_sort.cu b/test/test_device_segmented_sort.cu
index 32a764f864..d1f58a1f3c 100644
--- a/test/test_device_segmented_sort.cu
+++ b/test/test_device_segmented_sort.cu
@@ -1475,7 +1475,7 @@ void InputTestRandom(Input<KeyT, ValueT> &input)
             }
 #endif
 
-AssertTrue(keys_ok);
+            AssertTrue(keys_ok);
             AssertTrue(values_ok);
 
             input.shuffle();
@@ -1659,7 +1659,7 @@ void Test()
 }
 
 
-#ifdef CUB_CDP
+#if TEST_CDP == 1
 template <typename KeyT>
 __global__ void LauncherKernel(
     void *tmp_storage,
@@ -1771,7 +1771,8 @@ void TestDeviceSideLaunch()
   TestDeviceSideLaunch<KeyT>(1 << 2, 1 << 8);
   TestDeviceSideLaunch<KeyT>(1 << 9, 1 << 19);
 }
-#endif
+
+#endif // TEST_CDP
 
 
 int main(int argc, char** argv)
@@ -1781,10 +1782,9 @@ int main(int argc, char** argv)
   // Initialize device
   CubDebugExit(args.DeviceInit());
 
-#ifdef CUB_CDP
-  TestDeviceSideLaunch<int>();
-#endif
+  // %PARAM% TEST_CDP cdp 0:1
 
+#if TEST_CDP == 0
   TestZeroSegments();
   TestEmptySegments(1 << 2);
   TestEmptySegments(1 << 22);
@@ -1800,5 +1800,9 @@ int main(int argc, char** argv)
   Test<std::uint8_t, std::uint64_t>();
   Test<std::int64_t, std::uint32_t>();
 
+#elif TEST_CDP == 1
+  TestDeviceSideLaunch<int>();
+#endif // TEST_CDP
+
   return 0;
 }
diff --git a/test/test_device_select_if.cu b/test/test_device_select_if.cu
index c917bfd45e..980dd4529c 100644
--- a/test/test_device_select_if.cu
+++ b/test/test_device_select_if.cu
@@ -33,13 +33,10 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
 #include <cub/device/device_select.cuh>
 #include <cub/device/device_partition.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
 
 #include <thrust/count.h>
 #include <thrust/device_vector.h>
@@ -54,6 +51,9 @@
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -229,88 +229,130 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceSelect
  */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-__global__ void CnpDispatchKernel(
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    bool                        debug_synchronous)
+template <int CubBackend,
+          typename IsFlaggedTag,
+          typename IsPartitionTag,
+          typename InputIteratorT,
+          typename FlagIteratorT,
+          typename SelectOpT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  IsFlaggedTag         is_flagged,
+                                  IsPartitionTag       is_partition,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  FlagIteratorT        d_flags,
+                                  OutputIteratorT      d_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items,
+                                  SelectOpT            select_op,
+                                  bool                 debug_synchronous)
 {
+  *d_cdp_error = Dispatch(cub_backend,
+                          is_flagged,
+                          is_partition,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_flags,
+                          d_out,
+                          d_num_selected_out,
+                          num_items,
+                          select_op,
+                          0,
+                          debug_synchronous);
 
-#ifndef CUB_CDP
-    (void)is_flagged;
-    (void)is_partition;
-    (void)timing_timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_in;
-    (void)d_flags;
-    (void)d_out;
-    (void)d_num_selected_out;
-    (void)num_items;
-    (void)select_op;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <typename IsFlaggedTag,
+          typename IsPartitionTag,
+          typename InputIteratorT,
+          typename FlagIteratorT,
+          typename SelectOpT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     IsFlaggedTag   is_flagged,
+                     IsPartitionTag is_partition,
+                     int            timing_timing_iterations,
+                     size_t        *d_temp_storage_bytes,
+                     cudaError_t   *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     InputIteratorT       d_in,
+                     FlagIteratorT        d_flags,
+                     OutputIteratorT      d_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items,
+                     SelectOpT            select_op,
+                     cudaStream_t         stream,
+                     bool                 debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CUB,
+                              IsFlaggedTag,
+                              IsPartitionTag,
+                              InputIteratorT,
+                              FlagIteratorT,
+                              SelectOpT,
+                              OutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            is_flagged,
+            is_partition,
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            select_op,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -635,10 +677,11 @@ template <
 void TestOp(
     int             num_items)
 {
+#if TEST_CDP == 0
     TestMethod<CUB, T>(num_items);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     TestMethod<CDP, T>(num_items);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -981,8 +1024,8 @@ void TestFlaggedInPlaceWithAliasedFlags()
   int *d_num_out = thrust::raw_pointer_cast(num_out.data());
   int *d_data = thrust::raw_pointer_cast(data.data());
   int *d_flags = d_data; // alias
-  int *d_allocated_flags = thrust::raw_pointer_cast(data.data()); 
-  int *d_reference = thrust::raw_pointer_cast(reference.data()); 
+  int *d_allocated_flags = thrust::raw_pointer_cast(data.data());
+  int *d_reference = thrust::raw_pointer_cast(reference.data());
 
   void *d_tmp_storage{};
   std::size_t tmp_storage_size{};
@@ -1070,7 +1113,6 @@ int main(int argc, char** argv)
             "[--device=<device-id>] "
             "[--ratio=<selection ratio, default 0.5>] "
             "[--v] "
-            "[--cdp] "
             "\n", argv[0]);
         exit(0);
     }
@@ -1080,6 +1122,8 @@ int main(int argc, char** argv)
     g_device_giga_bandwidth = args.device_giga_bandwidth;
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
+
     TestFlagsAliasingInPartition();
 
     TestFlaggedInPlace();
diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu
index c7f6679278..64964e1b37 100644
--- a/test/test_device_select_unique.cu
+++ b/test/test_device_select_unique.cu
@@ -33,19 +33,20 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
 #include <cub/device/device_select.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
 
+#include <thrust/device_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/device_vector.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -103,77 +104,102 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceSelect
  */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    bool                        debug_synchronous)
+template <int CubBackend,
+          typename InputIteratorT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  InputIteratorT       d_in,
+                                  OutputIteratorT      d_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items,
+                                  bool                 debug_synchronous)
 {
-
-#ifndef CUB_CDP
-    (void)timing_timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_cdp_error;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_in;
-    (void)d_out;
-    (void)d_num_selected_out;
-    (void)num_items;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_in,
+                          d_out,
+                          d_num_selected_out,
+                          num_items,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <typename InputIteratorT,
+          typename OutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     InputIteratorT       d_in,
+                     OutputIteratorT      d_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items,
+                     cudaStream_t         stream,
+                     bool                 debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CUB,
+                              InputIteratorT,
+                              OutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            d_num_selected_out,
+            num_items,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -450,10 +476,11 @@ template <
 void TestOp(
     int             num_items)
 {
+#if TEST_CDP == 0
     Test<CUB, T>(num_items);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<CDP, T>(num_items);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -562,7 +589,6 @@ int main(int argc, char** argv)
             "[--maxseg=<max segment length>]"
             "[--entropy=<segment length bit entropy reduction rounds>]"
             "[--v] "
-            "[--cdp]"
             "\n", argv[0]);
         exit(0);
     }
@@ -572,6 +598,8 @@ int main(int argc, char** argv)
     g_device_giga_bandwidth = args.device_giga_bandwidth;
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
+
     // Test different input types
     Test<unsigned char>(num_items);
     Test<unsigned short>(num_items);
diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu
index d8db2f7935..bcfa662c01 100644
--- a/test/test_device_select_unique_by_key.cu
+++ b/test/test_device_select_unique_by_key.cu
@@ -32,15 +32,17 @@
 // Ensure printing of CUDA runtime errors to console
 #define CUB_STDERR
 
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
 #include <cub/device/device_select.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
 #include "test_util.h"
 
+#include <cstdio>
+#include <typeinfo>
+
 using namespace cub;
 
 
@@ -101,83 +103,116 @@ cudaError_t Dispatch(
 // CUDA Nested Parallelism Test Kernel
 //---------------------------------------------------------------------
 
+#if TEST_CDP == 1
+
 /**
  * Simple wrapper kernel to invoke DeviceSelect
  */
-template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    ValueInputIteratorT         d_values_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueOutputIteratorT        d_values_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    bool                        debug_synchronous)
+template <int CubBackend,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+__global__ void CDPDispatchKernel(Int2Type<CubBackend> cub_backend,
+                                  int                  timing_timing_iterations,
+                                  size_t              *d_temp_storage_bytes,
+                                  cudaError_t         *d_cdp_error,
+
+                                  void                *d_temp_storage,
+                                  size_t               temp_storage_bytes,
+                                  KeyInputIteratorT    d_keys_in,
+                                  ValueInputIteratorT  d_values_in,
+                                  KeyOutputIteratorT   d_keys_out,
+                                  ValueOutputIteratorT d_values_out,
+                                  NumSelectedIteratorT d_num_selected_out,
+                                  OffsetT              num_items,
+                                  bool                 debug_synchronous)
 {
-
-#ifndef CUB_CDP
-    (void)timing_timing_iterations;
-    (void)d_temp_storage_bytes;
-    (void)d_cdp_error;
-    (void)d_temp_storage;
-    (void)temp_storage_bytes;
-    (void)d_keys_in;
-    (void)d_values_in;
-    (void)d_keys_out;
-    (void)d_values_out;
-    (void)d_num_selected_out;
-    (void)num_items;
-    (void)debug_synchronous;
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
+  *d_cdp_error = Dispatch(cub_backend,
+                          timing_timing_iterations,
+                          d_temp_storage_bytes,
+                          d_cdp_error,
+                          d_temp_storage,
+                          temp_storage_bytes,
+                          d_keys_in,
+                          d_values_in,
+                          d_keys_out,
+                          d_values_out,
+                          d_num_selected_out,
+                          num_items,
+                          0,
+                          debug_synchronous);
+
+  *d_temp_storage_bytes = temp_storage_bytes;
 }
 
-
 /**
  * Dispatch to CDP kernel
  */
-template <typename KeyInputIteratorT, typename ValueInputIteratorT, typename KeyOutputIteratorT, typename ValueOutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    ValueInputIteratorT         d_values_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueOutputIteratorT        d_values_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename OffsetT>
+cudaError_t Dispatch(Int2Type<CDP> /*dispatch_to*/,
+                     int          timing_timing_iterations,
+                     size_t      *d_temp_storage_bytes,
+                     cudaError_t *d_cdp_error,
+
+                     void                *d_temp_storage,
+                     size_t              &temp_storage_bytes,
+                     KeyInputIteratorT    d_keys_in,
+                     ValueInputIteratorT  d_values_in,
+                     KeyOutputIteratorT   d_keys_out,
+                     ValueOutputIteratorT d_values_out,
+                     NumSelectedIteratorT d_num_selected_out,
+                     OffsetT              num_items,
+                     cudaStream_t         stream,
+                     bool                 debug_synchronous)
 {
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
+  // Invoke kernel to invoke device-side dispatch
+  cudaError_t retval =
+    thrust::cuda_cub::launcher::triple_chevron(1, 1, 0, stream)
+      .doit(CDPDispatchKernel<CUB,
+                              KeyInputIteratorT,
+                              ValueInputIteratorT,
+                              KeyOutputIteratorT,
+                              ValueOutputIteratorT,
+                              NumSelectedIteratorT,
+                              OffsetT>,
+            Int2Type<CUB>{},
+            timing_timing_iterations,
+            d_temp_storage_bytes,
+            d_cdp_error,
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_values_in,
+            d_keys_out,
+            d_values_out,
+            d_num_selected_out,
+            num_items,
+            debug_synchronous);
+  CubDebugExit(retval);
+
+  // Copy out temp_storage_bytes
+  CubDebugExit(cudaMemcpy(&temp_storage_bytes,
+                          d_temp_storage_bytes,
+                          sizeof(size_t) * 1,
+                          cudaMemcpyDeviceToHost));
+
+  // Copy out error
+  CubDebugExit(cudaMemcpy(&retval,
+                          d_cdp_error,
+                          sizeof(cudaError_t) * 1,
+                          cudaMemcpyDeviceToHost));
+  return retval;
 }
 
-
+#endif // TEST_CDP
 
 //---------------------------------------------------------------------
 // Test generation
@@ -486,10 +521,11 @@ template <
 void TestOp(
     int             num_items)
 {
+#if TEST_CDP == 0
     Test<CUB, KeyT, ValueT>(num_items);
-#ifdef CUB_CDP
+#elif TEST_CDP == 1
     Test<CDP, KeyT, ValueT>(num_items);
-#endif
+#endif // TEST_CDP
 }
 
 
@@ -550,7 +586,6 @@ int main(int argc, char** argv)
             "[--maxseg=<max segment length>]"
             "[--entropy=<segment length bit entropy reduction rounds>]"
             "[--v] "
-            "[--cdp]"
             "\n", argv[0]);
         exit(0);
     }
@@ -560,6 +595,8 @@ int main(int argc, char** argv)
     g_device_giga_bandwidth = args.device_giga_bandwidth;
     printf("\n");
 
+    // %PARAM% TEST_CDP cdp 0:1
+
     // Test different input types
     Test<unsigned char, int>(num_items);
     Test<unsigned char, long>(num_items);
diff --git a/test/test_util.h b/test/test_util.h
index d7b5d28f7c..8d00e435a2 100644
--- a/test/test_util.h
+++ b/test/test_util.h
@@ -811,13 +811,13 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
             PRIMITIVE       = false,                        \
             NULL_TYPE       = false,                        \
         };                                                  \
-        static T Max()                                      \
+        static __host__ __device__ T Max()                  \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Max()};               \
             return retval;                                  \
         }                                                   \
-        static T Lowest()                                   \
+        static __host__ __device__ T Lowest()               \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Lowest()};            \
@@ -899,14 +899,14 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
             PRIMITIVE       = false,                        \
             NULL_TYPE       = false,                        \
         };                                                  \
-        static T Max()                                      \
+        static __host__ __device__ T Max()                  \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Max(),                \
                 NumericTraits<BaseT>::Max()};               \
             return retval;                                  \
         }                                                   \
-        static T Lowest()                                   \
+        static __host__ __device__  T Lowest()              \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Lowest(),             \
@@ -996,7 +996,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
             PRIMITIVE       = false,                        \
             NULL_TYPE       = false,                        \
         };                                                  \
-        static T Max()                                      \
+        static __host__ __device__ T Max()                  \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Max(),                \
@@ -1004,7 +1004,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
                 NumericTraits<BaseT>::Max()};               \
             return retval;                                  \
         }                                                   \
-        static T Lowest()                                   \
+        static __host__ __device__ T Lowest()               \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Lowest(),             \
@@ -1101,7 +1101,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
             PRIMITIVE       = false,                        \
             NULL_TYPE       = false,                        \
         };                                                  \
-        static T Max()                                      \
+        static __host__ __device__ T Max()                  \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Max(),                \
@@ -1110,7 +1110,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
                 NumericTraits<BaseT>::Max()};               \
             return retval;                                  \
         }                                                   \
-        static T Lowest()                                   \
+        static __host__ __device__ T Lowest()               \
         {                                                   \
             T retval = {                                    \
                 NumericTraits<BaseT>::Lowest(),             \
@@ -1253,7 +1253,7 @@ struct NumericTraits<TestFoo>
         PRIMITIVE       = false,
         NULL_TYPE       = false,
     };
-    static TestFoo Max()
+  __host__ __device__ static TestFoo Max()
     {
         return TestFoo::MakeTestFoo(
             NumericTraits<long long>::Max(),
@@ -1262,7 +1262,7 @@ struct NumericTraits<TestFoo>
             NumericTraits<char>::Max());
     }
 
-    static TestFoo Lowest()
+  __host__ __device__ static TestFoo Lowest()
     {
         return TestFoo::MakeTestFoo(
             NumericTraits<long long>::Lowest(),
@@ -1369,14 +1369,14 @@ struct NumericTraits<TestBar>
         PRIMITIVE       = false,
         NULL_TYPE       = false,
     };
-    static TestBar Max()
+    __host__ __device__ static TestBar Max()
     {
         return TestBar(
             NumericTraits<long long>::Max(),
             NumericTraits<int>::Max());
     }
 
-    static TestBar Lowest()
+    __host__ __device__ static TestBar Lowest()
     {
         return TestBar(
             NumericTraits<long long>::Lowest(),
@@ -1513,6 +1513,11 @@ int CompareDeviceResults(
     bool verbose = true,
     bool display_data = false)
 {
+    if (num_items == 0)
+    {
+        return 0;
+    }
+
     // Allocate array on host
     T *h_data = (T*) malloc(num_items * sizeof(T));