Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #359 from elstehle/feature/device-batch-memcpy
Browse files Browse the repository at this point in the history
Adds DeviceBatchMemcpy algorithm and tests
  • Loading branch information
gevtushenko authored Dec 30, 2022
2 parents 73f3434 + 9ac15eb commit 423f54e
Show file tree
Hide file tree
Showing 8 changed files with 2,614 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ IncludeCategories:
Priority: 2
- Regex: '^<cuda'
Priority: 3
- Regex: '^<[a-z]*>$'
- Regex: '^<[a-z_]*>$'
Priority: 4
IndentCaseLabels: true
IndentPPDirectives: None
Expand Down
1,099 changes: 1,099 additions & 0 deletions cub/agent/agent_batch_memcpy.cuh

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions cub/agent/single_pass_scan_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,16 @@ struct ScanTileState<T, true>
value = tile_descriptor.value;
}

/**
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
*/
__device__ __forceinline__ T LoadValid(int tile_idx)
{
TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx];
TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
return tile_descriptor.value;
}
};


Expand Down Expand Up @@ -426,6 +436,15 @@ struct ScanTileState<T, false>
else
value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
}

/**
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
*/
__device__ __forceinline__ T LoadValid(int tile_idx)
{
return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx];
}
};


Expand Down
16 changes: 8 additions & 8 deletions cub/cub.cuh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
Expand All @@ -12,7 +12,7 @@
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
Expand All @@ -37,33 +37,34 @@
#include "config.cuh"

// Block
#include "block/block_histogram.cuh"
#include "block/block_adjacent_difference.cuh"
#include "block/block_discontinuity.cuh"
#include "block/block_exchange.cuh"
#include "block/block_histogram.cuh"
#include "block/block_load.cuh"
#include "block/block_merge_sort.cuh"
#include "block/block_radix_rank.cuh"
#include "block/block_radix_sort.cuh"
#include "block/block_merge_sort.cuh"
#include "block/block_reduce.cuh"
#include "block/block_scan.cuh"
#include "block/block_store.cuh"
//#include "block/block_shift.cuh"

// Device
#include "device/device_merge_sort.cuh"
#include "device/device_adjacent_difference.cuh"
#include "device/device_histogram.cuh"
#include "device/device_memcpy.cuh"
#include "device/device_merge_sort.cuh"
#include "device/device_partition.cuh"
#include "device/device_radix_sort.cuh"
#include "device/device_reduce.cuh"
#include "device/device_run_length_encode.cuh"
#include "device/device_scan.cuh"
#include "device/device_segmented_sort.cuh"
#include "device/device_segmented_radix_sort.cuh"
#include "device/device_segmented_reduce.cuh"
#include "device/device_segmented_sort.cuh"
#include "device/device_select.cuh"
#include "device/device_spmv.cuh"
#include "device/device_adjacent_difference.cuh"

// Grid
//#include "grid/grid_barrier.cuh"
Expand Down Expand Up @@ -105,4 +106,3 @@
#include "util_macro.cuh"
#include "util_ptx.cuh"
#include "util_type.cuh"

170 changes: 170 additions & 0 deletions cub/device/device_memcpy.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/******************************************************************************
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

/**
* \file
* cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
*/

#pragma once

#include <cub/config.cuh>
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>

#include <cstdint>
#include <type_traits>

CUB_NAMESPACE_BEGIN

/**
* @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
* \ingroup SingleModule
*/
struct DeviceMemcpy
{
/**
* @brief Copies data from a batch of given source buffers to their corresponding destination
* buffer.
* @note If any input buffer aliases memory from any output buffer the behavior is undefined. If
* any output buffer aliases memory of another output buffer the behavior is undefined. Input
* buffers can alias one another.
*
* @par Snippet
* The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing
* a single string buffer.
* @par
* @code
* struct GetPtrToStringItem
* {
* __host__ __device__ __forceinline__ void *operator()(uint32_t index)
* {
* return &d_string_data_in[d_string_offsets[index]];
* }
* char *d_string_data_in;
* uint32_t *d_string_offsets;
* };
*
* struct GetStringItemSize
* {
* __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
* {
* return d_string_offsets[index + 1] - d_string_offsets[index];
* }
* uint32_t *d_string_offsets;
* };
*
* uint32_t num_strings = 5;
* char *d_string_data_in; // e.g., "TomatoesBananasApplesOrangesGrapes"
* char *d_string_data_out; // e.g., " ... "
* uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
* uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
* uint32_t *d_gather_index; // e.g., [2, 1, 4, 3, 0]
*
* // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
* auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
* d_gather_index);
*
* // Returns pointers to the input buffer for each string
* auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
* GetPtrToStringItem{d_string_data_in,
* d_string_offsets_old});
*
* // Returns the string size of the i-th string
* auto str_sizes = thrust::make_transform_iterator(gather_iterator,
* GetStringItemSize{d_string_offsets_old});
*
* // Returns pointers to the output buffer for each string
* auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
* GetPtrToStringItem{d_string_data_out,
* d_string_offsets_new});
*
* // Determine temporary device storage requirements
* void *d_temp_storage = nullptr;
* size_t temp_storage_bytes = 0;
* cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
* str_sizes, num_strings);
*
* // Allocate temporary storage
* cudaMalloc(&d_temp_storage, temp_storage_bytes);
*
* // Run batched copy algorithm (used to permute strings)
* cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
* str_sizes, num_strings);
*
* // d_string_data_out <-- "ApplesBananasGrapesOrangesTomatoe"
* @endcode
* @tparam InputBufferIt <b>[inferred]</b> Device-accessible random-access input iterator type
* providing the pointers to the source memory buffers
* @tparam OutputBufferIt <b>[inferred]</b> Device-accessible random-access input iterator type
* providing the pointers to the destination memory buffers
* @tparam BufferSizeIteratorT <b>[inferred]</b> Device-accessible random-access input iterator
* type providing the number of bytes to be copied for each pair of buffers
* @param d_temp_storage [in] Device-accessible allocation of temporary storage. When NULL, the
* required allocation size is written to \p temp_storage_bytes and no work is done.
* @param temp_storage_bytes [in,out] Reference to size in bytes of \p d_temp_storage allocation
* @param input_buffer_it [in] Device-accessible iterator providing the pointers to the source
* memory buffers
* @param output_buffer_it [in] Device-accessible iterator providing the pointers to the
* destination memory buffers
* @param buffer_sizes [in] Device-accessible iterator providing the number of bytes to be copied
* for each pair of buffers
* @param num_buffers [in] The total number of buffer pairs
* @param stream [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is
* stream<sub>0</sub>.
*/
template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
CUB_RUNTIME_FUNCTION static cudaError_t Batched(void *d_temp_storage,
size_t &temp_storage_bytes,
InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
uint32_t num_buffers,
cudaStream_t stream = 0)
{
// Integer type large enough to hold any offset in [0, num_buffers)
using BufferOffsetT = uint32_t;

// Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
// uppper bound on num_thread_blocks_launched can be assumed to be given by
// IDIV_CEIL(num_buffers, 64)
using BlockOffsetT = uint32_t;

return detail::DispatchBatchMemcpy<InputBufferIt,
OutputBufferIt,
BufferSizeIteratorT,
BufferOffsetT,
BlockOffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
input_buffer_it,
output_buffer_it,
buffer_sizes,
num_buffers,
stream);
}
};

CUB_NAMESPACE_END
Loading

0 comments on commit 423f54e

Please sign in to comment.