From 192d72d0fabe20107c8218de7752f03b33898ba5 Mon Sep 17 00:00:00 2001 From: Tomoya Kimura Date: Mon, 27 Nov 2023 18:36:42 +0900 Subject: [PATCH] perf(image_projection_based_fusion,lidar_centerpoint): add cuda preprocess (#1041) * add cuda preprocess Signed-off-by: wep21 * add cuda preprocess for centerpoint Signed-off-by: wep21 * update pp Signed-off-by: wep21 Signed-off-by: tomoya.kimura * fix bug Signed-off-by: wep21 * style(pre-commit): autofix --------- Signed-off-by: wep21 Signed-off-by: tomoya.kimura Co-authored-by: wep21 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../pointpainting_trt.hpp | 3 +- .../preprocess_kernel.hpp | 12 +- .../pointpainting_fusion/voxel_generator.hpp | 4 +- .../pointpainting_trt.cpp | 45 +++++--- .../pointpainting_fusion/preprocess_kernel.cu | 103 +++++++++++++++++- .../pointpainting_fusion/voxel_generator.cpp | 82 +++----------- .../lidar_centerpoint/centerpoint_trt.hpp | 17 ++- .../network/scatter_kernel.hpp | 2 +- .../preprocess/preprocess_kernel.hpp | 12 +- .../preprocess/voxel_generator.hpp | 8 +- .../lidar_centerpoint/lib/centerpoint_trt.cpp | 67 +++++++----- .../lib/network/scatter_kernel.cu | 6 +- .../lib/preprocess/preprocess_kernel.cu | 99 ++++++++++++++++- .../lib/preprocess/voxel_generator.cpp | 76 ++----------- 14 files changed, 329 insertions(+), 207 deletions(-) diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp index 0f5ada722c31d..d28d9eb31216d 100644 --- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp +++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp @@ -23,6 +23,7 @@ namespace image_projection_based_fusion { +static constexpr size_t CAPACITY_POINT = 1000000; class PointPaintingTRT : public centerpoint::CenterPointTRT { public: @@ -33,8 +34,6 @@ class PointPaintingTRT : public centerpoint::CenterPointTRT const centerpoint::DensificationParam & densification_param, const centerpoint::CenterPointConfig & config); - ~PointPaintingTRT(); - protected: bool preprocess( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp index 7d7f290f71a4c..c913ac33d5e84 100644 --- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp +++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp @@ -20,9 +20,19 @@ namespace image_projection_based_fusion { +cudaError_t generateVoxels_random_launch( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels, + cudaStream_t stream); + +cudaError_t generateBaseFeatures_launch( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream); + cudaError_t generateFeatures_launch( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, + const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, const float voxel_size_y, const float voxel_size_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features, const std::size_t encoder_in_feature_size, cudaStream_t stream); diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp index 3e223dd3101df..ad0f3e38d665c 100755 --- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp +++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp @@ -26,9 +26,7 @@ class VoxelGenerator : public centerpoint::VoxelGenerator public: using centerpoint::VoxelGenerator::VoxelGenerator; - std::size_t pointsToVoxels( - std::vector & voxels, std::vector & coordinates, - std::vector & num_points_per_voxel) override; + std::size_t generateSweepPoints(std::vector & points) override; }; } // namespace image_projection_based_fusion diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp index f0e4de9c988e2..8911442f4c75d 100644 --- a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp +++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp @@ -43,26 +43,37 @@ bool PointPaintingTRT::preprocess( if (!is_success) { return false; } - num_voxels_ = vg_ptr_pp_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_); - if (num_voxels_ == 0) { - return false; - } - const auto voxels_size = - num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_; - const auto coordinates_size = num_voxels_ * config_.point_dim_size_; - // memcpy from host to device (not copy empty voxels) - CHECK_CUDA_ERROR(cudaMemcpyAsync( - voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice)); + const auto count = vg_ptr_pp_->generateSweepPoints(points_); CHECK_CUDA_ERROR(cudaMemcpyAsync( - coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int), - cudaMemcpyHostToDevice)); - CHECK_CUDA_ERROR(cudaMemcpyAsync( - num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float), - cudaMemcpyHostToDevice)); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float), + cudaMemcpyHostToDevice, stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_, stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_, stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync( + voxels_d_.get(), 0, + config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_ * + sizeof(float), + stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync( + coordinates_d_.get(), 0, config_.max_voxel_size_ * config_.point_dim_size_ * sizeof(int), + stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync( + num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_)); + + CHECK_CUDA_ERROR(image_projection_based_fusion::generateVoxels_random_launch( + points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_, + config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_, + config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_, + mask_d_.get(), voxels_buffer_d_.get(), stream_)); + + CHECK_CUDA_ERROR(image_projection_based_fusion::generateBaseFeatures_launch( + mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_, + num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), + stream_)); CHECK_CUDA_ERROR(image_projection_based_fusion::generateFeatures_launch( - voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_, + voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(), config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_, config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(), config_.encoder_in_feature_size_, stream_)); diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu b/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu index 9789f52de5f74..d06b60633acf8 100644 --- a/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu +++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu @@ -57,9 +57,105 @@ std::size_t divup(const std::size_t a, const std::size_t b) namespace image_projection_based_fusion { +__global__ void generateVoxels_random_kernel( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels) +{ + int point_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (point_idx >= points_size) return; + + float x = points[point_idx * POINT_FEATURE_SIZE]; + float y = points[point_idx * POINT_FEATURE_SIZE + 1]; + float z = points[point_idx * POINT_FEATURE_SIZE + 2]; + + if ( + x < min_x_range || x >= max_x_range || y < min_y_range || y >= max_y_range || z < min_z_range || + z >= max_z_range) + return; + + int voxel_idx = floorf((x - min_x_range) / pillar_x_size); + int voxel_idy = floorf((y - min_y_range) / pillar_y_size); + unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx; + + unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1); + + if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return; + float * address = + voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * POINT_FEATURE_SIZE; + for (unsigned int i = 0; i < POINT_FEATURE_SIZE; ++i) { + atomicExch(address + i, points[point_idx * POINT_FEATURE_SIZE + i]); + } +} + +cudaError_t generateVoxels_random_launch( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels, + cudaStream_t stream) +{ + dim3 blocks((points_size + 256 - 1) / 256); + dim3 threads(256); + generateVoxels_random_kernel<<>>( + points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range, + max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask, + voxels); + cudaError_t err = cudaGetLastError(); + return err; +} + +__global__ void generateBaseFeatures_kernel( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs) +{ + unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y; + + if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return; + + unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx; + unsigned int count = mask[voxel_index]; + if (!(count > 0)) return; + count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE; + + unsigned int current_pillarId = 0; + current_pillarId = atomicAdd(pillar_num, 1); + + voxel_num[current_pillarId] = count; + + uint3 idx = {0, voxel_idy, voxel_idx}; + ((uint3 *)voxel_idxs)[current_pillarId] = idx; + + for (int i = 0; i < count; i++) { + int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i; + int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i; + for (unsigned int j = 0; j < POINT_FEATURE_SIZE; ++j) { + voxel_features[outIndex * POINT_FEATURE_SIZE + j] = voxels[inIndex * POINT_FEATURE_SIZE + j]; + } + } + + // clear buffer for next infer + atomicExch(mask + voxel_index, 0); +} + +// create 4 channels +cudaError_t generateBaseFeatures_launch( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream) +{ + dim3 threads = {32, 32}; + dim3 blocks = { + (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y}; + + generateBaseFeatures_kernel<<>>( + mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs); + cudaError_t err = cudaGetLastError(); + return err; +} + __global__ void generateFeatures_kernel( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z, + const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features, const std::size_t encoder_in_feature_size) { @@ -70,7 +166,8 @@ __global__ void generateFeatures_kernel( int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE; int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE; - if (pillar_idx >= num_voxels) return; + unsigned int num_pillars = num_voxels[0]; + if (pillar_idx >= num_pillars) return; // load src __shared__ float pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE][POINT_FEATURE_SIZE]; @@ -155,7 +252,7 @@ __global__ void generateFeatures_kernel( cudaError_t generateFeatures_launch( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, + const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, const float voxel_size_y, const float voxel_size_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features, const std::size_t encoder_in_feature_size, cudaStream_t stream) diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp index cfd24829e7806..ea10cb1237436 100755 --- a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp +++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp @@ -18,29 +18,10 @@ namespace image_projection_based_fusion { -std::size_t VoxelGenerator::pointsToVoxels( - std::vector & voxels, std::vector & coordinates, - std::vector & num_points_per_voxel) +size_t VoxelGenerator::generateSweepPoints(std::vector & points) { - // voxels (float): (max_num_voxels * max_num_points_per_voxel * point_feature_size) - // coordinates (int): (max_num_voxels * point_dim_size) - // num_points_per_voxel (float): (max_num_voxels) - - const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_; - std::vector> coord_to_voxel_idx(grid_size, -1); - - std::size_t voxel_cnt = 0; // @return - // std::array point; - // std::array coord_zyx; - std::vector point; - point.resize(config_.point_feature_size_); - std::vector coord_zyx; - coord_zyx.resize(config_.point_dim_size_); - bool out_of_range; - std::size_t point_cnt; - int c, coord_idx, voxel_idx; Eigen::Vector3f point_current, point_past; - + size_t point_counter{}; for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter); pc_cache_iter++) { auto pc_msg = pc_cache_iter->pointcloud_msg; @@ -55,56 +36,25 @@ std::size_t VoxelGenerator::pointsToVoxels( point_past << *x_iter, *y_iter, *z_iter; point_current = affine_past2current * point_past; - point[0] = point_current.x(); - point[1] = point_current.y(); - point[2] = point_current.z(); - point[3] = time_lag; + points.at(point_counter * config_.point_feature_size_) = point_current.x(); + points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y(); + points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z(); + points.at(point_counter * config_.point_feature_size_ + 3) = time_lag; // decode the class value back to one-hot binary and assign it to point - std::fill(point.begin() + 4, point.end(), 0); - point[4 + *class_iter] = 1; - - out_of_range = false; - for (std::size_t di = 0; di < config_.point_dim_size_; di++) { - c = static_cast((point[di] - range_[di]) * recip_voxel_size_[di]); - if (c < 0 || c >= grid_size_[di]) { - out_of_range = true; - break; - } - coord_zyx[config_.point_dim_size_ - di - 1] = c; - } - if (out_of_range) { - continue; + for (size_t i = 0; i < config_.class_size_; ++i) { + points.at(point_counter * config_.point_feature_size_ + 4 + i) = 0.f; } - - coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ + - coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2]; - voxel_idx = coord_to_voxel_idx[coord_idx].value(); - if (voxel_idx == -1) { - voxel_idx = voxel_cnt; - if (voxel_cnt >= config_.max_voxel_size_) { - continue; - } - - voxel_cnt++; - coord_to_voxel_idx[coord_idx] = voxel_idx; - for (std::size_t di = 0; di < config_.point_dim_size_; di++) { - coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di]; - } - } - - point_cnt = num_points_per_voxel[voxel_idx]; - if (point_cnt < config_.max_point_in_voxel_size_) { - for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) { - voxels - [voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ + - point_cnt * config_.point_feature_size_ + fi] = point[fi]; - } - num_points_per_voxel[voxel_idx]++; + auto class_value = static_cast(*class_iter); + auto iter = points.begin() + point_counter * config_.point_feature_size_ + 4; + while (class_value > 0) { + *iter = class_value % 2; + class_value /= 2; + ++iter; } + ++point_counter; } } - - return voxel_cnt; + return point_counter; } } // namespace image_projection_based_fusion diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp index fee17f1c0156a..8cf250be0c049 100644 --- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp +++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp @@ -32,6 +32,8 @@ namespace centerpoint { +static constexpr size_t CAPACITY_POINT = 1000000; + class NetworkParam { public: @@ -59,7 +61,7 @@ class CenterPointTRT const NetworkParam & encoder_param, const NetworkParam & head_param, const DensificationParam & densification_param, const CenterPointConfig & config); - ~CenterPointTRT(); + virtual ~CenterPointTRT(); bool detect( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer, @@ -83,12 +85,13 @@ class CenterPointTRT std::size_t class_size_{0}; CenterPointConfig config_; - std::size_t num_voxels_{0}; std::size_t encoder_in_feature_size_{0}; std::size_t spatial_features_size_{0}; - std::vector voxels_; - std::vector coordinates_; - std::vector num_points_per_voxel_; + std::size_t voxels_buffer_size_{0}; + std::size_t mask_size_{0}; + std::size_t voxels_size_{0}; + std::size_t coordinates_size_{0}; + std::vector points_; cuda::unique_ptr voxels_d_{nullptr}; cuda::unique_ptr coordinates_d_{nullptr}; cuda::unique_ptr num_points_per_voxel_d_{nullptr}; @@ -101,6 +104,10 @@ class CenterPointTRT cuda::unique_ptr head_out_dim_d_{nullptr}; cuda::unique_ptr head_out_rot_d_{nullptr}; cuda::unique_ptr head_out_vel_d_{nullptr}; + cuda::unique_ptr points_d_{nullptr}; + cuda::unique_ptr voxels_buffer_d_{nullptr}; + cuda::unique_ptr mask_d_{nullptr}; + cuda::unique_ptr num_voxels_d_{nullptr}; }; } // namespace centerpoint diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp index e754d89468ba3..3dd00c25fd9e7 100644 --- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp +++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp @@ -21,7 +21,7 @@ namespace centerpoint { cudaError_t scatterFeatures_launch( - const float * pillar_features, const int * coords, const std::size_t num_pillars, + const float * pillar_features, const int * coords, const unsigned int * num_pillars, const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size, const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features, cudaStream_t stream); diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp index 3abc8cfeccd5b..824144fe3b22b 100644 --- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp +++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp @@ -20,9 +20,19 @@ namespace centerpoint { +cudaError_t generateVoxels_random_launch( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels, + cudaStream_t stream); + +cudaError_t generateBaseFeatures_launch( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream); + cudaError_t generateFeatures_launch( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, + const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, const float voxel_size_y, const float voxel_size_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features, cudaStream_t stream); diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp index 810ae3afd20d0..be15d51e91715 100644 --- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp +++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp @@ -31,9 +31,7 @@ class VoxelGeneratorTemplate explicit VoxelGeneratorTemplate( const DensificationParam & param, const CenterPointConfig & config); - virtual std::size_t pointsToVoxels( - std::vector & voxels, std::vector & coordinates, - std::vector & num_points_per_voxel) = 0; + virtual std::size_t generateSweepPoints(std::vector & points) = 0; bool enqueuePointCloud( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer); @@ -52,9 +50,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate public: using VoxelGeneratorTemplate::VoxelGeneratorTemplate; - std::size_t pointsToVoxels( - std::vector & voxels, std::vector & coordinates, - std::vector & num_points_per_voxel) override; + std::size_t generateSweepPoints(std::vector & points) override; }; } // namespace centerpoint diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp index 9b01ea8f53675..acdcad1b54c40 100644 --- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp +++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp @@ -69,9 +69,9 @@ CenterPointTRT::~CenterPointTRT() void CenterPointTRT::initPtr() { - const auto voxels_size = + voxels_size_ = config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_; - const auto coordinates_size = config_.max_voxel_size_ * config_.point_dim_size_; + coordinates_size_ = config_.max_voxel_size_ * config_.point_dim_size_; encoder_in_feature_size_ = config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.encoder_in_feature_size_; const auto pillar_features_size = config_.max_voxel_size_ * config_.encoder_out_feature_size_; @@ -79,14 +79,16 @@ void CenterPointTRT::initPtr() config_.grid_size_x_ * config_.grid_size_y_ * config_.encoder_out_feature_size_; const auto grid_xy_size = config_.down_grid_size_x_ * config_.down_grid_size_y_; + voxels_buffer_size_ = config_.grid_size_x_ * config_.grid_size_y_ * + config_.max_point_in_voxel_size_ * config_.point_feature_size_; + mask_size_ = config_.grid_size_x_ * config_.grid_size_y_; + // host - voxels_.resize(voxels_size); - coordinates_.resize(coordinates_size); - num_points_per_voxel_.resize(config_.max_voxel_size_); + points_.resize(CAPACITY_POINT * config_.point_feature_size_); // device - voxels_d_ = cuda::make_unique(voxels_size); - coordinates_d_ = cuda::make_unique(coordinates_size); + voxels_d_ = cuda::make_unique(voxels_size_); + coordinates_d_ = cuda::make_unique(coordinates_size_); num_points_per_voxel_d_ = cuda::make_unique(config_.max_voxel_size_); encoder_in_features_d_ = cuda::make_unique(encoder_in_feature_size_); pillar_features_d_ = cuda::make_unique(pillar_features_size); @@ -97,15 +99,16 @@ void CenterPointTRT::initPtr() head_out_dim_d_ = cuda::make_unique(grid_xy_size * config_.head_out_dim_size_); head_out_rot_d_ = cuda::make_unique(grid_xy_size * config_.head_out_rot_size_); head_out_vel_d_ = cuda::make_unique(grid_xy_size * config_.head_out_vel_size_); + points_d_ = cuda::make_unique(CAPACITY_POINT * config_.point_feature_size_); + voxels_buffer_d_ = cuda::make_unique(voxels_buffer_size_); + mask_d_ = cuda::make_unique(mask_size_); + num_voxels_d_ = cuda::make_unique(1); } bool CenterPointTRT::detect( const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer, std::vector & det_boxes3d) { - std::fill(voxels_.begin(), voxels_.end(), 0); - std::fill(coordinates_.begin(), coordinates_.end(), -1); - std::fill(num_points_per_voxel_.begin(), num_points_per_voxel_.end(), 0); CHECK_CUDA_ERROR(cudaMemsetAsync( encoder_in_features_d_.get(), 0, encoder_in_feature_size_ * sizeof(float), stream_)); CHECK_CUDA_ERROR( @@ -131,27 +134,33 @@ bool CenterPointTRT::preprocess( if (!is_success) { return false; } - num_voxels_ = vg_ptr_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_); - if (num_voxels_ == 0) { - return false; - } - - const auto voxels_size = - num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_; - const auto coordinates_size = num_voxels_ * config_.point_dim_size_; - // memcpy from host to device (not copy empty voxels) - CHECK_CUDA_ERROR(cudaMemcpyAsync( - voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA_ERROR(cudaMemcpyAsync( - coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int), - cudaMemcpyHostToDevice)); + const auto count = vg_ptr_->generateSweepPoints(points_); CHECK_CUDA_ERROR(cudaMemcpyAsync( - num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float), - cudaMemcpyHostToDevice)); - CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_)); + points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float), + cudaMemcpyHostToDevice, stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_)); + CHECK_CUDA_ERROR( + cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_ * sizeof(int), stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_d_.get(), 0, voxels_size_ * sizeof(float), stream_)); + CHECK_CUDA_ERROR( + cudaMemsetAsync(coordinates_d_.get(), 0, coordinates_size_ * sizeof(int), stream_)); + CHECK_CUDA_ERROR(cudaMemsetAsync( + num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_)); + + CHECK_CUDA_ERROR(generateVoxels_random_launch( + points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_, + config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_, + config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_, + mask_d_.get(), voxels_buffer_d_.get(), stream_)); + + CHECK_CUDA_ERROR(generateBaseFeatures_launch( + mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_, + num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), + stream_)); CHECK_CUDA_ERROR(generateFeatures_launch( - voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_, + voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(), config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_, config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(), stream_)); @@ -171,7 +180,7 @@ void CenterPointTRT::inference() // scatter CHECK_CUDA_ERROR(scatterFeatures_launch( - pillar_features_d_.get(), coordinates_d_.get(), num_voxels_, config_.max_voxel_size_, + pillar_features_d_.get(), coordinates_d_.get(), num_voxels_d_.get(), config_.max_voxel_size_, config_.encoder_out_feature_size_, config_.grid_size_x_, config_.grid_size_y_, spatial_features_d_.get(), stream_)); diff --git a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu index da2f0e2f57c15..09cc83bf606fe 100644 --- a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu +++ b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu @@ -24,7 +24,7 @@ const std::size_t THREADS_PER_BLOCK = 32; namespace centerpoint { __global__ void scatterFeatures_kernel( - const float * pillar_features, const int * coords, const std::size_t num_pillars, + const float * pillar_features, const int * coords, const unsigned int * num_pillars, const std::size_t pillar_feature_size, const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features) { @@ -34,7 +34,7 @@ __global__ void scatterFeatures_kernel( const auto pillar_i = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; const auto feature_i = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; - if (pillar_i >= num_pillars || feature_i >= pillar_feature_size) { + if (pillar_i >= num_pillars[0] || feature_i >= pillar_feature_size) { return; } @@ -50,7 +50,7 @@ __global__ void scatterFeatures_kernel( // cspell: ignore divup cudaError_t scatterFeatures_launch( - const float * pillar_features, const int * coords, const std::size_t num_pillars, + const float * pillar_features, const int * coords, const unsigned int * num_pillars, const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size, const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features, cudaStream_t stream) diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu index 7b59757311ff2..6f77ff84c2cea 100644 --- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu +++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu @@ -41,9 +41,101 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9; // the same as encoder_in_featur namespace centerpoint { +__global__ void generateVoxels_random_kernel( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels) +{ + int point_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (point_idx >= points_size) return; + + float4 point = ((float4 *)points)[point_idx]; + + if ( + point.x < min_x_range || point.x >= max_x_range || point.y < min_y_range || + point.y >= max_y_range || point.z < min_z_range || point.z >= max_z_range) + return; + + int voxel_idx = floorf((point.x - min_x_range) / pillar_x_size); + int voxel_idy = floorf((point.y - min_y_range) / pillar_y_size); + unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx; + + unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1); + + if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return; + float * address = voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * 4; + atomicExch(address + 0, point.x); + atomicExch(address + 1, point.y); + atomicExch(address + 2, point.z); + atomicExch(address + 3, point.w); +} + +cudaError_t generateVoxels_random_launch( + const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range, + float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size, + float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels, + cudaStream_t stream) +{ + dim3 blocks((points_size + 256 - 1) / 256); + dim3 threads(256); + generateVoxels_random_kernel<<>>( + points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range, + max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask, + voxels); + cudaError_t err = cudaGetLastError(); + return err; +} + +__global__ void generateBaseFeatures_kernel( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs) +{ + unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y; + + if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return; + + unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx; + unsigned int count = mask[voxel_index]; + if (!(count > 0)) return; + count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE; + + unsigned int current_pillarId = 0; + current_pillarId = atomicAdd(pillar_num, 1); + + voxel_num[current_pillarId] = count; + + uint3 idx = {0, voxel_idy, voxel_idx}; + ((uint3 *)voxel_idxs)[current_pillarId] = idx; + + for (int i = 0; i < count; i++) { + int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i; + int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i; + ((float4 *)voxel_features)[outIndex] = ((float4 *)voxels)[inIndex]; + } + + // clear buffer for next infer + atomicExch(mask + voxel_index, 0); +} + +// create 4 channels +cudaError_t generateBaseFeatures_launch( + unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num, + float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream) +{ + dim3 threads = {32, 32}; + dim3 blocks = { + (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y}; + + generateBaseFeatures_kernel<<>>( + mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs); + cudaError_t err = cudaGetLastError(); + return err; +} + __global__ void generateFeatures_kernel( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z, + const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features) { // voxel_features (float): (max_voxel_size, max_point_in_voxel_size, point_feature_size) @@ -53,7 +145,8 @@ __global__ void generateFeatures_kernel( int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE; int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE; // max_point_in_voxel_size - if (pillar_idx >= num_voxels) return; + unsigned int num_pillars = num_voxels[0]; + if (pillar_idx >= num_pillars) return; // load src __shared__ float4 pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE]; @@ -144,7 +237,7 @@ __global__ void generateFeatures_kernel( // cspell: ignore divup cudaError_t generateFeatures_launch( const float * voxel_features, const float * voxel_num_points, const int * coords, - const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, + const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x, const float voxel_size_y, const float voxel_size_z, const float range_min_x, const float range_min_y, const float range_min_z, float * features, cudaStream_t stream) { diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp index 7f4e4a849211c..07a1a2de725f5 100644 --- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp +++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp @@ -43,34 +43,15 @@ bool VoxelGeneratorTemplate::enqueuePointCloud( return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer); } -std::size_t VoxelGenerator::pointsToVoxels( - std::vector & voxels, std::vector & coordinates, - std::vector & num_points_per_voxel) +std::size_t VoxelGenerator::generateSweepPoints(std::vector & points) { - // voxels (float): (max_voxel_size * max_point_in_voxel_size * point_feature_size) - // coordinates (int): (max_voxel_size * point_dim_size) - // num_points_per_voxel (float): (max_voxel_size) - - const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_; - std::vector coord_to_voxel_idx(grid_size, -1); - - std::size_t voxel_cnt = 0; // @return - std::vector point; - point.resize(config_.point_feature_size_); - std::vector coord_zyx; - coord_zyx.resize(config_.point_dim_size_); - bool out_of_range; - std::size_t point_cnt; - int c, coord_idx, voxel_idx; Eigen::Vector3f point_current, point_past; - + size_t point_counter{}; for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter); pc_cache_iter++) { auto pc_msg = pc_cache_iter->pointcloud_msg; auto affine_past2current = - pd_ptr_->pointcloud_cache_size() > 1 - ? pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world - : Eigen::Affine3f::Identity(); + pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world; float time_lag = static_cast( pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds()); @@ -80,53 +61,14 @@ std::size_t VoxelGenerator::pointsToVoxels( point_past << *x_iter, *y_iter, *z_iter; point_current = affine_past2current * point_past; - point[0] = point_current.x(); - point[1] = point_current.y(); - point[2] = point_current.z(); - point[3] = time_lag; - - out_of_range = false; - for (std::size_t di = 0; di < config_.point_dim_size_; di++) { - c = static_cast((point[di] - range_[di]) * recip_voxel_size_[di]); - if (c < 0 || c >= grid_size_[di]) { - out_of_range = true; - break; - } - coord_zyx[config_.point_dim_size_ - di - 1] = c; - } - if (out_of_range) { - continue; - } - - coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ + - coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2]; - voxel_idx = coord_to_voxel_idx[coord_idx]; - if (voxel_idx == -1) { - voxel_idx = voxel_cnt; - if (voxel_cnt >= config_.max_voxel_size_) { - continue; - } - - voxel_cnt++; - coord_to_voxel_idx[coord_idx] = voxel_idx; - for (std::size_t di = 0; di < config_.point_dim_size_; di++) { - coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di]; - } - } - - point_cnt = num_points_per_voxel[voxel_idx]; - if (point_cnt < config_.max_point_in_voxel_size_) { - for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) { - voxels - [voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ + - point_cnt * config_.point_feature_size_ + fi] = point[fi]; - } - num_points_per_voxel[voxel_idx]++; - } + points.at(point_counter * config_.point_feature_size_) = point_current.x(); + points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y(); + points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z(); + points.at(point_counter * config_.point_feature_size_ + 3) = time_lag; + ++point_counter; } } - - return voxel_cnt; + return point_counter; } } // namespace centerpoint