From 192d72d0fabe20107c8218de7752f03b33898ba5 Mon Sep 17 00:00:00 2001
From: Tomoya Kimura <tomoya.kimura@tier4.jp>
Date: Mon, 27 Nov 2023 18:36:42 +0900
Subject: [PATCH] perf(image_projection_based_fusion,lidar_centerpoint): add
 cuda preprocess (#1041)

* add cuda preprocess

Signed-off-by: wep21 <daisuke.nishimatsu1021@gmail.com>

* add cuda preprocess for centerpoint

Signed-off-by: wep21 <daisuke.nishimatsu1021@gmail.com>

* update pp

Signed-off-by: wep21 <daisuke.nishimatsu1021@gmail.com>
Signed-off-by: tomoya.kimura <tomoya.kimura@tier4.jp>

* fix bug

Signed-off-by: wep21 <daisuke.nishimatsu1021@gmail.com>

* style(pre-commit): autofix

---------

Signed-off-by: wep21 <daisuke.nishimatsu1021@gmail.com>
Signed-off-by: tomoya.kimura <tomoya.kimura@tier4.jp>
Co-authored-by: wep21 <daisuke.nishimatsu1021@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../pointpainting_trt.hpp                     |   3 +-
 .../preprocess_kernel.hpp                     |  12 +-
 .../pointpainting_fusion/voxel_generator.hpp  |   4 +-
 .../pointpainting_trt.cpp                     |  45 +++++---
 .../pointpainting_fusion/preprocess_kernel.cu | 103 +++++++++++++++++-
 .../pointpainting_fusion/voxel_generator.cpp  |  82 +++-----------
 .../lidar_centerpoint/centerpoint_trt.hpp     |  17 ++-
 .../network/scatter_kernel.hpp                |   2 +-
 .../preprocess/preprocess_kernel.hpp          |  12 +-
 .../preprocess/voxel_generator.hpp            |   8 +-
 .../lidar_centerpoint/lib/centerpoint_trt.cpp |  67 +++++++-----
 .../lib/network/scatter_kernel.cu             |   6 +-
 .../lib/preprocess/preprocess_kernel.cu       |  99 ++++++++++++++++-
 .../lib/preprocess/voxel_generator.cpp        |  76 ++-----------
 14 files changed, 329 insertions(+), 207 deletions(-)

diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
index 0f5ada722c31d..d28d9eb31216d 100644
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/pointpainting_trt.hpp
@@ -23,6 +23,7 @@
 
 namespace image_projection_based_fusion
 {
+static constexpr size_t CAPACITY_POINT = 1000000;
 class PointPaintingTRT : public centerpoint::CenterPointTRT
 {
 public:
@@ -33,8 +34,6 @@ class PointPaintingTRT : public centerpoint::CenterPointTRT
     const centerpoint::DensificationParam & densification_param,
     const centerpoint::CenterPointConfig & config);
 
-  ~PointPaintingTRT();
-
 protected:
   bool preprocess(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg,
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp
index 7d7f290f71a4c..c913ac33d5e84 100644
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/preprocess_kernel.hpp
@@ -20,9 +20,19 @@
 
 namespace image_projection_based_fusion
 {
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream);
+
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream);
+
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size, cudaStream_t stream);
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
index 3e223dd3101df..ad0f3e38d665c 100755
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/pointpainting_fusion/voxel_generator.hpp
@@ -26,9 +26,7 @@ class VoxelGenerator : public centerpoint::VoxelGenerator
 public:
   using centerpoint::VoxelGenerator::VoxelGenerator;
 
-  std::size_t pointsToVoxels(
-    std::vector<float> & voxels, std::vector<int> & coordinates,
-    std::vector<float> & num_points_per_voxel) override;
+  std::size_t generateSweepPoints(std::vector<float> & points) override;
 };
 }  // namespace image_projection_based_fusion
 
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp
index f0e4de9c988e2..8911442f4c75d 100644
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/pointpainting_trt.cpp
@@ -43,26 +43,37 @@ bool PointPaintingTRT::preprocess(
   if (!is_success) {
     return false;
   }
-  num_voxels_ = vg_ptr_pp_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_);
-  if (num_voxels_ == 0) {
-    return false;
-  }
-  const auto voxels_size =
-    num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_;
-  const auto coordinates_size = num_voxels_ * config_.point_dim_size_;
-  // memcpy from host to device (not copy empty voxels)
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice));
+  const auto count = vg_ptr_pp_->generateSweepPoints(points_);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int),
-    cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float),
-    cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    voxels_d_.get(), 0,
+    config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_ *
+      sizeof(float),
+    stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    coordinates_d_.get(), 0, config_.max_voxel_size_ * config_.point_dim_size_ * sizeof(int),
+    stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));
+
+  CHECK_CUDA_ERROR(image_projection_based_fusion::generateVoxels_random_launch(
+    points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
+    config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
+    config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
+    mask_d_.get(), voxels_buffer_d_.get(), stream_));
+
+  CHECK_CUDA_ERROR(image_projection_based_fusion::generateBaseFeatures_launch(
+    mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
+    num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
+    stream_));
 
   CHECK_CUDA_ERROR(image_projection_based_fusion::generateFeatures_launch(
-    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_,
+    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),
     config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_,
     config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(),
     config_.encoder_in_feature_size_, stream_));
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu b/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu
index 9789f52de5f74..d06b60633acf8 100644
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/preprocess_kernel.cu
@@ -57,9 +57,105 @@ std::size_t divup(const std::size_t a, const std::size_t b)
 
 namespace image_projection_based_fusion
 {
+__global__ void generateVoxels_random_kernel(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  float x = points[point_idx * POINT_FEATURE_SIZE];
+  float y = points[point_idx * POINT_FEATURE_SIZE + 1];
+  float z = points[point_idx * POINT_FEATURE_SIZE + 2];
+
+  if (
+    x < min_x_range || x >= max_x_range || y < min_y_range || y >= max_y_range || z < min_z_range ||
+    z >= max_z_range)
+    return;
+
+  int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
+  int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+
+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return;
+  float * address =
+    voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * POINT_FEATURE_SIZE;
+  for (unsigned int i = 0; i < POINT_FEATURE_SIZE; ++i) {
+    atomicExch(address + i, points[point_idx * POINT_FEATURE_SIZE + i]);
+  }
+}
+
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream)
+{
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>(
+    points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range,
+    max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask,
+    voxels);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateBaseFeatures_kernel(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs)
+{
+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
+
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+  unsigned int count = mask[voxel_index];
+  if (!(count > 0)) return;
+  count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;
+
+  unsigned int current_pillarId = 0;
+  current_pillarId = atomicAdd(pillar_num, 1);
+
+  voxel_num[current_pillarId] = count;
+
+  uint3 idx = {0, voxel_idy, voxel_idx};
+  ((uint3 *)voxel_idxs)[current_pillarId] = idx;
+
+  for (int i = 0; i < count; i++) {
+    int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i;
+    int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i;
+    for (unsigned int j = 0; j < POINT_FEATURE_SIZE; ++j) {
+      voxel_features[outIndex * POINT_FEATURE_SIZE + j] = voxels[inIndex * POINT_FEATURE_SIZE + j];
+    }
+  }
+
+  // clear buffer for next infer
+  atomicExch(mask + voxel_index, 0);
+}
+
+// create 4 channels
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream)
+{
+  dim3 threads = {32, 32};
+  dim3 blocks = {
+    (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};
+
+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
+    mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateFeatures_kernel(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
+  const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
   const float range_min_x, const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size)
 {
@@ -70,7 +166,8 @@ __global__ void generateFeatures_kernel(
   int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE;
   int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE;
 
-  if (pillar_idx >= num_voxels) return;
+  unsigned int num_pillars = num_voxels[0];
+  if (pillar_idx >= num_pillars) return;
 
   // load src
   __shared__ float pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE][POINT_FEATURE_SIZE];
@@ -155,7 +252,7 @@ __global__ void generateFeatures_kernel(
 
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features,
   const std::size_t encoder_in_feature_size, cudaStream_t stream)
diff --git a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
index cfd24829e7806..ea10cb1237436 100755
--- a/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
+++ b/perception/image_projection_based_fusion/src/pointpainting_fusion/voxel_generator.cpp
@@ -18,29 +18,10 @@
 
 namespace image_projection_based_fusion
 {
-std::size_t VoxelGenerator::pointsToVoxels(
-  std::vector<float> & voxels, std::vector<int> & coordinates,
-  std::vector<float> & num_points_per_voxel)
+size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
 {
-  // voxels (float): (max_num_voxels * max_num_points_per_voxel * point_feature_size)
-  // coordinates (int): (max_num_voxels * point_dim_size)
-  // num_points_per_voxel (float): (max_num_voxels)
-
-  const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_;
-  std::vector<std::optional<int>> coord_to_voxel_idx(grid_size, -1);
-
-  std::size_t voxel_cnt = 0;  // @return
-  // std::array<float, config_.point_feature_size> point;
-  // std::array<float, config_.point_dim_size> coord_zyx;
-  std::vector<float> point;
-  point.resize(config_.point_feature_size_);
-  std::vector<float> coord_zyx;
-  coord_zyx.resize(config_.point_dim_size_);
-  bool out_of_range;
-  std::size_t point_cnt;
-  int c, coord_idx, voxel_idx;
   Eigen::Vector3f point_current, point_past;
-
+  size_t point_counter{};
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
     auto pc_msg = pc_cache_iter->pointcloud_msg;
@@ -55,56 +36,25 @@ std::size_t VoxelGenerator::pointsToVoxels(
       point_past << *x_iter, *y_iter, *z_iter;
       point_current = affine_past2current * point_past;
 
-      point[0] = point_current.x();
-      point[1] = point_current.y();
-      point[2] = point_current.z();
-      point[3] = time_lag;
+      points.at(point_counter * config_.point_feature_size_) = point_current.x();
+      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
+      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
+      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
       // decode the class value back to one-hot binary and assign it to point
-      std::fill(point.begin() + 4, point.end(), 0);
-      point[4 + *class_iter] = 1;
-
-      out_of_range = false;
-      for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-        c = static_cast<int>((point[di] - range_[di]) * recip_voxel_size_[di]);
-        if (c < 0 || c >= grid_size_[di]) {
-          out_of_range = true;
-          break;
-        }
-        coord_zyx[config_.point_dim_size_ - di - 1] = c;
-      }
-      if (out_of_range) {
-        continue;
+      for (size_t i = 0; i < config_.class_size_; ++i) {
+        points.at(point_counter * config_.point_feature_size_ + 4 + i) = 0.f;
       }
-
-      coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ +
-                  coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2];
-      voxel_idx = coord_to_voxel_idx[coord_idx].value();
-      if (voxel_idx == -1) {
-        voxel_idx = voxel_cnt;
-        if (voxel_cnt >= config_.max_voxel_size_) {
-          continue;
-        }
-
-        voxel_cnt++;
-        coord_to_voxel_idx[coord_idx] = voxel_idx;
-        for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-          coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di];
-        }
-      }
-
-      point_cnt = num_points_per_voxel[voxel_idx];
-      if (point_cnt < config_.max_point_in_voxel_size_) {
-        for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) {
-          voxels
-            [voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ +
-             point_cnt * config_.point_feature_size_ + fi] = point[fi];
-        }
-        num_points_per_voxel[voxel_idx]++;
+      auto class_value = static_cast<int>(*class_iter);
+      auto iter = points.begin() + point_counter * config_.point_feature_size_ + 4;
+      while (class_value > 0) {
+        *iter = class_value % 2;
+        class_value /= 2;
+        ++iter;
       }
+      ++point_counter;
     }
   }
-
-  return voxel_cnt;
+  return point_counter;
 }
 
 }  // namespace image_projection_based_fusion
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
index fee17f1c0156a..8cf250be0c049 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/centerpoint_trt.hpp
@@ -32,6 +32,8 @@
 
 namespace centerpoint
 {
+static constexpr size_t CAPACITY_POINT = 1000000;
+
 class NetworkParam
 {
 public:
@@ -59,7 +61,7 @@ class CenterPointTRT
     const NetworkParam & encoder_param, const NetworkParam & head_param,
     const DensificationParam & densification_param, const CenterPointConfig & config);
 
-  ~CenterPointTRT();
+  virtual ~CenterPointTRT();
 
   bool detect(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
@@ -83,12 +85,13 @@ class CenterPointTRT
 
   std::size_t class_size_{0};
   CenterPointConfig config_;
-  std::size_t num_voxels_{0};
   std::size_t encoder_in_feature_size_{0};
   std::size_t spatial_features_size_{0};
-  std::vector<float> voxels_;
-  std::vector<int> coordinates_;
-  std::vector<float> num_points_per_voxel_;
+  std::size_t voxels_buffer_size_{0};
+  std::size_t mask_size_{0};
+  std::size_t voxels_size_{0};
+  std::size_t coordinates_size_{0};
+  std::vector<float> points_;
   cuda::unique_ptr<float[]> voxels_d_{nullptr};
   cuda::unique_ptr<int[]> coordinates_d_{nullptr};
   cuda::unique_ptr<float[]> num_points_per_voxel_d_{nullptr};
@@ -101,6 +104,10 @@ class CenterPointTRT
   cuda::unique_ptr<float[]> head_out_dim_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_rot_d_{nullptr};
   cuda::unique_ptr<float[]> head_out_vel_d_{nullptr};
+  cuda::unique_ptr<float[]> points_d_{nullptr};
+  cuda::unique_ptr<float[]> voxels_buffer_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> mask_d_{nullptr};
+  cuda::unique_ptr<unsigned int[]> num_voxels_d_{nullptr};
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
index e754d89468ba3..3dd00c25fd9e7 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/network/scatter_kernel.hpp
@@ -21,7 +21,7 @@
 namespace centerpoint
 {
 cudaError_t scatterFeatures_launch(
-  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  const float * pillar_features, const int * coords, const unsigned int * num_pillars,
   const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size,
   const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features,
   cudaStream_t stream);
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
index 3abc8cfeccd5b..824144fe3b22b 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/preprocess_kernel.hpp
@@ -20,9 +20,19 @@
 
 namespace centerpoint
 {
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream);
+
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream);
+
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features, cudaStream_t stream);
 
diff --git a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
index 810ae3afd20d0..be15d51e91715 100644
--- a/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
+++ b/perception/lidar_centerpoint/include/lidar_centerpoint/preprocess/voxel_generator.hpp
@@ -31,9 +31,7 @@ class VoxelGeneratorTemplate
   explicit VoxelGeneratorTemplate(
     const DensificationParam & param, const CenterPointConfig & config);
 
-  virtual std::size_t pointsToVoxels(
-    std::vector<float> & voxels, std::vector<int> & coordinates,
-    std::vector<float> & num_points_per_voxel) = 0;
+  virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
 
   bool enqueuePointCloud(
     const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
@@ -52,9 +50,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
 public:
   using VoxelGeneratorTemplate::VoxelGeneratorTemplate;
 
-  std::size_t pointsToVoxels(
-    std::vector<float> & voxels, std::vector<int> & coordinates,
-    std::vector<float> & num_points_per_voxel) override;
+  std::size_t generateSweepPoints(std::vector<float> & points) override;
 };
 
 }  // namespace centerpoint
diff --git a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
index 9b01ea8f53675..acdcad1b54c40 100644
--- a/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
+++ b/perception/lidar_centerpoint/lib/centerpoint_trt.cpp
@@ -69,9 +69,9 @@ CenterPointTRT::~CenterPointTRT()
 
 void CenterPointTRT::initPtr()
 {
-  const auto voxels_size =
+  voxels_size_ =
     config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_;
-  const auto coordinates_size = config_.max_voxel_size_ * config_.point_dim_size_;
+  coordinates_size_ = config_.max_voxel_size_ * config_.point_dim_size_;
   encoder_in_feature_size_ =
     config_.max_voxel_size_ * config_.max_point_in_voxel_size_ * config_.encoder_in_feature_size_;
   const auto pillar_features_size = config_.max_voxel_size_ * config_.encoder_out_feature_size_;
@@ -79,14 +79,16 @@ void CenterPointTRT::initPtr()
     config_.grid_size_x_ * config_.grid_size_y_ * config_.encoder_out_feature_size_;
   const auto grid_xy_size = config_.down_grid_size_x_ * config_.down_grid_size_y_;
 
+  voxels_buffer_size_ = config_.grid_size_x_ * config_.grid_size_y_ *
+                        config_.max_point_in_voxel_size_ * config_.point_feature_size_;
+  mask_size_ = config_.grid_size_x_ * config_.grid_size_y_;
+
   // host
-  voxels_.resize(voxels_size);
-  coordinates_.resize(coordinates_size);
-  num_points_per_voxel_.resize(config_.max_voxel_size_);
+  points_.resize(CAPACITY_POINT * config_.point_feature_size_);
 
   // device
-  voxels_d_ = cuda::make_unique<float[]>(voxels_size);
-  coordinates_d_ = cuda::make_unique<int[]>(coordinates_size);
+  voxels_d_ = cuda::make_unique<float[]>(voxels_size_);
+  coordinates_d_ = cuda::make_unique<int[]>(coordinates_size_);
   num_points_per_voxel_d_ = cuda::make_unique<float[]>(config_.max_voxel_size_);
   encoder_in_features_d_ = cuda::make_unique<float[]>(encoder_in_feature_size_);
   pillar_features_d_ = cuda::make_unique<float[]>(pillar_features_size);
@@ -97,15 +99,16 @@ void CenterPointTRT::initPtr()
   head_out_dim_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_dim_size_);
   head_out_rot_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_rot_size_);
   head_out_vel_d_ = cuda::make_unique<float[]>(grid_xy_size * config_.head_out_vel_size_);
+  points_d_ = cuda::make_unique<float[]>(CAPACITY_POINT * config_.point_feature_size_);
+  voxels_buffer_d_ = cuda::make_unique<float[]>(voxels_buffer_size_);
+  mask_d_ = cuda::make_unique<unsigned int[]>(mask_size_);
+  num_voxels_d_ = cuda::make_unique<unsigned int[]>(1);
 }
 
 bool CenterPointTRT::detect(
   const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
   std::vector<Box3D> & det_boxes3d)
 {
-  std::fill(voxels_.begin(), voxels_.end(), 0);
-  std::fill(coordinates_.begin(), coordinates_.end(), -1);
-  std::fill(num_points_per_voxel_.begin(), num_points_per_voxel_.end(), 0);
   CHECK_CUDA_ERROR(cudaMemsetAsync(
     encoder_in_features_d_.get(), 0, encoder_in_feature_size_ * sizeof(float), stream_));
   CHECK_CUDA_ERROR(
@@ -131,27 +134,33 @@ bool CenterPointTRT::preprocess(
   if (!is_success) {
     return false;
   }
-  num_voxels_ = vg_ptr_->pointsToVoxels(voxels_, coordinates_, num_points_per_voxel_);
-  if (num_voxels_ == 0) {
-    return false;
-  }
-
-  const auto voxels_size =
-    num_voxels_ * config_.max_point_in_voxel_size_ * config_.point_feature_size_;
-  const auto coordinates_size = num_voxels_ * config_.point_dim_size_;
-  // memcpy from host to device (not copy empty voxels)
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    voxels_d_.get(), voxels_.data(), voxels_size * sizeof(float), cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    coordinates_d_.get(), coordinates_.data(), coordinates_size * sizeof(int),
-    cudaMemcpyHostToDevice));
+  const auto count = vg_ptr_->generateSweepPoints(points_);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    num_points_per_voxel_d_.get(), num_points_per_voxel_.data(), num_voxels_ * sizeof(float),
-    cudaMemcpyHostToDevice));
-  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+    points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
+    cudaMemcpyHostToDevice, stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
+  CHECK_CUDA_ERROR(
+    cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(mask_d_.get(), 0, mask_size_ * sizeof(int), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(voxels_d_.get(), 0, voxels_size_ * sizeof(float), stream_));
+  CHECK_CUDA_ERROR(
+    cudaMemsetAsync(coordinates_d_.get(), 0, coordinates_size_ * sizeof(int), stream_));
+  CHECK_CUDA_ERROR(cudaMemsetAsync(
+    num_points_per_voxel_d_.get(), 0, config_.max_voxel_size_ * sizeof(float), stream_));
+
+  CHECK_CUDA_ERROR(generateVoxels_random_launch(
+    points_d_.get(), count, config_.range_min_x_, config_.range_max_x_, config_.range_min_y_,
+    config_.range_max_y_, config_.range_min_z_, config_.range_max_z_, config_.voxel_size_x_,
+    config_.voxel_size_y_, config_.voxel_size_z_, config_.grid_size_y_, config_.grid_size_x_,
+    mask_d_.get(), voxels_buffer_d_.get(), stream_));
+
+  CHECK_CUDA_ERROR(generateBaseFeatures_launch(
+    mask_d_.get(), voxels_buffer_d_.get(), config_.grid_size_y_, config_.grid_size_x_,
+    num_voxels_d_.get(), voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(),
+    stream_));
 
   CHECK_CUDA_ERROR(generateFeatures_launch(
-    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_,
+    voxels_d_.get(), num_points_per_voxel_d_.get(), coordinates_d_.get(), num_voxels_d_.get(),
     config_.max_voxel_size_, config_.voxel_size_x_, config_.voxel_size_y_, config_.voxel_size_z_,
     config_.range_min_x_, config_.range_min_y_, config_.range_min_z_, encoder_in_features_d_.get(),
     stream_));
@@ -171,7 +180,7 @@ void CenterPointTRT::inference()
 
   // scatter
   CHECK_CUDA_ERROR(scatterFeatures_launch(
-    pillar_features_d_.get(), coordinates_d_.get(), num_voxels_, config_.max_voxel_size_,
+    pillar_features_d_.get(), coordinates_d_.get(), num_voxels_d_.get(), config_.max_voxel_size_,
     config_.encoder_out_feature_size_, config_.grid_size_x_, config_.grid_size_y_,
     spatial_features_d_.get(), stream_));
 
diff --git a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
index da2f0e2f57c15..09cc83bf606fe 100644
--- a/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
+++ b/perception/lidar_centerpoint/lib/network/scatter_kernel.cu
@@ -24,7 +24,7 @@ const std::size_t THREADS_PER_BLOCK = 32;
 namespace centerpoint
 {
 __global__ void scatterFeatures_kernel(
-  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  const float * pillar_features, const int * coords, const unsigned int * num_pillars,
   const std::size_t pillar_feature_size, const std::size_t grid_size_x,
   const std::size_t grid_size_y, float * scattered_features)
 {
@@ -34,7 +34,7 @@ __global__ void scatterFeatures_kernel(
   const auto pillar_i = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
   const auto feature_i = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
 
-  if (pillar_i >= num_pillars || feature_i >= pillar_feature_size) {
+  if (pillar_i >= num_pillars[0] || feature_i >= pillar_feature_size) {
     return;
   }
 
@@ -50,7 +50,7 @@ __global__ void scatterFeatures_kernel(
 
 // cspell: ignore divup
 cudaError_t scatterFeatures_launch(
-  const float * pillar_features, const int * coords, const std::size_t num_pillars,
+  const float * pillar_features, const int * coords, const unsigned int * num_pillars,
   const std::size_t max_voxel_size, const std::size_t encoder_out_feature_size,
   const std::size_t grid_size_x, const std::size_t grid_size_y, float * scattered_features,
   cudaStream_t stream)
diff --git a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
index 7b59757311ff2..6f77ff84c2cea 100644
--- a/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
+++ b/perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
@@ -41,9 +41,101 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9;  // the same as encoder_in_featur
 
 namespace centerpoint
 {
+__global__ void generateVoxels_random_kernel(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  float4 point = ((float4 *)points)[point_idx];
+
+  if (
+    point.x < min_x_range || point.x >= max_x_range || point.y < min_y_range ||
+    point.y >= max_y_range || point.z < min_z_range || point.z >= max_z_range)
+    return;
+
+  int voxel_idx = floorf((point.x - min_x_range) / pillar_x_size);
+  int voxel_idy = floorf((point.y - min_y_range) / pillar_y_size);
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+
+  unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
+
+  if (point_id >= MAX_POINT_IN_VOXEL_SIZE) return;
+  float * address = voxels + (voxel_index * MAX_POINT_IN_VOXEL_SIZE + point_id) * 4;
+  atomicExch(address + 0, point.x);
+  atomicExch(address + 1, point.y);
+  atomicExch(address + 2, point.z);
+  atomicExch(address + 3, point.w);
+}
+
+cudaError_t generateVoxels_random_launch(
+  const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
+  float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
+  float pillar_z_size, int grid_y_size, int grid_x_size, unsigned int * mask, float * voxels,
+  cudaStream_t stream)
+{
+  dim3 blocks((points_size + 256 - 1) / 256);
+  dim3 threads(256);
+  generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>(
+    points, points_size, min_x_range, max_x_range, min_y_range, max_y_range, min_z_range,
+    max_z_range, pillar_x_size, pillar_y_size, pillar_z_size, grid_y_size, grid_x_size, mask,
+    voxels);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+__global__ void generateBaseFeatures_kernel(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs)
+{
+  unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
+
+  unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
+  unsigned int count = mask[voxel_index];
+  if (!(count > 0)) return;
+  count = count < MAX_POINT_IN_VOXEL_SIZE ? count : MAX_POINT_IN_VOXEL_SIZE;
+
+  unsigned int current_pillarId = 0;
+  current_pillarId = atomicAdd(pillar_num, 1);
+
+  voxel_num[current_pillarId] = count;
+
+  uint3 idx = {0, voxel_idy, voxel_idx};
+  ((uint3 *)voxel_idxs)[current_pillarId] = idx;
+
+  for (int i = 0; i < count; i++) {
+    int inIndex = voxel_index * MAX_POINT_IN_VOXEL_SIZE + i;
+    int outIndex = current_pillarId * MAX_POINT_IN_VOXEL_SIZE + i;
+    ((float4 *)voxel_features)[outIndex] = ((float4 *)voxels)[inIndex];
+  }
+
+  // clear buffer for next infer
+  atomicExch(mask + voxel_index, 0);
+}
+
+// create 4 channels
+cudaError_t generateBaseFeatures_launch(
+  unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, unsigned int * pillar_num,
+  float * voxel_features, float * voxel_num, int * voxel_idxs, cudaStream_t stream)
+{
+  dim3 threads = {32, 32};
+  dim3 blocks = {
+    (grid_x_size + threads.x - 1) / threads.x, (grid_y_size + threads.y - 1) / threads.y};
+
+  generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>(
+    mask, voxels, grid_y_size, grid_x_size, pillar_num, voxel_features, voxel_num, voxel_idxs);
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
 __global__ void generateFeatures_kernel(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
+  const unsigned int * num_voxels, const float voxel_x, const float voxel_y, const float voxel_z,
   const float range_min_x, const float range_min_y, const float range_min_z, float * features)
 {
   // voxel_features (float): (max_voxel_size, max_point_in_voxel_size, point_feature_size)
@@ -53,7 +145,8 @@ __global__ void generateFeatures_kernel(
   int point_idx = threadIdx.x % MAX_POINT_IN_VOXEL_SIZE;
   int pillar_idx_inBlock = threadIdx.x / MAX_POINT_IN_VOXEL_SIZE;  // max_point_in_voxel_size
 
-  if (pillar_idx >= num_voxels) return;
+  unsigned int num_pillars = num_voxels[0];
+  if (pillar_idx >= num_pillars) return;
 
   // load src
   __shared__ float4 pillarSM[WARPS_PER_BLOCK][MAX_POINT_IN_VOXEL_SIZE];
@@ -144,7 +237,7 @@ __global__ void generateFeatures_kernel(
 // cspell: ignore divup
 cudaError_t generateFeatures_launch(
   const float * voxel_features, const float * voxel_num_points, const int * coords,
-  const std::size_t num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
+  const unsigned int * num_voxels, const std::size_t max_voxel_size, const float voxel_size_x,
   const float voxel_size_y, const float voxel_size_z, const float range_min_x,
   const float range_min_y, const float range_min_z, float * features, cudaStream_t stream)
 {
diff --git a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
index 7f4e4a849211c..07a1a2de725f5 100644
--- a/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
+++ b/perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
@@ -43,34 +43,15 @@ bool VoxelGeneratorTemplate::enqueuePointCloud(
   return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
 }
 
-std::size_t VoxelGenerator::pointsToVoxels(
-  std::vector<float> & voxels, std::vector<int> & coordinates,
-  std::vector<float> & num_points_per_voxel)
+std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
 {
-  // voxels (float): (max_voxel_size * max_point_in_voxel_size * point_feature_size)
-  // coordinates (int): (max_voxel_size * point_dim_size)
-  // num_points_per_voxel (float): (max_voxel_size)
-
-  const std::size_t grid_size = config_.grid_size_z_ * config_.grid_size_y_ * config_.grid_size_x_;
-  std::vector<int> coord_to_voxel_idx(grid_size, -1);
-
-  std::size_t voxel_cnt = 0;  // @return
-  std::vector<float> point;
-  point.resize(config_.point_feature_size_);
-  std::vector<float> coord_zyx;
-  coord_zyx.resize(config_.point_dim_size_);
-  bool out_of_range;
-  std::size_t point_cnt;
-  int c, coord_idx, voxel_idx;
   Eigen::Vector3f point_current, point_past;
-
+  size_t point_counter{};
   for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
        pc_cache_iter++) {
     auto pc_msg = pc_cache_iter->pointcloud_msg;
     auto affine_past2current =
-      pd_ptr_->pointcloud_cache_size() > 1
-        ? pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world
-        : Eigen::Affine3f::Identity();
+      pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
     float time_lag = static_cast<float>(
       pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
 
@@ -80,53 +61,14 @@ std::size_t VoxelGenerator::pointsToVoxels(
       point_past << *x_iter, *y_iter, *z_iter;
       point_current = affine_past2current * point_past;
 
-      point[0] = point_current.x();
-      point[1] = point_current.y();
-      point[2] = point_current.z();
-      point[3] = time_lag;
-
-      out_of_range = false;
-      for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-        c = static_cast<int>((point[di] - range_[di]) * recip_voxel_size_[di]);
-        if (c < 0 || c >= grid_size_[di]) {
-          out_of_range = true;
-          break;
-        }
-        coord_zyx[config_.point_dim_size_ - di - 1] = c;
-      }
-      if (out_of_range) {
-        continue;
-      }
-
-      coord_idx = coord_zyx[0] * config_.grid_size_y_ * config_.grid_size_x_ +
-                  coord_zyx[1] * config_.grid_size_x_ + coord_zyx[2];
-      voxel_idx = coord_to_voxel_idx[coord_idx];
-      if (voxel_idx == -1) {
-        voxel_idx = voxel_cnt;
-        if (voxel_cnt >= config_.max_voxel_size_) {
-          continue;
-        }
-
-        voxel_cnt++;
-        coord_to_voxel_idx[coord_idx] = voxel_idx;
-        for (std::size_t di = 0; di < config_.point_dim_size_; di++) {
-          coordinates[voxel_idx * config_.point_dim_size_ + di] = coord_zyx[di];
-        }
-      }
-
-      point_cnt = num_points_per_voxel[voxel_idx];
-      if (point_cnt < config_.max_point_in_voxel_size_) {
-        for (std::size_t fi = 0; fi < config_.point_feature_size_; fi++) {
-          voxels
-            [voxel_idx * config_.max_point_in_voxel_size_ * config_.point_feature_size_ +
-             point_cnt * config_.point_feature_size_ + fi] = point[fi];
-        }
-        num_points_per_voxel[voxel_idx]++;
-      }
+      points.at(point_counter * config_.point_feature_size_) = point_current.x();
+      points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
+      points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
+      points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
+      ++point_counter;
     }
   }
-
-  return voxel_cnt;
+  return point_counter;
 }
 
 }  // namespace centerpoint