Skip to content

Commit

Permalink
[breaking] fix autoware point type padding for faster memory access
Browse files Browse the repository at this point in the history
can memcpy between input data buffer and PointXYZI*
make assumption on memory layout for faster data fetch
misc optimization (reordering, constexpr, etc)

Signed-off-by: Vincent Richard <richard-v@macnica.co.jp>
  • Loading branch information
VRichardJP committed Apr 7, 2023
1 parent ac3427a commit e3936e0
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct PointXYZI
float x{0.0F};
float y{0.0F};
float z{0.0F};
float _padding{0}; // compatibility with PCL::PointXYZ*
float intensity{0.0F};
friend bool operator==(const PointXYZI & p1, const PointXYZI & p2) noexcept
{
Expand All @@ -59,6 +60,7 @@ struct PointXYZIRADRT
float x{0.0F};
float y{0.0F};
float z{0.0F};
float _padding{0}; // compatibility with PCL::PointXYZ*
float intensity{0.0F};
uint16_t ring{0U};
float azimuth{0.0F};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@

#include "pointcloud_preprocessor/outlier_filter/ring_outlier_filter_nodelet.hpp"

#include <autoware_point_types/types.hpp>
#include <range/v3/view/chunk.hpp>
#include <range/v3/view/zip.hpp>
#include <sensor_msgs/msg/detail/point_field__struct.hpp>

#include <algorithm>
#include <stdexcept>
#include <vector>

namespace pointcloud_preprocessor
Expand Down Expand Up @@ -58,6 +61,49 @@ void RingOutlierFilterComponent::faster_filter(
std::scoped_lock lock(mutex_);
stop_watch_ptr_->toc("processing_time", true);

// The code below makes several assumptions about the data memory layout to speed up memory access
// TODO(VRichardJP) these checks are verbose and cumbersome but I think necessary. All filters should somehow check the input data before using yolo memcpy everywhere. A few helper functions could make the whole process simpler.
// To memcpy PointXYZIRADRT -> PointXYZI
static_assert(offsetof(autoware_point_types::PointXYZI, x) == offsetof(autoware_point_types::PointXYZIRADRT, x));
static_assert(offsetof(autoware_point_types::PointXYZI, y) == offsetof(autoware_point_types::PointXYZIRADRT, y));
static_assert(offsetof(autoware_point_types::PointXYZI, z) == offsetof(autoware_point_types::PointXYZIRADRT, z));
static_assert(offsetof(autoware_point_types::PointXYZI, intensity) == offsetof(autoware_point_types::PointXYZIRADRT, intensity));
// To memcpy from input cloud data to PointXYZIRADRT (thus PointXYZI)
{
bool correct_layout = true;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, x) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::X)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::X)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::X)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, y) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Y)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Y)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Y)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, z) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Z)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Z)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Z)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, intensity) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Intensity)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Intensity)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Intensity)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, ring) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Ring)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Ring)).datatype == sensor_msgs::msg::PointField::UINT16;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Ring)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, azimuth) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Azimuth)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Azimuth)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Azimuth)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, distance) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Distance)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Distance)).datatype == sensor_msgs::msg::PointField::FLOAT32;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Distance)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, return_type) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::ReturnType)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::ReturnType)).datatype == sensor_msgs::msg::PointField::UINT8;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::ReturnType)).count == 1;
correct_layout &= offsetof(autoware_point_types::PointXYZIRADRT, time_stamp) == input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::TimeStamp)).offset;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::TimeStamp)).datatype == sensor_msgs::msg::PointField::FLOAT64;
correct_layout &= input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::TimeStamp)).count == 1;
if (!correct_layout) {
// TODO(VRichardJP) Handle this properly
throw std::runtime_error("Input cloud has invalid data layout");
}
}

// The initial implementation of ring outlier filter looked like this:
// 1. Iterate over the input cloud and group point indices by ring
// 2. For each ring:
Expand Down Expand Up @@ -94,25 +140,22 @@ void RingOutlierFilterComponent::faster_filter(
// helper functions

// extract ring from point raw data
const size_t ring_offset =
input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Ring)).offset;
auto getPointRing = [=](const auto & raw_p) {
auto getPointRing = [=](const unsigned char* raw_p) {
constexpr size_t ring_offset = offsetof(autoware_point_types::PointXYZIRADRT, ring);
uint16_t ring{};
std::memcpy(&ring, &raw_p[ring_offset], sizeof(ring));
return ring;
};
// extract azimuth from point raw data
const size_t azimuth_offset =
input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Azimuth)).offset;
auto getPointAzimuth = [=](const auto & raw_p) {
auto getPointAzimuth = [=](const unsigned char* raw_p) {
constexpr size_t azimuth_offset = offsetof(autoware_point_types::PointXYZIRADRT, azimuth);
float azimuth{};
std::memcpy(&azimuth, &raw_p[azimuth_offset], sizeof(azimuth));
return azimuth;
};
// extract distance from point raw data
const size_t distance_offset =
input->fields.at(static_cast<size_t>(autoware_point_types::PointIndex::Distance)).offset;
auto getPointDistance = [=](const auto & raw_p) {
auto getPointDistance = [=](const unsigned char* raw_p) {
constexpr size_t distance_offset = offsetof(autoware_point_types::PointXYZIRADRT, distance);
float distance{};
std::memcpy(&distance, &raw_p[distance_offset], sizeof(distance));
return distance;
Expand Down Expand Up @@ -142,27 +185,27 @@ void RingOutlierFilterComponent::faster_filter(
walks.reserve(max_rings_num_);
// initialize each ring with empty walk
rings.resize(max_rings_num_, RingWalk{-1U, 0., 0.});
// points are associated to no walk
// points are initially associated to no walk
point_walk_ids.resize(input->width * input->height, -1U);

int invalid_ring_count = 0;

// Build walks and classify points
for (const auto& [raw_p, point_walk_id] : ranges::views::zip(
input->data | ranges::views::chunk(input->point_step),
point_walk_ids))
{
const auto ring_idx = getPointRing(raw_p);
const uint16_t ring_idx = getPointRing(raw_p.data());
const float curr_azimuth = getPointAzimuth(raw_p.data());
const float curr_distance = getPointDistance(raw_p.data());

if (ring_idx >= max_rings_num_) {
// FIXME(VRichardJP) handle this properly
// Either the data is corrupted or max_rings_num_ is not set correctly
// But it would be nice to detect and report an error when it happens
// Note: point_walk_id == -1 so the point will be filtered out
invalid_ring_count += 1;
continue;
}

const float curr_azimuth = getPointAzimuth(raw_p);
const float curr_distance = getPointDistance(raw_p);

auto & ring = rings[ring_idx];
if (ring.walk_id == -1U) {
// first walk ever for this ring
Expand All @@ -174,7 +217,6 @@ void RingOutlierFilterComponent::faster_filter(
continue;
}

// const float distance_diff = curr_distance - ring.prev_distance;
float azimuth_diff = curr_azimuth - ring.prev_azimuth;
azimuth_diff = azimuth_diff < 0.f ? azimuth_diff + 36000.f : azimuth_diff;

Expand Down Expand Up @@ -206,20 +248,19 @@ void RingOutlierFilterComponent::faster_filter(
}

// finally copy points
// Note(VRichardJP) only implemented for PointXYZI output
output.point_step = sizeof(PointXYZI);
output.data.resize(output.point_step * input->width * input->height);
size_t output_size = 0;
for (const auto& [raw_p, point_walk_id] : ranges::views::zip(
input->data | ranges::views::chunk(input->point_step),
point_walk_ids)) {
if (point_walk_id == -1U || !walk_is_cluster[point_walk_id]) {
continue;
}
if (transform_info.need_transform) {
for (const auto& [raw_p, point_walk_id] : ranges::views::zip(
input->data | ranges::views::chunk(input->point_step),
point_walk_ids))
{
if (point_walk_id == -1U || !walk_is_cluster[point_walk_id]) {
continue;
}

if (transform_info.need_transform) {
// assume binary representation of input point is compatible with PointXYZI
// NOTE(VRichardJP) what is the point of having input->fields if we blindly copy data?
PointXYZI out_point;
std::memcpy(&out_point, raw_p.data(), sizeof(PointXYZI));

Expand All @@ -231,18 +272,27 @@ void RingOutlierFilterComponent::faster_filter(
out_point.intensity = out_point.intensity;

std::memcpy(&output.data[output_size], &out_point, sizeof(PointXYZI));
output_size += output.point_step;
} else {
output_size += sizeof(PointXYZI);
}
} else {
for (const auto& [raw_p, point_walk_id] : ranges::views::zip(
input->data | ranges::views::chunk(input->point_step),
point_walk_ids))
{
if (point_walk_id == -1U || !walk_is_cluster[point_walk_id]) {
continue;
}

// assume binary representation of input point is compatible with PointXYZI
std::memcpy(&output.data[output_size], raw_p.data(), sizeof(PointXYZI));
output_size += output.point_step;
output_size += sizeof(PointXYZI);
}
}
output.data.resize(output_size);

output.header.frame_id = !tf_input_frame_.empty() ? tf_input_frame_ : tf_input_orig_frame_;

// FIXME(VRichardJP) Yolo copy. What if input and output don't have the same layout??
// assume input first 4 fields are compatible with PointXYZI type
output.fields.resize(4); // x, y, z, intensity
std::copy(input->fields.begin(), input->fields.begin() + 4, output.fields.begin());

Expand All @@ -252,6 +302,10 @@ void RingOutlierFilterComponent::faster_filter(
output.width = static_cast<uint32_t>(output.data.size() / output.height / output.point_step);
output.row_step = static_cast<uint32_t>(output.data.size() / output.height);

if (invalid_ring_count > 0) {
RCLCPP_WARN(get_logger(), "%d points had ring index over max_rings_num (%d) and have been ignored.", invalid_ring_count, max_rings_num_);
}

// add processing time for debug
if (debug_publisher_) {
const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic_time", true);
Expand Down

0 comments on commit e3936e0

Please sign in to comment.