diff --git a/build_depends.humble.repos b/build_depends.humble.repos
index 5e8ce5953172a..3230e220c9ee9 100644
--- a/build_depends.humble.repos
+++ b/build_depends.humble.repos
@@ -33,10 +33,6 @@ repositories:
     type: git
     url: https://github.com/tier4/muSSP.git
     version: tier4/main
-  universe/external/ndt_omp:
-    type: git
-    url: https://github.com/tier4/ndt_omp.git
-    version: tier4/main
   universe/external/morai_msgs:
     type: git
     url: https://github.com/MORAI-Autonomous/MORAI-ROS2_morai_msgs.git
diff --git a/build_depends.repos b/build_depends.repos
index 2b47af3ef8e81..88a96de65e2fd 100644
--- a/build_depends.repos
+++ b/build_depends.repos
@@ -33,10 +33,6 @@ repositories:
     type: git
     url: https://github.com/tier4/muSSP.git
     version: tier4/main
-  universe/external/ndt_omp:
-    type: git
-    url: https://github.com/tier4/ndt_omp.git
-    version: tier4/main
   universe/external/morai_msgs:
     type: git
     url: https://github.com/MORAI-Autonomous/MORAI-ROS2_morai_msgs.git
diff --git a/localization/ndt_omp/CMakeLists.txt b/localization/ndt_omp/CMakeLists.txt
new file mode 100644
index 0000000000000..a2267d2ff339f
--- /dev/null
+++ b/localization/ndt_omp/CMakeLists.txt
@@ -0,0 +1,124 @@
+cmake_minimum_required(VERSION 3.5)
+project(ndt_omp)
+
+add_definitions(-std=c++14)
+set(CMAKE_CXX_FLAGS "-std=c++14")
+
+# Compile flags for SIMD instructions
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+  # For x86_64 architecture, SIMD instruction set is fixed below versions,
+  add_compile_options(-msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2)
+else()
+  # For other architecture, like arm64, compile flags are generally prepared by compiler
+  # march=native is disabled as default for specific depending pcl libraries
+  # or pre-building packages for other computers.
+  if(BUILD_WITH_MARCH_NATIVE)
+    add_compile_options(-march=native)
+  endif()
+endif()
+
+# pcl 1.7 causes a segfault when it is built with debug mode
+set(CMAKE_BUILD_TYPE "RELEASE")
+
+if($ENV{ROS_VERSION} EQUAL 1)
+# ROS1
+find_package(catkin REQUIRED COMPONENTS
+  roscpp
+  pcl_ros
+)
+
+find_package(PCL 1.7 REQUIRED)
+include_directories(${PCL_INCLUDE_DIRS})
+link_directories(${PCL_LIBRARY_DIRS})
+add_definitions(${PCL_DEFINITIONS})
+
+message(STATUS "PCL_INCLUDE_DIRS:" ${PCL_INCLUDE_DIRS})
+message(STATUS "PCL_LIBRARY_DIRS:" ${PCL_LIBRARY_DIRS})
+message(STATUS "PCL_DEFINITIONS:" ${PCL_DEFINITIONS})
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif()
+
+###################################
+## catkin specific configuration ##
+###################################
+catkin_package(
+  INCLUDE_DIRS include
+  LIBRARIES ndt_omp
+)
+
+###########
+## Build ##
+###########
+include_directories(include)
+include_directories(
+  ${catkin_INCLUDE_DIRS}
+)
+
+add_library(ndt_omp
+  src/pclomp/voxel_grid_covariance_omp.cpp
+  src/pclomp/ndt_omp.cpp
+)
+
+add_dependencies(align
+  ndt_omp
+)
+target_link_libraries(align
+  ${catkin_LIBRARIES}
+  ${PCL_LIBRARIES}
+  ndt_omp
+)
+
+#############
+## INSTALL ##
+#############
+
+install(
+  TARGETS
+    ndt_omp
+  ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+  RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+  LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION}
+)
+
+# install headers
+install(DIRECTORY include/pclomp
+        DESTINATION ${CATKIN_GLOBAL_INCLUDE_DESTINATION})
+else()
+# ROS2
+find_package(ament_cmake_auto REQUIRED)
+ament_auto_find_build_dependencies()
+
+find_package(PCL 1.7 REQUIRED)
+include_directories(${PCL_INCLUDE_DIRS})
+link_directories(${PCL_LIBRARY_DIRS})
+add_definitions(${PCL_DEFINITIONS})
+
+message(STATUS "PCL_INCLUDE_DIRS:" ${PCL_INCLUDE_DIRS})
+message(STATUS "PCL_LIBRARY_DIRS:" ${PCL_LIBRARY_DIRS})
+message(STATUS "PCL_DEFINITIONS:" ${PCL_DEFINITIONS})
+
+find_package(OpenMP)
+
+###########
+## Build ##
+###########
+
+ament_auto_add_library(ndt_omp SHARED
+  src/pclomp/voxel_grid_covariance_omp.cpp
+  src/pclomp/ndt_omp.cpp
+)
+
+target_link_libraries(ndt_omp ${PCL_LIBRARIES})
+
+if(OpenMP_CXX_FOUND)
+  target_link_libraries(ndt_omp OpenMP::OpenMP_CXX)
+else()
+  message(WARNING "OpenMP not found")
+endif()
+
+ament_auto_package()
+endif()
diff --git a/localization/ndt_omp/LICENSE b/localization/ndt_omp/LICENSE
new file mode 100644
index 0000000000000..62941fb22cf4d
--- /dev/null
+++ b/localization/ndt_omp/LICENSE
@@ -0,0 +1,40 @@
+Copyright 2022 Autoware Foundation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---
+BSD 2-Clause License
+
+Copyright (c) 2019, k.koide
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/localization/ndt_omp/README.md b/localization/ndt_omp/README.md
new file mode 100644
index 0000000000000..16a44634178e2
--- /dev/null
+++ b/localization/ndt_omp/README.md
@@ -0,0 +1,94 @@
+# ndt_omp
+
+This package provides an OpenMP-boosted Normal Distributions Transform (and GICP) algorithm derived from pcl. The NDT algorithm is modified to be SSE-friendly and multi-threaded. It can run up to 10 times faster than its original version in pcl.
+
+NOTE: This package's implementation is a customized version of [ndt_omp from koide3](https://github.com/koide3/ndt_omp) for Autoware.
+
+[![Build](https://github.com/koide3/ndt_omp/actions/workflows/build.yml/badge.svg)](https://github.com/koide3/ndt_omp/actions/workflows/build.yml)
+
+## Benchmark (on Core i7-6700K)
+
+```bash
+$ roscd ndt_omp/data
+$ rosrun ndt_omp align 251370668.pcd 251371071.pcd
+--- pcl::NDT ---
+single : 282.222[msec]
+10times: 2921.92[msec]
+fitness: 0.213937
+
+--- pclomp::NDT (KDTREE, 1 threads) ---
+single : 207.697[msec]
+10times: 2059.19[msec]
+fitness: 0.213937
+
+--- pclomp::NDT (DIRECT7, 1 threads) ---
+single : 139.433[msec]
+10times: 1356.79[msec]
+fitness: 0.214205
+
+--- pclomp::NDT (DIRECT1, 1 threads) ---
+single : 34.6418[msec]
+10times: 317.03[msec]
+fitness: 0.208511
+
+--- pclomp::NDT (KDTREE, 8 threads) ---
+single : 54.9903[msec]
+10times: 500.51[msec]
+fitness: 0.213937
+
+--- pclomp::NDT (DIRECT7, 8 threads) ---
+single : 63.1442[msec]
+10times: 343.336[msec]
+fitness: 0.214205
+
+--- pclomp::NDT (DIRECT1, 8 threads) ---
+single : 17.2353[msec]
+10times: 100.025[msec]
+fitness: 0.208511
+```
+
+Several methods for neighbor voxel search are implemented. If you select pclomp::KDTREE, results will be completely same as the original pcl::NDT. We recommend to use pclomp::DIRECT7 which is faster and stable. If you need extremely fast registration, choose pclomp::DIRECT1, but it might be a bit unstable.
+
+<img src="data/screenshot.png" height="400pix" /><br>
+Red: target, Green: source, Blue: aligned
+
+## Related packages
+
+- [ndt_omp](https://github.com/koide3/ndt_omp)
+- [fast_gicp](https://github.com/SMRT-AIST/fast_gicp)
+
+## License
+
+A license from [koide3/ndt_omp](https://github.com/koide3/ndt_omp):
+
+```markdown
+BSD 2-Clause License
+
+Copyright (c) 2019, k.koide
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
+## Special thanks
+
+- [koide3/ndt_omp](https://github.com/koide3/ndt_omp)
diff --git a/localization/ndt_omp/data/251370668.pcd b/localization/ndt_omp/data/251370668.pcd
new file mode 100644
index 0000000000000..bf25f1b980bff
Binary files /dev/null and b/localization/ndt_omp/data/251370668.pcd differ
diff --git a/localization/ndt_omp/data/251371071.pcd b/localization/ndt_omp/data/251371071.pcd
new file mode 100644
index 0000000000000..b9bdb9cf1c1a3
Binary files /dev/null and b/localization/ndt_omp/data/251371071.pcd differ
diff --git a/localization/ndt_omp/data/screenshot.png b/localization/ndt_omp/data/screenshot.png
new file mode 100644
index 0000000000000..c9c5dcd96c494
Binary files /dev/null and b/localization/ndt_omp/data/screenshot.png differ
diff --git a/localization/ndt_omp/include/pclomp/ndt_omp.h b/localization/ndt_omp/include/pclomp/ndt_omp.h
new file mode 100644
index 0000000000000..ec61b92c03a0d
--- /dev/null
+++ b/localization/ndt_omp/include/pclomp/ndt_omp.h
@@ -0,0 +1,514 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Point Cloud Library (PCL) - www.pointclouds.org
+ *  Copyright (c) 2010-2012, Willow Garage, Inc.
+ *  Copyright (c) 2012-, Open Perception, Inc.
+ *
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of the copyright holder(s) nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id$
+ *
+ */
+
+#ifndef PCLOMP__NDT_OMP_H_
+#define PCLOMP__NDT_OMP_H_
+
+#include "voxel_grid_covariance_omp.h"
+
+#include <pcl/search/impl/search.hpp>
+#include <unsupported/Eigen/NonLinearOptimization>
+
+#include "boost/optional.hpp"
+
+#include <pcl/registration/registration.h>
+
+#include <vector>
+
+namespace pclomp
+{
+enum NeighborSearchMethod { KDTREE, DIRECT26, DIRECT7, DIRECT1 };
+
+/** \brief A 3D Normal Distribution Transform registration implementation for point cloud data.
+ * \note For more information please see
+ * <b>Magnusson, M. (2009). The Three-Dimensional Normal-Distributions Transform —
+ * an Efficient Representation for Registration, Surface Analysis, and Loop Detection.
+ * PhD thesis, Orebro University. Orebro Studies in Technology 36.</b>,
+ * <b>More, J., and Thuente, D. (1994). Line Search Algorithm with Guaranteed Sufficient Decrease
+ * In ACM Transactions on Mathematical Software.</b> and
+ * Sun, W. and Yuan, Y, (2006) Optimization Theory and Methods: Nonlinear Programming. 89-100
+ * \note Math refactored by Todor Stoyanov.
+ * \author Brian Okorn (Space and Naval Warfare Systems Center Pacific)
+ */
+template <typename PointSource, typename PointTarget>
+class NormalDistributionsTransform : public pcl::Registration<PointSource, PointTarget>
+{
+protected:
+  typedef typename pcl::Registration<PointSource, PointTarget>::PointCloudSource PointCloudSource;
+  typedef typename PointCloudSource::Ptr PointCloudSourcePtr;
+  typedef typename PointCloudSource::ConstPtr PointCloudSourceConstPtr;
+
+  typedef typename pcl::Registration<PointSource, PointTarget>::PointCloudTarget PointCloudTarget;
+  typedef typename PointCloudTarget::Ptr PointCloudTargetPtr;
+  typedef typename PointCloudTarget::ConstPtr PointCloudTargetConstPtr;
+
+  typedef pcl::PointIndices::Ptr PointIndicesPtr;
+  typedef pcl::PointIndices::ConstPtr PointIndicesConstPtr;
+
+  /** \brief Typename of searchable voxel grid containing mean and covariance. */
+  typedef pclomp::VoxelGridCovariance<PointTarget> TargetGrid;
+  /** \brief Typename of pointer to searchable voxel grid. */
+  typedef TargetGrid * TargetGridPtr;
+  /** \brief Typename of const pointer to searchable voxel grid. */
+  typedef const TargetGrid * TargetGridConstPtr;
+  /** \brief Typename of const pointer to searchable voxel grid leaf. */
+  typedef typename TargetGrid::LeafConstPtr TargetGridLeafConstPtr;
+
+public:
+#if PCL_VERSION >= PCL_VERSION_CALC(1, 10, 0)
+  typedef pcl::shared_ptr<NormalDistributionsTransform<PointSource, PointTarget>> Ptr;
+  typedef pcl::shared_ptr<const NormalDistributionsTransform<PointSource, PointTarget>> ConstPtr;
+#else
+  typedef boost::shared_ptr<NormalDistributionsTransform<PointSource, PointTarget>> Ptr;
+  typedef boost::shared_ptr<const NormalDistributionsTransform<PointSource, PointTarget>> ConstPtr;
+#endif
+
+  /** \brief Constructor.
+   * Sets \ref outlier_ratio_ to 0.35, \ref step_size_ to 0.05 and \ref resolution_ to 1.0
+   */
+  NormalDistributionsTransform();
+
+  /** \brief Empty destructor */
+  virtual ~NormalDistributionsTransform() {}
+
+  void setNumThreads(int n) { num_threads_ = n; }
+
+  inline int getNumThreads() const { return num_threads_; }
+
+  /** \brief Provide a pointer to the input target (e.g., the point cloud that we want to align the
+   * input source to). \param[in] cloud the input point cloud target
+   */
+  inline void setInputTarget(const PointCloudTargetConstPtr & cloud)
+  {
+    pcl::Registration<PointSource, PointTarget>::setInputTarget(cloud);
+    init();
+  }
+
+  /** \brief Set/change the voxel grid resolution.
+   * \param[in] resolution side length of voxels
+   */
+  inline void setResolution(float resolution)
+  {
+    // Prevents unnecessary voxel initiations
+    if (resolution_ != resolution) {
+      resolution_ = resolution;
+      if (input_) init();
+    }
+  }
+
+  /** \brief Get voxel grid resolution.
+   * \return side length of voxels
+   */
+  inline float getResolution() const { return (resolution_); }
+
+  /** \brief Get the newton line search maximum step length.
+   * \return maximum step length
+   */
+  inline double getStepSize() const { return (step_size_); }
+
+  /** \brief Set/change the newton line search maximum step length.
+   * \param[in] step_size maximum step length
+   */
+  inline void setStepSize(double step_size) { step_size_ = step_size; }
+
+  /** \brief Get the point cloud outlier ratio.
+   * \return outlier ratio
+   */
+  inline double getOutlierRatio() const { return (outlier_ratio_); }
+
+  /** \brief Set/change the point cloud outlier ratio.
+   * \param[in] outlier_ratio outlier ratio
+   */
+  inline void setOutlierRatio(double outlier_ratio) { outlier_ratio_ = outlier_ratio; }
+
+  inline void setNeighborhoodSearchMethod(NeighborSearchMethod method) { search_method = method; }
+
+  inline NeighborSearchMethod getNeighborhoodSearchMethod() const { return search_method; }
+
+  /** \brief Get the registration alignment probability.
+   * \return transformation probability
+   */
+  inline double getTransformationProbability() const { return (trans_probability_); }
+
+  inline double getNearestVoxelTransformationLikelihood() const
+  {
+    return nearest_voxel_transformation_likelihood_;
+  }
+
+  /** \brief Get the number of iterations required to calculate alignment.
+   * \return final number of iterations
+   */
+  inline int getFinalNumIteration() const { return (nr_iterations_); }
+
+  /** \brief Return the hessian matrix */
+  inline Eigen::Matrix<double, 6, 6> getHessian() const { return hessian_; }
+
+  /** \brief Return the transformation array */
+  inline const std::vector<Eigen::Matrix4f, Eigen::aligned_allocator<Eigen::Matrix4f>>
+  getFinalTransformationArray() const
+  {
+    return transformation_array_;
+  }
+
+  /** \brief Convert 6 element transformation vector to affine transformation.
+   * \param[in] x transformation vector of the form [x, y, z, roll, pitch, yaw]
+   * \param[out] trans affine transform corresponding to given transformation vector
+   */
+  static void convertTransform(const Eigen::Matrix<double, 6, 1> & x, Eigen::Affine3f & trans)
+  {
+    trans = Eigen::Translation<float, 3>(
+              static_cast<float>(x(0)), static_cast<float>(x(1)), static_cast<float>(x(2))) *
+            Eigen::AngleAxis<float>(static_cast<float>(x(3)), Eigen::Vector3f::UnitX()) *
+            Eigen::AngleAxis<float>(static_cast<float>(x(4)), Eigen::Vector3f::UnitY()) *
+            Eigen::AngleAxis<float>(static_cast<float>(x(5)), Eigen::Vector3f::UnitZ());
+  }
+
+  /** \brief Convert 6 element transformation vector to transformation matrix.
+   * \param[in] x transformation vector of the form [x, y, z, roll, pitch, yaw]
+   * \param[out] trans 4x4 transformation matrix corresponding to given transformation vector
+   */
+  static void convertTransform(const Eigen::Matrix<double, 6, 1> & x, Eigen::Matrix4f & trans)
+  {
+    Eigen::Affine3f _affine;
+    convertTransform(x, _affine);
+    trans = _affine.matrix();
+  }
+
+  // negative log likelihood function
+  // lower is better
+  double calculateScore(const PointCloudSource & cloud) const;
+  double calculateTransformationProbability(const PointCloudSource & cloud) const;
+  double calculateNearestVoxelTransformationLikelihood(const PointCloudSource & cloud) const;
+
+  inline void setRegularizationScaleFactor(float regularization_scale_factor)
+  {
+    regularization_scale_factor_ = regularization_scale_factor;
+  }
+
+  inline void setRegularizationPose(Eigen::Matrix4f regularization_pose)
+  {
+    regularization_pose_ = regularization_pose;
+  }
+
+  inline void unsetRegularizationPose() { regularization_pose_ = boost::none; }
+
+protected:
+  using pcl::Registration<PointSource, PointTarget>::reg_name_;
+  using pcl::Registration<PointSource, PointTarget>::getClassName;
+  using pcl::Registration<PointSource, PointTarget>::input_;
+  using pcl::Registration<PointSource, PointTarget>::indices_;
+  using pcl::Registration<PointSource, PointTarget>::target_;
+  using pcl::Registration<PointSource, PointTarget>::nr_iterations_;
+  using pcl::Registration<PointSource, PointTarget>::max_iterations_;
+  using pcl::Registration<PointSource, PointTarget>::previous_transformation_;
+  using pcl::Registration<PointSource, PointTarget>::final_transformation_;
+  using pcl::Registration<PointSource, PointTarget>::transformation_;
+  using pcl::Registration<PointSource, PointTarget>::transformation_epsilon_;
+  using pcl::Registration<PointSource, PointTarget>::converged_;
+  using pcl::Registration<PointSource, PointTarget>::corr_dist_threshold_;
+  using pcl::Registration<PointSource, PointTarget>::inlier_threshold_;
+
+  using pcl::Registration<PointSource, PointTarget>::update_visualizer_;
+
+  /** \brief Estimate the transformation and returns the transformed source (input) as output.
+   * \param[out] output the resultant input transformed point cloud dataset
+   */
+  virtual void computeTransformation(PointCloudSource & output)
+  {
+    computeTransformation(output, Eigen::Matrix4f::Identity());
+  }
+
+  /** \brief Estimate the transformation and returns the transformed source (input) as output.
+   * \param[out] output the resultant input transformed point cloud dataset
+   * \param[in] guess the initial gross estimation of the transformation
+   */
+  virtual void computeTransformation(PointCloudSource & output, const Eigen::Matrix4f & guess);
+
+  /** \brief Initiate covariance voxel structure. */
+  void inline init()
+  {
+    target_cells_.setLeafSize(resolution_, resolution_, resolution_);
+    target_cells_.setInputCloud(target_);
+    // Initiate voxel structure.
+    target_cells_.filter(true);
+  }
+
+  /** \brief Compute derivatives of probability function w.r.t. the transformation vector.
+   * \note Equation 6.10, 6.12 and 6.13 [Magnusson 2009].
+   * \param[out] score_gradient the gradient vector of the probability function w.r.t. the
+   * transformation vector \param[out] hessian the hessian matrix of the probability function w.r.t.
+   * the transformation vector \param[in] trans_cloud transformed point cloud \param[in] p the
+   * current transform vector \param[in] compute_hessian flag to calculate hessian, unnecessary for
+   * step calculation.
+   */
+  double computeDerivatives(
+    Eigen::Matrix<double, 6, 1> & score_gradient, Eigen::Matrix<double, 6, 6> & hessian,
+    PointCloudSource & trans_cloud, Eigen::Matrix<double, 6, 1> & p, bool compute_hessian = true);
+
+  /** \brief Compute individual point contributions to derivatives of probability function w.r.t.
+   * the transformation vector. \note Equation 6.10, 6.12 and 6.13 [Magnusson 2009]. \param[in,out]
+   * score_gradient the gradient vector of the probability function w.r.t. the transformation vector
+   * \param[in,out] hessian the hessian matrix of the probability function w.r.t. the transformation
+   * vector \param[in] x_trans transformed point minus mean of occupied covariance voxel \param[in]
+   * c_inv covariance of occupied covariance voxel \param[in] compute_hessian flag to calculate
+   * hessian, unnecessary for step calculation.
+   */
+  double updateDerivatives(
+    Eigen::Matrix<double, 6, 1> & score_gradient, Eigen::Matrix<double, 6, 6> & hessian,
+    const Eigen::Matrix<float, 4, 6> & point_gradient_,
+    const Eigen::Matrix<float, 24, 6> & point_hessian_, const Eigen::Vector3d & x_trans,
+    const Eigen::Matrix3d & c_inv, bool compute_hessian = true) const;
+
+  /** \brief Precompute angular components of derivatives.
+   * \note Equation 6.19 and 6.21 [Magnusson 2009].
+   * \param[in] p the current transform vector
+   * \param[in] compute_hessian flag to calculate hessian, unnecessary for step calculation.
+   */
+  void computeAngleDerivatives(Eigen::Matrix<double, 6, 1> & p, bool compute_hessian = true);
+
+  /** \brief Compute point derivatives.
+   * \note Equation 6.18-21 [Magnusson 2009].
+   * \param[in] x point from the input cloud
+   * \param[in] compute_hessian flag to calculate hessian, unnecessary for step calculation.
+   */
+  void computePointDerivatives(
+    Eigen::Vector3d & x, Eigen::Matrix<double, 3, 6> & point_gradient_,
+    Eigen::Matrix<double, 18, 6> & point_hessian_, bool compute_hessian = true) const;
+
+  void computePointDerivatives(
+    Eigen::Vector3d & x, Eigen::Matrix<float, 4, 6> & point_gradient_,
+    Eigen::Matrix<float, 24, 6> & point_hessian_, bool compute_hessian = true) const;
+
+  /** \brief Compute hessian of probability function w.r.t. the transformation vector.
+   * \note Equation 6.13 [Magnusson 2009].
+   * \param[out] hessian the hessian matrix of the probability function w.r.t. the transformation
+   * vector \param[in] trans_cloud transformed point cloud \param[in] p the current transform vector
+   */
+  void computeHessian(
+    Eigen::Matrix<double, 6, 6> & hessian, PointCloudSource & trans_cloud,
+    Eigen::Matrix<double, 6, 1> & p);
+
+  /** \brief Compute individual point contributions to hessian of probability function w.r.t. the
+   * transformation vector. \note Equation 6.13 [Magnusson 2009]. \param[in,out] hessian the hessian
+   * matrix of the probability function w.r.t. the transformation vector \param[in] x_trans
+   * transformed point minus mean of occupied covariance voxel \param[in] c_inv covariance of
+   * occupied covariance voxel
+   */
+  void updateHessian(
+    Eigen::Matrix<double, 6, 6> & hessian, const Eigen::Matrix<double, 3, 6> & point_gradient_,
+    const Eigen::Matrix<double, 18, 6> & point_hessian_, const Eigen::Vector3d & x_trans,
+    const Eigen::Matrix3d & c_inv) const;
+
+  /** \brief Compute line search step length and update transform and probability derivatives using
+   * More-Thuente method. \note Search Algorithm [More, Thuente 1994] \param[in] x initial
+   * transformation vector, \f$ x \f$ in Equation 1.3 (Moore, Thuente 1994) and \f$ \vec{p} \f$ in
+   * Algorithm 2 [Magnusson 2009] \param[in] step_dir descent direction, \f$ p \f$ in Equation 1.3
+   * (Moore, Thuente 1994) and \f$ \delta \vec{p} \f$ normalized in Algorithm 2 [Magnusson 2009]
+   * \param[in] step_init initial step length estimate, \f$ \alpha_0 \f$ in Moore-Thuente (1994) and
+   * the normal of \f$ \delta \vec{p} \f$ in Algorithm 2 [Magnusson 2009] \param[in] step_max
+   * maximum step length, \f$ \alpha_max \f$ in Moore-Thuente (1994) \param[in] step_min minimum
+   * step length, \f$ \alpha_min \f$ in Moore-Thuente (1994) \param[out] score final score function
+   * value, \f$ f(x + \alpha p) \f$ in Equation 1.3 (Moore, Thuente 1994) and \f$ score \f$ in
+   * Algorithm 2 [Magnusson 2009] \param[in,out] score_gradient gradient of score function w.r.t.
+   * transformation vector, \f$ f'(x + \alpha p) \f$ in Moore-Thuente (1994) and \f$ \vec{g} \f$ in
+   * Algorithm 2 [Magnusson 2009] \param[out] hessian hessian of score function w.r.t.
+   * transformation vector, \f$ f''(x + \alpha p) \f$ in Moore-Thuente (1994) and \f$ H \f$ in
+   * Algorithm 2 [Magnusson 2009] \param[in,out] trans_cloud transformed point cloud, \f$ X \f$
+   * transformed by \f$ T(\vec{p},\vec{x}) \f$ in Algorithm 2 [Magnusson 2009] \return final step
+   * length
+   */
+  double computeStepLengthMT(
+    const Eigen::Matrix<double, 6, 1> & x, Eigen::Matrix<double, 6, 1> & step_dir, double step_init,
+    double step_max, double step_min, double & score, Eigen::Matrix<double, 6, 1> & score_gradient,
+    Eigen::Matrix<double, 6, 6> & hessian, PointCloudSource & trans_cloud);
+
+  /** \brief Update interval of possible step lengths for More-Thuente method, \f$ I \f$ in
+   * More-Thuente (1994) \note Updating Algorithm until some value satisfies \f$ \psi(\alpha_k) \leq
+   * 0 \f$ and \f$ \phi'(\alpha_k) \geq 0 \f$ and Modified Updating Algorithm from then on [More,
+   * Thuente 1994]. \param[in,out] a_l first endpoint of interval \f$ I \f$, \f$ \alpha_l \f$ in
+   * Moore-Thuente (1994) \param[in,out] f_l value at first endpoint, \f$ f_l \f$ in Moore-Thuente
+   * (1994), \f$ \psi(\alpha_l) \f$ for Update Algorithm and \f$ \phi(\alpha_l) \f$ for Modified
+   * Update Algorithm \param[in,out] g_l derivative at first endpoint, \f$ g_l \f$ in Moore-Thuente
+   * (1994), \f$ \psi'(\alpha_l) \f$ for Update Algorithm and \f$ \phi'(\alpha_l) \f$ for Modified
+   * Update Algorithm \param[in,out] a_u second endpoint of interval \f$ I \f$, \f$ \alpha_u \f$ in
+   * Moore-Thuente (1994) \param[in,out] f_u value at second endpoint, \f$ f_u \f$ in Moore-Thuente
+   * (1994), \f$ \psi(\alpha_u) \f$ for Update Algorithm and \f$ \phi(\alpha_u) \f$ for Modified
+   * Update Algorithm \param[in,out] g_u derivative at second endpoint, \f$ g_u \f$ in Moore-Thuente
+   * (1994), \f$ \psi'(\alpha_u) \f$ for Update Algorithm and \f$ \phi'(\alpha_u) \f$ for Modified
+   * Update Algorithm \param[in] a_t trial value, \f$ \alpha_t \f$ in Moore-Thuente (1994)
+   * \param[in] f_t value at trial value, \f$ f_t \f$ in Moore-Thuente (1994), \f$ \psi(\alpha_t)
+   * \f$ for Update Algorithm and \f$ \phi(\alpha_t) \f$ for Modified Update Algorithm \param[in]
+   * g_t derivative at trial value, \f$ g_t \f$ in Moore-Thuente (1994), \f$ \psi'(\alpha_t) \f$ for
+   * Update Algorithm and \f$ \phi'(\alpha_t) \f$ for Modified Update Algorithm \return if interval
+   * converges
+   */
+  bool updateIntervalMT(
+    double & a_l, double & f_l, double & g_l, double & a_u, double & f_u, double & g_u, double a_t,
+    double f_t, double g_t);
+
+  /** \brief Select new trial value for More-Thuente method.
+   * \note Trial Value Selection [More, Thuente 1994], \f$ \psi(\alpha_k) \f$ is used for \f$ f_k
+   * \f$ and \f$ g_k \f$ until some value satisfies the test \f$ \psi(\alpha_k) \leq 0 \f$ and \f$
+   * \phi'(\alpha_k) \geq 0 \f$ then \f$ \phi(\alpha_k) \f$ is used from then on. \note
+   * Interpolation Minimizer equations from Optimization Theory and Methods: Nonlinear Programming
+   * By Wenyu Sun, Ya-xiang Yuan (89-100). \param[in] a_l first endpoint of interval \f$ I \f$, \f$
+   * \alpha_l \f$ in Moore-Thuente (1994) \param[in] f_l value at first endpoint, \f$ f_l \f$ in
+   * Moore-Thuente (1994) \param[in] g_l derivative at first endpoint, \f$ g_l \f$ in Moore-Thuente
+   * (1994) \param[in] a_u second endpoint of interval \f$ I \f$, \f$ \alpha_u \f$ in Moore-Thuente
+   * (1994) \param[in] f_u value at second endpoint, \f$ f_u \f$ in Moore-Thuente (1994) \param[in]
+   * g_u derivative at second endpoint, \f$ g_u \f$ in Moore-Thuente (1994) \param[in] a_t previous
+   * trial value, \f$ \alpha_t \f$ in Moore-Thuente (1994) \param[in] f_t value at previous trial
+   * value, \f$ f_t \f$ in Moore-Thuente (1994) \param[in] g_t derivative at previous trial value,
+   * \f$ g_t \f$ in Moore-Thuente (1994) \return new trial value
+   */
+  double trialValueSelectionMT(
+    double a_l, double f_l, double g_l, double a_u, double f_u, double g_u, double a_t, double f_t,
+    double g_t);
+
+  /** \brief Auxiliary function used to determine endpoints of More-Thuente interval.
+   * \note \f$ \psi(\alpha) \f$ in Equation 1.6 (Moore, Thuente 1994)
+   * \param[in] a the step length, \f$ \alpha \f$ in More-Thuente (1994)
+   * \param[in] f_a function value at step length a, \f$ \phi(\alpha) \f$ in More-Thuente (1994)
+   * \param[in] f_0 initial function value, \f$ \phi(0) \f$ in Moore-Thuente (1994)
+   * \param[in] g_0 initial function gradiant, \f$ \phi'(0) \f$ in More-Thuente (1994)
+   * \param[in] mu the step length, constant \f$ \mu \f$ in Equation 1.1 [More, Thuente 1994]
+   * \return sufficient decrease value
+   */
+  inline double auxiliaryFunction_PsiMT(
+    double a, double f_a, double f_0, double g_0, double mu = 1.e-4)
+  {
+    return (f_a - f_0 - mu * g_0 * a);
+  }
+
+  /** \brief Auxiliary function derivative used to determine endpoints of More-Thuente interval.
+   * \note \f$ \psi'(\alpha) \f$, derivative of Equation 1.6 (Moore, Thuente 1994)
+   * \param[in] g_a function gradient at step length a, \f$ \phi'(\alpha) \f$ in More-Thuente (1994)
+   * \param[in] g_0 initial function gradiant, \f$ \phi'(0) \f$ in More-Thuente (1994)
+   * \param[in] mu the step length, constant \f$ \mu \f$ in Equation 1.1 [More, Thuente 1994]
+   * \return sufficient decrease derivative
+   */
+  inline double auxiliaryFunction_dPsiMT(double g_a, double g_0, double mu = 1.e-4)
+  {
+    return (g_a - mu * g_0);
+  }
+
+  /** \brief The voxel grid generated from target cloud containing point means and covariances. */
+  TargetGrid target_cells_;
+
+  // double fitness_epsilon_;
+
+  /** \brief The side length of voxels. */
+  float resolution_;
+
+  /** \brief The maximum step length. */
+  double step_size_;
+
+  /** \brief The ratio of outliers of points w.r.t. a normal distribution, Equation 6.7 [Magnusson
+   * 2009]. */
+  double outlier_ratio_;
+
+  /** \brief The normalization constants used fit the point distribution to a normal distribution,
+   * Equation 6.8 [Magnusson 2009]. */
+  double gauss_d1_, gauss_d2_, gauss_d3_;
+
+  /** \brief The probability score of the transform applied to the input cloud, Equation 6.9
+   * and 6.10 [Magnusson 2009]. */
+  double trans_probability_;
+
+  /** \brief Precomputed Angular Gradient
+   *
+   * The precomputed angular derivatives for the jacobian of a transformation vector, Equation 6.19
+   * [Magnusson 2009].
+   */
+  Eigen::Vector3d j_ang_a_, j_ang_b_, j_ang_c_, j_ang_d_, j_ang_e_, j_ang_f_, j_ang_g_, j_ang_h_;
+
+  Eigen::Matrix<float, 8, 4> j_ang;
+
+  /** \brief Precomputed Angular Hessian
+   *
+   * The precomputed angular derivatives for the hessian of a transformation vector, Equation 6.19
+   * [Magnusson 2009].
+   */
+  Eigen::Vector3d h_ang_a2_, h_ang_a3_, h_ang_b2_, h_ang_b3_, h_ang_c2_, h_ang_c3_, h_ang_d1_,
+    h_ang_d2_, h_ang_d3_, h_ang_e1_, h_ang_e2_, h_ang_e3_, h_ang_f1_, h_ang_f2_, h_ang_f3_;
+
+  Eigen::Matrix<float, 16, 4> h_ang;
+
+  /** \brief The first order derivative of the transformation of a point w.r.t. the transform
+   * vector, \f$ J_E \f$ in Equation 6.18 [Magnusson 2009]. */
+  //      Eigen::Matrix<double, 3, 6> point_gradient_;
+
+  /** \brief The second order derivative of the transformation of a point w.r.t. the transform
+   * vector, \f$ H_E \f$ in Equation 6.20 [Magnusson 2009]. */
+  //      Eigen::Matrix<double, 18, 6> point_hessian_;
+
+  int num_threads_;
+
+  Eigen::Matrix<double, 6, 6> hessian_;
+  std::vector<Eigen::Matrix4f, Eigen::aligned_allocator<Eigen::Matrix4f>> transformation_array_;
+  double nearest_voxel_transformation_likelihood_;
+
+  float regularization_scale_factor_;
+  boost::optional<Eigen::Matrix4f> regularization_pose_;
+  Eigen::Vector3f regularization_pose_translation_;
+
+public:
+  NeighborSearchMethod search_method;
+
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+};
+
+}  // namespace pclomp
+
+#endif  // PCLOMP__NDT_OMP_H_
diff --git a/localization/ndt_omp/include/pclomp/ndt_omp_impl.hpp b/localization/ndt_omp/include/pclomp/ndt_omp_impl.hpp
new file mode 100644
index 0000000000000..bf91d8d62b6ef
--- /dev/null
+++ b/localization/ndt_omp/include/pclomp/ndt_omp_impl.hpp
@@ -0,0 +1,1231 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Point Cloud Library (PCL) - www.pointclouds.org
+ *  Copyright (c) 2010-2011, Willow Garage, Inc.
+ *  Copyright (c) 2012-, Open Perception, Inc.
+ *
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of the copyright holder(s) nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id$
+ *
+ */
+
+#include "ndt_omp.h"
+
+#include <algorithm>
+#include <vector>
+
+#ifndef PCLOMP__NDT_OMP_IMPL_HPP_
+#define PCLOMP__NDT_OMP_IMPL_HPP_
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+pclomp::NormalDistributionsTransform<PointSource, PointTarget>::NormalDistributionsTransform()
+: target_cells_(),
+  resolution_(1.0f),
+  step_size_(0.1),
+  outlier_ratio_(0.55),
+  gauss_d1_(),
+  gauss_d2_(),
+  gauss_d3_(),
+  trans_probability_(),
+  regularization_pose_(boost::none),
+  j_ang_a_(),
+  j_ang_b_(),
+  j_ang_c_(),
+  j_ang_d_(),
+  j_ang_e_(),
+  j_ang_f_(),
+  j_ang_g_(),
+  j_ang_h_(),
+  h_ang_a2_(),
+  h_ang_a3_(),
+  h_ang_b2_(),
+  h_ang_b3_(),
+  h_ang_c2_(),
+  h_ang_c3_(),
+  h_ang_d1_(),
+  h_ang_d2_(),
+  h_ang_d3_(),
+  h_ang_e1_(),
+  h_ang_e2_(),
+  h_ang_e3_(),
+  h_ang_f1_(),
+  h_ang_f2_(),
+  h_ang_f3_()
+{
+  reg_name_ = "NormalDistributionsTransform";
+
+  double gauss_c1, gauss_c2;
+
+  // Initializes the gaussian fitting parameters (eq. 6.8) [Magnusson 2009]
+  gauss_c1 = 10.0 * (1 - outlier_ratio_);
+  gauss_c2 = outlier_ratio_ / pow(resolution_, 3);
+  gauss_d3_ = -log(gauss_c2);
+  gauss_d1_ = -log(gauss_c1 + gauss_c2) - gauss_d3_;
+  gauss_d2_ = -2 * log((-log(gauss_c1 * exp(-0.5) + gauss_c2) - gauss_d3_) / gauss_d1_);
+
+  transformation_epsilon_ = 0.1;
+  max_iterations_ = 35;
+
+  search_method = DIRECT7;
+  num_threads_ = omp_get_max_threads();
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computeTransformation(
+  PointCloudSource & output, const Eigen::Matrix4f & guess)
+{
+  nr_iterations_ = 0;
+  converged_ = false;
+
+  double gauss_c1, gauss_c2;
+
+  // Initializes the gaussian fitting parameters (eq. 6.8) [Magnusson 2009]
+  gauss_c1 = 10 * (1 - outlier_ratio_);
+  gauss_c2 = outlier_ratio_ / pow(resolution_, 3);
+  gauss_d3_ = -log(gauss_c2);
+  gauss_d1_ = -log(gauss_c1 + gauss_c2) - gauss_d3_;
+  gauss_d2_ = -2 * log((-log(gauss_c1 * exp(-0.5) + gauss_c2) - gauss_d3_) / gauss_d1_);
+
+  if (guess != Eigen::Matrix4f::Identity()) {
+    // Initialise final transformation to the guessed one
+    final_transformation_ = guess;
+    // Apply guessed transformation prior to search for neighbours
+    transformPointCloud(output, output, guess);
+  }
+
+  Eigen::Transform<float, 3, Eigen::Affine, Eigen::ColMajor> eig_transformation;
+  eig_transformation.matrix() = final_transformation_;
+  transformation_array_.clear();
+  transformation_array_.push_back(final_transformation_);
+
+  // Convert initial guess matrix to 6 element transformation vector
+  Eigen::Matrix<double, 6, 1> p, delta_p, score_gradient;
+  Eigen::Vector3f init_translation = eig_transformation.translation();
+  Eigen::Vector3f init_rotation = eig_transformation.rotation().eulerAngles(0, 1, 2);
+  p << init_translation(0), init_translation(1), init_translation(2), init_rotation(0),
+    init_rotation(1), init_rotation(2);
+
+  Eigen::Matrix<double, 6, 6> hessian;
+
+  double score = 0;
+  double delta_p_norm;
+
+  if (regularization_pose_) {
+    Eigen::Transform<float, 3, Eigen::Affine, Eigen::ColMajor> regularization_pose_transformation;
+    regularization_pose_transformation.matrix() = regularization_pose_.get();
+    regularization_pose_translation_ = regularization_pose_transformation.translation();
+  }
+
+  // Calculate derivatives of initial transform vector, subsequent derivative calculations are done
+  // in the step length determination.
+  score = computeDerivatives(score_gradient, hessian, output, p);
+
+  while (!converged_) {
+    // Store previous transformation
+    previous_transformation_ = transformation_;
+
+    // Solve for decent direction using newton method, line 23 in Algorithm 2 [Magnusson 2009]
+    Eigen::JacobiSVD<Eigen::Matrix<double, 6, 6>> sv(
+      hessian, Eigen::ComputeFullU | Eigen::ComputeFullV);
+    // Negative for maximization as opposed to minimization
+    delta_p = sv.solve(-score_gradient);
+
+    // Calculate step length with guaranteed sufficient decrease [More, Thuente 1994]
+    delta_p_norm = delta_p.norm();
+
+    if (delta_p_norm == 0 || delta_p_norm != delta_p_norm) {
+      if (input_->points.empty()) {
+        trans_probability_ = 0.0f;
+      } else {
+        trans_probability_ = score / static_cast<double>(input_->points.size());
+      }
+
+      converged_ = delta_p_norm == delta_p_norm;
+      return;
+    }
+
+    delta_p.normalize();
+    delta_p_norm = computeStepLengthMT(
+      p, delta_p, delta_p_norm, step_size_, transformation_epsilon_ / 2, score, score_gradient,
+      hessian, output);
+    delta_p *= delta_p_norm;
+
+    transformation_ =
+      (Eigen::Translation<float, 3>(
+         static_cast<float>(delta_p(0)), static_cast<float>(delta_p(1)),
+         static_cast<float>(delta_p(2))) *
+       Eigen::AngleAxis<float>(static_cast<float>(delta_p(3)), Eigen::Vector3f::UnitX()) *
+       Eigen::AngleAxis<float>(static_cast<float>(delta_p(4)), Eigen::Vector3f::UnitY()) *
+       Eigen::AngleAxis<float>(static_cast<float>(delta_p(5)), Eigen::Vector3f::UnitZ()))
+        .matrix();
+
+    transformation_array_.push_back(final_transformation_);
+
+    p = p + delta_p;
+
+    // Update Visualizer (untested)
+    if (update_visualizer_ != 0)
+      update_visualizer_(output, std::vector<int>(), *target_, std::vector<int>());
+
+    if (
+      nr_iterations_ > max_iterations_ ||
+      (nr_iterations_ && (std::fabs(delta_p_norm) < transformation_epsilon_))) {
+      converged_ = true;
+    }
+
+    nr_iterations_++;
+  }
+
+  // Store transformation probability. The relative differences within each scan registration are
+  // accurate but the normalization constants need to be modified for it to be globally accurate
+  if (input_->points.empty()) {
+    trans_probability_ = 0.0f;
+  } else {
+    trans_probability_ = score / static_cast<double>(input_->points.size());
+  }
+
+  hessian_ = hessian;
+}
+
+#ifndef _OPENMP
+int omp_get_max_threads() { return 1; }
+int omp_get_thread_num() { return 0; }
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computeDerivatives(
+  Eigen::Matrix<double, 6, 1> & score_gradient, Eigen::Matrix<double, 6, 6> & hessian,
+  PointCloudSource & trans_cloud, Eigen::Matrix<double, 6, 1> & p, bool compute_hessian)
+{
+  score_gradient.setZero();
+  hessian.setZero();
+  double score = 0;
+  int total_neighborhood_count = 0;
+  double nearest_voxel_score = 0;
+  size_t found_neigborhood_voxel_num = 0;
+
+  std::vector<double> scores(input_->points.size());
+  std::vector<double> nearest_voxel_scores(input_->points.size());
+  std::vector<size_t> found_neigborhood_voxel_nums(input_->points.size());
+  std::vector<Eigen::Matrix<double, 6, 1>, Eigen::aligned_allocator<Eigen::Matrix<double, 6, 1>>>
+    score_gradients(input_->points.size());
+  std::vector<Eigen::Matrix<double, 6, 6>, Eigen::aligned_allocator<Eigen::Matrix<double, 6, 6>>>
+    hessians(input_->points.size());
+  std::vector<int> neighborhood_counts(input_->points.size());
+  for (std::size_t i = 0; i < input_->points.size(); i++) {
+    scores[i] = 0;
+    nearest_voxel_scores[i] = 0;
+    found_neigborhood_voxel_nums[i] = 0;
+    score_gradients[i].setZero();
+    hessians[i].setZero();
+    neighborhood_counts[i] = 0;
+  }
+
+  // Precompute Angular Derivatives (eq. 6.19 and 6.21)[Magnusson 2009]
+  computeAngleDerivatives(p);
+
+  std::vector<std::vector<TargetGridLeafConstPtr>> neighborhoods(num_threads_);
+  std::vector<std::vector<float>> distancess(num_threads_);
+
+  // Update gradient and hessian for each point, line 17 in Algorithm 2 [Magnusson 2009]
+#pragma omp parallel for num_threads(num_threads_) schedule(guided, 8)
+  for (std::size_t idx = 0; idx < input_->points.size(); idx++) {
+    int thread_n = omp_get_thread_num();
+
+    // Original Point and Transformed Point
+    PointSource x_pt, x_trans_pt;
+    // Original Point and Transformed Point (for math)
+    Eigen::Vector3d x, x_trans;
+    // Occupied Voxel
+    TargetGridLeafConstPtr cell;
+    // Inverse Covariance of Occupied Voxel
+    Eigen::Matrix3d c_inv;
+
+    // Initialize Point Gradient and Hessian
+    Eigen::Matrix<float, 4, 6> point_gradient_;
+    Eigen::Matrix<float, 24, 6> point_hessian_;
+    point_gradient_.setZero();
+    point_gradient_.block<3, 3>(0, 0).setIdentity();
+    point_hessian_.setZero();
+
+    x_trans_pt = trans_cloud.points[idx];
+
+    auto & neighborhood = neighborhoods[thread_n];
+    auto & distances = distancess[thread_n];
+
+    // Find neighbors (Radius search has been experimentally faster than direct neighbor checking.
+    switch (search_method) {
+      case KDTREE:
+        target_cells_.radiusSearch(x_trans_pt, resolution_, neighborhood, distances);
+        break;
+      case DIRECT26:
+        target_cells_.getNeighborhoodAtPoint(x_trans_pt, neighborhood);
+        break;
+      default:
+      case DIRECT7:
+        target_cells_.getNeighborhoodAtPoint7(x_trans_pt, neighborhood);
+        break;
+      case DIRECT1:
+        target_cells_.getNeighborhoodAtPoint1(x_trans_pt, neighborhood);
+        break;
+    }
+
+    double sum_score_pt = 0;
+    double nearest_voxel_score_pt = 0;
+    Eigen::Matrix<double, 6, 1> score_gradient_pt = Eigen::Matrix<double, 6, 1>::Zero();
+    Eigen::Matrix<double, 6, 6> hessian_pt = Eigen::Matrix<double, 6, 6>::Zero();
+    int neighborhood_count = 0;
+
+    for (typename std::vector<TargetGridLeafConstPtr>::iterator neighborhood_it =
+           neighborhood.begin();
+         neighborhood_it != neighborhood.end(); neighborhood_it++) {
+      cell = *neighborhood_it;
+      x_pt = input_->points[idx];
+      x = Eigen::Vector3d(x_pt.x, x_pt.y, x_pt.z);
+
+      x_trans = Eigen::Vector3d(x_trans_pt.x, x_trans_pt.y, x_trans_pt.z);
+
+      // Denorm point, x_k' in Equations 6.12 and 6.13 [Magnusson 2009]
+      x_trans -= cell->getMean();
+      // Uses precomputed covariance for speed.
+      c_inv = cell->getInverseCov();
+
+      // Compute derivative of transform function w.r.t. transform vector, J_E and H_E in
+      // Equations 6.18 and 6.20 [Magnusson 2009]
+      computePointDerivatives(x, point_gradient_, point_hessian_);
+      // Update score, gradient and hessian, lines 19-21 in Algorithm 2, according to
+      // Equations 6.10, 6.12 and 6.13, respectively [Magnusson 2009]
+      double score_pt = updateDerivatives(
+        score_gradient_pt, hessian_pt, point_gradient_, point_hessian_, x_trans, c_inv,
+        compute_hessian);
+      neighborhood_count++;
+      sum_score_pt += score_pt;
+      if (score_pt > nearest_voxel_score_pt) {
+        nearest_voxel_score_pt = score_pt;
+      }
+    }
+
+    if (!neighborhood.empty()) {
+      ++found_neigborhood_voxel_nums[idx];
+    }
+
+    scores[idx] = sum_score_pt;
+    nearest_voxel_scores[idx] = nearest_voxel_score_pt;
+    score_gradients[idx].noalias() = score_gradient_pt;
+    hessians[idx].noalias() = hessian_pt;
+    neighborhood_counts[idx] += neighborhood_count;
+  }
+
+  // Ensure that the result is invariant against the summing up order
+  for (std::size_t i = 0; i < input_->points.size(); i++) {
+    score += scores[i];
+    nearest_voxel_score += nearest_voxel_scores[i];
+    found_neigborhood_voxel_num += found_neigborhood_voxel_nums[i];
+    score_gradient += score_gradients[i];
+    hessian += hessians[i];
+    total_neighborhood_count += neighborhood_counts[i];
+  }
+
+  if (regularization_pose_) {
+    float regularization_score = 0.0f;
+    Eigen::Matrix<double, 6, 1> regularization_gradient = Eigen::Matrix<double, 6, 1>::Zero();
+    Eigen::Matrix<double, 6, 6> regularization_hessian = Eigen::Matrix<double, 6, 6>::Zero();
+
+    const float dx = regularization_pose_translation_(0) - static_cast<float>(p(0, 0));
+    const float dy = regularization_pose_translation_(1) - static_cast<float>(p(1, 0));
+    const auto sin_yaw = static_cast<float>(sin(p(5, 0)));
+    const auto cos_yaw = static_cast<float>(cos(p(5, 0)));
+    const float longitudinal_distance = dy * sin_yaw + dx * cos_yaw;
+    const auto neighborhood_count_weight = static_cast<float>(total_neighborhood_count);
+
+    regularization_score = -regularization_scale_factor_ * neighborhood_count_weight *
+                           longitudinal_distance * longitudinal_distance;
+
+    regularization_gradient(0, 0) = regularization_scale_factor_ * neighborhood_count_weight *
+                                    2.0f * cos_yaw * longitudinal_distance;
+    regularization_gradient(1, 0) = regularization_scale_factor_ * neighborhood_count_weight *
+                                    2.0f * sin_yaw * longitudinal_distance;
+
+    regularization_hessian(0, 0) =
+      -regularization_scale_factor_ * neighborhood_count_weight * 2.0f * cos_yaw * cos_yaw;
+    regularization_hessian(0, 1) =
+      -regularization_scale_factor_ * neighborhood_count_weight * 2.0f * cos_yaw * sin_yaw;
+    regularization_hessian(1, 1) =
+      -regularization_scale_factor_ * neighborhood_count_weight * 2.0f * sin_yaw * sin_yaw;
+    regularization_hessian(1, 0) = regularization_hessian(0, 1);
+
+    score += regularization_score;
+    score_gradient += regularization_gradient;
+    hessian += regularization_hessian;
+  }
+
+  if (found_neigborhood_voxel_num != 0) {
+    nearest_voxel_transformation_likelihood_ =
+      nearest_voxel_score / static_cast<double>(found_neigborhood_voxel_num);
+  } else {
+    nearest_voxel_transformation_likelihood_ = 0.0;
+  }
+
+  return (score);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computeAngleDerivatives(
+  Eigen::Matrix<double, 6, 1> & p, bool compute_hessian)
+{
+  // Simplified math for near 0 angles
+  double cx, cy, cz, sx, sy, sz;
+  if (fabs(p(3)) < 10e-5) {
+    // p(3) = 0;
+    cx = 1.0;
+    sx = 0.0;
+  } else {
+    cx = cos(p(3));
+    sx = sin(p(3));
+  }
+  if (fabs(p(4)) < 10e-5) {
+    // p(4) = 0;
+    cy = 1.0;
+    sy = 0.0;
+  } else {
+    cy = cos(p(4));
+    sy = sin(p(4));
+  }
+
+  if (fabs(p(5)) < 10e-5) {
+    // p(5) = 0;
+    cz = 1.0;
+    sz = 0.0;
+  } else {
+    cz = cos(p(5));
+    sz = sin(p(5));
+  }
+
+  // Precomputed angular gradiant components. Letters correspond to Equation 6.19 [Magnusson 2009]
+  j_ang_a_ << (-sx * sz + cx * sy * cz), (-sx * cz - cx * sy * sz), (-cx * cy);
+  j_ang_b_ << (cx * sz + sx * sy * cz), (cx * cz - sx * sy * sz), (-sx * cy);
+  j_ang_c_ << (-sy * cz), sy * sz, cy;
+  j_ang_d_ << sx * cy * cz, (-sx * cy * sz), sx * sy;
+  j_ang_e_ << (-cx * cy * cz), cx * cy * sz, (-cx * sy);
+  j_ang_f_ << (-cy * sz), (-cy * cz), 0;
+  j_ang_g_ << (cx * cz - sx * sy * sz), (-cx * sz - sx * sy * cz), 0;
+  j_ang_h_ << (sx * cz + cx * sy * sz), (cx * sy * cz - sx * sz), 0;
+
+  j_ang.setZero();
+  j_ang.row(0).noalias() =
+    Eigen::Vector4f((-sx * sz + cx * sy * cz), (-sx * cz - cx * sy * sz), (-cx * cy), 0.0f);
+  j_ang.row(1).noalias() =
+    Eigen::Vector4f((cx * sz + sx * sy * cz), (cx * cz - sx * sy * sz), (-sx * cy), 0.0f);
+  j_ang.row(2).noalias() = Eigen::Vector4f((-sy * cz), sy * sz, cy, 0.0f);
+  j_ang.row(3).noalias() = Eigen::Vector4f(sx * cy * cz, (-sx * cy * sz), sx * sy, 0.0f);
+  j_ang.row(4).noalias() = Eigen::Vector4f((-cx * cy * cz), cx * cy * sz, (-cx * sy), 0.0f);
+  j_ang.row(5).noalias() = Eigen::Vector4f((-cy * sz), (-cy * cz), 0, 0.0f);
+  j_ang.row(6).noalias() =
+    Eigen::Vector4f((cx * cz - sx * sy * sz), (-cx * sz - sx * sy * cz), 0, 0.0f);
+  j_ang.row(7).noalias() =
+    Eigen::Vector4f((sx * cz + cx * sy * sz), (cx * sy * cz - sx * sz), 0, 0.0f);
+
+  if (compute_hessian) {
+    // Precomputed angular hessian components. Letters correspond to Equation 6.21 and numbers
+    // correspond to row index [Magnusson 2009]
+    h_ang_a2_ << (-cx * sz - sx * sy * cz), (-cx * cz + sx * sy * sz), sx * cy;
+    h_ang_a3_ << (-sx * sz + cx * sy * cz), (-cx * sy * sz - sx * cz), (-cx * cy);
+
+    h_ang_b2_ << (cx * cy * cz), (-cx * cy * sz), (cx * sy);
+    h_ang_b3_ << (sx * cy * cz), (-sx * cy * sz), (sx * sy);
+
+    h_ang_c2_ << (-sx * cz - cx * sy * sz), (sx * sz - cx * sy * cz), 0;
+    h_ang_c3_ << (cx * cz - sx * sy * sz), (-sx * sy * cz - cx * sz), 0;
+
+    h_ang_d1_ << (-cy * cz), (cy * sz), (sy);
+    h_ang_d2_ << (-sx * sy * cz), (sx * sy * sz), (sx * cy);
+    h_ang_d3_ << (cx * sy * cz), (-cx * sy * sz), (-cx * cy);
+
+    h_ang_e1_ << (sy * sz), (sy * cz), 0;
+    h_ang_e2_ << (-sx * cy * sz), (-sx * cy * cz), 0;
+    h_ang_e3_ << (cx * cy * sz), (cx * cy * cz), 0;
+
+    h_ang_f1_ << (-cy * cz), (cy * sz), 0;
+    h_ang_f2_ << (-cx * sz - sx * sy * cz), (-cx * cz + sx * sy * sz), 0;
+    h_ang_f3_ << (-sx * sz + cx * sy * cz), (-cx * sy * sz - sx * cz), 0;
+
+    h_ang.setZero();
+    h_ang.row(0).noalias() =
+      Eigen::Vector4f((-cx * sz - sx * sy * cz), (-cx * cz + sx * sy * sz), sx * cy, 0.0f);  // a2
+    h_ang.row(1).noalias() = Eigen::Vector4f(
+      (-sx * sz + cx * sy * cz), (-cx * sy * sz - sx * cz), (-cx * cy), 0.0f);  // a3
+
+    h_ang.row(2).noalias() =
+      Eigen::Vector4f((cx * cy * cz), (-cx * cy * sz), (cx * sy), 0.0f);  // b2
+    h_ang.row(3).noalias() =
+      Eigen::Vector4f((sx * cy * cz), (-sx * cy * sz), (sx * sy), 0.0f);  // b3
+
+    h_ang.row(4).noalias() =
+      Eigen::Vector4f((-sx * cz - cx * sy * sz), (sx * sz - cx * sy * cz), 0, 0.0f);  // c2
+    h_ang.row(5).noalias() =
+      Eigen::Vector4f((cx * cz - sx * sy * sz), (-sx * sy * cz - cx * sz), 0, 0.0f);  // c3
+
+    h_ang.row(6).noalias() = Eigen::Vector4f((-cy * cz), (cy * sz), (sy), 0.0f);  // d1
+    h_ang.row(7).noalias() =
+      Eigen::Vector4f((-sx * sy * cz), (sx * sy * sz), (sx * cy), 0.0f);  // d2
+    h_ang.row(8).noalias() =
+      Eigen::Vector4f((cx * sy * cz), (-cx * sy * sz), (-cx * cy), 0.0f);  // d3
+
+    h_ang.row(9).noalias() = Eigen::Vector4f((sy * sz), (sy * cz), 0, 0.0f);               // e1
+    h_ang.row(10).noalias() = Eigen::Vector4f((-sx * cy * sz), (-sx * cy * cz), 0, 0.0f);  // e2
+    h_ang.row(11).noalias() = Eigen::Vector4f((cx * cy * sz), (cx * cy * cz), 0, 0.0f);    // e3
+
+    h_ang.row(12).noalias() = Eigen::Vector4f((-cy * cz), (cy * sz), 0, 0.0f);  // f1
+    h_ang.row(13).noalias() =
+      Eigen::Vector4f((-cx * sz - sx * sy * cz), (-cx * cz + sx * sy * sz), 0, 0.0f);  // f2
+    h_ang.row(14).noalias() =
+      Eigen::Vector4f((-sx * sz + cx * sy * cz), (-cx * sy * sz - sx * cz), 0, 0.0f);  // f3
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computePointDerivatives(
+  Eigen::Vector3d & x, Eigen::Matrix<float, 4, 6> & point_gradient_,
+  Eigen::Matrix<float, 24, 6> & point_hessian_, bool compute_hessian) const
+{
+  Eigen::Vector4f x4(x[0], x[1], x[2], 0.0f);
+
+  // Calculate first derivative of Transformation Equation 6.17 w.r.t. transform vector p.
+  // Derivative w.r.t. ith element of transform vector corresponds to column i, Equation 6.18
+  // and 6.19 [Magnusson 2009]
+  Eigen::Matrix<float, 8, 1> x_j_ang = j_ang * x4;
+
+  point_gradient_(1, 3) = x_j_ang[0];
+  point_gradient_(2, 3) = x_j_ang[1];
+  point_gradient_(0, 4) = x_j_ang[2];
+  point_gradient_(1, 4) = x_j_ang[3];
+  point_gradient_(2, 4) = x_j_ang[4];
+  point_gradient_(0, 5) = x_j_ang[5];
+  point_gradient_(1, 5) = x_j_ang[6];
+  point_gradient_(2, 5) = x_j_ang[7];
+
+  if (compute_hessian) {
+    Eigen::Matrix<float, 16, 1> x_h_ang = h_ang * x4;
+
+    // Vectors from Equation 6.21 [Magnusson 2009]
+    Eigen::Vector4f a(0, x_h_ang[0], x_h_ang[1], 0.0f);
+    Eigen::Vector4f b(0, x_h_ang[2], x_h_ang[3], 0.0f);
+    Eigen::Vector4f c(0, x_h_ang[4], x_h_ang[5], 0.0f);
+    Eigen::Vector4f d(x_h_ang[6], x_h_ang[7], x_h_ang[8], 0.0f);
+    Eigen::Vector4f e(x_h_ang[9], x_h_ang[10], x_h_ang[11], 0.0f);
+    Eigen::Vector4f f(x_h_ang[12], x_h_ang[13], x_h_ang[14], 0.0f);
+
+    // Calculate second derivative of Transformation Equation 6.17 w.r.t. transform vector p.
+    // Derivative w.r.t. ith and jth elements of transform vector corresponds to the 3x1 block
+    // matrix starting at (3i,j), Equation 6.20 and 6.21 [Magnusson 2009]
+    point_hessian_.block<4, 1>((9 / 3) * 4, 3) = a;
+    point_hessian_.block<4, 1>((12 / 3) * 4, 3) = b;
+    point_hessian_.block<4, 1>((15 / 3) * 4, 3) = c;
+    point_hessian_.block<4, 1>((9 / 3) * 4, 4) = b;
+    point_hessian_.block<4, 1>((12 / 3) * 4, 4) = d;
+    point_hessian_.block<4, 1>((15 / 3) * 4, 4) = e;
+    point_hessian_.block<4, 1>((9 / 3) * 4, 5) = c;
+    point_hessian_.block<4, 1>((12 / 3) * 4, 5) = e;
+    point_hessian_.block<4, 1>((15 / 3) * 4, 5) = f;
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computePointDerivatives(
+  Eigen::Vector3d & x, Eigen::Matrix<double, 3, 6> & point_gradient_,
+  Eigen::Matrix<double, 18, 6> & point_hessian_, bool compute_hessian) const
+{
+  // Calculate first derivative of Transformation Equation 6.17 w.r.t. transform vector p.
+  // Derivative w.r.t. ith element of transform vector corresponds to column i, Equation 6.18
+  // and 6.19 [Magnusson 2009]
+  point_gradient_(1, 3) = x.dot(j_ang_a_);
+  point_gradient_(2, 3) = x.dot(j_ang_b_);
+  point_gradient_(0, 4) = x.dot(j_ang_c_);
+  point_gradient_(1, 4) = x.dot(j_ang_d_);
+  point_gradient_(2, 4) = x.dot(j_ang_e_);
+  point_gradient_(0, 5) = x.dot(j_ang_f_);
+  point_gradient_(1, 5) = x.dot(j_ang_g_);
+  point_gradient_(2, 5) = x.dot(j_ang_h_);
+
+  if (compute_hessian) {
+    // Vectors from Equation 6.21 [Magnusson 2009]
+    Eigen::Vector3d a, b, c, d, e, f;
+
+    a << 0, x.dot(h_ang_a2_), x.dot(h_ang_a3_);
+    b << 0, x.dot(h_ang_b2_), x.dot(h_ang_b3_);
+    c << 0, x.dot(h_ang_c2_), x.dot(h_ang_c3_);
+    d << x.dot(h_ang_d1_), x.dot(h_ang_d2_), x.dot(h_ang_d3_);
+    e << x.dot(h_ang_e1_), x.dot(h_ang_e2_), x.dot(h_ang_e3_);
+    f << x.dot(h_ang_f1_), x.dot(h_ang_f2_), x.dot(h_ang_f3_);
+
+    // Calculate second derivative of Transformation Equation 6.17 w.r.t. transform vector p.
+    // Derivative w.r.t. ith and jth elements of transform vector corresponds to the 3x1 block
+    // matrix starting at (3i,j), Equation 6.20 and 6.21 [Magnusson 2009]
+    point_hessian_.block<3, 1>(9, 3) = a;
+    point_hessian_.block<3, 1>(12, 3) = b;
+    point_hessian_.block<3, 1>(15, 3) = c;
+    point_hessian_.block<3, 1>(9, 4) = b;
+    point_hessian_.block<3, 1>(12, 4) = d;
+    point_hessian_.block<3, 1>(15, 4) = e;
+    point_hessian_.block<3, 1>(9, 5) = c;
+    point_hessian_.block<3, 1>(12, 5) = e;
+    point_hessian_.block<3, 1>(15, 5) = f;
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::updateDerivatives(
+  Eigen::Matrix<double, 6, 1> & score_gradient, Eigen::Matrix<double, 6, 6> & hessian,
+  const Eigen::Matrix<float, 4, 6> & point_gradient4,
+  const Eigen::Matrix<float, 24, 6> & point_hessian_, const Eigen::Vector3d & x_trans,
+  const Eigen::Matrix3d & c_inv, bool compute_hessian) const
+{
+  Eigen::Matrix<float, 1, 4> x_trans4(x_trans[0], x_trans[1], x_trans[2], 0.0f);
+  Eigen::Matrix4f c_inv4 = Eigen::Matrix4f::Zero();
+  c_inv4.topLeftCorner(3, 3) = c_inv.cast<float>();
+
+  float gauss_d2 = gauss_d2_;
+
+  // e^(-d_2/2 * (x_k - mu_k)^T Sigma_k^-1 (x_k - mu_k)) Equation 6.9 [Magnusson 2009]
+  float e_x_cov_x = exp(-gauss_d2 * x_trans4.dot(x_trans4 * c_inv4) * 0.5f);
+  // Calculate probability of transformed points existence, Equation 6.9 [Magnusson 2009]
+  float score_inc = -gauss_d1_ * e_x_cov_x;
+
+  e_x_cov_x = gauss_d2 * e_x_cov_x;
+
+  // Error checking for invalid values.
+  if (e_x_cov_x > 1 || e_x_cov_x < 0 || e_x_cov_x != e_x_cov_x) return (0);
+
+  // Reusable portion of Equation 6.12 and 6.13 [Magnusson 2009]
+  e_x_cov_x *= gauss_d1_;
+
+  Eigen::Matrix<float, 4, 6> c_inv4_x_point_gradient4 = c_inv4 * point_gradient4;
+  Eigen::Matrix<float, 6, 1> x_trans4_dot_c_inv4_x_point_gradient4 =
+    x_trans4 * c_inv4_x_point_gradient4;
+
+  score_gradient.noalias() += (e_x_cov_x * x_trans4_dot_c_inv4_x_point_gradient4).cast<double>();
+
+  if (compute_hessian) {
+    Eigen::Matrix<float, 1, 4> x_trans4_x_c_inv4 = x_trans4 * c_inv4;
+    Eigen::Matrix<float, 6, 6> point_gradient4_colj_dot_c_inv4_x_point_gradient4_col_i =
+      point_gradient4.transpose() * c_inv4_x_point_gradient4;
+    Eigen::Matrix<float, 6, 1> x_trans4_dot_c_inv4_x_ext_point_hessian_4ij;
+
+    for (int i = 0; i < 6; i++) {
+      // Sigma_k^-1 d(T(x,p))/dpi, Reusable portion of Equation 6.12 and 6.13 [Magnusson 2009]
+      // Update gradient, Equation 6.12 [Magnusson 2009]
+      x_trans4_dot_c_inv4_x_ext_point_hessian_4ij.noalias() =
+        x_trans4_x_c_inv4 * point_hessian_.block<4, 6>(i * 4, 0);
+
+      for (int j = 0; j < hessian.cols(); j++) {
+        // Update hessian, Equation 6.13 [Magnusson 2009]
+        hessian(i, j) +=
+          e_x_cov_x * (-gauss_d2 * x_trans4_dot_c_inv4_x_point_gradient4(i) *
+                         x_trans4_dot_c_inv4_x_point_gradient4(j) +
+                       x_trans4_dot_c_inv4_x_ext_point_hessian_4ij(j) +
+                       point_gradient4_colj_dot_c_inv4_x_point_gradient4_col_i(j, i));
+      }
+    }
+  }
+
+  return (score_inc);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computeHessian(
+  Eigen::Matrix<double, 6, 6> & hessian, PointCloudSource & trans_cloud,
+  Eigen::Matrix<double, 6, 1> &)
+{
+  // Original Point and Transformed Point
+  PointSource x_pt, x_trans_pt;
+  // Original Point and Transformed Point (for math)
+  Eigen::Vector3d x, x_trans;
+  // Occupied Voxel
+  TargetGridLeafConstPtr cell;
+  // Inverse Covariance of Occupied Voxel
+  Eigen::Matrix3d c_inv;
+
+  // Initialize Point Gradient and Hessian
+  Eigen::Matrix<double, 3, 6> point_gradient_;
+  Eigen::Matrix<double, 18, 6> point_hessian_;
+  point_gradient_.setZero();
+  point_gradient_.block<3, 3>(0, 0).setIdentity();
+  point_hessian_.setZero();
+
+  hessian.setZero();
+
+  // Precompute Angular Derivatives unnecessary because only used after regular derivative
+  // calculation
+
+  // Update hessian for each point, line 17 in Algorithm 2 [Magnusson 2009]
+  for (size_t idx = 0; idx < input_->points.size(); idx++) {
+    x_trans_pt = trans_cloud.points[idx];
+
+    // Find neighbors (Radius search has been experimentally faster than direct neighbor checking.
+    std::vector<TargetGridLeafConstPtr> neighborhood;
+    std::vector<float> distances;
+    switch (search_method) {
+      case KDTREE:
+        target_cells_.radiusSearch(x_trans_pt, resolution_, neighborhood, distances);
+        break;
+      case DIRECT26:
+        target_cells_.getNeighborhoodAtPoint(x_trans_pt, neighborhood);
+        break;
+      default:
+      case DIRECT7:
+        target_cells_.getNeighborhoodAtPoint7(x_trans_pt, neighborhood);
+        break;
+      case DIRECT1:
+        target_cells_.getNeighborhoodAtPoint1(x_trans_pt, neighborhood);
+        break;
+    }
+
+    for (typename std::vector<TargetGridLeafConstPtr>::iterator neighborhood_it =
+           neighborhood.begin();
+         neighborhood_it != neighborhood.end(); neighborhood_it++) {
+      cell = *neighborhood_it;
+
+      {
+        x_pt = input_->points[idx];
+        x = Eigen::Vector3d(x_pt.x, x_pt.y, x_pt.z);
+
+        x_trans = Eigen::Vector3d(x_trans_pt.x, x_trans_pt.y, x_trans_pt.z);
+
+        // Denorm point, x_k' in Equations 6.12 and 6.13 [Magnusson 2009]
+        x_trans -= cell->getMean();
+        // Uses precomputed covariance for speed.
+        c_inv = cell->getInverseCov();
+
+        // Compute derivative of transform function w.r.t. transform vector, J_E and H_E in
+        // Equations 6.18 and 6.20 [Magnusson 2009]
+        computePointDerivatives(x, point_gradient_, point_hessian_);
+        // Update hessian, lines 21 in Algorithm 2, according to Equations 6.10, 6.12 and 6.13,
+        // respectively [Magnusson 2009]
+        updateHessian(hessian, point_gradient_, point_hessian_, x_trans, c_inv);
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+void pclomp::NormalDistributionsTransform<PointSource, PointTarget>::updateHessian(
+  Eigen::Matrix<double, 6, 6> & hessian, const Eigen::Matrix<double, 3, 6> & point_gradient_,
+  const Eigen::Matrix<double, 18, 6> & point_hessian_, const Eigen::Vector3d & x_trans,
+  const Eigen::Matrix3d & c_inv) const
+{
+  Eigen::Vector3d cov_dxd_pi;
+  // e^(-d_2/2 * (x_k - mu_k)^T Sigma_k^-1 (x_k - mu_k)) Equation 6.9 [Magnusson 2009]
+  double e_x_cov_x = gauss_d2_ * exp(-gauss_d2_ * x_trans.dot(c_inv * x_trans) / 2);
+
+  // Error checking for invalid values.
+  if (e_x_cov_x > 1 || e_x_cov_x < 0 || e_x_cov_x != e_x_cov_x) return;
+
+  // Reusable portion of Equation 6.12 and 6.13 [Magnusson 2009]
+  e_x_cov_x *= gauss_d1_;
+
+  for (int i = 0; i < 6; i++) {
+    // Sigma_k^-1 d(T(x,p))/dpi, Reusable portion of Equation 6.12 and 6.13 [Magnusson 2009]
+    cov_dxd_pi = c_inv * point_gradient_.col(i);
+
+    for (int j = 0; j < hessian.cols(); j++) {
+      // Update hessian, Equation 6.13 [Magnusson 2009]
+      hessian(i, j) += e_x_cov_x * (-gauss_d2_ * x_trans.dot(cov_dxd_pi) *
+                                      x_trans.dot(c_inv * point_gradient_.col(j)) +
+                                    x_trans.dot(c_inv * point_hessian_.block<3, 1>(3 * i, j)) +
+                                    point_gradient_.col(j).dot(cov_dxd_pi));
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+bool pclomp::NormalDistributionsTransform<PointSource, PointTarget>::updateIntervalMT(
+  double & a_l, double & f_l, double & g_l, double & a_u, double & f_u, double & g_u, double a_t,
+  double f_t, double g_t)
+{
+  // Case U1 in Update Algorithm and Case a in Modified Update Algorithm [More, Thuente 1994]
+  if (f_t > f_l) {
+    a_u = a_t;
+    f_u = f_t;
+    g_u = g_t;
+    return (false);
+  } else if (g_t * (a_l - a_t) > 0) {  // Case U2 in Update Algorithm and Case b in Modified Update
+                                       // Algorithm [More, Thuente 1994]
+    a_l = a_t;
+    f_l = f_t;
+    g_l = g_t;
+    return (false);
+  } else if (g_t * (a_l - a_t) < 0) {  // Case U3 in Update Algorithm and Case c in Modified Update
+                                       // Algorithm [More, Thuente 1994]
+    a_u = a_l;
+    f_u = f_l;
+    g_u = g_l;
+
+    a_l = a_t;
+    f_l = f_t;
+    g_l = g_t;
+    return (false);
+  } else {  // Interval Converged
+    return (true);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::trialValueSelectionMT(
+  double a_l, double f_l, double g_l, double a_u, double f_u, double g_u, double a_t, double f_t,
+  double g_t)
+{
+  // Case 1 in Trial Value Selection [More, Thuente 1994]
+  if (f_t > f_l) {
+    // Calculate the minimizer of the cubic that interpolates f_l, f_t, g_l and g_t
+    // Equation 2.4.52 [Sun, Yuan 2006]
+    double z = 3 * (f_t - f_l) / (a_t - a_l) - g_t - g_l;
+    double w = std::sqrt(z * z - g_t * g_l);
+    // Equation 2.4.56 [Sun, Yuan 2006]
+    double a_c = a_l + (a_t - a_l) * (w - g_l - z) / (g_t - g_l + 2 * w);
+
+    // Calculate the minimizer of the quadratic that interpolates f_l, f_t and g_l
+    // Equation 2.4.2 [Sun, Yuan 2006]
+    double a_q = a_l - 0.5 * (a_l - a_t) * g_l / (g_l - (f_l - f_t) / (a_l - a_t));
+
+    if (std::fabs(a_c - a_l) < std::fabs(a_q - a_l))
+      return (a_c);
+    else
+      return (0.5 * (a_q + a_c));
+  } else if (g_t * g_l < 0) {  // Case 2 in Trial Value Selection [More, Thuente 1994]
+    // Calculate the minimizer of the cubic that interpolates f_l, f_t, g_l and g_t
+    // Equation 2.4.52 [Sun, Yuan 2006]
+    double z = 3 * (f_t - f_l) / (a_t - a_l) - g_t - g_l;
+    double w = std::sqrt(z * z - g_t * g_l);
+    // Equation 2.4.56 [Sun, Yuan 2006]
+    double a_c = a_l + (a_t - a_l) * (w - g_l - z) / (g_t - g_l + 2 * w);
+
+    // Calculate the minimizer of the quadratic that interpolates f_l, g_l and g_t
+    // Equation 2.4.5 [Sun, Yuan 2006]
+    double a_s = a_l - (a_l - a_t) / (g_l - g_t) * g_l;
+
+    if (std::fabs(a_c - a_t) >= std::fabs(a_s - a_t))
+      return (a_c);
+    else
+      return (a_s);
+  } else if (std::fabs(g_t) <= std::fabs(g_l)) {  // Case 3 in Trial Value Selection [More, Thuente
+                                                  // 1994]
+    // Calculate the minimizer of the cubic that interpolates f_l, f_t, g_l and g_t
+    // Equation 2.4.52 [Sun, Yuan 2006]
+    double z = 3 * (f_t - f_l) / (a_t - a_l) - g_t - g_l;
+    double w = std::sqrt(z * z - g_t * g_l);
+    double a_c = a_l + (a_t - a_l) * (w - g_l - z) / (g_t - g_l + 2 * w);
+
+    // Calculate the minimizer of the quadratic that interpolates g_l and g_t
+    // Equation 2.4.5 [Sun, Yuan 2006]
+    double a_s = a_l - (a_l - a_t) / (g_l - g_t) * g_l;
+
+    double a_t_next;
+
+    if (std::fabs(a_c - a_t) < std::fabs(a_s - a_t))
+      a_t_next = a_c;
+    else
+      a_t_next = a_s;
+
+    if (a_t > a_l)
+      return (std::min(a_t + 0.66 * (a_u - a_t), a_t_next));
+    else
+      return (std::max(a_t + 0.66 * (a_u - a_t), a_t_next));
+  } else {  // Case 4 in Trial Value Selection [More, Thuente 1994]
+    // Calculate the minimizer of the cubic that interpolates f_u, f_t, g_u and g_t
+    // Equation 2.4.52 [Sun, Yuan 2006]
+    double z = 3 * (f_t - f_u) / (a_t - a_u) - g_t - g_u;
+    double w = std::sqrt(z * z - g_t * g_u);
+    // Equation 2.4.56 [Sun, Yuan 2006]
+    return (a_u + (a_t - a_u) * (w - g_u - z) / (g_t - g_u + 2 * w));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::computeStepLengthMT(
+  const Eigen::Matrix<double, 6, 1> & x, Eigen::Matrix<double, 6, 1> & step_dir, double step_init,
+  double step_max, double step_min, double & score, Eigen::Matrix<double, 6, 1> & score_gradient,
+  Eigen::Matrix<double, 6, 6> & hessian, PointCloudSource & trans_cloud)
+{
+  // Set the value of phi(0), Equation 1.3 [More, Thuente 1994]
+  double phi_0 = -score;
+  // Set the value of phi'(0), Equation 1.3 [More, Thuente 1994]
+  double d_phi_0 = -(score_gradient.dot(step_dir));
+
+  Eigen::Matrix<double, 6, 1> x_t;
+
+  if (d_phi_0 >= 0) {
+    // Not a decent direction
+    if (d_phi_0 == 0) {
+      return 0;
+    } else {
+      // Reverse step direction and calculate optimal step.
+      d_phi_0 *= -1;
+      step_dir *= -1;
+    }
+  }
+
+  // The Search Algorithm for T(mu) [More, Thuente 1994]
+
+  int max_step_iterations = 10;
+  int step_iterations = 0;
+
+  // Sufficient decrease constant, Equation 1.1 [More, Thuete 1994]
+  double mu = 1.e-4;
+  // Curvature condition constant, Equation 1.2 [More, Thuete 1994]
+  double nu = 0.9;
+
+  // Initial endpoints of Interval I,
+  double a_l = 0, a_u = 0;
+
+  // Auxiliary function psi is used until I is determined ot be a closed interval, Equation 2.1
+  // [More, Thuente 1994]
+  double f_l = auxiliaryFunction_PsiMT(a_l, phi_0, phi_0, d_phi_0, mu);
+  double g_l = auxiliaryFunction_dPsiMT(d_phi_0, d_phi_0, mu);
+
+  double f_u = auxiliaryFunction_PsiMT(a_u, phi_0, phi_0, d_phi_0, mu);
+  double g_u = auxiliaryFunction_dPsiMT(d_phi_0, d_phi_0, mu);
+
+  // Check used to allow More-Thuente step length calculation to be skipped by making step_min ==
+  // step_max
+  bool interval_converged = (step_max - step_min) < 0, open_interval = true;
+
+  double a_t = step_init;
+  a_t = std::min(a_t, step_max);
+  a_t = std::max(a_t, step_min);
+
+  x_t = x + step_dir * a_t;
+
+  final_transformation_ =
+    (Eigen::Translation<float, 3>(
+       static_cast<float>(x_t(0)), static_cast<float>(x_t(1)), static_cast<float>(x_t(2))) *
+     Eigen::AngleAxis<float>(static_cast<float>(x_t(3)), Eigen::Vector3f::UnitX()) *
+     Eigen::AngleAxis<float>(static_cast<float>(x_t(4)), Eigen::Vector3f::UnitY()) *
+     Eigen::AngleAxis<float>(static_cast<float>(x_t(5)), Eigen::Vector3f::UnitZ()))
+      .matrix();
+
+  // New transformed point cloud
+  transformPointCloud(*input_, trans_cloud, final_transformation_);
+
+  // Updates score, gradient and hessian.  Hessian calculation is unnecessary but testing showed
+  // that most step calculations use the initial step suggestion and recalculation the reusable
+  // portions of the hessian would intail more computation time.
+  score = computeDerivatives(score_gradient, hessian, trans_cloud, x_t, true);
+
+  // Calculate phi(alpha_t)
+  double phi_t = -score;
+  // Calculate phi'(alpha_t)
+  double d_phi_t = -(score_gradient.dot(step_dir));
+
+  // Calculate psi(alpha_t)
+  double psi_t = auxiliaryFunction_PsiMT(a_t, phi_t, phi_0, d_phi_0, mu);
+  // Calculate psi'(alpha_t)
+  double d_psi_t = auxiliaryFunction_dPsiMT(d_phi_t, d_phi_0, mu);
+
+  // Iterate until max number of iterations, interval convergence or a value satisfies the
+  // sufficient decrease, Equation 1.1, and curvature condition, Equation 1.2 [More, Thuente 1994]
+  while (
+    !interval_converged && step_iterations < max_step_iterations &&
+    !(psi_t <= 0 /*Sufficient Decrease*/ && d_phi_t <= -nu * d_phi_0 /*Curvature Condition*/)) {
+    // Use auxiliary function if interval I is not closed
+    if (open_interval) {
+      a_t = trialValueSelectionMT(a_l, f_l, g_l, a_u, f_u, g_u, a_t, psi_t, d_psi_t);
+    } else {
+      a_t = trialValueSelectionMT(a_l, f_l, g_l, a_u, f_u, g_u, a_t, phi_t, d_phi_t);
+    }
+
+    a_t = std::min(a_t, step_max);
+    a_t = std::max(a_t, step_min);
+
+    x_t = x + step_dir * a_t;
+
+    final_transformation_ =
+      (Eigen::Translation<float, 3>(
+         static_cast<float>(x_t(0)), static_cast<float>(x_t(1)), static_cast<float>(x_t(2))) *
+       Eigen::AngleAxis<float>(static_cast<float>(x_t(3)), Eigen::Vector3f::UnitX()) *
+       Eigen::AngleAxis<float>(static_cast<float>(x_t(4)), Eigen::Vector3f::UnitY()) *
+       Eigen::AngleAxis<float>(static_cast<float>(x_t(5)), Eigen::Vector3f::UnitZ()))
+        .matrix();
+
+    // New transformed point cloud
+    // Done on final cloud to prevent wasted computation
+    transformPointCloud(*input_, trans_cloud, final_transformation_);
+
+    // Updates score, gradient. Values stored to prevent wasted computation.
+    score = computeDerivatives(score_gradient, hessian, trans_cloud, x_t, false);
+
+    // Calculate phi(alpha_t+)
+    phi_t = -score;
+    // Calculate phi'(alpha_t+)
+    d_phi_t = -(score_gradient.dot(step_dir));
+
+    // Calculate psi(alpha_t+)
+    psi_t = auxiliaryFunction_PsiMT(a_t, phi_t, phi_0, d_phi_0, mu);
+    // Calculate psi'(alpha_t+)
+    d_psi_t = auxiliaryFunction_dPsiMT(d_phi_t, d_phi_0, mu);
+
+    // Check if I is now a closed interval
+    if (open_interval && (psi_t <= 0 && d_psi_t >= 0)) {
+      open_interval = false;
+
+      // Converts f_l and g_l from psi to phi
+      f_l = f_l + phi_0 - mu * d_phi_0 * a_l;
+      g_l = g_l + mu * d_phi_0;
+
+      // Converts f_u and g_u from psi to phi
+      f_u = f_u + phi_0 - mu * d_phi_0 * a_u;
+      g_u = g_u + mu * d_phi_0;
+    }
+
+    if (open_interval) {
+      // Update interval end points using Updating Algorithm [More, Thuente 1994]
+      interval_converged = updateIntervalMT(a_l, f_l, g_l, a_u, f_u, g_u, a_t, psi_t, d_psi_t);
+    } else {
+      // Update interval end points using Modified Updating Algorithm [More, Thuente 1994]
+      interval_converged = updateIntervalMT(a_l, f_l, g_l, a_u, f_u, g_u, a_t, phi_t, d_phi_t);
+    }
+
+    step_iterations++;
+  }
+
+  // If inner loop was run then hessian needs to be calculated.
+  // Hessian is unnecessary for step length determination but gradients are required
+  // so derivative and transform data is stored for the next iteration.
+  if (step_iterations) computeHessian(hessian, trans_cloud, x_t);
+
+  return (a_t);
+}
+
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::calculateScore(
+  const PointCloudSource & trans_cloud) const
+{
+  double score = 0;
+
+  for (std::size_t idx = 0; idx < trans_cloud.points.size(); idx++) {
+    PointSource x_trans_pt = trans_cloud.points[idx];
+
+    // Find neighbors (Radius search has been experimentally faster than direct neighbor checking.
+    std::vector<TargetGridLeafConstPtr> neighborhood;
+    std::vector<float> distances;
+    switch (search_method) {
+      case KDTREE:
+        target_cells_.radiusSearch(x_trans_pt, resolution_, neighborhood, distances);
+        break;
+      case DIRECT26:
+        target_cells_.getNeighborhoodAtPoint(x_trans_pt, neighborhood);
+        break;
+      default:
+      case DIRECT7:
+        target_cells_.getNeighborhoodAtPoint7(x_trans_pt, neighborhood);
+        break;
+      case DIRECT1:
+        target_cells_.getNeighborhoodAtPoint1(x_trans_pt, neighborhood);
+        break;
+    }
+
+    for (typename std::vector<TargetGridLeafConstPtr>::iterator neighborhood_it =
+           neighborhood.begin();
+         neighborhood_it != neighborhood.end(); neighborhood_it++) {
+      TargetGridLeafConstPtr cell = *neighborhood_it;
+
+      Eigen::Vector3d x_trans = Eigen::Vector3d(x_trans_pt.x, x_trans_pt.y, x_trans_pt.z);
+
+      // Denorm point, x_k' in Equations 6.12 and 6.13 [Magnusson 2009]
+      x_trans -= cell->getMean();
+      // Uses precomputed covariance for speed.
+      Eigen::Matrix3d c_inv = cell->getInverseCov();
+
+      // e^(-d_2/2 * (x_k - mu_k)^T Sigma_k^-1 (x_k - mu_k)) Equation 6.9 [Magnusson 2009]
+      double e_x_cov_x = exp(-gauss_d2_ * x_trans.dot(c_inv * x_trans) / 2);
+      // Calculate probability of transformed points existence, Equation 6.9 [Magnusson 2009]
+      double score_inc = -gauss_d1_ * e_x_cov_x - gauss_d3_;
+
+      score += score_inc / neighborhood.size();
+    }
+  }
+
+  double output_score = 0;
+  if (!trans_cloud.points.empty()) {
+    output_score = (score) / static_cast<double>(trans_cloud.size());
+  }
+  return output_score;
+}
+
+template <typename PointSource, typename PointTarget>
+double
+pclomp::NormalDistributionsTransform<PointSource, PointTarget>::calculateTransformationProbability(
+  const PointCloudSource & trans_cloud) const
+{
+  double score = 0;
+
+  for (std::size_t idx = 0; idx < trans_cloud.points.size(); idx++) {
+    PointSource x_trans_pt = trans_cloud.points[idx];
+
+    // Find neighbors (Radius search has been experimentally faster than direct neighbor checking.
+    std::vector<TargetGridLeafConstPtr> neighborhood;
+    std::vector<float> distances;
+    switch (search_method) {
+      case KDTREE:
+        target_cells_.radiusSearch(x_trans_pt, resolution_, neighborhood, distances);
+        break;
+      case DIRECT26:
+        target_cells_.getNeighborhoodAtPoint(x_trans_pt, neighborhood);
+        break;
+      default:
+      case DIRECT7:
+        target_cells_.getNeighborhoodAtPoint7(x_trans_pt, neighborhood);
+        break;
+      case DIRECT1:
+        target_cells_.getNeighborhoodAtPoint1(x_trans_pt, neighborhood);
+        break;
+    }
+
+    for (typename std::vector<TargetGridLeafConstPtr>::iterator neighborhood_it =
+           neighborhood.begin();
+         neighborhood_it != neighborhood.end(); neighborhood_it++) {
+      TargetGridLeafConstPtr cell = *neighborhood_it;
+
+      Eigen::Vector3d x_trans = Eigen::Vector3d(x_trans_pt.x, x_trans_pt.y, x_trans_pt.z);
+
+      // Denorm point, x_k' in Equations 6.12 and 6.13 [Magnusson 2009]
+      x_trans -= cell->getMean();
+      // Uses precomputed covariance for speed.
+      Eigen::Matrix3d c_inv = cell->getInverseCov();
+
+      // e^(-d_2/2 * (x_k - mu_k)^T Sigma_k^-1 (x_k - mu_k)) Equation 6.9 [Magnusson 2009]
+      double e_x_cov_x = exp(-gauss_d2_ * x_trans.dot(c_inv * x_trans) / 2);
+      // Calculate probability of transformed points existence, Equation 6.9 [Magnusson 2009]
+      double score_inc = -gauss_d1_ * e_x_cov_x;
+
+      score += score_inc;
+    }
+  }
+
+  double output_score = 0;
+  if (!trans_cloud.points.empty()) {
+    output_score = (score) / static_cast<double>(trans_cloud.points.size());
+  }
+  return output_score;
+}
+
+template <typename PointSource, typename PointTarget>
+double pclomp::NormalDistributionsTransform<PointSource, PointTarget>::
+  calculateNearestVoxelTransformationLikelihood(const PointCloudSource & trans_cloud) const
+{
+  double nearest_voxel_score = 0;
+  size_t found_neigborhood_voxel_num = 0;
+
+  for (std::size_t idx = 0; idx < trans_cloud.points.size(); idx++) {
+    double nearest_voxel_score_pt = 0;
+    PointSource x_trans_pt = trans_cloud.points[idx];
+
+    // Find neighbors (Radius search has been experimentally faster than direct neighbor checking.
+    std::vector<TargetGridLeafConstPtr> neighborhood;
+    std::vector<float> distances;
+    switch (search_method) {
+      case KDTREE:
+        target_cells_.radiusSearch(x_trans_pt, resolution_, neighborhood, distances);
+        break;
+      case DIRECT26:
+        target_cells_.getNeighborhoodAtPoint(x_trans_pt, neighborhood);
+        break;
+      default:
+      case DIRECT7:
+        target_cells_.getNeighborhoodAtPoint7(x_trans_pt, neighborhood);
+        break;
+      case DIRECT1:
+        target_cells_.getNeighborhoodAtPoint1(x_trans_pt, neighborhood);
+        break;
+    }
+
+    for (typename std::vector<TargetGridLeafConstPtr>::iterator neighborhood_it =
+           neighborhood.begin();
+         neighborhood_it != neighborhood.end(); neighborhood_it++) {
+      TargetGridLeafConstPtr cell = *neighborhood_it;
+
+      Eigen::Vector3d x_trans = Eigen::Vector3d(x_trans_pt.x, x_trans_pt.y, x_trans_pt.z);
+
+      // Denorm point, x_k' in Equations 6.12 and 6.13 [Magnusson 2009]
+      x_trans -= cell->getMean();
+      // Uses precomputed covariance for speed.
+      Eigen::Matrix3d c_inv = cell->getInverseCov();
+
+      // e^(-d_2/2 * (x_k - mu_k)^T Sigma_k^-1 (x_k - mu_k)) Equation 6.9 [Magnusson 2009]
+      double e_x_cov_x = exp(-gauss_d2_ * x_trans.dot(c_inv * x_trans) / 2);
+      // Calculate probability of transformed points existence, Equation 6.9 [Magnusson 2009]
+      double score_inc = -gauss_d1_ * e_x_cov_x;
+
+      if (score_inc > nearest_voxel_score_pt) {
+        nearest_voxel_score_pt = score_inc;
+      }
+    }
+
+    if (!neighborhood.empty()) {
+      ++found_neigborhood_voxel_num;
+      nearest_voxel_score += nearest_voxel_score_pt;
+    }
+  }
+
+  double output_score = 0;
+  if (found_neigborhood_voxel_num != 0) {
+    output_score = nearest_voxel_score / static_cast<double>(found_neigborhood_voxel_num);
+  }
+  return output_score;
+}
+
+#endif  // PCLOMP__NDT_OMP_IMPL_HPP_
diff --git a/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp.h b/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp.h
new file mode 100644
index 0000000000000..c8f842d3ff6ce
--- /dev/null
+++ b/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp.h
@@ -0,0 +1,518 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Point Cloud Library (PCL) - www.pointclouds.org
+ *  Copyright (c) 2010-2011, Willow Garage, Inc.
+ *
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of the copyright holder(s) nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef PCLOMP__VOXEL_GRID_COVARIANCE_OMP_H_
+#define PCLOMP__VOXEL_GRID_COVARIANCE_OMP_H_
+
+// clang-format off
+#include <pcl/pcl_macros.h>
+#include <pcl/filters/boost.h>
+#include <pcl/filters/voxel_grid.h>
+#include <map>
+#include <unordered_map>
+#include <pcl/point_types.h>
+#include <pcl/kdtree/kdtree_flann.h>
+#include <vector>
+
+// clang-format on
+
+namespace pclomp
+{
+/** \brief A searchable voxel structure containing the mean and covariance of the data.
+ * \note For more information please see
+ * <b>Magnusson, M. (2009). The Three-Dimensional Normal-Distributions Transform —
+ * an Efficient Representation for Registration, Surface Analysis, and Loop Detection.
+ * PhD thesis, Orebro University. Orebro Studies in Technology 36</b>
+ * \author Brian Okorn (Space and Naval Warfare Systems Center Pacific)
+ */
+template <typename PointT>
+class VoxelGridCovariance : public pcl::VoxelGrid<PointT>
+{
+protected:
+  using pcl::VoxelGrid<PointT>::filter_name_;
+  using pcl::VoxelGrid<PointT>::getClassName;
+  using pcl::VoxelGrid<PointT>::input_;
+  using pcl::VoxelGrid<PointT>::indices_;
+  using pcl::VoxelGrid<PointT>::filter_limit_negative_;
+  using pcl::VoxelGrid<PointT>::filter_limit_min_;
+  using pcl::VoxelGrid<PointT>::filter_limit_max_;
+  using pcl::VoxelGrid<PointT>::filter_field_name_;
+
+  using pcl::VoxelGrid<PointT>::downsample_all_data_;
+  using pcl::VoxelGrid<PointT>::leaf_layout_;
+  using pcl::VoxelGrid<PointT>::save_leaf_layout_;
+  using pcl::VoxelGrid<PointT>::leaf_size_;
+  using pcl::VoxelGrid<PointT>::min_b_;
+  using pcl::VoxelGrid<PointT>::max_b_;
+  using pcl::VoxelGrid<PointT>::inverse_leaf_size_;
+  using pcl::VoxelGrid<PointT>::div_b_;
+  using pcl::VoxelGrid<PointT>::divb_mul_;
+
+  typedef typename pcl::traits::fieldList<PointT>::type FieldList;
+  typedef typename pcl::Filter<PointT>::PointCloud PointCloud;
+  typedef typename PointCloud::Ptr PointCloudPtr;
+  typedef typename PointCloud::ConstPtr PointCloudConstPtr;
+
+public:
+#if PCL_VERSION >= PCL_VERSION_CALC(1, 10, 0)
+  typedef pcl::shared_ptr<pcl::VoxelGrid<PointT> > Ptr;
+  typedef pcl::shared_ptr<const pcl::VoxelGrid<PointT> > ConstPtr;
+#else
+  typedef boost::shared_ptr<pcl::VoxelGrid<PointT> > Ptr;
+  typedef boost::shared_ptr<const pcl::VoxelGrid<PointT> > ConstPtr;
+#endif
+
+  /** \brief Simple structure to hold a centroid, covariance and the number of points in a leaf.
+   * Inverse covariance, eigen vectors and eigen values are precomputed. */
+  struct Leaf
+  {
+    /** \brief Constructor.
+     * Sets \ref nr_points, \ref icov_, \ref mean_ and \ref evals_ to 0 and \ref cov_ and \ref
+     * evecs_ to the identity matrix
+     */
+    Leaf()
+    : nr_points(0),
+      mean_(Eigen::Vector3d::Zero()),
+      centroid(),
+      cov_(Eigen::Matrix3d::Identity()),
+      icov_(Eigen::Matrix3d::Zero()),
+      evecs_(Eigen::Matrix3d::Identity()),
+      evals_(Eigen::Vector3d::Zero())
+    {
+    }
+
+    /** \brief Get the voxel covariance.
+     * \return covariance matrix
+     */
+    Eigen::Matrix3d getCov() const { return (cov_); }
+
+    /** \brief Get the inverse of the voxel covariance.
+     * \return inverse covariance matrix
+     */
+    Eigen::Matrix3d getInverseCov() const { return (icov_); }
+
+    /** \brief Get the voxel centroid.
+     * \return centroid
+     */
+    Eigen::Vector3d getMean() const { return (mean_); }
+
+    /** \brief Get the eigen vectors of the voxel covariance.
+     * \note Order corresponds with \ref getEvals
+     * \return matrix whose columns contain eigen vectors
+     */
+    Eigen::Matrix3d getEvecs() const { return (evecs_); }
+
+    /** \brief Get the eigen values of the voxel covariance.
+     * \note Order corresponds with \ref getEvecs
+     * \return vector of eigen values
+     */
+    Eigen::Vector3d getEvals() const { return (evals_); }
+
+    /** \brief Get the number of points contained by this voxel.
+     * \return number of points
+     */
+    int getPointCount() const { return (nr_points); }
+
+    /** \brief Number of points contained by voxel */
+    int nr_points;
+
+    /** \brief 3D voxel centroid */
+    Eigen::Vector3d mean_;
+
+    /** \brief Nd voxel centroid
+     * \note Differs from \ref mean_ when color data is used
+     */
+    Eigen::VectorXf centroid;
+
+    /** \brief Voxel covariance matrix */
+    Eigen::Matrix3d cov_;
+
+    /** \brief Inverse of voxel covariance matrix */
+    Eigen::Matrix3d icov_;
+
+    /** \brief Eigen vectors of voxel covariance matrix */
+    Eigen::Matrix3d evecs_;
+
+    /** \brief Eigen values of voxel covariance matrix */
+    Eigen::Vector3d evals_;
+  };
+
+  /** \brief Pointer to VoxelGridCovariance leaf structure */
+  typedef Leaf * LeafPtr;
+
+  /** \brief Const pointer to VoxelGridCovariance leaf structure */
+  typedef const Leaf * LeafConstPtr;
+
+  typedef std::map<size_t, Leaf> Map;
+
+public:
+  /** \brief Constructor.
+   * Sets \ref leaf_size_ to 0 and \ref searchable_ to false.
+   */
+  VoxelGridCovariance()
+  : searchable_(true),
+    min_points_per_voxel_(6),
+    min_covar_eigvalue_mult_(0.01),
+    leaves_(),
+    voxel_centroids_(),
+    voxel_centroids_leaf_indices_(),
+    kdtree_()
+  {
+    downsample_all_data_ = false;
+    save_leaf_layout_ = false;
+    leaf_size_.setZero();
+    min_b_.setZero();
+    max_b_.setZero();
+    filter_name_ = "VoxelGridCovariance";
+  }
+
+  /** \brief Set the minimum number of points required for a cell to be used (must be 3 or greater
+   * for covariance calculation). \param[in] min_points_per_voxel the minimum number of points for
+   * required for a voxel to be used
+   */
+  inline void setMinPointPerVoxel(int min_points_per_voxel)
+  {
+    if (min_points_per_voxel > 2) {
+      min_points_per_voxel_ = min_points_per_voxel;
+    } else {
+      PCL_WARN(
+        "%s: Covariance calculation requires at least 3 points, setting Min Point per Voxel to 3 ",
+        this->getClassName().c_str());
+      min_points_per_voxel_ = 3;
+    }
+  }
+
+  /** \brief Get the minimum number of points required for a cell to be used.
+   * \return the minimum number of points for required for a voxel to be used
+   */
+  inline int getMinPointPerVoxel() { return min_points_per_voxel_; }
+
+  /** \brief Set the minimum allowable ratio between eigenvalues to prevent singular covariance
+   * matrices. \param[in] min_covar_eigvalue_mult the minimum allowable ratio between eigenvalues
+   */
+  inline void setCovEigValueInflationRatio(double min_covar_eigvalue_mult)
+  {
+    min_covar_eigvalue_mult_ = min_covar_eigvalue_mult;
+  }
+
+  /** \brief Get the minimum allowable ratio between eigenvalues to prevent singular covariance
+   * matrices. \return the minimum allowable ratio between eigenvalues
+   */
+  inline double getCovEigValueInflationRatio() { return min_covar_eigvalue_mult_; }
+
+  /** \brief Filter cloud and initializes voxel structure.
+   * \param[out] output cloud containing centroids of voxels containing a sufficient number of
+   * points \param[in] searchable flag if voxel structure is searchable, if true then kdtree is
+   * built
+   */
+  inline void filter(PointCloud & output, bool searchable = false)
+  {
+    searchable_ = searchable;
+    applyFilter(output);
+
+    voxel_centroids_ = PointCloudPtr(new PointCloud(output));
+
+    if (searchable_ && voxel_centroids_->size() > 0) {
+      // Initiates kdtree of the centroids of voxels containing a sufficient number of points
+      kdtree_.setInputCloud(voxel_centroids_);
+    }
+  }
+
+  /** \brief Initializes voxel structure.
+   * \param[in] searchable flag if voxel structure is searchable, if true then kdtree is built
+   */
+  inline void filter(bool searchable = false)
+  {
+    searchable_ = searchable;
+    voxel_centroids_ = PointCloudPtr(new PointCloud);
+    applyFilter(*voxel_centroids_);
+
+    if (searchable_ && voxel_centroids_->size() > 0) {
+      // Initiates kdtree of the centroids of voxels containing a sufficient number of points
+      kdtree_.setInputCloud(voxel_centroids_);
+    }
+  }
+
+  /** \brief Get the voxel containing point p.
+   * \param[in] index the index of the leaf structure node
+   * \return const pointer to leaf structure
+   */
+  inline LeafConstPtr getLeaf(int index)
+  {
+    auto leaf_iter = leaves_.find(index);
+    if (leaf_iter != leaves_.end()) {
+      LeafConstPtr ret(&(leaf_iter->second));
+      return ret;
+    } else {
+      return NULL;
+    }
+  }
+
+  /** \brief Get the voxel containing point p.
+   * \param[in] p the point to get the leaf structure at
+   * \return const pointer to leaf structure
+   */
+  inline LeafConstPtr getLeaf(PointT & p)
+  {
+    // Generate index associated with p
+    int ijk0 = static_cast<int>(floor(p.x * inverse_leaf_size_[0]) - min_b_[0]);
+    int ijk1 = static_cast<int>(floor(p.y * inverse_leaf_size_[1]) - min_b_[1]);
+    int ijk2 = static_cast<int>(floor(p.z * inverse_leaf_size_[2]) - min_b_[2]);
+
+    // Compute the centroid leaf index
+    int idx = ijk0 * divb_mul_[0] + ijk1 * divb_mul_[1] + ijk2 * divb_mul_[2];
+
+    // Find leaf associated with index
+    auto leaf_iter = leaves_.find(idx);
+    if (leaf_iter != leaves_.end()) {
+      // If such a leaf exists return the pointer to the leaf structure
+      LeafConstPtr ret(&(leaf_iter->second));
+      return ret;
+    } else {
+      return NULL;
+    }
+  }
+
+  /** \brief Get the voxel containing point p.
+   * \param[in] p the point to get the leaf structure at
+   * \return const pointer to leaf structure
+   */
+  inline LeafConstPtr getLeaf(Eigen::Vector3f & p)
+  {
+    // Generate index associated with p
+    int ijk0 = static_cast<int>(floor(p[0] * inverse_leaf_size_[0]) - min_b_[0]);
+    int ijk1 = static_cast<int>(floor(p[1] * inverse_leaf_size_[1]) - min_b_[1]);
+    int ijk2 = static_cast<int>(floor(p[2] * inverse_leaf_size_[2]) - min_b_[2]);
+
+    // Compute the centroid leaf index
+    int idx = ijk0 * divb_mul_[0] + ijk1 * divb_mul_[1] + ijk2 * divb_mul_[2];
+
+    // Find leaf associated with index
+    auto leaf_iter = leaves_.find(idx);
+    if (leaf_iter != leaves_.end()) {
+      // If such a leaf exists return the pointer to the leaf structure
+      LeafConstPtr ret(&(leaf_iter->second));
+      return ret;
+    } else {
+      return NULL;
+    }
+  }
+
+  /** \brief Get the voxels surrounding point p, not including the voxel containing point p.
+   * \note Only voxels containing a sufficient number of points are used (slower than radius search
+   * in practice). \param[in] reference_point the point to get the leaf structure at \param[out]
+   * neighbors \return number of neighbors found
+   */
+  int getNeighborhoodAtPoint(
+    const Eigen::MatrixXi &, const PointT & reference_point,
+    std::vector<LeafConstPtr> & neighbors) const;
+  int getNeighborhoodAtPoint(
+    const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const;
+  int getNeighborhoodAtPoint7(
+    const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const;
+  int getNeighborhoodAtPoint1(
+    const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const;
+
+  /** \brief Get the leaf structure map
+   * \return a map containing all leaves
+   */
+  inline const Map & getLeaves() { return leaves_; }
+
+  /** \brief Get a pointcloud containing the voxel centroids
+   * \note Only voxels containing a sufficient number of points are used.
+   * \return a map containing all leaves
+   */
+  inline PointCloudPtr getCentroids() { return voxel_centroids_; }
+
+  /** \brief Get a cloud to visualize each voxels normal distribution.
+   * \param[out] cell_cloud a cloud created by sampling the normal distributions of each voxel
+   */
+  void getDisplayCloud(pcl::PointCloud<pcl::PointXYZ> & cell_cloud);
+
+  /** \brief Search for the k-nearest occupied voxels for the given query point.
+   * \note Only voxels containing a sufficient number of points are used.
+   * \param[in] point the given query point
+   * \param[in] k the number of neighbors to search for
+   * \param[out] k_leaves the resultant leaves of the neighboring points
+   * \param[out] k_sqr_distances the resultant squared distances to the neighboring points
+   * \return number of neighbors found
+   */
+  int nearestKSearch(
+    const PointT & point, int k, std::vector<LeafConstPtr> & k_leaves,
+    std::vector<float> & k_sqr_distances)
+  {
+    k_leaves.clear();
+
+    // Check if kdtree has been built
+    if (!searchable_) {
+      PCL_WARN("%s: Not Searchable", this->getClassName().c_str());
+      return 0;
+    }
+
+    // Find k-nearest neighbors in the occupied voxel centroid cloud
+    std::vector<int> k_indices;
+    k = kdtree_.nearestKSearch(point, k, k_indices, k_sqr_distances);
+
+    // Find leaves corresponding to neighbors
+    k_leaves.reserve(k);
+    for (std::vector<int>::iterator iter = k_indices.begin(); iter != k_indices.end(); iter++) {
+      k_leaves.push_back(&leaves_[voxel_centroids_leaf_indices_[*iter]]);
+    }
+    return k;
+  }
+
+  /** \brief Search for the k-nearest occupied voxels for the given query point.
+   * \note Only voxels containing a sufficient number of points are used.
+   * \param[in] cloud the given query point
+   * \param[in] index the index
+   * \param[in] k the number of neighbors to search for
+   * \param[out] k_leaves the resultant leaves of the neighboring points
+   * \param[out] k_sqr_distances the resultant squared distances to the neighboring points
+   * \return number of neighbors found
+   */
+  inline int nearestKSearch(
+    const PointCloud & cloud, int index, int k, std::vector<LeafConstPtr> & k_leaves,
+    std::vector<float> & k_sqr_distances)
+  {
+    if (index >= static_cast<int>(cloud.points.size()) || index < 0) return (0);
+    return (nearestKSearch(cloud.points[index], k, k_leaves, k_sqr_distances));
+  }
+
+  /** \brief Search for all the nearest occupied voxels of the query point in a given radius.
+   * \note Only voxels containing a sufficient number of points are used.
+   * \param[in] point the given query point
+   * \param[in] radius the radius of the sphere bounding all of p_q's neighbors
+   * \param[out] k_leaves the resultant leaves of the neighboring points
+   * \param[out] k_sqr_distances the resultant squared distances to the neighboring points
+   * \param[in] max_nn
+   * \return number of neighbors found
+   */
+  int radiusSearch(
+    const PointT & point, double radius, std::vector<LeafConstPtr> & k_leaves,
+    std::vector<float> & k_sqr_distances, unsigned int max_nn = 0) const
+  {
+    k_leaves.clear();
+
+    // Check if kdtree has been built
+    if (!searchable_) {
+      PCL_WARN("%s: Not Searchable", this->getClassName().c_str());
+      return 0;
+    }
+
+    // Find neighbors within radius in the occupied voxel centroid cloud
+    std::vector<int> k_indices;
+    int k = kdtree_.radiusSearch(point, radius, k_indices, k_sqr_distances, max_nn);
+
+    // Find leaves corresponding to neighbors
+    k_leaves.reserve(k);
+    for (std::vector<int>::iterator iter = k_indices.begin(); iter != k_indices.end(); iter++) {
+      auto leaf = leaves_.find(voxel_centroids_leaf_indices_[*iter]);
+      if (leaf == leaves_.end()) {
+        std::cerr << "error : could not find the leaf corresponding to the voxel" << std::endl;
+        std::cin.ignore(1);
+      }
+      k_leaves.push_back(&(leaf->second));
+    }
+    return k;
+  }
+
+  /** \brief Search for all the nearest occupied voxels of the query point in a given radius.
+   * \note Only voxels containing a sufficient number of points are used.
+   * \param[in] cloud the given query point
+   * \param[in] index a valid index in cloud representing a valid (i.e., finite) query point
+   * \param[in] radius the radius of the sphere bounding all of p_q's neighbors
+   * \param[out] k_leaves the resultant leaves of the neighboring points
+   * \param[out] k_sqr_distances the resultant squared distances to the neighboring points
+   * \param[in] max_nn
+   * \return number of neighbors found
+   */
+  inline int radiusSearch(
+    const PointCloud & cloud, int index, double radius, std::vector<LeafConstPtr> & k_leaves,
+    std::vector<float> & k_sqr_distances, unsigned int max_nn = 0) const
+  {
+    if (index >= static_cast<int>(cloud.points.size()) || index < 0) return (0);
+    return (radiusSearch(cloud.points[index], radius, k_leaves, k_sqr_distances, max_nn));
+  }
+
+protected:
+  /** \brief Filter cloud and initializes voxel structure.
+   * \param[out] output cloud containing centroids of voxels containing a sufficient number of
+   * points
+   */
+  void applyFilter(PointCloud & output);
+
+  /** \brief Flag to determine if voxel structure is searchable. */
+  bool searchable_;
+
+  /** \brief Minimum points contained with in a voxel to allow it to be usable. */
+  int min_points_per_voxel_;
+
+  /** \brief Minimum allowable ratio between eigenvalues to prevent singular covariance matrices. */
+  double min_covar_eigvalue_mult_;
+
+  /** \brief Voxel structure containing all leaf nodes (includes voxels with less than a sufficient
+   * number of points). */
+  Map leaves_;
+
+  /** \brief Point cloud containing centroids of voxels containing at least minimum number of
+   * points. */
+  PointCloudPtr voxel_centroids_;
+
+  /** \brief Indices of leaf structures associated with each point in \ref voxel_centroids_ (used
+   * for searching). */
+  std::vector<int> voxel_centroids_leaf_indices_;
+
+  /** \brief KdTree generated using \ref voxel_centroids_ (used for searching). */
+  pcl::KdTreeFLANN<PointT> kdtree_;
+};
+}  // namespace pclomp
+
+#endif  // PCLOMP__VOXEL_GRID_COVARIANCE_OMP_H_
diff --git a/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp_impl.hpp b/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp_impl.hpp
new file mode 100644
index 0000000000000..ed9c0273551e0
--- /dev/null
+++ b/localization/ndt_omp/include/pclomp/voxel_grid_covariance_omp_impl.hpp
@@ -0,0 +1,493 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Point Cloud Library (PCL) - www.pointclouds.org
+ *  Copyright (c) 2010-2011, Willow Garage, Inc.
+ *
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of the copyright holder(s) nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef PCLOMP__VOXEL_GRID_COVARIANCE_OMP_IMPL_HPP_
+#define PCLOMP__VOXEL_GRID_COVARIANCE_OMP_IMPL_HPP_
+
+#include "voxel_grid_covariance_omp.h"
+
+#include <Eigen/Cholesky>
+#include <Eigen/Dense>
+
+#include <pcl/common/common.h>
+#include <pcl/filters/boost.h>
+
+#include <limits>
+#include <vector>
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+void pclomp::VoxelGridCovariance<PointT>::applyFilter(PointCloud & output)
+{
+  voxel_centroids_leaf_indices_.clear();
+
+  // Has the input dataset been set already?
+  if (!input_) {
+    PCL_WARN("[pcl::%s::applyFilter] No input dataset given!\n", getClassName().c_str());
+    output.width = output.height = 0;
+    output.points.clear();
+    return;
+  }
+
+  // Copy the header (and thus the frame_id) + allocate enough space for points
+  output.height = 1;       // downsampling breaks the organized structure
+  output.is_dense = true;  // we filter out invalid points
+  output.points.clear();
+
+  Eigen::Vector4f min_p, max_p;
+  // Get the minimum and maximum dimensions
+  if (!filter_field_name_.empty())  // If we don't want to process the entire cloud...
+    pcl::getMinMax3D<PointT>(
+      input_, filter_field_name_, static_cast<float>(filter_limit_min_),
+      static_cast<float>(filter_limit_max_), min_p, max_p, filter_limit_negative_);
+  else
+    pcl::getMinMax3D<PointT>(*input_, min_p, max_p);
+
+  // Check that the leaf size is not too small, given the size of the data
+  int64_t dx = static_cast<int64_t>((max_p[0] - min_p[0]) * inverse_leaf_size_[0]) + 1;
+  int64_t dy = static_cast<int64_t>((max_p[1] - min_p[1]) * inverse_leaf_size_[1]) + 1;
+  int64_t dz = static_cast<int64_t>((max_p[2] - min_p[2]) * inverse_leaf_size_[2]) + 1;
+
+  if ((dx * dy * dz) > std::numeric_limits<int32_t>::max()) {
+    PCL_WARN(
+      "[pcl::%s::applyFilter] Leaf size is too small for the input dataset. Integer indices would "
+      "overflow.",
+      getClassName().c_str());
+    output.clear();
+    return;
+  }
+
+  // Compute the minimum and maximum bounding box values
+  min_b_[0] = static_cast<int>(floor(min_p[0] * inverse_leaf_size_[0]));
+  max_b_[0] = static_cast<int>(floor(max_p[0] * inverse_leaf_size_[0]));
+  min_b_[1] = static_cast<int>(floor(min_p[1] * inverse_leaf_size_[1]));
+  max_b_[1] = static_cast<int>(floor(max_p[1] * inverse_leaf_size_[1]));
+  min_b_[2] = static_cast<int>(floor(min_p[2] * inverse_leaf_size_[2]));
+  max_b_[2] = static_cast<int>(floor(max_p[2] * inverse_leaf_size_[2]));
+
+  // Compute the number of divisions needed along all axis
+  div_b_ = max_b_ - min_b_ + Eigen::Vector4i::Ones();
+  div_b_[3] = 0;
+
+  // Clear the leaves
+  leaves_.clear();
+  //  leaves_.reserve(8192);
+
+  // Set up the division multiplier
+  divb_mul_ = Eigen::Vector4i(1, div_b_[0], div_b_[0] * div_b_[1], 0);
+
+  int centroid_size = 4;
+
+  if (downsample_all_data_) centroid_size = boost::mpl::size<FieldList>::value;
+
+  // ---[ RGB special case
+  std::vector<pcl::PCLPointField> fields;
+  int rgba_index = -1;
+  rgba_index = pcl::getFieldIndex<PointT>("rgb", fields);
+  if (rgba_index == -1) rgba_index = pcl::getFieldIndex<PointT>("rgba", fields);
+  if (rgba_index >= 0) {
+    rgba_index = fields[rgba_index].offset;
+    centroid_size += 3;
+  }
+
+  // If we don't want to process the entire cloud, but rather filter points far away from the
+  // viewpoint first...
+  if (!filter_field_name_.empty()) {
+    // Get the distance field index
+    std::vector<pcl::PCLPointField> fields;
+    int distance_idx = pcl::getFieldIndex<PointT>(filter_field_name_, fields);
+    if (distance_idx == -1)
+      PCL_WARN(
+        "[pcl::%s::applyFilter] Invalid filter field name. Index is %d.\n", getClassName().c_str(),
+        distance_idx);
+
+    // First pass: go over all points and insert them into the right leaf
+    for (size_t cp = 0; cp < input_->points.size(); ++cp) {
+      if (!input_->is_dense)
+        // Check if the point is invalid
+        if (
+          !std::isfinite(input_->points[cp].x) || !std::isfinite(input_->points[cp].y) ||
+          !std::isfinite(input_->points[cp].z))
+          continue;
+
+      // Get the distance value
+      const uint8_t * pt_data = reinterpret_cast<const uint8_t *>(&input_->points[cp]);
+      float distance_value = 0;
+      memcpy(&distance_value, pt_data + fields[distance_idx].offset, sizeof(float));
+
+      if (filter_limit_negative_) {
+        // Use a threshold for cutting out points which inside the interval
+        if ((distance_value < filter_limit_max_) && (distance_value > filter_limit_min_)) continue;
+      } else {
+        // Use a threshold for cutting out points which are too close/far away
+        if ((distance_value > filter_limit_max_) || (distance_value < filter_limit_min_)) continue;
+      }
+
+      int ijk0 = static_cast<int>(
+        floor(input_->points[cp].x * inverse_leaf_size_[0]) - static_cast<float>(min_b_[0]));
+      int ijk1 = static_cast<int>(
+        floor(input_->points[cp].y * inverse_leaf_size_[1]) - static_cast<float>(min_b_[1]));
+      int ijk2 = static_cast<int>(
+        floor(input_->points[cp].z * inverse_leaf_size_[2]) - static_cast<float>(min_b_[2]));
+
+      // Compute the centroid leaf index
+      int idx = ijk0 * divb_mul_[0] + ijk1 * divb_mul_[1] + ijk2 * divb_mul_[2];
+
+      Leaf & leaf = leaves_[idx];
+      if (leaf.nr_points == 0) {
+        leaf.centroid.resize(centroid_size);
+        leaf.centroid.setZero();
+      }
+
+      Eigen::Vector3d pt3d(input_->points[cp].x, input_->points[cp].y, input_->points[cp].z);
+      // Accumulate point sum for centroid calculation
+      leaf.mean_ += pt3d;
+      // Accumulate x*xT for single pass covariance calculation
+      leaf.cov_ += pt3d * pt3d.transpose();
+
+      // Do we need to process all the fields?
+      if (!downsample_all_data_) {
+        Eigen::Vector4f pt(input_->points[cp].x, input_->points[cp].y, input_->points[cp].z, 0);
+        leaf.centroid.template head<4>() += pt;
+      } else {
+        // Copy all the fields
+        Eigen::VectorXf centroid = Eigen::VectorXf::Zero(centroid_size);
+        // ---[ RGB special case
+        if (rgba_index >= 0) {
+          // fill r/g/b data
+          int rgb;
+          memcpy(
+            &rgb, reinterpret_cast<const char *>(&input_->points[cp]) + rgba_index, sizeof(int));
+          centroid[centroid_size - 3] = static_cast<float>((rgb >> 16) & 0x0000ff);
+          centroid[centroid_size - 2] = static_cast<float>((rgb >> 8) & 0x0000ff);
+          centroid[centroid_size - 1] = static_cast<float>((rgb)&0x0000ff);
+        }
+        pcl::for_each_type<FieldList>(
+          pcl::NdCopyPointEigenFunctor<PointT>(input_->points[cp], centroid));
+        leaf.centroid += centroid;
+      }
+      ++leaf.nr_points;
+    }
+  } else {  // No distance filtering, process all data
+    // First pass: go over all points and insert them into the right leaf
+    for (size_t cp = 0; cp < input_->points.size(); ++cp) {
+      if (!input_->is_dense)
+        // Check if the point is invalid
+        if (
+          !std::isfinite(input_->points[cp].x) || !std::isfinite(input_->points[cp].y) ||
+          !std::isfinite(input_->points[cp].z))
+          continue;
+
+      int ijk0 = static_cast<int>(
+        floor(input_->points[cp].x * inverse_leaf_size_[0]) - static_cast<float>(min_b_[0]));
+      int ijk1 = static_cast<int>(
+        floor(input_->points[cp].y * inverse_leaf_size_[1]) - static_cast<float>(min_b_[1]));
+      int ijk2 = static_cast<int>(
+        floor(input_->points[cp].z * inverse_leaf_size_[2]) - static_cast<float>(min_b_[2]));
+
+      // Compute the centroid leaf index
+      int idx = ijk0 * divb_mul_[0] + ijk1 * divb_mul_[1] + ijk2 * divb_mul_[2];
+
+      // int idx = (((input_->points[cp].getArray4fmap () * inverse_leaf_size_).template cast<int>
+      // ()).matrix () - min_b_).dot (divb_mul_);
+      Leaf & leaf = leaves_[idx];
+      if (leaf.nr_points == 0) {
+        leaf.centroid.resize(centroid_size);
+        leaf.centroid.setZero();
+      }
+
+      Eigen::Vector3d pt3d(input_->points[cp].x, input_->points[cp].y, input_->points[cp].z);
+      // Accumulate point sum for centroid calculation
+      leaf.mean_ += pt3d;
+      // Accumulate x*xT for single pass covariance calculation
+      leaf.cov_ += pt3d * pt3d.transpose();
+
+      // Do we need to process all the fields?
+      if (!downsample_all_data_) {
+        Eigen::Vector4f pt(input_->points[cp].x, input_->points[cp].y, input_->points[cp].z, 0);
+        leaf.centroid.template head<4>() += pt;
+      } else {
+        // Copy all the fields
+        Eigen::VectorXf centroid = Eigen::VectorXf::Zero(centroid_size);
+        // ---[ RGB special case
+        if (rgba_index >= 0) {
+          // Fill r/g/b data, assuming that the order is BGRA
+          int rgb;
+          memcpy(
+            &rgb, reinterpret_cast<const char *>(&input_->points[cp]) + rgba_index, sizeof(int));
+          centroid[centroid_size - 3] = static_cast<float>((rgb >> 16) & 0x0000ff);
+          centroid[centroid_size - 2] = static_cast<float>((rgb >> 8) & 0x0000ff);
+          centroid[centroid_size - 1] = static_cast<float>((rgb)&0x0000ff);
+        }
+        pcl::for_each_type<FieldList>(
+          pcl::NdCopyPointEigenFunctor<PointT>(input_->points[cp], centroid));
+        leaf.centroid += centroid;
+      }
+      ++leaf.nr_points;
+    }
+  }
+
+  // Second pass: go over all leaves and compute centroids and covariance matrices
+  output.points.reserve(leaves_.size());
+  if (searchable_) voxel_centroids_leaf_indices_.reserve(leaves_.size());
+  int cp = 0;
+  if (save_leaf_layout_) leaf_layout_.resize(div_b_[0] * div_b_[1] * div_b_[2], -1);
+
+  // Eigen values and vectors calculated to prevent near singular matrices
+  Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> eigensolver;
+  Eigen::Matrix3d eigen_val;
+  Eigen::Vector3d pt_sum;
+
+  // Eigen values less than a threshold of max eigen value are inflated to a set fraction of the max
+  // eigen value.
+  double min_covar_eigvalue;
+
+  for (auto it = leaves_.begin(); it != leaves_.end(); ++it) {
+    // Normalize the centroid
+    Leaf & leaf = it->second;
+
+    // Normalize the centroid
+    leaf.centroid /= static_cast<float>(leaf.nr_points);
+    // Point sum used for single pass covariance calculation
+    pt_sum = leaf.mean_;
+    // Normalize mean
+    leaf.mean_ /= leaf.nr_points;
+
+    // If the voxel contains sufficient points, its covariance is calculated and is added to the
+    // voxel centroids and output clouds. Points with less than the minimum points will have a can
+    // not be accurately approximated using a normal distribution.
+    if (leaf.nr_points >= min_points_per_voxel_) {
+      if (save_leaf_layout_) leaf_layout_[it->first] = cp++;
+
+      output.push_back(PointT());
+
+      // Do we need to process all the fields?
+      if (!downsample_all_data_) {
+        output.points.back().x = leaf.centroid[0];
+        output.points.back().y = leaf.centroid[1];
+        output.points.back().z = leaf.centroid[2];
+      } else {
+        pcl::for_each_type<FieldList>(
+          pcl::NdCopyEigenPointFunctor<PointT>(leaf.centroid, output.back()));
+        // ---[ RGB special case
+        if (rgba_index >= 0) {
+          // pack r/g/b into rgb
+          float r = leaf.centroid[centroid_size - 3], g = leaf.centroid[centroid_size - 2],
+                b = leaf.centroid[centroid_size - 1];
+          int rgb =
+            (static_cast<int>(r)) << 16 | (static_cast<int>(g)) << 8 | (static_cast<int>(b));
+          memcpy(reinterpret_cast<char *>(&output.points.back()) + rgba_index, &rgb, sizeof(float));
+        }
+      }
+
+      // Stores the voxel indices for fast access searching
+      if (searchable_) voxel_centroids_leaf_indices_.push_back(static_cast<int>(it->first));
+
+      // Single pass covariance calculation
+      leaf.cov_ = (leaf.cov_ - 2 * (pt_sum * leaf.mean_.transpose())) / leaf.nr_points +
+                  leaf.mean_ * leaf.mean_.transpose();
+      leaf.cov_ *= (leaf.nr_points - 1.0) / leaf.nr_points;
+
+      // Normalize Eigen Val such that max no more than 100x min.
+      eigensolver.compute(leaf.cov_);
+      eigen_val = eigensolver.eigenvalues().asDiagonal();
+      leaf.evecs_ = eigensolver.eigenvectors();
+
+      if (eigen_val(0, 0) < 0 || eigen_val(1, 1) < 0 || eigen_val(2, 2) <= 0) {
+        leaf.nr_points = -1;
+        continue;
+      }
+
+      // Avoids matrices near singularities (eq 6.11)[Magnusson 2009]
+
+      min_covar_eigvalue = min_covar_eigvalue_mult_ * eigen_val(2, 2);
+      if (eigen_val(0, 0) < min_covar_eigvalue) {
+        eigen_val(0, 0) = min_covar_eigvalue;
+
+        if (eigen_val(1, 1) < min_covar_eigvalue) {
+          eigen_val(1, 1) = min_covar_eigvalue;
+        }
+
+        leaf.cov_ = leaf.evecs_ * eigen_val * leaf.evecs_.inverse();
+      }
+      leaf.evals_ = eigen_val.diagonal();
+
+      leaf.icov_ = leaf.cov_.inverse();
+      if (
+        leaf.icov_.maxCoeff() == std::numeric_limits<float>::infinity() ||
+        leaf.icov_.minCoeff() == -std::numeric_limits<float>::infinity()) {
+        leaf.nr_points = -1;
+      }
+    }
+  }
+
+  output.width = static_cast<uint32_t>(output.points.size());
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+int pclomp::VoxelGridCovariance<PointT>::getNeighborhoodAtPoint(
+  const Eigen::MatrixXi & relative_coordinates, const PointT & reference_point,
+  std::vector<LeafConstPtr> & neighbors) const
+{
+  neighbors.clear();
+
+  // Find displacement coordinates
+  Eigen::Vector4i ijk(
+    static_cast<int>(floor(reference_point.x / leaf_size_[0])),
+    static_cast<int>(floor(reference_point.y / leaf_size_[1])),
+    static_cast<int>(floor(reference_point.z / leaf_size_[2])), 0);
+  Eigen::Array4i diff2min = min_b_ - ijk;
+  Eigen::Array4i diff2max = max_b_ - ijk;
+  neighbors.reserve(relative_coordinates.cols());
+
+  // Check each neighbor to see if it is occupied and contains sufficient points
+  // Slower than radius search because needs to check 26 indices
+  for (int ni = 0; ni < relative_coordinates.cols(); ni++) {
+    Eigen::Vector4i displacement =
+      (Eigen::Vector4i() << relative_coordinates.col(ni), 0).finished();
+    // Checking if the specified cell is in the grid
+    if ((diff2min <= displacement.array()).all() && (diff2max >= displacement.array()).all()) {
+      auto leaf_iter = leaves_.find(((ijk + displacement - min_b_).dot(divb_mul_)));
+      if (leaf_iter != leaves_.end() && leaf_iter->second.nr_points >= min_points_per_voxel_) {
+        LeafConstPtr leaf = &(leaf_iter->second);
+        neighbors.push_back(leaf);
+      }
+    }
+  }
+
+  return (static_cast<int>(neighbors.size()));
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+int pclomp::VoxelGridCovariance<PointT>::getNeighborhoodAtPoint(
+  const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const
+{
+  neighbors.clear();
+
+  // Find displacement coordinates
+  Eigen::MatrixXi relative_coordinates = pcl::getAllNeighborCellIndices();
+  return getNeighborhoodAtPoint(relative_coordinates, reference_point, neighbors);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+int pclomp::VoxelGridCovariance<PointT>::getNeighborhoodAtPoint7(
+  const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const
+{
+  neighbors.clear();
+
+  Eigen::MatrixXi relative_coordinates(3, 7);
+  relative_coordinates.setZero();
+  relative_coordinates(0, 1) = 1;
+  relative_coordinates(0, 2) = -1;
+  relative_coordinates(1, 3) = 1;
+  relative_coordinates(1, 4) = -1;
+  relative_coordinates(2, 5) = 1;
+  relative_coordinates(2, 6) = -1;
+
+  return getNeighborhoodAtPoint(relative_coordinates, reference_point, neighbors);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+int pclomp::VoxelGridCovariance<PointT>::getNeighborhoodAtPoint1(
+  const PointT & reference_point, std::vector<LeafConstPtr> & neighbors) const
+{
+  neighbors.clear();
+  return getNeighborhoodAtPoint(Eigen::MatrixXi::Zero(3, 1), reference_point, neighbors);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+template <typename PointT>
+void pclomp::VoxelGridCovariance<PointT>::getDisplayCloud(
+  pcl::PointCloud<pcl::PointXYZ> & cell_cloud)
+{
+  cell_cloud.clear();
+
+  int pnt_per_cell = 1000;
+  boost::mt19937 rng;
+  boost::normal_distribution<> nd(0.0, leaf_size_.head(3).norm());
+  boost::variate_generator<boost::mt19937 &, boost::normal_distribution<> > var_nor(rng, nd);
+
+  Eigen::LLT<Eigen::Matrix3d> llt_of_cov;
+  Eigen::Matrix3d cholesky_decomp;
+  Eigen::Vector3d cell_mean;
+  Eigen::Vector3d rand_point;
+  Eigen::Vector3d dist_point;
+
+  // Generate points for each occupied voxel with sufficient points.
+  for (auto it = leaves_.begin(); it != leaves_.end(); ++it) {
+    Leaf & leaf = it->second;
+
+    if (leaf.nr_points >= min_points_per_voxel_) {
+      cell_mean = leaf.mean_;
+      llt_of_cov.compute(leaf.cov_);
+      cholesky_decomp = llt_of_cov.matrixL();
+
+      // Random points generated by sampling the normal distribution
+      // given by voxel mean and covariance matrix
+      for (int i = 0; i < pnt_per_cell; i++) {
+        rand_point = Eigen::Vector3d(var_nor(), var_nor(), var_nor());
+        dist_point = cell_mean + cholesky_decomp * rand_point;
+        cell_cloud.push_back(pcl::PointXYZ(
+          static_cast<float>(dist_point(0)), static_cast<float>(dist_point(1)),
+          static_cast<float>(dist_point(2))));
+      }
+    }
+  }
+}
+
+#define PCL_INSTANTIATE_VoxelGridCovariance(T) \
+  template class PCL_EXPORTS pcl::VoxelGridCovariance<T>;
+
+#endif  // PCLOMP__VOXEL_GRID_COVARIANCE_OMP_IMPL_HPP_
diff --git a/localization/ndt_omp/package.xml b/localization/ndt_omp/package.xml
new file mode 100644
index 0000000000000..02049a39f7bdc
--- /dev/null
+++ b/localization/ndt_omp/package.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>ndt_omp</name>
+  <version>0.0.0</version>
+  <description>OpenMP boosted NDT and GICP algorithms</description>
+  <maintainer email="yamato.ando@tier4.jp">Yamato Ando</maintainer>
+  <maintainer email="kento.yabuuchi.2@tier4.jp">Kento Yabuuchi</maintainer>
+  <maintainer email="koji.minoda@tier4.jp">Koji Minoda</maintainer>
+  <author email="koide@aisl.cs.tut.ac.jp">koide</author>
+  <license>Apache License 2.0</license>
+  <!-- The original package is BSD-2 License -->
+
+  <buildtool_depend condition="$ROS_VERSION == 1">catkin</buildtool_depend>
+  <buildtool_depend condition="$ROS_VERSION == 2">ament_cmake_auto</buildtool_depend>
+
+  <depend condition="$ROS_VERSION == 1">pcl_ros</depend>
+  <depend condition="$ROS_VERSION == 1">roscpp</depend>
+
+  <depend condition="$ROS_VERSION == 2">libpcl-all-dev</depend>
+
+  <export>
+    <build_type condition="$ROS_VERSION == 1">catkin</build_type>
+    <build_type condition="$ROS_VERSION == 2">ament_cmake</build_type>
+  </export>
+</package>
diff --git a/localization/ndt_omp/src/pclomp/ndt_omp.cpp b/localization/ndt_omp/src/pclomp/ndt_omp.cpp
new file mode 100644
index 0000000000000..5f75bf315b2a2
--- /dev/null
+++ b/localization/ndt_omp/src/pclomp/ndt_omp.cpp
@@ -0,0 +1,20 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pclomp/ndt_omp_impl.hpp>
+
+#include <pclomp/ndt_omp.h>
+
+template class pclomp::NormalDistributionsTransform<pcl::PointXYZ, pcl::PointXYZ>;
+template class pclomp::NormalDistributionsTransform<pcl::PointXYZI, pcl::PointXYZI>;
diff --git a/localization/ndt_omp/src/pclomp/voxel_grid_covariance_omp.cpp b/localization/ndt_omp/src/pclomp/voxel_grid_covariance_omp.cpp
new file mode 100644
index 0000000000000..1b7b24c708eae
--- /dev/null
+++ b/localization/ndt_omp/src/pclomp/voxel_grid_covariance_omp.cpp
@@ -0,0 +1,20 @@
+// Copyright 2022 Autoware Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pclomp/voxel_grid_covariance_omp_impl.hpp>
+
+#include <pclomp/voxel_grid_covariance_omp.h>
+
+template class pclomp::VoxelGridCovariance<pcl::PointXYZ>;
+template class pclomp::VoxelGridCovariance<pcl::PointXYZI>;