diff --git a/common/cuda_utils/CMakeLists.txt b/common/cuda_utils/CMakeLists.txt new file mode 100644 index 0000000000000..c0115bc9f3b89 --- /dev/null +++ b/common/cuda_utils/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.5) +project(cuda_utils) + +find_package(autoware_cmake REQUIRED) +autoware_package() + +find_package(CUDA REQUIRED) + +install( + DIRECTORY include/${PROJECT_NAME}/ + DESTINATION include/${PROJECT_NAME} +) + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +ament_export_include_directories(include) +ament_export_dependencies(CUDA) + +ament_auto_package() diff --git a/common/cuda_utils/include/cuda_utils/cuda_check_error.hpp b/common/cuda_utils/include/cuda_utils/cuda_check_error.hpp new file mode 100644 index 0000000000000..8c2d96eee5f13 --- /dev/null +++ b/common/cuda_utils/include/cuda_utils/cuda_check_error.hpp @@ -0,0 +1,44 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This code is licensed under CC0 1.0 Universal (Public Domain). +// You can use this without any limitation. +// https://creativecommons.org/publicdomain/zero/1.0/deed.en +// borrowed from https://proc-cpuinfo.fixstars.com/2019/02/cuda_smart_pointer/ + +#ifndef CUDA_UTILS__CUDA_CHECK_ERROR_HPP_ +#define CUDA_UTILS__CUDA_CHECK_ERROR_HPP_ + +#include + +#include +#include + +namespace cuda_utils +{ +template +void cuda_check_error(const ::cudaError_t e, F && f, N && n) +{ + if (e != ::cudaSuccess) { + std::stringstream s; + s << ::cudaGetErrorName(e) << " (" << e << ")@" << f << "#L" << n << ": " + << ::cudaGetErrorString(e); + throw std::runtime_error{s.str()}; + } +} +} // namespace cuda_utils + +#define CHECK_CUDA_ERROR(e) (cuda_utils::cuda_check_error(e, __FILE__, __LINE__)) + +#endif // CUDA_UTILS__CUDA_CHECK_ERROR_HPP_ diff --git a/common/cuda_utils/include/cuda_utils/cuda_unique_ptr.hpp b/common/cuda_utils/include/cuda_utils/cuda_unique_ptr.hpp new file mode 100644 index 0000000000000..6e01a00d6681c --- /dev/null +++ b/common/cuda_utils/include/cuda_utils/cuda_unique_ptr.hpp @@ -0,0 +1,81 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This code is licensed under CC0 1.0 Universal (Public Domain). +// You can use this without any limitation. +// https://creativecommons.org/publicdomain/zero/1.0/deed.en +// borrowed from https://proc-cpuinfo.fixstars.com/2019/02/cuda_smart_pointer/ + +#ifndef CUDA_UTILS__CUDA_UNIQUE_PTR_HPP_ +#define CUDA_UTILS__CUDA_UNIQUE_PTR_HPP_ + +#include "cuda_utils/cuda_check_error.hpp" + +#include +#include + +namespace cuda_utils +{ +struct CudaDeleter +{ + void operator()(void * p) const { CHECK_CUDA_ERROR(::cudaFree(p)); } +}; +template +using CudaUniquePtr = std::unique_ptr; + +template +typename std::enable_if_t::value, CudaUniquePtr> make_unique( + const std::size_t n) +{ + using U = typename std::remove_extent_t; + U * p; + CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast(&p), sizeof(U) * n)); + return CudaUniquePtr{p}; +} + +template +CudaUniquePtr make_unique() +{ + T * p; + CHECK_CUDA_ERROR(::cudaMalloc(reinterpret_cast(&p), sizeof(T))); + return CudaUniquePtr{p}; +} + +struct CudaDeleterHost +{ + void operator()(void * p) const { CHECK_CUDA_ERROR(::cudaFreeHost(p)); } +}; +template +using CudaUniquePtrHost = std::unique_ptr; + +template +typename std::enable_if_t::value, CudaUniquePtrHost> make_unique_host( + const std::size_t n, unsigned int flag) +{ + using U = typename std::remove_extent_t; + U * p; + CHECK_CUDA_ERROR(::cudaHostAlloc(reinterpret_cast(&p), sizeof(U) * n, flag)); + return CudaUniquePtrHost{p}; +} + +template +CudaUniquePtrHost make_unique_host(unsigned int flag = cudaHostAllocDefault) +{ + T * p; + CHECK_CUDA_ERROR(::cudaHostAlloc(reinterpret_cast(&p), sizeof(T), flag)); + return CudaUniquePtrHost{p}; +} +} // namespace cuda_utils + +#endif // CUDA_UTILS__CUDA_UNIQUE_PTR_HPP_ diff --git a/common/cuda_utils/include/cuda_utils/stream_unique_ptr.hpp b/common/cuda_utils/include/cuda_utils/stream_unique_ptr.hpp new file mode 100644 index 0000000000000..0f3a1bc6ccebf --- /dev/null +++ b/common/cuda_utils/include/cuda_utils/stream_unique_ptr.hpp @@ -0,0 +1,52 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This code is licensed under CC0 1.0 Universal (Public Domain). +// You can use this without any limitation. +// https://creativecommons.org/publicdomain/zero/1.0/deed.en +// borrowed from https://proc-cpuinfo.fixstars.com/2019/02/cuda_smart_pointer/ + +#ifndef CUDA_UTILS__STREAM_UNIQUE_PTR_HPP_ +#define CUDA_UTILS__STREAM_UNIQUE_PTR_HPP_ + +#include + +#include + +namespace cuda_utils +{ +struct StreamDeleter +{ + void operator()(cudaStream_t * stream) + { + if (stream) { + cudaStreamDestroy(*stream); + delete stream; + } + } +}; + +using StreamUniquePtr = std::unique_ptr; + +inline StreamUniquePtr makeCudaStream(const uint32_t flags = cudaStreamDefault) +{ + StreamUniquePtr stream(new cudaStream_t, StreamDeleter()); + if (cudaStreamCreateWithFlags(stream.get(), flags) != cudaSuccess) { + stream.reset(nullptr); + } + return stream; +} +} // namespace cuda_utils + +#endif // CUDA_UTILS__STREAM_UNIQUE_PTR_HPP_ diff --git a/common/cuda_utils/package.xml b/common/cuda_utils/package.xml new file mode 100644 index 0000000000000..53127da0c1ff6 --- /dev/null +++ b/common/cuda_utils/package.xml @@ -0,0 +1,23 @@ + + + cuda_utils + 0.0.1 + cuda utility library + + Daisuke Nishimatsu + Daisuke Nishimatsu + Manato Hirabayashi + + Apache License 2.0 + + ament_cmake_auto + + autoware_cmake + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/common/tensorrt_common/CMakeLists.txt b/common/tensorrt_common/CMakeLists.txt new file mode 100644 index 0000000000000..49a2e40dd8234 --- /dev/null +++ b/common/tensorrt_common/CMakeLists.txt @@ -0,0 +1,54 @@ +cmake_minimum_required(VERSION 3.5) +project(tensorrt_common) + +find_package(autoware_cmake REQUIRED) +autoware_package() + +find_package(CUDA REQUIRED) +find_package(CUDNN REQUIRED) +find_package(TENSORRT REQUIRED) + +if(NOT (${CUDA_FOUND} AND ${CUDNN_FOUND} AND ${TENSORRT_FOUND})) + message(WARNING "cuda, cudnn, tensorrt libraries are not found") + return() +endif() + +cuda_add_library(${PROJECT_NAME} SHARED + src/tensorrt_common.cpp +) + +ament_target_dependencies(${PROJECT_NAME} + rclcpp +) + +target_include_directories(${PROJECT_NAME} + PUBLIC + $ + $ + ${TENSORRT_INCLUDE_DIRS} +) + +target_link_libraries(${PROJECT_NAME} + ${TENSORRT_LIBRARIES} + stdc++fs +) + +target_compile_definitions(${PROJECT_NAME} PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +list(APPEND ${PROJECT_NAME}_LIBRARIES "${PROJECT_NAME}") + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +ament_export_include_directories(include) +ament_export_dependencies(CUDA) +ament_export_dependencies(cudnn_cmake_module) +ament_export_dependencies(CUDNN) +ament_export_dependencies(tensorrt_cmake_module) +ament_export_dependencies(TENSORRT) + +ament_auto_package() diff --git a/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp b/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp new file mode 100644 index 0000000000000..bb271f0689002 --- /dev/null +++ b/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp @@ -0,0 +1,134 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TENSORRT_COMMON__TENSORRT_COMMON_HPP_ +#define TENSORRT_COMMON__TENSORRT_COMMON_HPP_ + +#include + +#include +#include + +#if (defined(_MSC_VER) or (defined(__GNUC__) and (7 <= __GNUC_MAJOR__))) +#include +namespace fs = ::std::filesystem; +#else +#include +namespace fs = ::std::experimental::filesystem; +#endif + +#include +#include +#include +#include + +namespace tensorrt_common +{ +class Logger : public nvinfer1::ILogger // NOLINT +{ +public: + Logger() : Logger(Severity::kINFO) {} + + explicit Logger(Severity severity) : reportable_severity_(severity) {} + + void log(Severity severity, const char * msg) noexcept override + { + // suppress messages with severity enum value greater than the reportable + if (severity > reportable_severity_) { + return; + } + + switch (severity) { + case Severity::kINTERNAL_ERROR: + RCLCPP_ERROR_STREAM(logger_, msg); + break; + case Severity::kERROR: + RCLCPP_ERROR_STREAM(logger_, msg); + break; + case Severity::kWARNING: + RCLCPP_WARN_STREAM(logger_, msg); + break; + case Severity::kINFO: + RCLCPP_INFO_STREAM(logger_, msg); + break; + default: + RCLCPP_INFO_STREAM(logger_, msg); + break; + } + } + + Severity reportable_severity_{Severity::kWARNING}; + rclcpp::Logger logger_{rclcpp::get_logger("tensorrt_common")}; +}; + +template +struct InferDeleter // NOLINT +{ + void operator()(T * obj) const + { + if (obj) { +#if TENSORRT_VERSION_MAJOR >= 8 + delete obj; +#else + obj->destroy(); +#endif + } + } +}; + +template +using TrtUniquePtr = std::unique_ptr>; + +using BatchConfig = std::array; + +class TrtCommon // NOLINT +{ +public: + TrtCommon( + const std::string & model_path, const std::string & precision, + std::unique_ptr calibrator = nullptr, + const BatchConfig & batch_config = {1, 1, 1}, const size_t max_workspace_size = (16 << 20), + const std::vector & plugin_paths = {}); + + bool loadEngine(const std::string & engine_file_path); + bool buildEngineFromOnnx( + const std::string & onnx_file_path, const std::string & output_engine_file_path); + void setup(); + + bool isInitialized(); + + nvinfer1::Dims getBindingDimensions(const int32_t index) const; + int32_t getNbBindings(); + bool setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const; + bool enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed); + +private: + Logger logger_; + fs::path model_file_path_; + TrtUniquePtr runtime_; + TrtUniquePtr engine_; + TrtUniquePtr context_; + std::unique_ptr calibrator_; + + nvinfer1::Dims input_dims_; + nvinfer1::Dims output_dims_; + std::string precision_; + BatchConfig batch_config_; + size_t max_workspace_size_; + bool is_initialized_{false}; +}; + +} // namespace tensorrt_common + +#endif // TENSORRT_COMMON__TENSORRT_COMMON_HPP_ diff --git a/common/tensorrt_common/package.xml b/common/tensorrt_common/package.xml new file mode 100644 index 0000000000000..6e8a0b042dd70 --- /dev/null +++ b/common/tensorrt_common/package.xml @@ -0,0 +1,28 @@ + + + tensorrt_common + 0.0.1 + tensorrt utility wrapper + + Taichi Higashide + Daisuke Nishimatsu + Daisuke Nishimatsu + Manato Hirabayashi + + Apache License 2.0 + + ament_cmake_auto + cudnn_cmake_module + tensorrt_cmake_module + + autoware_cmake + + rclcpp + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/common/tensorrt_common/src/tensorrt_common.cpp b/common/tensorrt_common/src/tensorrt_common.cpp new file mode 100644 index 0000000000000..50028f51332e6 --- /dev/null +++ b/common/tensorrt_common/src/tensorrt_common.cpp @@ -0,0 +1,222 @@ +// Copyright 2020 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace tensorrt_common +{ + +TrtCommon::TrtCommon( + const std::string & model_path, const std::string & precision, + std::unique_ptr calibrator, + const tensorrt_common::BatchConfig & batch_config, const size_t max_workspace_size, + const std::vector & plugin_paths) +: model_file_path_(model_path), + calibrator_(std::move(calibrator)), + precision_(precision), + batch_config_(batch_config), + max_workspace_size_(max_workspace_size) +{ + for (const auto & plugin_path : plugin_paths) { + int32_t flags{RTLD_LAZY}; +#if ENABLE_ASAN + // https://github.com/google/sanitizers/issues/89 + // asan doesn't handle module unloading correctly and there are no plans on doing + // so. In order to get proper stack traces, don't delete the shared library on + // close so that asan can resolve the symbols correctly. + flags |= RTLD_NODELETE; +#endif // ENABLE_ASAN + void * handle = dlopen(plugin_path.c_str(), flags); + if (!handle) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Could not load plugin library"); + } + } + runtime_ = TrtUniquePtr(nvinfer1::createInferRuntime(logger_)); + initLibNvInferPlugins(&logger_, ""); +} + +void TrtCommon::setup() +{ + if (!fs::exists(model_file_path_)) { + is_initialized_ = false; + return; + } + if (model_file_path_.extension() == ".engine") { + loadEngine(model_file_path_); + } else if (model_file_path_.extension() == ".onnx") { + fs::path cache_engine_path{model_file_path_}; + cache_engine_path.replace_extension("engine"); + if (fs::exists(cache_engine_path)) { + loadEngine(cache_engine_path); + } else { + logger_.log(nvinfer1::ILogger::Severity::kINFO, "Start build engine"); + buildEngineFromOnnx(model_file_path_, cache_engine_path); + logger_.log(nvinfer1::ILogger::Severity::kINFO, "End build engine"); + } + } else { + is_initialized_ = false; + return; + } + + context_ = TrtUniquePtr(engine_->createExecutionContext()); + if (!context_) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create context"); + is_initialized_ = false; + return; + } + + is_initialized_ = true; +} + +bool TrtCommon::loadEngine(const std::string & engine_file_path) +{ + std::ifstream engine_file(engine_file_path); + std::stringstream engine_buffer; + engine_buffer << engine_file.rdbuf(); + std::string engine_str = engine_buffer.str(); + engine_ = TrtUniquePtr(runtime_->deserializeCudaEngine( + reinterpret_cast(engine_str.data()), engine_str.size())); + return true; +} + +bool TrtCommon::buildEngineFromOnnx( + const std::string & onnx_file_path, const std::string & output_engine_file_path) +{ + auto builder = TrtUniquePtr(nvinfer1::createInferBuilder(logger_)); + if (!builder) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder"); + return false; + } + + const auto explicitBatch = + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + + auto network = + TrtUniquePtr(builder->createNetworkV2(explicitBatch)); + if (!network) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create network"); + return false; + } + + auto config = TrtUniquePtr(builder->createBuilderConfig()); + if (!config) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create builder config"); + return false; + } + + if (precision_ == "fp16" || precision_ == "int8") { + config->setFlag(nvinfer1::BuilderFlag::kFP16); + } +#if (NV_TENSORRT_MAJOR * 1000) + (NV_TENSORRT_MINOR * 100) + NV_TENSOR_PATCH >= 8400 + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); +#else + config->setMaxWorkspaceSize(max_workspace_size_); +#endif + + auto parser = TrtUniquePtr(nvonnxparser::createParser(*network, logger_)); + if (!parser->parseFromFile( + onnx_file_path.c_str(), static_cast(nvinfer1::ILogger::Severity::kERROR))) { + return false; + } + + const auto input = network->getInput(0); + const auto input_dims = input->getDimensions(); + const auto input_channel = input_dims.d[1]; + const auto input_height = input_dims.d[2]; + const auto input_width = input_dims.d[3]; + + auto profile = builder->createOptimizationProfile(); + profile->setDimensions( + network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMIN, + nvinfer1::Dims4{batch_config_.at(0), input_channel, input_height, input_width}); + profile->setDimensions( + network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kOPT, + nvinfer1::Dims4{batch_config_.at(1), input_channel, input_height, input_width}); + profile->setDimensions( + network->getInput(0)->getName(), nvinfer1::OptProfileSelector::kMAX, + nvinfer1::Dims4{batch_config_.at(2), input_channel, input_height, input_width}); + config->addOptimizationProfile(profile); + + if (precision_ == "int8" && calibrator_) { + config->setFlag(nvinfer1::BuilderFlag::kINT8); + config->setInt8Calibrator(calibrator_.get()); + } + +#if TENSORRT_VERSION_MAJOR >= 8 + auto plan = + TrtUniquePtr(builder->buildSerializedNetwork(*network, *config)); + if (!plan) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create host memory"); + return false; + } + engine_ = TrtUniquePtr( + runtime_->deserializeCudaEngine(plan->data(), plan->size())); +#else + engine_ = TrtUniquePtr(builder->buildEngineWithConfig(*network, *config)); +#endif + + if (!engine_) { + logger_.log(nvinfer1::ILogger::Severity::kERROR, "Fail to create engine"); + return false; + } + + // save engine +#if TENSORRT_VERSION_MAJOR < 8 + auto data = TrtUniquePtr(engine_->serialize()); +#endif + std::ofstream file; + file.open(output_engine_file_path, std::ios::binary | std::ios::out); + if (!file.is_open()) { + return false; + } +#if TENSORRT_VERSION_MAJOR < 8 + file.write(reinterpret_cast(data->data()), data->size()); +#else + file.write(reinterpret_cast(plan->data()), plan->size()); +#endif + + file.close(); + + return true; +} + +bool TrtCommon::isInitialized() { return is_initialized_; } + +nvinfer1::Dims TrtCommon::getBindingDimensions(const int32_t index) const +{ + return context_->getBindingDimensions(index); +} + +int32_t TrtCommon::getNbBindings() { return engine_->getNbBindings(); } + +bool TrtCommon::setBindingDimensions(const int32_t index, const nvinfer1::Dims & dimensions) const +{ + return context_->setBindingDimensions(index, dimensions); +} + +bool TrtCommon::enqueueV2(void ** bindings, cudaStream_t stream, cudaEvent_t * input_consumed) +{ + return context_->enqueueV2(bindings, stream, input_consumed); +} + +} // namespace tensorrt_common diff --git a/perception/tensorrt_yolox/CMakeLists.txt b/perception/tensorrt_yolox/CMakeLists.txt new file mode 100644 index 0000000000000..6b0719a814893 --- /dev/null +++ b/perception/tensorrt_yolox/CMakeLists.txt @@ -0,0 +1,123 @@ +cmake_minimum_required(VERSION 3.5) +project(tensorrt_yolox) + +find_package(autoware_cmake REQUIRED) +autoware_package() + +find_package(OpenCV REQUIRED) +find_package(OpenMP) +if(OpenMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +########## +# Download pretrained model +set(DATA_PATH "${CMAKE_CURRENT_SOURCE_DIR}/data") +if(NOT EXISTS "${DATA_PATH}") + execute_process(COMMAND mkdir -p ${DATA_PATH}) +endif() +function(download FILE_NAME FILE_HASH) + message(STATUS "Checking and downloading ${FILE_NAME}") + set(FILE_PATH ${DATA_PATH}/${FILE_NAME}) + set(STATUS_CODE 0) + message(STATUS "start ${FILE_NAME}") + if(EXISTS ${FILE_PATH}) + message(STATUS "found ${FILE_NAME}") + file(MD5 ${FILE_PATH} EXISTING_FILE_HASH) + if(${FILE_HASH} STREQUAL ${EXISTING_FILE_HASH}) + message(STATUS "same ${FILE_NAME}") + message(STATUS "File already exists.") + else() + message(STATUS "diff ${FILE_NAME}") + message(STATUS "File hash changes. Downloading now ...") + file(DOWNLOAD https://awf.ml.dev.web.auto/perception/models/${FILE_NAME} ${FILE_PATH} STATUS DOWNLOAD_STATUS TIMEOUT 3600) + list(GET DOWNLOAD_STATUS 0 STATUS_CODE) + list(GET DOWNLOAD_STATUS 1 ERROR_MESSAGE) + endif() + else() + message(STATUS "not found ${FILE_NAME}") + message(STATUS "File doesn't exists. Downloading now ...") + file(DOWNLOAD https://awf.ml.dev.web.auto/perception/models/${FILE_NAME} ${FILE_PATH} STATUS DOWNLOAD_STATUS TIMEOUT 3600) + list(GET DOWNLOAD_STATUS 0 STATUS_CODE) + list(GET DOWNLOAD_STATUS 1 ERROR_MESSAGE) + endif() + if(${STATUS_CODE} EQUAL 0) + message(STATUS "Download completed successfully!") + else() + message(FATAL_ERROR "Error occurred during download: ${ERROR_MESSAGE}") + endif() +endfunction() + +download(yolox-tiny.onnx 97028baf73ce55e115599c9c60651b08) +download(label.txt 9ceadca8b72b6169ee6aabb861fe3e1e) + +########## +# tensorrt_yolox +ament_auto_add_library(${PROJECT_NAME} SHARED + src/tensorrt_yolox.cpp +) + +ament_target_dependencies(${PROJECT_NAME} + OpenCV +) + +target_link_libraries(${PROJECT_NAME} + ${tensorrt_common_LIBRARIES} +) + +target_compile_definitions(${PROJECT_NAME} PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +ament_auto_add_library(yolox_single_image_inference_node SHARED + src/yolox_single_image_inference_node.cpp +) + +ament_target_dependencies(yolox_single_image_inference_node + OpenCV +) + +target_link_libraries(yolox_single_image_inference_node + ${PROJECT_NAME} + stdc++fs +) + +target_compile_definitions(yolox_single_image_inference_node PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +rclcpp_components_register_node(yolox_single_image_inference_node + PLUGIN "tensorrt_yolox::YoloXSingleImageInferenceNode" + EXECUTABLE yolox_single_image_inference +) + +ament_auto_add_library(${PROJECT_NAME}_node SHARED + src/tensorrt_yolox_node.cpp +) + +ament_target_dependencies(${PROJECT_NAME}_node + OpenCV +) + +target_link_libraries(${PROJECT_NAME}_node + ${PROJECT_NAME} +) + +target_compile_definitions(${PROJECT_NAME}_node PRIVATE + TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR} +) + +rclcpp_components_register_node(${PROJECT_NAME}_node + PLUGIN "tensorrt_yolox::TrtYoloXNode" + EXECUTABLE ${PROJECT_NAME}_node_exe +) + +if(BUILD_TESTING) + find_package(ament_lint_auto REQUIRED) + ament_lint_auto_find_test_dependencies() +endif() + +ament_auto_package(INSTALL_TO_SHARE + data +) diff --git a/perception/tensorrt_yolox/README.md b/perception/tensorrt_yolox/README.md new file mode 100644 index 0000000000000..aaf51198e4429 --- /dev/null +++ b/perception/tensorrt_yolox/README.md @@ -0,0 +1,142 @@ +# tensorrt_yolox + +## Purpose + +This package detects target objects e.g., cars, trucks, bicycles, and pedestrians on a image based on [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) model. + +## Inner-workings / Algorithms + +### Cite + +Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, Jian Sun, "YOLOX: Exceeding YOLO Series in 2021", arXiv preprint arXiv:2107.08430, 2021 [[ref](https://arxiv.org/abs/2107.08430)] + +## Inputs / Outputs + +### Input + +| Name | Type | Description | +| ---------- | ------------------- | --------------- | +| `in/image` | `sensor_msgs/Image` | The input image | + +### Output + +| Name | Type | Description | +| ------------- | -------------------------------------------------- | -------------------------------------------------- | +| `out/objects` | `tier4_perception_msgs/DetectedObjectsWithFeature` | The detected objects with 2D bounding boxes | +| `out/image` | `sensor_msgs/Image` | The image with 2D bounding boxes for visualization | + +## Parameters + +### Core Parameters + +| Name | Type | Default Value | Description | +| ----------------- | ----- | ------------- | ------------------------------------------------------------------------------------- | +| `score_threshold` | float | 0.3 | If the objectness score is less than this value, the object is ignored in yolo layer. | +| `nms_threshold` | float | 0.7 | The IoU threshold for NMS method | + +**NOTE:** These two parameters are only valid for "plain" model (described later). + +### Node Parameters + +| Name | Type | Default Value | Description | +| ------------ | ------ | ------------- | ------------------------------------------------------------------ | +| `onnx_file` | string | "" | The onnx file name for yolo model | +| `label_file` | string | "" | The label file with label names for detected objects written on it | +| `mode` | string | "fp32" | The inference mode: "fp32", "fp16", "int8" | + +## Assumptions / Known limits + +The label contained in detected 2D bounding boxes (i.e., `out/objects`) will be either one of the followings: + +- CAR +- PEDESTRIAN ("PERSON" will also be categorized as "PEDESTRIAN") +- BUS +- TRUCK +- BICYCLE +- MOTORCYCLE + +If other labels (case insensitive) are contained in the file specified via the `label_file` parameter, +those are labeled as `UNKNOWN`, while detected rectangles are drawn in the visualization result (`out/image`). + +## Onnx model + +A sample model (named `yolox-tiny.onnx`) is downloaded automatically during the build process. +To accelerate Non-maximum-suppression (NMS), which is one of the common post-process after object detection inference, +`EfficientNMS_TRT` module is attached after the ordinal YOLOX (tiny) network. +The `EfficientNMS_TRT` module contains fixed values for `score_threshold` and `nms_threshold` in it, +hence these parameters are ignored when users specify ONNX models including this module. + +This package accepts both `EfficientNMS_TRT` attached ONNXs and [models published from the official YOLOX repository](https://github.com/Megvii-BaseDetection/YOLOX/tree/main/demo/ONNXRuntime#download-onnx-models) (we referred to them as "plain" models). + +All models are automatically converted to TensorRT format. +These converted files will be saved in the same directory as specified ONNX files +with `.engine` filename extension and reused from the next run. +The conversion process may take a while (typically a few minutes) and the inference process is blocked +until complete the conversion, so it will take some time until detection results are published on the first run. + +### Package acceptable model generation + +To convert users' own model that saved in PyTorch's `pth` format into ONNX, +users can exploit the converter offered by the official repository. +For the convenience, only procedures are described below. +Please refer [the official document](https://github.com/Megvii-BaseDetection/YOLOX/tree/main/demo/ONNXRuntime#convert-your-model-to-onnx) for more detail. + +#### For plain models + +1. Install dependency + + ```shell + git clone git@github.com:Megvii-BaseDetection/YOLOX.git + cd YOLOX + python3 setup.py develop --user + ``` + +2. Convert pth into ONNX + + ```shell + python3 tools/export_onnx.py \ + --output-name YOUR_YOLOX.onnx \ + -f YOUR_YOLOX.py \ + -c YOUR_YOLOX.pth + ``` + +#### For EfficientNMS_TRT embedded models + +1. Install dependency + + ```shell + git clone git@github.com:Megvii-BaseDetection/YOLOX.git + cd YOLOX + python3 setup.py develop --user + pip3 install git+ssh://git@github.com/wep21/yolox_onnx_modifier.git --user + ``` + +2. Convert pth into ONNX + + ```shell + python3 tools/export_onnx.py \ + --output-name YOUR_YOLOX.onnx \ + -f YOUR_YOLOX.py \ + -c YOUR_YOLOX.pth + --decode_in_inference + ``` + +3. Embed `EfficientNMS_TRT` to the end of YOLOX + + ```shell + yolox_onnx_modifier YOUR_YOLOX.onnx -o YOUR_YOLOX_WITH_NMS.onnx + ``` + +## Label file + +A sample label file (named `label.txt`)is also downloaded automatically during the build process +(**NOTE:** This file is incompatible with models that output labels for the COCO dataset (e.g., models from the official YOLOX repository)). + +This file represents the correspondence between class index (integer outputted from YOLOX network) and +class label (strings making understanding easier). This package maps class IDs (incremented from 0) +with labels according to the order in this file. + +## Reference repositories + +- +- diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp new file mode 100644 index 0000000000000..7f0f64dc89f67 --- /dev/null +++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp @@ -0,0 +1,123 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_ +#define TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_ + +#include +#include +#include +#include + +#include +#include +#include + +namespace tensorrt_yolox +{ +using cuda_utils::CudaUniquePtr; +using cuda_utils::CudaUniquePtrHost; +using cuda_utils::makeCudaStream; +using cuda_utils::StreamUniquePtr; + +struct Object +{ + int32_t x_offset; + int32_t y_offset; + int32_t height; + int32_t width; + float score; + int32_t type; +}; + +using ObjectArray = std::vector; +using ObjectArrays = std::vector; + +struct GridAndStride +{ + int grid0; + int grid1; + int stride; +}; + +class TrtYoloX +{ +public: + TrtYoloX( + const std::string & model_path, const std::string & precision, const int num_class = 8, + const float score_threshold = 0.3, const float nms_threshold = 0.7, + const std::string & cache_dir = "", + const tensorrt_common::BatchConfig & batch_config = {1, 1, 1}, + const size_t max_workspace_size = (1 << 30)); + + bool doInference(const std::vector & images, ObjectArrays & objects); + +private: + void preprocess(const std::vector & images); + bool feedforward(const std::vector & images, ObjectArrays & objects); + bool feedforwardAndDecode(const std::vector & images, ObjectArrays & objects); + void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const; + void generateGridsAndStride( + const int target_w, const int target_h, std::vector & strides, + std::vector & grid_strides) const; + void generateYoloxProposals( + std::vector grid_strides, float * feat_blob, float prob_threshold, + ObjectArray & objects) const; + void qsortDescentInplace(ObjectArray & faceobjects, int left, int right) const; + inline void qsortDescentInplace(ObjectArray & objects) const + { + if (objects.empty()) { + return; + } + qsortDescentInplace(objects, 0, objects.size() - 1); + } + inline float intersectionArea(const Object & a, const Object & b) const + { + cv::Rect a_rect(a.x_offset, a.y_offset, a.width, a.height); + cv::Rect b_rect(b.x_offset, b.y_offset, b.width, b.height); + cv::Rect_ inter = a_rect & b_rect; + return inter.area(); + } + void nmsSortedBboxes( + const ObjectArray & faceobjects, std::vector & picked, float nms_threshold) const; + + std::unique_ptr trt_common_; + + std::vector input_h_; + CudaUniquePtr input_d_; + CudaUniquePtr out_num_detections_d_; + CudaUniquePtr out_boxes_d_; + CudaUniquePtr out_scores_d_; + CudaUniquePtr out_classes_d_; + + bool needs_output_decode_; + size_t out_elem_num_; + size_t out_elem_num_per_batch_; + CudaUniquePtr out_prob_d_; + + StreamUniquePtr stream_{makeCudaStream()}; + + int32_t max_detections_; + std::vector scales_; + + int num_class_; + float score_threshold_; + float nms_threshold_; + + CudaUniquePtrHost out_prob_h_; +}; + +} // namespace tensorrt_yolox + +#endif // TENSORRT_YOLOX__TENSORRT_YOLOX_HPP_ diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp new file mode 100644 index 0000000000000..9a9921b36dcf1 --- /dev/null +++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp @@ -0,0 +1,63 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_ +#define TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace tensorrt_yolox +{ +using LabelMap = std::map; + +class TrtYoloXNode : public rclcpp::Node +{ +public: + explicit TrtYoloXNode(const rclcpp::NodeOptions & node_options); + +private: + void onConnect(); + void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg); + bool readLabelFile(const std::string & label_path); + + image_transport::Publisher image_pub_; + rclcpp::Publisher::SharedPtr objects_pub_; + + image_transport::Subscriber image_sub_; + + rclcpp::TimerBase::SharedPtr timer_; + + LabelMap label_map_; + std::unique_ptr trt_yolox_; +}; + +} // namespace tensorrt_yolox + +#endif // TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_ diff --git a/perception/tensorrt_yolox/package.xml b/perception/tensorrt_yolox/package.xml new file mode 100644 index 0000000000000..dcd04c264ea95 --- /dev/null +++ b/perception/tensorrt_yolox/package.xml @@ -0,0 +1,39 @@ + + + tensorrt_yolox + 0.0.1 + tensorrt library implementation for yolox + + Daisuke Nishimatsu + Daisuke Nishimatsu + Manato Hirabayashi + + Apache License 2.0 + + ament_cmake_auto + cudnn_cmake_module + tensorrt_cmake_module + + cudnn_cmake_module + tensorrt_cmake_module + + autoware_cmake + + autoware_auto_perception_msgs + cuda_utils + cv_bridge + image_transport + libopencv-dev + rclcpp + rclcpp_components + sensor_msgs + tensorrt_common + tier4_perception_msgs + + ament_lint_auto + autoware_lint_common + + + ament_cmake + + diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp new file mode 100644 index 0000000000000..191a3832d028d --- /dev/null +++ b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp @@ -0,0 +1,390 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cuda_utils/cuda_check_error.hpp" +#include "cuda_utils/cuda_unique_ptr.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace tensorrt_yolox +{ +TrtYoloX::TrtYoloX( + const std::string & model_path, const std::string & precision, const int num_class, + const float score_threshold, const float nms_threshold, + [[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config, + const size_t max_workspace_size) +{ + trt_common_ = std::make_unique( + model_path, precision, nullptr, batch_config, max_workspace_size); + trt_common_->setup(); + + if (!trt_common_->isInitialized()) { + return; + } + + // Judge whether decoding output is required + // Plain models require decoding, while models with EfficientNMS_TRT module don't. + // If getNbBindings == 5, the model contains EfficientNMS_TRT + switch (trt_common_->getNbBindings()) { + case 2: + // Specified model is plain one. + // Bindings are: [input, output] + needs_output_decode_ = true; + // The following three values are considered only if the specified model is plain one + num_class_ = num_class; + score_threshold_ = score_threshold; + nms_threshold_ = nms_threshold; + break; + case 5: + // Specified model contains Efficient NMS_TRT. + // Bindings are[input, detection_classes, detection_scores, detection_boxes, num_detections] + needs_output_decode_ = false; + break; + default: + std::stringstream s; + s << "\"" << model_path << "\" is unsuppoerted format"; + std::runtime_error{s.str()}; + } + + // GPU memory allocation + const auto input_dims = trt_common_->getBindingDimensions(0); + const auto input_size = + std::accumulate(input_dims.d + 1, input_dims.d + input_dims.nbDims, 1, std::multiplies()); + if (needs_output_decode_) { + const auto output_dims = trt_common_->getBindingDimensions(1); + input_d_ = cuda_utils::make_unique(batch_config[2] * input_size); + out_elem_num_ = std::accumulate( + output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies()); + out_elem_num_per_batch_ = static_cast(out_elem_num_ / batch_config[2]); + out_prob_d_ = cuda_utils::make_unique(out_elem_num_); + out_prob_h_ = cuda_utils::make_unique_host(out_elem_num_, cudaHostAllocPortable); + } else { + const auto out_scores_dims = trt_common_->getBindingDimensions(3); + max_detections_ = out_scores_dims.d[1]; + input_d_ = cuda_utils::make_unique(batch_config[2] * input_size); + out_num_detections_d_ = cuda_utils::make_unique(batch_config[2]); + out_boxes_d_ = cuda_utils::make_unique(batch_config[2] * max_detections_ * 4); + out_scores_d_ = cuda_utils::make_unique(batch_config[2] * max_detections_); + out_classes_d_ = cuda_utils::make_unique(batch_config[2] * max_detections_); + } +} + +void TrtYoloX::preprocess(const std::vector & images) +{ + const auto batch_size = images.size(); + auto input_dims = trt_common_->getBindingDimensions(0); + input_dims.d[0] = batch_size; + trt_common_->setBindingDimensions(0, input_dims); + const float input_height = static_cast(input_dims.d[2]); + const float input_width = static_cast(input_dims.d[3]); + std::vector dst_images; + scales_.clear(); + for (const auto & image : images) { + cv::Mat dst_image; + const float scale = std::min(input_width / image.cols, input_height / image.rows); + scales_.emplace_back(scale); + const auto scale_size = cv::Size(image.cols * scale, image.rows * scale); + cv::resize(image, dst_image, scale_size, 0, 0, cv::INTER_CUBIC); + const auto bottom = input_height - dst_image.rows; + const auto right = input_width - dst_image.cols; + copyMakeBorder(dst_image, dst_image, 0, bottom, 0, right, cv::BORDER_CONSTANT, {114, 114, 114}); + dst_images.emplace_back(dst_image); + } + const auto chw_images = + cv::dnn::blobFromImages(dst_images, 1.0, cv::Size(), cv::Scalar(), false, false, CV_32F); + + const auto data_length = chw_images.total(); + input_h_.reserve(data_length); + const auto flat = chw_images.reshape(1, data_length); + input_h_ = chw_images.isContinuous() ? flat : flat.clone(); +} + +bool TrtYoloX::doInference(const std::vector & images, ObjectArrays & objects) +{ + if (!trt_common_->isInitialized()) { + return false; + } + + preprocess(images); + + CHECK_CUDA_ERROR(cudaMemcpy( + input_d_.get(), input_h_.data(), input_h_.size() * sizeof(float), cudaMemcpyHostToDevice)); + + if (needs_output_decode_) { + return feedforwardAndDecode(images, objects); + } else { + return feedforward(images, objects); + } +} + +// This method is assumed to be called when specified YOLOX model contains +// EfficientNMS_TRT module. +bool TrtYoloX::feedforward(const std::vector & images, ObjectArrays & objects) +{ + std::vector buffers = { + input_d_.get(), out_num_detections_d_.get(), out_boxes_d_.get(), out_scores_d_.get(), + out_classes_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + const auto batch_size = images.size(); + auto out_num_detections = std::make_unique(batch_size); + auto out_boxes = std::make_unique(4 * batch_size * max_detections_); + auto out_scores = std::make_unique(batch_size * max_detections_); + auto out_classes = std::make_unique(batch_size * max_detections_); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_num_detections.get(), out_num_detections_d_.get(), sizeof(int32_t) * batch_size, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_boxes.get(), out_boxes_d_.get(), sizeof(float) * 4 * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_scores.get(), out_scores_d_.get(), sizeof(float) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_classes.get(), out_classes_d_.get(), sizeof(int32_t) * batch_size * max_detections_, + cudaMemcpyDeviceToHost, *stream_)); + cudaStreamSynchronize(*stream_); + objects.clear(); + for (size_t i = 0; i < batch_size; ++i) { + const size_t num_detection = static_cast(out_num_detections[i]); + ObjectArray object_array(num_detection); + for (size_t j = 0; j < num_detection; ++j) { + Object object{}; + const auto x1 = out_boxes[i * max_detections_ * 4 + j * 4] / scales_[i]; + const auto y1 = out_boxes[i * max_detections_ * 4 + j * 4 + 1] / scales_[i]; + const auto x2 = out_boxes[i * max_detections_ * 4 + j * 4 + 2] / scales_[i]; + const auto y2 = out_boxes[i * max_detections_ * 4 + j * 4 + 3] / scales_[i]; + object.x_offset = std::clamp(0, static_cast(x1), images[i].cols); + object.y_offset = std::clamp(0, static_cast(y1), images[i].rows); + object.width = static_cast(std::max(0.0F, x2 - x1)); + object.height = static_cast(std::max(0.0F, y2 - y1)); + object.score = out_scores[i * max_detections_ + j]; + object.type = out_classes[i * max_detections_ + j]; + object_array.emplace_back(object); + } + objects.emplace_back(object_array); + } + return true; +} + +bool TrtYoloX::feedforwardAndDecode(const std::vector & images, ObjectArrays & objects) +{ + std::vector buffers = {input_d_.get(), out_prob_d_.get()}; + + trt_common_->enqueueV2(buffers.data(), *stream_, nullptr); + + const auto batch_size = images.size(); + + CHECK_CUDA_ERROR(cudaMemcpyAsync( + out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost, + *stream_)); + cudaStreamSynchronize(*stream_); + objects.clear(); + + for (size_t i = 0; i < batch_size; ++i) { + auto image_size = images[i].size(); + float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_); + ObjectArray object_array; + decodeOutputs(batch_prob, object_array, scales_[i], image_size); + objects.emplace_back(object_array); + } + return true; +} + +void TrtYoloX::decodeOutputs( + float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const +{ + ObjectArray proposals; + auto input_dims = trt_common_->getBindingDimensions(0); + const float input_height = static_cast(input_dims.d[2]); + const float input_width = static_cast(input_dims.d[3]); + std::vector strides = {8, 16, 32}; + std::vector grid_strides; + generateGridsAndStride(input_width, input_height, strides, grid_strides); + generateYoloxProposals(grid_strides, prob, score_threshold_, proposals); + + qsortDescentInplace(proposals); + + std::vector picked; + nmsSortedBboxes(proposals, picked, nms_threshold_); + + int count = static_cast(picked.size()); + objects.resize(count); + for (int i = 0; i < count; i++) { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].x_offset) / scale; + float y0 = (objects[i].y_offset) / scale; + float x1 = (objects[i].x_offset + objects[i].width) / scale; + float y1 = (objects[i].y_offset + objects[i].height) / scale; + + // clip + x0 = std::clamp(x0, 0.f, static_cast(img_size.width - 1)); + y0 = std::clamp(y0, 0.f, static_cast(img_size.height - 1)); + x1 = std::clamp(x1, 0.f, static_cast(img_size.width - 1)); + y1 = std::clamp(y1, 0.f, static_cast(img_size.height - 1)); + + objects[i].x_offset = x0; + objects[i].y_offset = y0; + objects[i].width = x1 - x0; + objects[i].height = y1 - y0; + } +} + +void TrtYoloX::generateGridsAndStride( + const int target_w, const int target_h, std::vector & strides, + std::vector & grid_strides) const +{ + for (auto stride : strides) { + int num_grid_w = target_w / stride; + int num_grid_h = target_h / stride; + for (int g1 = 0; g1 < num_grid_h; g1++) { + for (int g0 = 0; g0 < num_grid_w; g0++) { + grid_strides.push_back(GridAndStride{g0, g1, stride}); + } + } + } +} + +void TrtYoloX::generateYoloxProposals( + std::vector grid_strides, float * feat_blob, float prob_threshold, + ObjectArray & objects) const +{ + const int num_anchors = grid_strides.size(); + + for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { + const int grid0 = grid_strides[anchor_idx].grid0; + const int grid1 = grid_strides[anchor_idx].grid1; + const int stride = grid_strides[anchor_idx].stride; + + const int basic_pos = anchor_idx * (num_class_ + 5); + + // yolox/models/yolo_head.py decode logic + // To apply this logic, YOLOX head must output raw value + // (i.e., `decode_in_inference` should be False) + float x_center = (feat_blob[basic_pos + 0] + grid0) * stride; + float y_center = (feat_blob[basic_pos + 1] + grid1) * stride; + float w = exp(feat_blob[basic_pos + 2]) * stride; + float h = exp(feat_blob[basic_pos + 3]) * stride; + float x0 = x_center - w * 0.5f; + float y0 = y_center - h * 0.5f; + + float box_objectness = feat_blob[basic_pos + 4]; + for (int class_idx = 0; class_idx < num_class_; class_idx++) { + float box_cls_score = feat_blob[basic_pos + 5 + class_idx]; + float box_prob = box_objectness * box_cls_score; + if (box_prob > prob_threshold) { + Object obj; + obj.x_offset = x0; + obj.y_offset = y0; + obj.height = h; + obj.width = w; + obj.type = class_idx; + obj.score = box_prob; + + objects.push_back(obj); + } + } // class loop + } // point anchor loop +} + +void TrtYoloX::qsortDescentInplace(ObjectArray & faceobjects, int left, int right) const +{ + int i = left; + int j = right; + float p = faceobjects[(left + right) / 2].score; + + while (i <= j) { + while (faceobjects[i].score > p) { + i++; + } + + while (faceobjects[j].score < p) { + j--; + } + + if (i <= j) { + // swap + std::swap(faceobjects[i], faceobjects[j]); + + i++; + j--; + } + } + +#pragma omp parallel sections + { +#pragma omp section + { + if (left < j) { + qsortDescentInplace(faceobjects, left, j); + } + } +#pragma omp section + { + if (i < right) { + qsortDescentInplace(faceobjects, i, right); + } + } + } +} + +void TrtYoloX::nmsSortedBboxes( + const ObjectArray & faceobjects, std::vector & picked, float nms_threshold) const +{ + picked.clear(); + const int n = faceobjects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) { + cv::Rect rect( + faceobjects[i].x_offset, faceobjects[i].y_offset, faceobjects[i].width, + faceobjects[i].height); + areas[i] = rect.area(); + } + + for (int i = 0; i < n; i++) { + const Object & a = faceobjects[i]; + + int keep = 1; + for (int j = 0; j < static_cast(picked.size()); j++) { + const Object & b = faceobjects[picked[j]]; + + // intersection over union + float inter_area = intersectionArea(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) { + keep = 0; + } + } + + if (keep) { + picked.push_back(i); + } + } +} + +} // namespace tensorrt_yolox diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp new file mode 100644 index 0000000000000..1cc3f6d5ec2cc --- /dev/null +++ b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp @@ -0,0 +1,155 @@ +// Copyright 2022 Tier IV, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include + +namespace tensorrt_yolox +{ +TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options) +: Node("tensorrt_yolox", node_options) +{ + using std::placeholders::_1; + using std::chrono_literals::operator""ms; + + std::string model_path = declare_parameter("model_path", ""); + std::string label_path = declare_parameter("label_path", ""); + std::string precision = declare_parameter("precision", "fp32"); + // Objects with a score lower than this value will be ignored. + // This threshold will be ignored if specified model contains EfficientNMS_TRT module in it + float score_threshold = declare_parameter("score_threshold", 0.3); + // Detection results will be ignored if IoU over this value. + // This threshold will be ignored if specified model contains EfficientNMS_TRT module in it + float nms_threshold = declare_parameter("nms_threshold", 0.7); + + if (!readLabelFile(label_path)) { + RCLCPP_ERROR(this->get_logger(), "Could not find label file"); + rclcpp::shutdown(); + } + trt_yolox_ = std::make_unique( + model_path, precision, label_map_.size(), score_threshold, nms_threshold); + + timer_ = + rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this)); + + objects_pub_ = this->create_publisher( + "~/out/objects", 1); + image_pub_ = image_transport::create_publisher(this, "~/out/image"); +} + +void TrtYoloXNode::onConnect() +{ + using std::placeholders::_1; + if ( + objects_pub_->get_subscription_count() == 0 && + objects_pub_->get_intra_process_subscription_count() == 0 && + image_pub_.getNumSubscribers() == 0) { + image_sub_.shutdown(); + } else if (!image_sub_) { + image_sub_ = image_transport::create_subscription( + this, "~/in/image", std::bind(&TrtYoloXNode::onImage, this, _1), "raw", + rmw_qos_profile_sensor_data); + } +} + +void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg) +{ + using Label = autoware_auto_perception_msgs::msg::ObjectClassification; + + tier4_perception_msgs::msg::DetectedObjectsWithFeature out_objects; + + cv_bridge::CvImagePtr in_image_ptr; + try { + in_image_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8); + } catch (cv_bridge::Exception & e) { + RCLCPP_ERROR(this->get_logger(), "cv_bridge exception: %s", e.what()); + return; + } + const auto width = in_image_ptr->image.cols; + const auto height = in_image_ptr->image.rows; + + tensorrt_yolox::ObjectArrays objects; + if (!trt_yolox_->doInference({in_image_ptr->image}, objects)) { + RCLCPP_WARN(this->get_logger(), "Fail to inference"); + return; + } + for (const auto & yolox_object : objects.at(0)) { + tier4_perception_msgs::msg::DetectedObjectWithFeature object; + object.feature.roi.x_offset = yolox_object.x_offset; + object.feature.roi.y_offset = yolox_object.y_offset; + object.feature.roi.width = yolox_object.width; + object.feature.roi.height = yolox_object.height; + object.object.classification.emplace_back(autoware_auto_perception_msgs::build