From b9f118de9f3594adbd73c693f6b61436cb003489 Mon Sep 17 00:00:00 2001 From: ito-san Date: Mon, 23 May 2022 12:13:23 +0900 Subject: [PATCH 1/4] fix(system_monitor): move top command execution to a timer Signed-off-by: ito-san --- .../process_monitor/process_monitor.hpp | 11 +++- .../src/process_monitor/process_monitor.cpp | 58 +++++++++++++------ 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp index 39bf897116b0a..9212156b61d87 100644 --- a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp +++ b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp @@ -110,6 +110,11 @@ class ProcessMonitor : public rclcpp::Node std::vector> * tasks, const std::string & message, const std::string & error_command, const std::string & content); + /** + * @brief timer callback to execute top command + */ + void onTimer(); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name @@ -118,7 +123,11 @@ class ProcessMonitor : public rclcpp::Node std::vector> load_tasks_; //!< @brief list of diagnostics tasks for high load procs std::vector> - memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs + memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs + rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to execute top command + std::string top_output_; //!< @brief output from top command + bool is_top_error_; //!< @brief flag if an error occurs + float elapsed_ms_; //!< @brief Execution time of top command }; #endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ diff --git a/system/system_monitor/src/process_monitor/process_monitor.cpp b/system/system_monitor/src/process_monitor/process_monitor.cpp index 661753196128e..39edb3bcc256a 100644 --- a/system/system_monitor/src/process_monitor/process_monitor.cpp +++ b/system/system_monitor/src/process_monitor/process_monitor.cpp @@ -33,6 +33,8 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) updater_(this), num_of_procs_(declare_parameter("num_of_procs", 5)) { + using namespace std::literals::chrono_literals; + int index; gethostname(hostname_, sizeof(hostname_)); @@ -50,34 +52,25 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) memory_tasks_.push_back(task); updater_.add(*task); } + + // Start timer to execute top command + timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this)); } void ProcessMonitor::update() { updater_.force_update(); } void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat) { - // Remember start time to measure elapsed time - const auto t_start = SystemMonitorUtility::startMeasurement(); + std::string str = top_output_; - bp::ipstream is_err; - bp::ipstream is_out; - std::ostringstream os; - - // Get processes - bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err); - c.wait(); - if (c.exit_code() != 0) { - is_err >> os.rdbuf(); + if (is_top_error_) { stat.summary(DiagStatus::ERROR, "top error"); - stat.add("top", os.str().c_str()); - setErrorContent(&load_tasks_, "top error", "top", os.str().c_str()); - setErrorContent(&memory_tasks_, "top error", "top", os.str().c_str()); + stat.add("top", str); + setErrorContent(&load_tasks_, "top error", "top", str); + setErrorContent(&memory_tasks_, "top error", "top", str); return; } - is_out >> os.rdbuf(); - std::string str = os.str(); - // Get task summary getTasksSummary(stat, str); // Remove header @@ -89,8 +82,7 @@ void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrappe // Get high memory processes getHighMemoryProcesses(str); - // Measure elapsed time since start time and report - SystemMonitorUtility::stopMeasurement(t_start, stat); + stat.addf("execution time", "%f ms", elapsed_ms_); } void ProcessMonitor::getTasksSummary( @@ -310,5 +302,33 @@ void ProcessMonitor::setErrorContent( } } +void ProcessMonitor::onTimer() +{ + is_top_error_ = false; + + // Remember start time to measure elapsed time + const auto t_start = std::chrono::high_resolution_clock::now(); + + bp::ipstream is_err; + bp::ipstream is_out; + std::ostringstream os; + + // Get processes + bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err); + c.wait(); + if (c.exit_code() != 0) { + is_top_error_ = true; + is_err >> os.rdbuf(); + top_output_ = os.str(); + return; + } + + is_out >> os.rdbuf(); + top_output_ = os.str(); + + const auto t_end = std::chrono::high_resolution_clock::now(); + elapsed_ms_ = std::chrono::duration(t_end - t_start).count(); +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(ProcessMonitor) From fa9328ac2f8e9638a314642cfc6edfc80da7d3ec Mon Sep 17 00:00:00 2001 From: ito-san Date: Mon, 23 May 2022 12:17:31 +0900 Subject: [PATCH 2/4] removed unnecessary update method Signed-off-by: ito-san --- .../system_monitor/process_monitor/process_monitor.hpp | 5 ----- .../system_monitor/src/process_monitor/process_monitor.cpp | 2 -- 2 files changed, 7 deletions(-) diff --git a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp index 9212156b61d87..919f7795034cd 100644 --- a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp +++ b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp @@ -41,11 +41,6 @@ class ProcessMonitor : public rclcpp::Node */ explicit ProcessMonitor(const rclcpp::NodeOptions & options); - /** - * @brief Update the diagnostic state - */ - void update(); - protected: using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus; diff --git a/system/system_monitor/src/process_monitor/process_monitor.cpp b/system/system_monitor/src/process_monitor/process_monitor.cpp index 39edb3bcc256a..5d69bc731bc9d 100644 --- a/system/system_monitor/src/process_monitor/process_monitor.cpp +++ b/system/system_monitor/src/process_monitor/process_monitor.cpp @@ -57,8 +57,6 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this)); } -void ProcessMonitor::update() { updater_.force_update(); } - void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat) { std::string str = top_output_; From 8f02967aa28a9694b398c56013636a91e10b5bcc Mon Sep 17 00:00:00 2001 From: ito-san Date: Mon, 23 May 2022 14:02:11 +0900 Subject: [PATCH 3/4] use tier4_autoware_utils::StopWatch Signed-off-by: ito-san --- .../system_monitor/process_monitor/process_monitor.hpp | 2 +- system/system_monitor/package.xml | 1 + .../src/process_monitor/process_monitor.cpp | 10 ++++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp index 919f7795034cd..48529b2d7d88f 100644 --- a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp +++ b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp @@ -122,7 +122,7 @@ class ProcessMonitor : public rclcpp::Node rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to execute top command std::string top_output_; //!< @brief output from top command bool is_top_error_; //!< @brief flag if an error occurs - float elapsed_ms_; //!< @brief Execution time of top command + double elapsed_ms_; //!< @brief Execution time of top command }; #endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ diff --git a/system/system_monitor/package.xml b/system/system_monitor/package.xml index 559dc9c072592..01ef6d6f9a3a8 100644 --- a/system/system_monitor/package.xml +++ b/system/system_monitor/package.xml @@ -18,6 +18,7 @@ rclcpp rclcpp_components std_msgs + tier4_autoware_utils tier4_external_api_msgs chrony diff --git a/system/system_monitor/src/process_monitor/process_monitor.cpp b/system/system_monitor/src/process_monitor/process_monitor.cpp index 5d69bc731bc9d..7213a621bd25e 100644 --- a/system/system_monitor/src/process_monitor/process_monitor.cpp +++ b/system/system_monitor/src/process_monitor/process_monitor.cpp @@ -21,6 +21,8 @@ #include "system_monitor/system_monitor_utility.hpp" +#include + #include #include @@ -304,8 +306,9 @@ void ProcessMonitor::onTimer() { is_top_error_ = false; - // Remember start time to measure elapsed time - const auto t_start = std::chrono::high_resolution_clock::now(); + // Start to measure elapsed time + tier4_autoware_utils::StopWatch stop_watch; + stop_watch.tic("execution_time"); bp::ipstream is_err; bp::ipstream is_out; @@ -324,8 +327,7 @@ void ProcessMonitor::onTimer() is_out >> os.rdbuf(); top_output_ = os.str(); - const auto t_end = std::chrono::high_resolution_clock::now(); - elapsed_ms_ = std::chrono::duration(t_end - t_start).count(); + elapsed_ms_ = stop_watch.toc("execution_time"); } #include From d2151975376fcaadd100b590db9ba5bf4ae0ef98 Mon Sep 17 00:00:00 2001 From: ito-san Date: Fri, 27 May 2022 10:10:54 +0900 Subject: [PATCH 4/4] Ensure thread-safe Signed-off-by: ito-san --- .../process_monitor/process_monitor.hpp | 3 ++ .../src/process_monitor/process_monitor.cpp | 49 ++++++++++++++----- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp index 48529b2d7d88f..802d905d6bfd5 100644 --- a/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp +++ b/system/system_monitor/include/system_monitor/process_monitor/process_monitor.hpp @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -123,6 +124,8 @@ class ProcessMonitor : public rclcpp::Node std::string top_output_; //!< @brief output from top command bool is_top_error_; //!< @brief flag if an error occurs double elapsed_ms_; //!< @brief Execution time of top command + std::mutex mutex_; //!< @brief mutex for output from top command + rclcpp::CallbackGroup::SharedPtr timer_callback_group_; //!< @brief Callback Group }; #endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_ diff --git a/system/system_monitor/src/process_monitor/process_monitor.cpp b/system/system_monitor/src/process_monitor/process_monitor.cpp index 7213a621bd25e..57e1aaf20f571 100644 --- a/system/system_monitor/src/process_monitor/process_monitor.cpp +++ b/system/system_monitor/src/process_monitor/process_monitor.cpp @@ -33,7 +33,8 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) : Node("process_monitor", options), updater_(this), - num_of_procs_(declare_parameter("num_of_procs", 5)) + num_of_procs_(declare_parameter("num_of_procs", 5)), + is_top_error_(false) { using namespace std::literals::chrono_literals; @@ -56,14 +57,25 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options) } // Start timer to execute top command - timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this)); + timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive); + timer_ = rclcpp::create_timer( + this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this), timer_callback_group_); } void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat) { - std::string str = top_output_; + // thread-safe read + std::string str; + bool is_top_error; + double elapsed_ms; + { + std::lock_guard lock(mutex_); + str = top_output_; + is_top_error = is_top_error_; + elapsed_ms = elapsed_ms_; + } - if (is_top_error_) { + if (is_top_error) { stat.summary(DiagStatus::ERROR, "top error"); stat.add("top", str); setErrorContent(&load_tasks_, "top error", "top", str); @@ -71,6 +83,13 @@ void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrappe return; } + // If top still not running + if (str.empty()) { + // Send OK tentatively + stat.summary(DiagStatus::OK, "starting up"); + return; + } + // Get task summary getTasksSummary(stat, str); // Remove header @@ -82,7 +101,7 @@ void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrappe // Get high memory processes getHighMemoryProcesses(str); - stat.addf("execution time", "%f ms", elapsed_ms_); + stat.addf("execution time", "%f ms", elapsed_ms); } void ProcessMonitor::getTasksSummary( @@ -304,7 +323,7 @@ void ProcessMonitor::setErrorContent( void ProcessMonitor::onTimer() { - is_top_error_ = false; + bool is_top_error = false; // Start to measure elapsed time tier4_autoware_utils::StopWatch stop_watch; @@ -317,17 +336,23 @@ void ProcessMonitor::onTimer() // Get processes bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err); c.wait(); + if (c.exit_code() != 0) { - is_top_error_ = true; + is_top_error = true; is_err >> os.rdbuf(); - top_output_ = os.str(); - return; + } else { + is_out >> os.rdbuf(); } - is_out >> os.rdbuf(); - top_output_ = os.str(); + const double elapsed_ms = stop_watch.toc("execution_time"); - elapsed_ms_ = stop_watch.toc("execution_time"); + // thread-safe copy + { + std::lock_guard lock(mutex_); + top_output_ = os.str(); + is_top_error_ = is_top_error; + elapsed_ms_ = elapsed_ms; + } } #include