Skip to content

Commit

Permalink
fix(system_monitor): move top command execution to a timer (autowaref…
Browse files Browse the repository at this point in the history
…oundation#948)

* fix(system_monitor): move top command execution to  a timer

Signed-off-by: ito-san <fumihito.ito@tier4.jp>

* removed unnecessary update method

Signed-off-by: ito-san <fumihito.ito@tier4.jp>

* use tier4_autoware_utils::StopWatch

Signed-off-by: ito-san <fumihito.ito@tier4.jp>

* Ensure thread-safe

Signed-off-by: ito-san <fumihito.ito@tier4.jp>
Signed-off-by: ktro2828 <kotaro.uetake@tier4.jp>
  • Loading branch information
ito-san authored and ktro2828 committed Jun 7, 2022
1 parent 73afd02 commit 0a274b6
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <boost/process.hpp>

#include <memory>
#include <mutex>
#include <string>
#include <vector>

Expand All @@ -41,11 +42,6 @@ class ProcessMonitor : public rclcpp::Node
*/
explicit ProcessMonitor(const rclcpp::NodeOptions & options);

/**
* @brief Update the diagnostic state
*/
void update();

protected:
using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus;

Expand Down Expand Up @@ -110,6 +106,11 @@ class ProcessMonitor : public rclcpp::Node
std::vector<std::shared_ptr<DiagTask>> * tasks, const std::string & message,
const std::string & error_command, const std::string & content);

/**
* @brief timer callback to execute top command
*/
void onTimer();

diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics

char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name
Expand All @@ -118,7 +119,13 @@ class ProcessMonitor : public rclcpp::Node
std::vector<std::shared_ptr<DiagTask>>
load_tasks_; //!< @brief list of diagnostics tasks for high load procs
std::vector<std::shared_ptr<DiagTask>>
memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs
memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs
rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to execute top command
std::string top_output_; //!< @brief output from top command
bool is_top_error_; //!< @brief flag if an error occurs
double elapsed_ms_; //!< @brief Execution time of top command
std::mutex mutex_; //!< @brief mutex for output from top command
rclcpp::CallbackGroup::SharedPtr timer_callback_group_; //!< @brief Callback Group
};

#endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_
1 change: 1 addition & 0 deletions system/system_monitor/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<depend>rclcpp</depend>
<depend>rclcpp_components</depend>
<depend>std_msgs</depend>
<depend>tier4_autoware_utils</depend>
<depend>tier4_external_api_msgs</depend>

<exec_depend>chrony</exec_depend>
Expand Down
87 changes: 66 additions & 21 deletions system/system_monitor/src/process_monitor/process_monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

#include "system_monitor/system_monitor_utility.hpp"

#include <tier4_autoware_utils/system/stop_watch.hpp>

#include <fmt/format.h>

#include <memory>
Expand All @@ -31,8 +33,11 @@
ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options)
: Node("process_monitor", options),
updater_(this),
num_of_procs_(declare_parameter<int>("num_of_procs", 5))
num_of_procs_(declare_parameter<int>("num_of_procs", 5)),
is_top_error_(false)
{
using namespace std::literals::chrono_literals;

int index;

gethostname(hostname_, sizeof(hostname_));
Expand All @@ -50,33 +55,40 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options)
memory_tasks_.push_back(task);
updater_.add(*task);
}
}

void ProcessMonitor::update() { updater_.force_update(); }
// Start timer to execute top command
timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive);
timer_ = rclcpp::create_timer(
this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this), timer_callback_group_);
}

void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat)
{
// Remember start time to measure elapsed time
const auto t_start = SystemMonitorUtility::startMeasurement();

bp::ipstream is_err;
bp::ipstream is_out;
std::ostringstream os;
// thread-safe read
std::string str;
bool is_top_error;
double elapsed_ms;
{
std::lock_guard<std::mutex> lock(mutex_);
str = top_output_;
is_top_error = is_top_error_;
elapsed_ms = elapsed_ms_;
}

// Get processes
bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err);
c.wait();
if (c.exit_code() != 0) {
is_err >> os.rdbuf();
if (is_top_error) {
stat.summary(DiagStatus::ERROR, "top error");
stat.add("top", os.str().c_str());
setErrorContent(&load_tasks_, "top error", "top", os.str().c_str());
setErrorContent(&memory_tasks_, "top error", "top", os.str().c_str());
stat.add("top", str);
setErrorContent(&load_tasks_, "top error", "top", str);
setErrorContent(&memory_tasks_, "top error", "top", str);
return;
}

is_out >> os.rdbuf();
std::string str = os.str();
// If top still not running
if (str.empty()) {
// Send OK tentatively
stat.summary(DiagStatus::OK, "starting up");
return;
}

// Get task summary
getTasksSummary(stat, str);
Expand All @@ -89,8 +101,7 @@ void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrappe
// Get high memory processes
getHighMemoryProcesses(str);

// Measure elapsed time since start time and report
SystemMonitorUtility::stopMeasurement(t_start, stat);
stat.addf("execution time", "%f ms", elapsed_ms);
}

void ProcessMonitor::getTasksSummary(
Expand Down Expand Up @@ -310,5 +321,39 @@ void ProcessMonitor::setErrorContent(
}
}

void ProcessMonitor::onTimer()
{
bool is_top_error = false;

// Start to measure elapsed time
tier4_autoware_utils::StopWatch<std::chrono::milliseconds> stop_watch;
stop_watch.tic("execution_time");

bp::ipstream is_err;
bp::ipstream is_out;
std::ostringstream os;

// Get processes
bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err);
c.wait();

if (c.exit_code() != 0) {
is_top_error = true;
is_err >> os.rdbuf();
} else {
is_out >> os.rdbuf();
}

const double elapsed_ms = stop_watch.toc("execution_time");

// thread-safe copy
{
std::lock_guard<std::mutex> lock(mutex_);
top_output_ = os.str();
is_top_error_ = is_top_error;
elapsed_ms_ = elapsed_ms;
}
}

#include <rclcpp_components/register_node_macro.hpp>
RCLCPP_COMPONENTS_REGISTER_NODE(ProcessMonitor)

0 comments on commit 0a274b6

Please sign in to comment.