Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(system_monitor): move top command execution to a timer #948

Merged
merged 4 commits into from
May 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <boost/process.hpp>

#include <memory>
#include <mutex>
#include <string>
#include <vector>

Expand All @@ -41,11 +42,6 @@ class ProcessMonitor : public rclcpp::Node
*/
explicit ProcessMonitor(const rclcpp::NodeOptions & options);

/**
* @brief Update the diagnostic state
*/
void update();

protected:
using DiagStatus = diagnostic_msgs::msg::DiagnosticStatus;

Expand Down Expand Up @@ -110,6 +106,11 @@ class ProcessMonitor : public rclcpp::Node
std::vector<std::shared_ptr<DiagTask>> * tasks, const std::string & message,
const std::string & error_command, const std::string & content);

/**
* @brief timer callback to execute top command
*/
void onTimer();

diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics

char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name
Expand All @@ -118,7 +119,13 @@ class ProcessMonitor : public rclcpp::Node
std::vector<std::shared_ptr<DiagTask>>
load_tasks_; //!< @brief list of diagnostics tasks for high load procs
std::vector<std::shared_ptr<DiagTask>>
memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs
memory_tasks_; //!< @brief list of diagnostics tasks for high memory procs
rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to execute top command
std::string top_output_; //!< @brief output from top command
bool is_top_error_; //!< @brief flag if an error occurs
double elapsed_ms_; //!< @brief Execution time of top command
std::mutex mutex_; //!< @brief mutex for output from top command
rclcpp::CallbackGroup::SharedPtr timer_callback_group_; //!< @brief Callback Group
};

#endif // SYSTEM_MONITOR__PROCESS_MONITOR__PROCESS_MONITOR_HPP_
1 change: 1 addition & 0 deletions system/system_monitor/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<depend>rclcpp</depend>
<depend>rclcpp_components</depend>
<depend>std_msgs</depend>
<depend>tier4_autoware_utils</depend>
<depend>tier4_external_api_msgs</depend>

<exec_depend>chrony</exec_depend>
Expand Down
87 changes: 66 additions & 21 deletions system/system_monitor/src/process_monitor/process_monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

#include "system_monitor/system_monitor_utility.hpp"

#include <tier4_autoware_utils/system/stop_watch.hpp>

#include <fmt/format.h>

#include <memory>
Expand All @@ -31,8 +33,11 @@
ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options)
: Node("process_monitor", options),
updater_(this),
num_of_procs_(declare_parameter<int>("num_of_procs", 5))
num_of_procs_(declare_parameter<int>("num_of_procs", 5)),
is_top_error_(false)
{
using namespace std::literals::chrono_literals;

int index;

gethostname(hostname_, sizeof(hostname_));
Expand All @@ -50,33 +55,40 @@ ProcessMonitor::ProcessMonitor(const rclcpp::NodeOptions & options)
memory_tasks_.push_back(task);
updater_.add(*task);
}
}

void ProcessMonitor::update() { updater_.force_update(); }
// Start timer to execute top command
timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive);
timer_ = rclcpp::create_timer(
this, get_clock(), 1s, std::bind(&ProcessMonitor::onTimer, this), timer_callback_group_);
}

void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrapper & stat)
{
// Remember start time to measure elapsed time
const auto t_start = SystemMonitorUtility::startMeasurement();

bp::ipstream is_err;
bp::ipstream is_out;
std::ostringstream os;
// thread-safe read
std::string str;
bool is_top_error;
double elapsed_ms;
{
std::lock_guard<std::mutex> lock(mutex_);
str = top_output_;
is_top_error = is_top_error_;
elapsed_ms = elapsed_ms_;
}

// Get processes
bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err);
c.wait();
if (c.exit_code() != 0) {
is_err >> os.rdbuf();
if (is_top_error) {
stat.summary(DiagStatus::ERROR, "top error");
stat.add("top", os.str().c_str());
setErrorContent(&load_tasks_, "top error", "top", os.str().c_str());
setErrorContent(&memory_tasks_, "top error", "top", os.str().c_str());
stat.add("top", str);
setErrorContent(&load_tasks_, "top error", "top", str);
setErrorContent(&memory_tasks_, "top error", "top", str);
return;
}

is_out >> os.rdbuf();
std::string str = os.str();
// If top still not running
if (str.empty()) {
// Send OK tentatively
stat.summary(DiagStatus::OK, "starting up");
return;
}

// Get task summary
getTasksSummary(stat, str);
Expand All @@ -89,8 +101,7 @@ void ProcessMonitor::monitorProcesses(diagnostic_updater::DiagnosticStatusWrappe
// Get high memory processes
getHighMemoryProcesses(str);

// Measure elapsed time since start time and report
SystemMonitorUtility::stopMeasurement(t_start, stat);
stat.addf("execution time", "%f ms", elapsed_ms);
}

void ProcessMonitor::getTasksSummary(
Expand Down Expand Up @@ -310,5 +321,39 @@ void ProcessMonitor::setErrorContent(
}
}

void ProcessMonitor::onTimer()
{
bool is_top_error = false;

// Start to measure elapsed time
tier4_autoware_utils::StopWatch<std::chrono::milliseconds> stop_watch;
stop_watch.tic("execution_time");

bp::ipstream is_err;
bp::ipstream is_out;
std::ostringstream os;

// Get processes
bp::child c("top -bn1 -o %CPU -w 128", bp::std_out > is_out, bp::std_err > is_err);
c.wait();

if (c.exit_code() != 0) {
is_top_error = true;
is_err >> os.rdbuf();
} else {
is_out >> os.rdbuf();
}

const double elapsed_ms = stop_watch.toc("execution_time");

// thread-safe copy
{
std::lock_guard<std::mutex> lock(mutex_);
top_output_ = os.str();
is_top_error_ = is_top_error;
elapsed_ms_ = elapsed_ms;
}
}

#include <rclcpp_components/register_node_macro.hpp>
RCLCPP_COMPONENTS_REGISTER_NODE(ProcessMonitor)