From a1e6dbb99e60c8397d5ff9b7b8b1b839a5d0c0e2 Mon Sep 17 00:00:00 2001 From: Andrew Battat Date: Thu, 2 Jan 2025 23:32:40 +0000 Subject: [PATCH] Add datadir metrics --- rs/ic_os/fstrim_tool/src/lib.rs | 24 ++++++--- rs/ic_os/fstrim_tool/src/metrics/mod.rs | 72 ++++++++++++++++++++++--- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/rs/ic_os/fstrim_tool/src/lib.rs b/rs/ic_os/fstrim_tool/src/lib.rs index af39d6c6f5d..1bbe877bf5e 100644 --- a/rs/ic_os/fstrim_tool/src/lib.rs +++ b/rs/ic_os/fstrim_tool/src/lib.rs @@ -61,7 +61,12 @@ fn write_metrics_using_tmp_file(metrics: &FsTrimMetrics, metrics_filename: &str) .context("Failed to write metrics to file") } -fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -> Result<()> { +fn update_metrics( + elapsed: Duration, + is_success: bool, + metrics_filename: &str, + is_datadir: bool, +) -> Result<()> { let mut metrics = parse_existing_metrics_from_file(metrics_filename) .unwrap_or_else(|e| { eprintln!("error parsing existing metrics: {}", e); @@ -71,7 +76,13 @@ fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) - eprintln!("no existing metrics found"); FsTrimMetrics::default() }); - metrics.update(is_success, elapsed)?; + + if is_datadir { + metrics.update_datadir(is_success, elapsed)?; + } else { + metrics.update(is_success, elapsed)?; + } + write_metrics_using_tmp_file(&metrics, metrics_filename) } @@ -101,14 +112,13 @@ pub fn fstrim_tool( let start = std::time::Instant::now(); let res_target = run_command(command, &target); let elapsed_target = start.elapsed(); - update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename)?; + update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename, false)?; if !datadir_target.is_empty() && !is_node_assigned() { - // TODO observability changes needed, expand the metrics logic - // let start_datadir = std::time::Instant::now(); + let start = std::time::Instant::now(); let res_datadir = run_command(command, &datadir_target); - // let elapsed_datadir = start_datadir.elapsed(); - // update_metrics(elapsed_datadir, res_datadir.is_ok(), &metrics_filename)?; + let elapsed = start.elapsed(); + update_metrics(elapsed, res_datadir.is_ok(), &metrics_filename, true)?; res_target.and(res_datadir) } else { res_target diff --git a/rs/ic_os/fstrim_tool/src/metrics/mod.rs b/rs/ic_os/fstrim_tool/src/metrics/mod.rs index 5e42bce8d83..33c6aca6c62 100644 --- a/rs/ic_os/fstrim_tool/src/metrics/mod.rs +++ b/rs/ic_os/fstrim_tool/src/metrics/mod.rs @@ -8,11 +8,20 @@ const METRICS_LAST_RUN_DURATION_MILLISECONDS: &str = "fstrim_last_run_duration_m const METRICS_LAST_RUN_SUCCESS: &str = "fstrim_last_run_success"; const METRICS_RUNS_TOTAL: &str = "fstrim_runs_total"; +const METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR: &str = + "fstrim_datadir_last_run_duration_milliseconds"; +const METRICS_LAST_RUN_SUCCESS_DATADIR: &str = "fstrim_datadir_last_run_success"; +const METRICS_RUNS_TOTAL_DATADIR: &str = "fstrim_datadir_runs_total"; + #[derive(Debug)] pub struct FsTrimMetrics { pub last_duration_milliseconds: f64, pub last_run_success: bool, pub total_runs: f64, + + pub last_duration_milliseconds_datadir: f64, + pub last_run_success_datadir: bool, + pub total_runs_datadir: f64, } impl Default for FsTrimMetrics { @@ -21,6 +30,10 @@ impl Default for FsTrimMetrics { last_duration_milliseconds: 0f64, last_run_success: true, total_runs: 0f64, + + last_duration_milliseconds_datadir: 0f64, + last_run_success_datadir: true, + total_runs_datadir: 0f64, } } } @@ -33,6 +46,13 @@ impl FsTrimMetrics { Ok(()) } + pub(crate) fn update_datadir(&mut self, success: bool, duration: Duration) -> Result<()> { + self.total_runs_datadir += 1f64; + self.last_run_success_datadir = success; + self.last_duration_milliseconds_datadir = duration.as_millis() as f64; + Ok(()) + } + pub fn to_p8s_metrics_string(&self) -> String { format!( "# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\ @@ -43,16 +63,31 @@ impl FsTrimMetrics { fstrim_last_run_success {}\n\ # HELP fstrim_runs_total Total number of runs of fstrim\n\ # TYPE fstrim_runs_total counter\n\ - fstrim_runs_total {}\n", + fstrim_runs_total {}\n\ + # HELP fstrim_datadir_last_run_duration_milliseconds Duration of last run of fstrim on datadir in milliseconds\n\ + # TYPE fstrim_datadir_last_run_duration_milliseconds gauge\n\ + fstrim_datadir_last_run_duration_milliseconds {}\n\ + # HELP fstrim_datadir_last_run_success Success status of last run of fstrim on datadir (success: 1, failure: 0)\n\ + # TYPE fstrim_datadir_last_run_success gauge\n\ + fstrim_datadir_last_run_success {}\n\ + # HELP fstrim_datadir_runs_total Total number of runs of fstrim on datadir\n\ + # TYPE fstrim_datadir_runs_total counter\n\ + fstrim_datadir_runs_total {}\n", to_go_f64(self.last_duration_milliseconds), if self.last_run_success { "1" } else { "0" }, to_go_f64(self.total_runs), - ).to_string() + to_go_f64(self.last_duration_milliseconds_datadir), + if self.last_run_success_datadir { "1" } else { "0" }, + to_go_f64(self.total_runs_datadir), + ) + .to_string() } fn are_valid(&self) -> bool { is_f64_finite_and_0_or_larger(self.total_runs) && is_f64_finite_and_0_or_larger(self.last_duration_milliseconds) + && is_f64_finite_and_0_or_larger(self.total_runs_datadir) + && is_f64_finite_and_0_or_larger(self.last_duration_milliseconds_datadir) } } @@ -102,27 +137,41 @@ where let mut last_duration_milliseconds: Option = None; let mut last_run_success: Option = None; let mut total_runs: Option = None; + + // Default datadir fields (we treat them as optional in the metrics file) + let mut datadir_last_duration_milliseconds: f64 = 0f64; + let mut datadir_last_run_success: bool = true; + let mut datadir_total_runs: f64 = 0f64; + for line_or_err in lines { let line = line_or_err.map_err(|e| format_err!("failed to read line: {}", e))?; match line.split(' ').collect::>()[..] { ["#", ..] => continue, [key, value] => match key { METRICS_LAST_RUN_DURATION_MILLISECONDS => { - let _ = last_duration_milliseconds - .get_or_insert(parse_metrics_value(key, value)?); + last_duration_milliseconds.get_or_insert(parse_metrics_value(key, value)?); } METRICS_LAST_RUN_SUCCESS => { - let _ = - last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64); + last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64); } METRICS_RUNS_TOTAL => { - let _ = total_runs.get_or_insert(parse_metrics_value(key, value)?); + total_runs.get_or_insert(parse_metrics_value(key, value)?); + } + METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR => { + datadir_last_duration_milliseconds = parse_metrics_value(key, value)?; + } + METRICS_LAST_RUN_SUCCESS_DATADIR => { + datadir_last_run_success = parse_metrics_value(key, value)? > 0f64; + } + METRICS_RUNS_TOTAL_DATADIR => { + datadir_total_runs = parse_metrics_value(key, value)?; } _ => return Err(format_err!("unknown metric key: {}", key)), }, _ => return Err(format_err!("invalid metric line: {:?}", line)), } } + let metrics = FsTrimMetrics { last_duration_milliseconds: last_duration_milliseconds.ok_or(format_err!( "missing metric: {}", @@ -131,6 +180,9 @@ where last_run_success: last_run_success .ok_or(format_err!("missing metric: {}", METRICS_LAST_RUN_SUCCESS))?, total_runs: total_runs.ok_or(format_err!("missing metric: {}", METRICS_RUNS_TOTAL))?, + last_duration_milliseconds_datadir: datadir_last_duration_milliseconds, + last_run_success_datadir: datadir_last_run_success, + total_runs_datadir: datadir_total_runs, }; if !metrics.are_valid() { return Err(format_err!("parsed metrics are invalid")); @@ -148,6 +200,12 @@ impl PartialEq for FsTrimMetrics { other.last_duration_milliseconds, ) && (self.last_run_success == other.last_run_success) + && f64_approx_eq( + self.last_duration_milliseconds_datadir, + other.last_duration_milliseconds_datadir, + ) + && (self.last_run_success_datadir == other.last_run_success_datadir) + && f64_approx_eq(self.total_runs_datadir, other.total_runs_datadir) } }