Skip to content

Commit

Permalink
feat(node): add fstrim datadir observability (#3322)
Browse files Browse the repository at this point in the history
NODE-1537

Node decommissioning stage 2 was rolled out in a hurry so that we could
make the holiday-release cutoff: #2953

These changes add the missing observability logic.

Main commits:

- [Add datadir
metrics](a1e6dbb)
- [Update unit
tests](513ba80)


Screenshot of metrics:
<img width="1448" alt="image"
src="https://github.com/user-attachments/assets/a80cecd7-cde7-444b-85b6-a973a10331ab"
/>
  • Loading branch information
andrewbattat authored Jan 14, 2025
1 parent 323b72f commit 572afbc
Show file tree
Hide file tree
Showing 8 changed files with 481 additions and 244 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rs/ic_os/fstrim_tool/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ DEV_DEPENDENCIES = [
"@crate_index//:assert_matches",
"@crate_index//:predicates",
"@crate_index//:rand",
"@crate_index//:regex",
"@crate_index//:tempfile",
]

Expand Down
1 change: 1 addition & 0 deletions rs/ic_os/fstrim_tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ assert_matches = { workspace = true }
ic-crypto-test-utils-reproducible-rng = { path = "../../crypto/test_utils/reproducible_rng" }
predicates = { workspace = true }
rand = { workspace = true }
regex = { workspace = true }
tempfile = { workspace = true }
24 changes: 17 additions & 7 deletions rs/ic_os/fstrim_tool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@ fn write_metrics_using_tmp_file(metrics: &FsTrimMetrics, metrics_filename: &str)
.context("Failed to write metrics to file")
}

fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -> Result<()> {
fn update_metrics(
elapsed: Duration,
is_success: bool,
metrics_filename: &str,
is_datadir: bool,
) -> Result<()> {
let mut metrics = parse_existing_metrics_from_file(metrics_filename)
.unwrap_or_else(|e| {
eprintln!("error parsing existing metrics: {}", e);
Expand All @@ -71,7 +76,13 @@ fn update_metrics(elapsed: Duration, is_success: bool, metrics_filename: &str) -
eprintln!("no existing metrics found");
FsTrimMetrics::default()
});
metrics.update(is_success, elapsed)?;

if is_datadir {
metrics.update_datadir(is_success, elapsed)?;
} else {
metrics.update(is_success, elapsed)?;
}

write_metrics_using_tmp_file(&metrics, metrics_filename)
}

Expand Down Expand Up @@ -101,14 +112,13 @@ pub fn fstrim_tool(
let start = std::time::Instant::now();
let res_target = run_command(command, &target);
let elapsed_target = start.elapsed();
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename)?;
update_metrics(elapsed_target, res_target.is_ok(), &metrics_filename, false)?;

if !datadir_target.is_empty() && !is_node_assigned() {
// TODO observability changes needed, expand the metrics logic
// let start_datadir = std::time::Instant::now();
let start = std::time::Instant::now();
let res_datadir = run_command(command, &datadir_target);
// let elapsed_datadir = start_datadir.elapsed();
// update_metrics(elapsed_datadir, res_datadir.is_ok(), &metrics_filename)?;
let elapsed = start.elapsed();
update_metrics(elapsed, res_datadir.is_ok(), &metrics_filename, true)?;
res_target.and(res_datadir)
} else {
res_target
Expand Down
88 changes: 76 additions & 12 deletions rs/ic_os/fstrim_tool/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,20 @@ const METRICS_LAST_RUN_DURATION_MILLISECONDS: &str = "fstrim_last_run_duration_m
const METRICS_LAST_RUN_SUCCESS: &str = "fstrim_last_run_success";
const METRICS_RUNS_TOTAL: &str = "fstrim_runs_total";

const METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR: &str =
"fstrim_datadir_last_run_duration_milliseconds";
const METRICS_LAST_RUN_SUCCESS_DATADIR: &str = "fstrim_datadir_last_run_success";
const METRICS_RUNS_TOTAL_DATADIR: &str = "fstrim_datadir_runs_total";

#[derive(Debug)]
pub struct FsTrimMetrics {
pub last_duration_milliseconds: f64,
pub last_run_success: bool,
pub total_runs: f64,

pub last_duration_milliseconds_datadir: f64,
pub last_run_success_datadir: bool,
pub total_runs_datadir: f64,
}

impl Default for FsTrimMetrics {
Expand All @@ -21,6 +30,10 @@ impl Default for FsTrimMetrics {
last_duration_milliseconds: 0f64,
last_run_success: true,
total_runs: 0f64,

last_duration_milliseconds_datadir: 0f64,
last_run_success_datadir: true,
total_runs_datadir: 0f64,
}
}
}
Expand All @@ -33,26 +46,54 @@ impl FsTrimMetrics {
Ok(())
}

pub(crate) fn update_datadir(&mut self, success: bool, duration: Duration) -> Result<()> {
self.total_runs_datadir += 1f64;
self.last_run_success_datadir = success;
self.last_duration_milliseconds_datadir = duration.as_millis() as f64;
Ok(())
}

pub fn to_p8s_metrics_string(&self) -> String {
let fstrim_last_run_duration_milliseconds = to_go_f64(self.last_duration_milliseconds);
let fstrim_last_run_success = if self.last_run_success { "1" } else { "0" };
let fstrim_runs_total = to_go_f64(self.total_runs);

let fstrim_datadir_last_run_duration_milliseconds =
to_go_f64(self.last_duration_milliseconds_datadir);
let fstrim_datadir_last_run_success = if self.last_run_success_datadir {
"1"
} else {
"0"
};
let fstrim_datadir_runs_total = to_go_f64(self.total_runs_datadir);

format!(
"# HELP fstrim_last_run_duration_milliseconds Duration of last run of fstrim in milliseconds\n\
# TYPE fstrim_last_run_duration_milliseconds gauge\n\
fstrim_last_run_duration_milliseconds {}\n\
fstrim_last_run_duration_milliseconds {fstrim_last_run_duration_milliseconds}\n\
# HELP fstrim_last_run_success Success status of last run of fstrim (success: 1, failure: 0)\n\
# TYPE fstrim_last_run_success gauge\n\
fstrim_last_run_success {}\n\
fstrim_last_run_success {fstrim_last_run_success}\n\
# HELP fstrim_runs_total Total number of runs of fstrim\n\
# TYPE fstrim_runs_total counter\n\
fstrim_runs_total {}\n",
to_go_f64(self.last_duration_milliseconds),
if self.last_run_success { "1" } else { "0" },
to_go_f64(self.total_runs),
).to_string()
fstrim_runs_total {fstrim_runs_total}\n\
# HELP fstrim_datadir_last_run_duration_milliseconds Duration of last run of fstrim on datadir in milliseconds\n\
# TYPE fstrim_datadir_last_run_duration_milliseconds gauge\n\
fstrim_datadir_last_run_duration_milliseconds {fstrim_datadir_last_run_duration_milliseconds}\n\
# HELP fstrim_datadir_last_run_success Success status of last run of fstrim on datadir (success: 1, failure: 0)\n\
# TYPE fstrim_datadir_last_run_success gauge\n\
fstrim_datadir_last_run_success {fstrim_datadir_last_run_success}\n\
# HELP fstrim_datadir_runs_total Total number of runs of fstrim on datadir\n\
# TYPE fstrim_datadir_runs_total counter\n\
fstrim_datadir_runs_total {fstrim_datadir_runs_total}\n"
)
}

fn are_valid(&self) -> bool {
is_f64_finite_and_0_or_larger(self.total_runs)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds)
&& is_f64_finite_and_0_or_larger(self.total_runs_datadir)
&& is_f64_finite_and_0_or_larger(self.last_duration_milliseconds_datadir)
}
}

Expand Down Expand Up @@ -102,27 +143,41 @@ where
let mut last_duration_milliseconds: Option<f64> = None;
let mut last_run_success: Option<bool> = None;
let mut total_runs: Option<f64> = None;

// Default datadir fields (we treat them as optional in the metrics file)
let mut datadir_last_duration_milliseconds: f64 = 0f64;
let mut datadir_last_run_success: bool = true;
let mut datadir_total_runs: f64 = 0f64;

for line_or_err in lines {
let line = line_or_err.map_err(|e| format_err!("failed to read line: {}", e))?;
match line.split(' ').collect::<Vec<_>>()[..] {
["#", ..] => continue,
[key, value] => match key {
METRICS_LAST_RUN_DURATION_MILLISECONDS => {
let _ = last_duration_milliseconds
.get_or_insert(parse_metrics_value(key, value)?);
last_duration_milliseconds.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_SUCCESS => {
let _ =
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
last_run_success.get_or_insert(parse_metrics_value(key, value)? > 0f64);
}
METRICS_RUNS_TOTAL => {
let _ = total_runs.get_or_insert(parse_metrics_value(key, value)?);
total_runs.get_or_insert(parse_metrics_value(key, value)?);
}
METRICS_LAST_RUN_DURATION_MILLISECONDS_DATADIR => {
datadir_last_duration_milliseconds = parse_metrics_value(key, value)?;
}
METRICS_LAST_RUN_SUCCESS_DATADIR => {
datadir_last_run_success = parse_metrics_value(key, value)? > 0f64;
}
METRICS_RUNS_TOTAL_DATADIR => {
datadir_total_runs = parse_metrics_value(key, value)?;
}
_ => return Err(format_err!("unknown metric key: {}", key)),
},
_ => return Err(format_err!("invalid metric line: {:?}", line)),
}
}

let metrics = FsTrimMetrics {
last_duration_milliseconds: last_duration_milliseconds.ok_or(format_err!(
"missing metric: {}",
Expand All @@ -131,6 +186,9 @@ where
last_run_success: last_run_success
.ok_or(format_err!("missing metric: {}", METRICS_LAST_RUN_SUCCESS))?,
total_runs: total_runs.ok_or(format_err!("missing metric: {}", METRICS_RUNS_TOTAL))?,
last_duration_milliseconds_datadir: datadir_last_duration_milliseconds,
last_run_success_datadir: datadir_last_run_success,
total_runs_datadir: datadir_total_runs,
};
if !metrics.are_valid() {
return Err(format_err!("parsed metrics are invalid"));
Expand All @@ -148,6 +206,12 @@ impl PartialEq for FsTrimMetrics {
other.last_duration_milliseconds,
)
&& (self.last_run_success == other.last_run_success)
&& f64_approx_eq(
self.last_duration_milliseconds_datadir,
other.last_duration_milliseconds_datadir,
)
&& (self.last_run_success_datadir == other.last_run_success_datadir)
&& f64_approx_eq(self.total_runs_datadir, other.total_runs_datadir)
}
}

Expand Down
Loading

0 comments on commit 572afbc

Please sign in to comment.