From a4b16da9b52e13696ee9da82b9d7ef573c06177d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 22 Jun 2024 10:41:10 -0700 Subject: [PATCH] feat(eclssd): try to reset sensors to clear errors --- lib/eclss/src/error.rs | 15 ++++++ lib/eclss/src/metrics.rs | 7 +++ lib/eclss/src/sensor.rs | 98 ++++++++++++++++++++++------------- lib/eclss/src/sensor/sgp30.rs | 71 +++++++++++++++---------- 4 files changed, 128 insertions(+), 63 deletions(-) diff --git a/lib/eclss/src/error.rs b/lib/eclss/src/error.rs index 6bc6ea6..60c13f3 100644 --- a/lib/eclss/src/error.rs +++ b/lib/eclss/src/error.rs @@ -13,6 +13,17 @@ pub trait SensorError { Some(_) => sensor::Status::OtherI2cError, } } + + /// Returns `true` if the error may be able to be cleared by + /// performing a sensor reset. + /// + /// If this returns `true`, the sensor driver will attempt a sensor reset + /// rather than backing off. + /// + /// By default, this method returns `false`. + fn should_reset(&self) -> bool { + false + } } pub trait Context { @@ -56,6 +67,10 @@ impl SensorError for EclssError { fn i2c_error(&self) -> Option { self.error.i2c_error() } + + fn should_reset(&self) -> bool { + self.error.should_reset() + } } impl fmt::Display for EclssError { diff --git a/lib/eclss/src/metrics.rs b/lib/eclss/src/metrics.rs index 4749c20..cb91317 100644 --- a/lib/eclss/src/metrics.rs +++ b/lib/eclss/src/metrics.rs @@ -36,6 +36,8 @@ pub struct SensorMetrics { pub pm_count: GaugeFamily<'static, PM_COUNT_METRICS, DiameterLabel>, #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_metric"))] pub sensor_errors: CounterFamily<'static, SENSORS, SensorName>, + #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_metric"))] + pub sensor_reset_count: CounterFamily<'static, SENSORS, SensorName>, } macro_rules! count_features { ($($feature:literal),*) => {{ @@ -129,6 +131,10 @@ impl SensorMetrics { sensor_errors: MetricBuilder::new("sensor_error_count") .with_help("Count of I2C errors that occurred while talking to a sensor") .build_labeled::<_, SensorName, SENSORS>(), + + sensor_reset_count: MetricBuilder::new("sensor_reset_count") + .with_help("The number of times a sensor was reset successfully") + .build_labeled::<_, SensorName, SENSORS>(), } } @@ -146,6 +152,7 @@ impl SensorMetrics { self.pm_conc.fmt_metric(f)?; self.pm_count.fmt_metric(f)?; self.sensor_errors.fmt_metric(f)?; + self.sensor_reset_count.fmt_metric(f)?; Ok(()) } } diff --git a/lib/eclss/src/sensor.rs b/lib/eclss/src/sensor.rs index 2f2f54f..0395dc2 100644 --- a/lib/eclss/src/sensor.rs +++ b/lib/eclss/src/sensor.rs @@ -66,7 +66,7 @@ impl Eclss { name = "sensor", level = tracing::Level::INFO, skip(self, retry_backoff, delay, sensor), - fields(message = %S::NAME) + fields(sensor = %S::NAME) ) )] pub async fn run_sensor( @@ -101,48 +101,72 @@ impl Eclss { .sensor_errors .register(S::NAME) .ok_or("insufficient space in sensor errors metric")?; - - let mut attempts = 0; - while let Err(error) = { - status.set_status(Status::Initializing); - sensor.init().await - } { - status.set_status(error.as_status()); - errors.fetch_add(1); - attempts += 1; - warn!( - %error, - "failed to initialize {} (attempt {attempts}): {error}", - S::NAME - ); - - if Some(attempts) == max_init_attempts { - error!( - "Giving up on {} after {attempts} failed initialization attempts!", + let resets = self + .metrics + .sensor_reset_count + .register(S::NAME) + .ok_or("insufficient space in sensor reset count metric")?; + let mut has_come_up = false; + 'initialize: loop { + let mut attempts = 0; + let what_are_we_doing = if has_come_up { "initialize" } else { "reset" }; + while let Err(error) = { + status.set_status(Status::Initializing); + sensor.init().await + } { + status.set_status(error.as_status()); + errors.fetch_add(1); + attempts += 1; + warn!( + %error, + "failed to {what_are_we_doing} {} (attempt {attempts}): {error}", S::NAME ); - return Err("failed to initialize sensor after maximum attempts"); - } - - backoff.wait(&mut delay).await; - } - backoff.reset(); - info!("initialized {}", S::NAME); + if Some(attempts) == max_init_attempts { + error!( + "Giving up on {} after {attempts} attempts to {what_are_we_doing}", + S::NAME + ); + return Err("failed to initialize sensor after maximum attempts"); + } - loop { - delay.delay_ms(poll_interval.as_millis() as u32).await; - while let Err(error) = sensor.poll().await { - warn!( - %error, - retry_in = ?backoff.current(), - "failed to poll {}, retrying: {error}", S::NAME - ); - status.set_status(error.as_status()); - errors.fetch_add(1); backoff.wait(&mut delay).await; } - status.set_status(Status::Up); + + backoff.reset(); + if has_come_up { + resets.fetch_add(1); + info!("successfully reset {}", S::NAME); + } else { + info!("initialized {}", S::NAME); + has_come_up = true; + } + + loop { + delay.delay_ms(poll_interval.as_millis() as u32).await; + while let Err(error) = sensor.poll().await { + warn!( + %error, + retry_in = ?backoff.current(), + "failed to poll {}, retrying: {error}", S::NAME + ); + status.set_status(error.as_status()); + errors.fetch_add(1); + + if error.should_reset() { + tracing::info!( + %error, + "attempting to clear {} error by resetting...", + S::NAME, + ); + continue 'initialize; + } else { + backoff.wait(&mut delay).await; + } + } + status.set_status(Status::Up); + } } } } diff --git a/lib/eclss/src/sensor/sgp30.rs b/lib/eclss/src/sensor/sgp30.rs index 46e4c89..3313dcf 100644 --- a/lib/eclss/src/sensor/sgp30.rs +++ b/lib/eclss/src/sensor/sgp30.rs @@ -20,6 +20,7 @@ pub struct Sgp30 { eco2: &'static Gauge, abs_humidity: &'static tinymetrics::GaugeFamily<'static, HUMIDITY_METRICS, SensorName>, calibration_polls: usize, + last_good_baseline: Option, } /// Wrapper type to add a `Display` implementation to the `sgp30` crate's error @@ -47,6 +48,7 @@ where eco2: metrics.eco2_ppm.register(NAME).unwrap(), abs_humidity: &metrics.abs_humidity_grams_m3, calibration_polls: 0, + last_good_baseline: None, } } } @@ -65,9 +67,9 @@ where // The SGP30 must be polled every second in order to ensure that the dynamic // baseline calibration algorithm works correctly. Performing a measurement // takes 12 ms, reading the raw H2 and ETOH signals takes 25 ms, and - // setting the humidity compensation takes up to 10 ms, so - // we poll every 1000ms - 12ms - 10ms - 25ms = 953 ms. - const POLL_INTERVAL: Duration = Duration::from_millis(1000 - 12 - 10 - 25); + // setting the humidity compensation and/or reading the baseline takes up to + // 10 ms, so we poll every 1000ms - 12ms - 10ms - 10ms - 25ms = 943 ms. + const POLL_INTERVAL: Duration = Duration::from_millis(1000 - 12 - 10 - 10 - 25); type Error = EclssError>; async fn init(&mut self) -> Result<(), Self::Error> { @@ -91,11 +93,20 @@ where if !selftest { return Err(Sgp30Error::SelfTestFailed.into()); } + self.sensor - .init() + .force_init() .await .context("error initializing SGP30")?; + if let Some(ref baseline) = self.last_good_baseline { + tracing::info!("Setting {NAME} baseline to {baseline:?}"); + self.sensor + .set_baseline(baseline) + .await + .context("error setting SGP30 baseline")?; + } + Ok(()) } @@ -112,20 +123,23 @@ where } } }); - let baseline = if let Some(h) = abs_h { + + if let Some(h) = abs_h { self.sensor .set_humidity(Some(&h)) .await .context("error setting humidity for SGP30")?; - None - } else { - let baseline = self - .sensor - .get_baseline() - .await - .context("error reading SGP30 baseline")?; - Some(baseline) - }; + } + + let baseline = self + .sensor + .get_baseline() + .await + .map_err(|e| { + let error = Sgp30Error::from(e); + tracing::warn!(%error, "{NAME}: error reading baseline: {error}"); + }) + .ok(); let sgp30::Measurement { tvoc_ppb, @@ -138,16 +152,16 @@ where tracing::debug!("{NAME}: CO₂eq: {co2eq_ppm} ppm, TVOC: {tvoc_ppb} ppb"); - let sgp30::RawSignals { h2, ethanol } = self + match self .sensor .measure_raw_signals() .await - .context("error reading SGP30 raw signals")?; - - tracing::debug!("{NAME}: H₂: {h2}, Ethanol: {ethanol}"); - - if let Some(ref baseline) = baseline { - tracing::trace!("{NAME}: baseline: {baseline:?}"); + .map_err(Sgp30Error::from) + { + Ok(sgp30::RawSignals { ethanol, h2 }) => { + tracing::debug!("{NAME}: H₂: {h2}, Ethanol: {ethanol}"); + } + Err(error) => tracing::warn!(%error, "{NAME}: error reading raw signals: {error}"), } // Skip updating metrics until calibration completes. @@ -165,16 +179,17 @@ where // maximum value. This generally seems to indicate that the sensor // is misbehaving, so just bail out and give it some time to settle down. if tvoc_ppb == MAX_TVOC { - tracing::warn!("{NAME} TVOC measurement saturated, ignoring it"); - // TODO(eliza): consider returning an error here? But, the sensor - // wants to be polled exactly once per second, so it might not like - // it if we back off... - return Ok(()); + return Err(Sgp30Error::Saturated.into()); } self.tvoc.set_value(tvoc_ppb as f64); self.eco2.set_value(co2eq_ppm as f64); + if let Some(baseline) = baseline { + tracing::trace!("{NAME}: baseline: {baseline:?}"); + self.last_good_baseline = Some(baseline); + } + Ok(()) } } @@ -193,6 +208,10 @@ impl SensorError for Sgp30Error { _ => None, } } + + fn should_reset(&self) -> bool { + matches!(self, Self::Saturated) + } } impl fmt::Display for Sgp30Error {