Skip to content

Commit

Permalink
feat(eclssd): try to reset sensors to clear errors
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Jun 22, 2024
1 parent 047dae2 commit a4b16da
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 63 deletions.
15 changes: 15 additions & 0 deletions lib/eclss/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ pub trait SensorError {
Some(_) => sensor::Status::OtherI2cError,
}
}

/// Returns `true` if the error may be able to be cleared by
/// performing a sensor reset.
///
/// If this returns `true`, the sensor driver will attempt a sensor reset
/// rather than backing off.
///
/// By default, this method returns `false`.
fn should_reset(&self) -> bool {
false
}
}

pub trait Context<T, E> {
Expand Down Expand Up @@ -56,6 +67,10 @@ impl<E: SensorError> SensorError for EclssError<E> {
fn i2c_error(&self) -> Option<i2c::ErrorKind> {
self.error.i2c_error()
}

fn should_reset(&self) -> bool {
self.error.should_reset()
}
}

impl<E: fmt::Display> fmt::Display for EclssError<E> {
Expand Down
7 changes: 7 additions & 0 deletions lib/eclss/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ pub struct SensorMetrics {
pub pm_count: GaugeFamily<'static, PM_COUNT_METRICS, DiameterLabel>,
#[cfg_attr(feature = "serde", serde(serialize_with = "serialize_metric"))]
pub sensor_errors: CounterFamily<'static, SENSORS, SensorName>,
#[cfg_attr(feature = "serde", serde(serialize_with = "serialize_metric"))]
pub sensor_reset_count: CounterFamily<'static, SENSORS, SensorName>,
}
macro_rules! count_features {
($($feature:literal),*) => {{
Expand Down Expand Up @@ -129,6 +131,10 @@ impl SensorMetrics {
sensor_errors: MetricBuilder::new("sensor_error_count")
.with_help("Count of I2C errors that occurred while talking to a sensor")
.build_labeled::<_, SensorName, SENSORS>(),

sensor_reset_count: MetricBuilder::new("sensor_reset_count")
.with_help("The number of times a sensor was reset successfully")
.build_labeled::<_, SensorName, SENSORS>(),
}
}

Expand All @@ -146,6 +152,7 @@ impl SensorMetrics {
self.pm_conc.fmt_metric(f)?;
self.pm_count.fmt_metric(f)?;
self.sensor_errors.fmt_metric(f)?;
self.sensor_reset_count.fmt_metric(f)?;
Ok(())
}
}
Expand Down
98 changes: 61 additions & 37 deletions lib/eclss/src/sensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ impl<I, const SENSORS: usize> Eclss<I, { SENSORS }> {
name = "sensor",
level = tracing::Level::INFO,
skip(self, retry_backoff, delay, sensor),
fields(message = %S::NAME)
fields(sensor = %S::NAME)
)
)]
pub async fn run_sensor<S>(
Expand Down Expand Up @@ -101,48 +101,72 @@ impl<I, const SENSORS: usize> Eclss<I, { SENSORS }> {
.sensor_errors
.register(S::NAME)
.ok_or("insufficient space in sensor errors metric")?;

let mut attempts = 0;
while let Err(error) = {
status.set_status(Status::Initializing);
sensor.init().await
} {
status.set_status(error.as_status());
errors.fetch_add(1);
attempts += 1;
warn!(
%error,
"failed to initialize {} (attempt {attempts}): {error}",
S::NAME
);

if Some(attempts) == max_init_attempts {
error!(
"Giving up on {} after {attempts} failed initialization attempts!",
let resets = self
.metrics
.sensor_reset_count
.register(S::NAME)
.ok_or("insufficient space in sensor reset count metric")?;
let mut has_come_up = false;
'initialize: loop {
let mut attempts = 0;
let what_are_we_doing = if has_come_up { "initialize" } else { "reset" };
while let Err(error) = {
status.set_status(Status::Initializing);
sensor.init().await
} {
status.set_status(error.as_status());
errors.fetch_add(1);
attempts += 1;
warn!(
%error,
"failed to {what_are_we_doing} {} (attempt {attempts}): {error}",
S::NAME
);
return Err("failed to initialize sensor after maximum attempts");
}

backoff.wait(&mut delay).await;
}

backoff.reset();
info!("initialized {}", S::NAME);
if Some(attempts) == max_init_attempts {
error!(
"Giving up on {} after {attempts} attempts to {what_are_we_doing}",
S::NAME
);
return Err("failed to initialize sensor after maximum attempts");
}

loop {
delay.delay_ms(poll_interval.as_millis() as u32).await;
while let Err(error) = sensor.poll().await {
warn!(
%error,
retry_in = ?backoff.current(),
"failed to poll {}, retrying: {error}", S::NAME
);
status.set_status(error.as_status());
errors.fetch_add(1);
backoff.wait(&mut delay).await;
}
status.set_status(Status::Up);

backoff.reset();
if has_come_up {
resets.fetch_add(1);
info!("successfully reset {}", S::NAME);
} else {
info!("initialized {}", S::NAME);
has_come_up = true;
}

loop {
delay.delay_ms(poll_interval.as_millis() as u32).await;
while let Err(error) = sensor.poll().await {
warn!(
%error,
retry_in = ?backoff.current(),
"failed to poll {}, retrying: {error}", S::NAME
);
status.set_status(error.as_status());
errors.fetch_add(1);

if error.should_reset() {
tracing::info!(
%error,
"attempting to clear {} error by resetting...",
S::NAME,
);
continue 'initialize;
} else {
backoff.wait(&mut delay).await;
}
}
status.set_status(Status::Up);
}
}
}
}
Expand Down
71 changes: 45 additions & 26 deletions lib/eclss/src/sensor/sgp30.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub struct Sgp30<I: 'static, D> {
eco2: &'static Gauge,
abs_humidity: &'static tinymetrics::GaugeFamily<'static, HUMIDITY_METRICS, SensorName>,
calibration_polls: usize,
last_good_baseline: Option<sgp30::Baseline>,
}

/// Wrapper type to add a `Display` implementation to the `sgp30` crate's error
Expand Down Expand Up @@ -47,6 +48,7 @@ where
eco2: metrics.eco2_ppm.register(NAME).unwrap(),
abs_humidity: &metrics.abs_humidity_grams_m3,
calibration_polls: 0,
last_good_baseline: None,
}
}
}
Expand All @@ -65,9 +67,9 @@ where
// The SGP30 must be polled every second in order to ensure that the dynamic
// baseline calibration algorithm works correctly. Performing a measurement
// takes 12 ms, reading the raw H2 and ETOH signals takes 25 ms, and
// setting the humidity compensation takes up to 10 ms, so
// we poll every 1000ms - 12ms - 10ms - 25ms = 953 ms.
const POLL_INTERVAL: Duration = Duration::from_millis(1000 - 12 - 10 - 25);
// setting the humidity compensation and/or reading the baseline takes up to
// 10 ms, so we poll every 1000ms - 12ms - 10ms - 10ms - 25ms = 943 ms.
const POLL_INTERVAL: Duration = Duration::from_millis(1000 - 12 - 10 - 10 - 25);
type Error = EclssError<Sgp30Error<I::Error>>;

async fn init(&mut self) -> Result<(), Self::Error> {
Expand All @@ -91,11 +93,20 @@ where
if !selftest {
return Err(Sgp30Error::SelfTestFailed.into());
}

self.sensor
.init()
.force_init()
.await
.context("error initializing SGP30")?;

if let Some(ref baseline) = self.last_good_baseline {
tracing::info!("Setting {NAME} baseline to {baseline:?}");
self.sensor
.set_baseline(baseline)
.await
.context("error setting SGP30 baseline")?;
}

Ok(())
}

Expand All @@ -112,20 +123,23 @@ where
}
}
});
let baseline = if let Some(h) = abs_h {

if let Some(h) = abs_h {
self.sensor
.set_humidity(Some(&h))
.await
.context("error setting humidity for SGP30")?;
None
} else {
let baseline = self
.sensor
.get_baseline()
.await
.context("error reading SGP30 baseline")?;
Some(baseline)
};
}

let baseline = self
.sensor
.get_baseline()
.await
.map_err(|e| {
let error = Sgp30Error::from(e);
tracing::warn!(%error, "{NAME}: error reading baseline: {error}");
})
.ok();

let sgp30::Measurement {
tvoc_ppb,
Expand All @@ -138,16 +152,16 @@ where

tracing::debug!("{NAME}: CO₂eq: {co2eq_ppm} ppm, TVOC: {tvoc_ppb} ppb");

let sgp30::RawSignals { h2, ethanol } = self
match self
.sensor
.measure_raw_signals()
.await
.context("error reading SGP30 raw signals")?;

tracing::debug!("{NAME}: H₂: {h2}, Ethanol: {ethanol}");

if let Some(ref baseline) = baseline {
tracing::trace!("{NAME}: baseline: {baseline:?}");
.map_err(Sgp30Error::from)
{
Ok(sgp30::RawSignals { ethanol, h2 }) => {
tracing::debug!("{NAME}: H₂: {h2}, Ethanol: {ethanol}");
}
Err(error) => tracing::warn!(%error, "{NAME}: error reading raw signals: {error}"),
}

// Skip updating metrics until calibration completes.
Expand All @@ -165,16 +179,17 @@ where
// maximum value. This generally seems to indicate that the sensor
// is misbehaving, so just bail out and give it some time to settle down.
if tvoc_ppb == MAX_TVOC {
tracing::warn!("{NAME} TVOC measurement saturated, ignoring it");
// TODO(eliza): consider returning an error here? But, the sensor
// wants to be polled exactly once per second, so it might not like
// it if we back off...
return Ok(());
return Err(Sgp30Error::Saturated.into());
}

self.tvoc.set_value(tvoc_ppb as f64);
self.eco2.set_value(co2eq_ppm as f64);

if let Some(baseline) = baseline {
tracing::trace!("{NAME}: baseline: {baseline:?}");
self.last_good_baseline = Some(baseline);
}

Ok(())
}
}
Expand All @@ -193,6 +208,10 @@ impl<E: i2c::Error> SensorError for Sgp30Error<E> {
_ => None,
}
}

fn should_reset(&self) -> bool {
matches!(self, Self::Saturated)
}
}

impl<E: fmt::Display> fmt::Display for Sgp30Error<E> {
Expand Down

0 comments on commit a4b16da

Please sign in to comment.