From f9b231b878ffee221d21f39f9d08419535bca428 Mon Sep 17 00:00:00 2001
From: Roderick <rbovee@gmail.com>
Date: Fri, 22 Jul 2022 20:03:16 -0700
Subject: [PATCH] Support Chemstation "31" format and bump to 0.3.1

Closes #34.
---
 entab-cli/Cargo.toml                         |   4 +-
 entab-js/Cargo.toml                          |   2 +-
 entab-py/Cargo.toml                          |   2 +-
 entab-r/Cargo.toml                           |   2 +-
 entab-r/DESCRIPTION                          |   2 +-
 entab/Cargo.toml                             |   2 +-
 entab/src/filetype.rs                        |   9 +-
 entab/src/parsers/agilent/chemstation.rs     | 210 +++++++++++++------
 entab/src/parsers/agilent/chemstation_new.rs |  14 --
 entab/src/parsers/agilent/mod.rs             |   6 +-
 entab/src/readers.rs                         |   3 +
 11 files changed, 168 insertions(+), 88 deletions(-)

diff --git a/entab-cli/Cargo.toml b/entab-cli/Cargo.toml
index 005097c..a352ba3 100644
--- a/entab-cli/Cargo.toml
+++ b/entab-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "entab-cli"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Roderick <rbovee@gmail.com>"]
 edition = "2018"
 description = "Record-format file reader CLI"
@@ -11,7 +11,7 @@ categories = ["command-line-utilities", "parsing", "science"]
 
 [dependencies]
 clap = { version = "3.1.5", features = ["cargo"] }
-entab = { path = "../entab", version = "0.3.0" }
+entab = { path = "../entab", version = "0.3.1" }
 memchr = "2.4"
 memmap2 = { version = "0.5.3", optional = true }
 
diff --git a/entab-js/Cargo.toml b/entab-js/Cargo.toml
index 80f7d7a..e62aadc 100644
--- a/entab-js/Cargo.toml
+++ b/entab-js/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "entab-js"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Roderick <rbovee@gmail.com>"]
 license = "MIT"
 description = "Record-format file reader"
diff --git a/entab-py/Cargo.toml b/entab-py/Cargo.toml
index 73708b8..d9f2567 100644
--- a/entab-py/Cargo.toml
+++ b/entab-py/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "entab-py"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Roderick <rbovee@gmail.com>"]
 license = "MIT"
 description = "Record-format file reader"
diff --git a/entab-r/Cargo.toml b/entab-r/Cargo.toml
index a697041..3128f89 100644
--- a/entab-r/Cargo.toml
+++ b/entab-r/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "entab-r"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Roderick <rbovee@gmail.com>"]
 edition = "2018"
 
diff --git a/entab-r/DESCRIPTION b/entab-r/DESCRIPTION
index 4b47abd..08f1308 100644
--- a/entab-r/DESCRIPTION
+++ b/entab-r/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: entab
 Type: Package
 Title: Entab
-Version: 0.3.0
+Version: 0.3.1
 Author: Roderick
 Maintainer: Roderick <rbovee@gmail.com>
 Description: Entab is a record-format file reader.
diff --git a/entab/Cargo.toml b/entab/Cargo.toml
index 8a84b67..d9d7528 100644
--- a/entab/Cargo.toml
+++ b/entab/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "entab"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["Roderick <rbovee@gmail.com>"]
 edition = "2018"
 description = "Record-format file reader"
diff --git a/entab/src/filetype.rs b/entab/src/filetype.rs
index 08a6b3f..a2c2845 100644
--- a/entab/src/filetype.rs
+++ b/entab/src/filetype.rs
@@ -33,6 +33,8 @@ pub enum FileType {
     // chemoinformatics
     /// Agilent format used for MS-MS trace data
     AgilentMsMsScan, // bin   0x01, 0x01
+    /// Agilent format used for UV-visible array data
+    AgilentChemstationDad,
     /// Agilent format used for flame ionization trace data
     AgilentChemstationFid,
     /// Agilent format used for mass spectrometry trace data
@@ -99,6 +101,7 @@ impl FileType {
                 b"BAM\x01" => return FileType::Bam,
                 b"@HD\t" | b"@SQ\t" => return FileType::Sam,
                 b"\x2Escf" => return FileType::Scf,
+                [0x02, 0x33, 0x31, 0x00] => return FileType::AgilentChemstationDad,
                 [0x02, 0x38, 0x31, 0x00] => return FileType::AgilentChemstationFid,
                 [0x01, 0x32, 0x00, 0x00] => return FileType::AgilentChemstationMs,
                 [0x02, 0x33, 0x30, 0x00] => return FileType::AgilentChemstationMwd,
@@ -165,7 +168,10 @@ impl FileType {
             "scf" => &[FileType::Scf],
             "sd" => &[FileType::AgilentMasshunterDadHeader],
             "sp" => &[FileType::AgilentMasshunterDad],
-            "uv" => &[FileType::AgilentChemstationUv],
+            "uv" => &[
+                FileType::AgilentChemstationDad,
+                FileType::AgilentChemstationUv,
+            ],
             "xz" => &[FileType::Lzma],
             "zstd" => &[FileType::Zstd],
             "ztr" => &[FileType::Ztr],
@@ -179,6 +185,7 @@ impl FileType {
     /// If a file is unsupported, an error will be returned.
     pub fn to_parser_name<'a>(&self, hint: Option<&'a str>) -> Result<&'a str, EtError> {
         Ok(match (self, hint) {
+            (FileType::AgilentChemstationDad, None) => "chemstation_dad",
             (FileType::AgilentChemstationFid, None) => "chemstation_fid",
             (FileType::AgilentChemstationMs, None) => "chemstation_ms",
             (FileType::AgilentChemstationMwd, None) => "chemstation_mwd",
diff --git a/entab/src/parsers/agilent/chemstation.rs b/entab/src/parsers/agilent/chemstation.rs
index 997f3d2..f837275 100644
--- a/entab/src/parsers/agilent/chemstation.rs
+++ b/entab/src/parsers/agilent/chemstation.rs
@@ -81,25 +81,32 @@ impl<'r> From<&ChemstationMetadata> for BTreeMap<String, Value<'r>> {
     }
 }
 
-fn get_metadata(header: &[u8]) -> Result<ChemstationMetadata, EtError> {
-    if header.len() < 652 {
+fn get_metadata(header: &[u8], has_signal: bool) -> Result<ChemstationMetadata, EtError> {
+    if has_signal && header.len() < 652 {
         return Err(
             EtError::from("Chemstation header needs to be at least 648 bytes long").incomplete(),
         );
+    } else if !has_signal && header.len() < 512 {
+        return Err(
+            EtError::from("Chemstation header needs to be at least 512 bytes long").incomplete(),
+        );
     }
     let start_time = f64::from(i32::extract(&header[282..], &Endian::Big)?) / 60000.;
     let end_time = f64::from(i32::extract(&header[286..], &Endian::Big)?) / 60000.;
 
-    let offset_correction = f64::extract(&header[636..], &Endian::Big)?;
-    let mult_correction = f64::extract(&header[644..], &Endian::Big)?;
+    let mut offset_correction = 0.;
+    let mut mult_correction = 1.;
+    let mut signal_name = "";
+    if has_signal {
+        offset_correction = f64::extract(&header[636..], &Endian::Big)?;
+        mult_correction = f64::extract(&header[644..], &Endian::Big)?;
 
-    let signal_name_len = usize::from(header[596]);
-    if signal_name_len > 40 {
-        return Err("Invalid signal name length".into());
+        let signal_name_len = usize::from(header[596]);
+        if signal_name_len > 40 {
+            return Err("Invalid signal name length".into());
+        }
+        signal_name = str::from_utf8(&header[597..597 + signal_name_len])?.trim();
     }
-    let signal_name = str::from_utf8(&header[597..597 + signal_name_len])?
-        .trim()
-        .to_string();
 
     let sample_len = usize::from(header[24]);
     if sample_len > 60 {
@@ -164,7 +171,7 @@ fn get_metadata(header: &[u8]) -> Result<ChemstationMetadata, EtError> {
     Ok(ChemstationMetadata {
         start_time,
         end_time,
-        signal_name,
+        signal_name: signal_name.to_string(),
         offset_correction,
         mult_correction,
         sequence,
@@ -213,7 +220,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationFidState {
     }
 
     fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
-        let metadata = get_metadata(rb)?;
+        let metadata = get_metadata(rb, true)?;
         // offset the current time back one step so it'll be right after the first time that parse
         self.cur_time = metadata.start_time - CHEMSTATION_TIME_STEP;
         self.cur_intensity = 0.;
@@ -312,7 +319,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMsState {
     }
 
     fn get(&mut self, buffer: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
-        let metadata = get_metadata(buffer)?;
+        let metadata = get_metadata(buffer, true)?;
         let n_scans = u32::extract(&buffer[278..], &Endian::Big)? as usize;
 
         self.n_scans_left = n_scans;
@@ -430,7 +437,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdState {
     }
 
     fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
-        let metadata = get_metadata(buf)?;
+        let metadata = get_metadata(buf, true)?;
 
         self.n_wvs_left = 0;
         // offset the current time back one step so it'll be right after the first time that parse
@@ -514,55 +521,134 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdRecord<'s> {
     }
 }
 
-// scratch with offsets for info in different files
-
-// FID - 02 38 31 00 ("81") (missing 01 38 00 00)
-// MWD - 02 33 30 00 ("30")
-// MS - 01 32 00 00 ("2") (missing 02 32 30?)
-// (possibly also 03 31 37 39 and 03 31 38 31 ?)
-//  - 5 - "GC / MS Data File" or other?
-//  - 24 - Sample Name
-//  - 86 - Sample Description?
-//  - 148 - Operator Name
-//  - 178 - Run Date
-//  - 208 - Instrument Name
-//  - 218 - LC or GC
-//  - 228 - Method Name
-//  - 252 - Sequence? (u16)
-//  - 254 - Vial? (u16)
-//  - 256 - Replicate? (u16)
-//  - 260 - TIC Offset? (i32)
-//  * 264 - FID/MWD - 512 byte header chunks // 2 + 1
-//  - 264 - MS - total header bytes // 2 + 1
-//  - 272 - Normalization offset? (i32)
-//  * 282 - Start Time (i32)
-//  * 286 - End Time (i32)
-//  M 322 - Collection software?
-//  M 355 - Software Version?
-//  - 368 - "GC / MS Data File" as utf16
-//  M 405 - Another Version?
-//  - 448 - MS - Instrument name as utf16
-//  - 530 - lower end of mz/wv range?
-//  - 532 - upper end of mz/wv range?
-//  - 576 - MS - "GC"
-//  - 580 - Units
-//  M 596 - Channel Info (str)
-//  - 616 - MS - Method directory
-//  - 644 - (f32/64?)
-//  - 5768 - MS - data start (GC)
-
-// LC - 03 31 33 31 ("131")
-//  * 264 - 512 byte header chunks // 2 + 1
-//  ? 278 - Number of Records
-//  - 858 - Sample Name
-//  - 1880 - Operator Name
-//  - 2391 - Run Date
-//  - 2492 - Instrument Name
-//  - 2533 - "LC"
-//  - 2574 - Method Name
-//  - 3093 - Units
-//   4096 - data start?
+#[derive(Clone, Debug, Default)]
+/// Internal state for the `ChemstationDadRecord` parser
+pub struct ChemstationDadState {
+    n_scans_left: usize,
+    n_bytes_left: usize,
+    cur_time: f64,
+    cur_intensity: f64,
+    cur_wv: f64,
+    wv_step: f64,
+    metadata: ChemstationMetadata,
+}
+
+impl StateMetadata for ChemstationDadState {
+    fn metadata(&self) -> BTreeMap<String, Value> {
+        (&self.metadata).into()
+    }
+
+    fn header(&self) -> Vec<&str> {
+        vec!["time", "wavelength", "intensity"]
+    }
+}
+
+impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadState {
+    type State = ();
+
+    fn parse(
+        rb: &[u8],
+        _eof: bool,
+        consumed: &mut usize,
+        _state: &mut Self::State,
+    ) -> Result<bool, EtError> {
+        *consumed += read_agilent_header(rb, false)?;
+        Ok(true)
+    }
 
+    fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> {
+        let metadata = get_metadata(buf, false)?;
+        let n_scans = u32::extract(&buf[278..], &Endian::Big)? as usize;
+
+        self.n_scans_left = n_scans;
+        self.metadata = metadata;
+        Ok(())
+    }
+}
+
+#[derive(Clone, Copy, Debug, Default)]
+/// A single point from an e.g. moving wavelength detector trace
+pub struct ChemstationDadRecord {
+    /// The time recorded at
+    pub time: f64,
+    /// The wavelength recorded at
+    pub wavelength: f64,
+    /// The intensity record
+    pub intensity: f64,
+}
+
+impl_record!(ChemstationDadRecord: time, wavelength, intensity);
+
+impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadRecord {
+    type State = ChemstationDadState;
+
+    fn parse(
+        rb: &[u8],
+        _eof: bool,
+        consumed: &mut usize,
+        state: &mut Self::State,
+    ) -> Result<bool, EtError> {
+        if state.n_scans_left == 0 {
+            return Ok(false);
+        }
+        let con = &mut 0;
+        let mut n_scans_left = state.n_scans_left;
+        let mut n_bytes_left = state.n_bytes_left;
+        if n_bytes_left == 0 {
+            let scan_type = extract::<u16>(rb, con, &mut Endian::Little)?;
+            if scan_type != 67 {
+                // i'm not sure we ever hit this (tracking the n_scans_left should prevent it), but
+                // sometimes there's a different type of scan (68) at the end which starts a stream
+                // of u16, u32, u32 data; the u32's appear to both increment separately and the u16
+                // is either 80 or 81 ~95% of the time and a number in the 50s-60s otherwise.
+                return Ok(false);
+            }
+            n_bytes_left =
+                usize::from(extract::<u16>(rb, con, &mut Endian::Little)?.saturating_sub(22));
+            state.cur_time = f64::from(extract::<u32>(rb, con, &mut Endian::Little)?);
+            state.cur_wv = f64::from(extract::<u16>(rb, con, &mut Endian::Little)?);
+            let _ = extract::<u16>(rb, con, &mut Endian::Little)?; // the end wavelength
+            state.wv_step = f64::from(extract::<u16>(rb, con, &mut Endian::Little)?);
+            let _ = extract::<&[u8]>(rb, con, &mut 8)?;
+            state.cur_intensity = 0.;
+            if n_bytes_left == 0 {
+                // TODO: consume the rest of the file so this can't accidentally repeat?
+                return Ok(false);
+            }
+            n_scans_left -= 1;
+        } else {
+            state.cur_wv += state.wv_step;
+        }
+
+        let intensity: i16 = extract(rb, con, &mut Endian::Little)?;
+        if intensity == -32768 {
+            state.cur_intensity = f64::from(extract::<i32>(rb, con, &mut Endian::Little)?);
+            state.n_bytes_left = n_bytes_left.saturating_sub(6);
+        } else {
+            state.cur_intensity += f64::from(intensity);
+            state.n_bytes_left = n_bytes_left.saturating_sub(2);
+        }
+
+        state.n_scans_left = n_scans_left;
+        *consumed += *con;
+        Ok(true)
+    }
+
+    fn get(&mut self, _rb: &'b [u8], state: &'s Self::State) -> Result<(), EtError> {
+        self.wavelength = state.cur_wv / 20.;
+        self.time = state.cur_time / 60_000.;
+        self.intensity = state.cur_intensity / 2000.;
+        Ok(())
+    }
+}
+
+impl_reader!(
+    ChemstationDadReader,
+    ChemstationDadRecord,
+    ChemstationDadRecord,
+    ChemstationDadState,
+    ()
+);
 impl_reader!(
     ChemstationFidReader,
     ChemstationFidRecord,
diff --git a/entab/src/parsers/agilent/chemstation_new.rs b/entab/src/parsers/agilent/chemstation_new.rs
index 206a5f1..db9f231 100644
--- a/entab/src/parsers/agilent/chemstation_new.rs
+++ b/entab/src/parsers/agilent/chemstation_new.rs
@@ -224,20 +224,6 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationUvRecord {
     }
 }
 
-// scratch with offsets for info in different files
-
-// LC - 03 31 33 31 ("131")
-//  * 264 - 512 byte header chunks // 2 + 1
-//  ? 278 - Number of Records
-//  - 858 - Sample Name
-//  - 1880 - Operator Name
-//  - 2391 - Run Date
-//  - 2492 - Instrument Name
-//  - 2533 - "LC"
-//  - 2574 - Method Name
-//  - 3093 - Units
-//   4096 - data start?
-
 impl_reader!(
     ChemstationUvReader,
     ChemstationUvRecord,
diff --git a/entab/src/parsers/agilent/mod.rs b/entab/src/parsers/agilent/mod.rs
index 73d3e56..3af3637 100644
--- a/entab/src/parsers/agilent/mod.rs
+++ b/entab/src/parsers/agilent/mod.rs
@@ -1,11 +1,9 @@
-// TODO: finish and reenable this
-// /// Readers for instrument telemetry data generated by Chemstation
-// pub mod chemstation_reg;
 /// Readers for formats generated by the GC/LC control software Chemstation
 pub mod chemstation;
 /// Readers for newer formats generated by the GC/LC control software Chemstation
 pub mod chemstation_new;
-// /// Reader for Chemstation's logging files
+// TODO: finish and reenable this
+// /// Readers for instrument telemetry data generated by Chemstation
 // pub mod chemstation_reg;
 /// Readers for formats generated by the GC/LC control software Masshunter
 #[cfg(feature = "std")]
diff --git a/entab/src/readers.rs b/entab/src/readers.rs
index 4282d84..65fdb31 100644
--- a/entab/src/readers.rs
+++ b/entab/src/readers.rs
@@ -41,6 +41,9 @@ fn _get_reader<'n, 'p, 'r>(
 ) -> Result<(Box<dyn RecordReader + 'r>, &'n str), EtError> {
     let reader: Box<dyn RecordReader + 'r> = match parser_name {
         "bam" => Box::new(parsers::sam::BamReader::new(rb, None)?),
+        "chemstation_dad" => Box::new(parsers::agilent::chemstation::ChemstationDadReader::new(
+            rb, None,
+        )?),
         "chemstation_fid" => Box::new(parsers::agilent::chemstation::ChemstationFidReader::new(
             rb, None,
         )?),