From f9b231b878ffee221d21f39f9d08419535bca428 Mon Sep 17 00:00:00 2001 From: Roderick Date: Fri, 22 Jul 2022 20:03:16 -0700 Subject: [PATCH] Support Chemstation "31" format and bump to 0.3.1 Closes #34. --- entab-cli/Cargo.toml | 4 +- entab-js/Cargo.toml | 2 +- entab-py/Cargo.toml | 2 +- entab-r/Cargo.toml | 2 +- entab-r/DESCRIPTION | 2 +- entab/Cargo.toml | 2 +- entab/src/filetype.rs | 9 +- entab/src/parsers/agilent/chemstation.rs | 210 +++++++++++++------ entab/src/parsers/agilent/chemstation_new.rs | 14 -- entab/src/parsers/agilent/mod.rs | 6 +- entab/src/readers.rs | 3 + 11 files changed, 168 insertions(+), 88 deletions(-) diff --git a/entab-cli/Cargo.toml b/entab-cli/Cargo.toml index 005097c..a352ba3 100644 --- a/entab-cli/Cargo.toml +++ b/entab-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "entab-cli" -version = "0.3.0" +version = "0.3.1" authors = ["Roderick "] edition = "2018" description = "Record-format file reader CLI" @@ -11,7 +11,7 @@ categories = ["command-line-utilities", "parsing", "science"] [dependencies] clap = { version = "3.1.5", features = ["cargo"] } -entab = { path = "../entab", version = "0.3.0" } +entab = { path = "../entab", version = "0.3.1" } memchr = "2.4" memmap2 = { version = "0.5.3", optional = true } diff --git a/entab-js/Cargo.toml b/entab-js/Cargo.toml index 80f7d7a..e62aadc 100644 --- a/entab-js/Cargo.toml +++ b/entab-js/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "entab-js" -version = "0.3.0" +version = "0.3.1" authors = ["Roderick "] license = "MIT" description = "Record-format file reader" diff --git a/entab-py/Cargo.toml b/entab-py/Cargo.toml index 73708b8..d9f2567 100644 --- a/entab-py/Cargo.toml +++ b/entab-py/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "entab-py" -version = "0.3.0" +version = "0.3.1" authors = ["Roderick "] license = "MIT" description = "Record-format file reader" diff --git a/entab-r/Cargo.toml b/entab-r/Cargo.toml index a697041..3128f89 100644 --- a/entab-r/Cargo.toml +++ b/entab-r/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "entab-r" -version = "0.3.0" +version = "0.3.1" authors = ["Roderick "] edition = "2018" diff --git a/entab-r/DESCRIPTION b/entab-r/DESCRIPTION index 4b47abd..08f1308 100644 --- a/entab-r/DESCRIPTION +++ b/entab-r/DESCRIPTION @@ -1,7 +1,7 @@ Package: entab Type: Package Title: Entab -Version: 0.3.0 +Version: 0.3.1 Author: Roderick Maintainer: Roderick Description: Entab is a record-format file reader. diff --git a/entab/Cargo.toml b/entab/Cargo.toml index 8a84b67..d9d7528 100644 --- a/entab/Cargo.toml +++ b/entab/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "entab" -version = "0.3.0" +version = "0.3.1" authors = ["Roderick "] edition = "2018" description = "Record-format file reader" diff --git a/entab/src/filetype.rs b/entab/src/filetype.rs index 08a6b3f..a2c2845 100644 --- a/entab/src/filetype.rs +++ b/entab/src/filetype.rs @@ -33,6 +33,8 @@ pub enum FileType { // chemoinformatics /// Agilent format used for MS-MS trace data AgilentMsMsScan, // bin 0x01, 0x01 + /// Agilent format used for UV-visible array data + AgilentChemstationDad, /// Agilent format used for flame ionization trace data AgilentChemstationFid, /// Agilent format used for mass spectrometry trace data @@ -99,6 +101,7 @@ impl FileType { b"BAM\x01" => return FileType::Bam, b"@HD\t" | b"@SQ\t" => return FileType::Sam, b"\x2Escf" => return FileType::Scf, + [0x02, 0x33, 0x31, 0x00] => return FileType::AgilentChemstationDad, [0x02, 0x38, 0x31, 0x00] => return FileType::AgilentChemstationFid, [0x01, 0x32, 0x00, 0x00] => return FileType::AgilentChemstationMs, [0x02, 0x33, 0x30, 0x00] => return FileType::AgilentChemstationMwd, @@ -165,7 +168,10 @@ impl FileType { "scf" => &[FileType::Scf], "sd" => &[FileType::AgilentMasshunterDadHeader], "sp" => &[FileType::AgilentMasshunterDad], - "uv" => &[FileType::AgilentChemstationUv], + "uv" => &[ + FileType::AgilentChemstationDad, + FileType::AgilentChemstationUv, + ], "xz" => &[FileType::Lzma], "zstd" => &[FileType::Zstd], "ztr" => &[FileType::Ztr], @@ -179,6 +185,7 @@ impl FileType { /// If a file is unsupported, an error will be returned. pub fn to_parser_name<'a>(&self, hint: Option<&'a str>) -> Result<&'a str, EtError> { Ok(match (self, hint) { + (FileType::AgilentChemstationDad, None) => "chemstation_dad", (FileType::AgilentChemstationFid, None) => "chemstation_fid", (FileType::AgilentChemstationMs, None) => "chemstation_ms", (FileType::AgilentChemstationMwd, None) => "chemstation_mwd", diff --git a/entab/src/parsers/agilent/chemstation.rs b/entab/src/parsers/agilent/chemstation.rs index 997f3d2..f837275 100644 --- a/entab/src/parsers/agilent/chemstation.rs +++ b/entab/src/parsers/agilent/chemstation.rs @@ -81,25 +81,32 @@ impl<'r> From<&ChemstationMetadata> for BTreeMap> { } } -fn get_metadata(header: &[u8]) -> Result { - if header.len() < 652 { +fn get_metadata(header: &[u8], has_signal: bool) -> Result { + if has_signal && header.len() < 652 { return Err( EtError::from("Chemstation header needs to be at least 648 bytes long").incomplete(), ); + } else if !has_signal && header.len() < 512 { + return Err( + EtError::from("Chemstation header needs to be at least 512 bytes long").incomplete(), + ); } let start_time = f64::from(i32::extract(&header[282..], &Endian::Big)?) / 60000.; let end_time = f64::from(i32::extract(&header[286..], &Endian::Big)?) / 60000.; - let offset_correction = f64::extract(&header[636..], &Endian::Big)?; - let mult_correction = f64::extract(&header[644..], &Endian::Big)?; + let mut offset_correction = 0.; + let mut mult_correction = 1.; + let mut signal_name = ""; + if has_signal { + offset_correction = f64::extract(&header[636..], &Endian::Big)?; + mult_correction = f64::extract(&header[644..], &Endian::Big)?; - let signal_name_len = usize::from(header[596]); - if signal_name_len > 40 { - return Err("Invalid signal name length".into()); + let signal_name_len = usize::from(header[596]); + if signal_name_len > 40 { + return Err("Invalid signal name length".into()); + } + signal_name = str::from_utf8(&header[597..597 + signal_name_len])?.trim(); } - let signal_name = str::from_utf8(&header[597..597 + signal_name_len])? - .trim() - .to_string(); let sample_len = usize::from(header[24]); if sample_len > 60 { @@ -164,7 +171,7 @@ fn get_metadata(header: &[u8]) -> Result { Ok(ChemstationMetadata { start_time, end_time, - signal_name, + signal_name: signal_name.to_string(), offset_correction, mult_correction, sequence, @@ -213,7 +220,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationFidState { } fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(rb)?; + let metadata = get_metadata(rb, true)?; // offset the current time back one step so it'll be right after the first time that parse self.cur_time = metadata.start_time - CHEMSTATION_TIME_STEP; self.cur_intensity = 0.; @@ -312,7 +319,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMsState { } fn get(&mut self, buffer: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(buffer)?; + let metadata = get_metadata(buffer, true)?; let n_scans = u32::extract(&buffer[278..], &Endian::Big)? as usize; self.n_scans_left = n_scans; @@ -430,7 +437,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdState { } fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(buf)?; + let metadata = get_metadata(buf, true)?; self.n_wvs_left = 0; // offset the current time back one step so it'll be right after the first time that parse @@ -514,55 +521,134 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdRecord<'s> { } } -// scratch with offsets for info in different files - -// FID - 02 38 31 00 ("81") (missing 01 38 00 00) -// MWD - 02 33 30 00 ("30") -// MS - 01 32 00 00 ("2") (missing 02 32 30?) -// (possibly also 03 31 37 39 and 03 31 38 31 ?) -// - 5 - "GC / MS Data File" or other? -// - 24 - Sample Name -// - 86 - Sample Description? -// - 148 - Operator Name -// - 178 - Run Date -// - 208 - Instrument Name -// - 218 - LC or GC -// - 228 - Method Name -// - 252 - Sequence? (u16) -// - 254 - Vial? (u16) -// - 256 - Replicate? (u16) -// - 260 - TIC Offset? (i32) -// * 264 - FID/MWD - 512 byte header chunks // 2 + 1 -// - 264 - MS - total header bytes // 2 + 1 -// - 272 - Normalization offset? (i32) -// * 282 - Start Time (i32) -// * 286 - End Time (i32) -// M 322 - Collection software? -// M 355 - Software Version? -// - 368 - "GC / MS Data File" as utf16 -// M 405 - Another Version? -// - 448 - MS - Instrument name as utf16 -// - 530 - lower end of mz/wv range? -// - 532 - upper end of mz/wv range? -// - 576 - MS - "GC" -// - 580 - Units -// M 596 - Channel Info (str) -// - 616 - MS - Method directory -// - 644 - (f32/64?) -// - 5768 - MS - data start (GC) - -// LC - 03 31 33 31 ("131") -// * 264 - 512 byte header chunks // 2 + 1 -// ? 278 - Number of Records -// - 858 - Sample Name -// - 1880 - Operator Name -// - 2391 - Run Date -// - 2492 - Instrument Name -// - 2533 - "LC" -// - 2574 - Method Name -// - 3093 - Units -// 4096 - data start? +#[derive(Clone, Debug, Default)] +/// Internal state for the `ChemstationDadRecord` parser +pub struct ChemstationDadState { + n_scans_left: usize, + n_bytes_left: usize, + cur_time: f64, + cur_intensity: f64, + cur_wv: f64, + wv_step: f64, + metadata: ChemstationMetadata, +} + +impl StateMetadata for ChemstationDadState { + fn metadata(&self) -> BTreeMap { + (&self.metadata).into() + } + + fn header(&self) -> Vec<&str> { + vec!["time", "wavelength", "intensity"] + } +} + +impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadState { + type State = (); + + fn parse( + rb: &[u8], + _eof: bool, + consumed: &mut usize, + _state: &mut Self::State, + ) -> Result { + *consumed += read_agilent_header(rb, false)?; + Ok(true) + } + fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { + let metadata = get_metadata(buf, false)?; + let n_scans = u32::extract(&buf[278..], &Endian::Big)? as usize; + + self.n_scans_left = n_scans; + self.metadata = metadata; + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, Default)] +/// A single point from an e.g. moving wavelength detector trace +pub struct ChemstationDadRecord { + /// The time recorded at + pub time: f64, + /// The wavelength recorded at + pub wavelength: f64, + /// The intensity record + pub intensity: f64, +} + +impl_record!(ChemstationDadRecord: time, wavelength, intensity); + +impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadRecord { + type State = ChemstationDadState; + + fn parse( + rb: &[u8], + _eof: bool, + consumed: &mut usize, + state: &mut Self::State, + ) -> Result { + if state.n_scans_left == 0 { + return Ok(false); + } + let con = &mut 0; + let mut n_scans_left = state.n_scans_left; + let mut n_bytes_left = state.n_bytes_left; + if n_bytes_left == 0 { + let scan_type = extract::(rb, con, &mut Endian::Little)?; + if scan_type != 67 { + // i'm not sure we ever hit this (tracking the n_scans_left should prevent it), but + // sometimes there's a different type of scan (68) at the end which starts a stream + // of u16, u32, u32 data; the u32's appear to both increment separately and the u16 + // is either 80 or 81 ~95% of the time and a number in the 50s-60s otherwise. + return Ok(false); + } + n_bytes_left = + usize::from(extract::(rb, con, &mut Endian::Little)?.saturating_sub(22)); + state.cur_time = f64::from(extract::(rb, con, &mut Endian::Little)?); + state.cur_wv = f64::from(extract::(rb, con, &mut Endian::Little)?); + let _ = extract::(rb, con, &mut Endian::Little)?; // the end wavelength + state.wv_step = f64::from(extract::(rb, con, &mut Endian::Little)?); + let _ = extract::<&[u8]>(rb, con, &mut 8)?; + state.cur_intensity = 0.; + if n_bytes_left == 0 { + // TODO: consume the rest of the file so this can't accidentally repeat? + return Ok(false); + } + n_scans_left -= 1; + } else { + state.cur_wv += state.wv_step; + } + + let intensity: i16 = extract(rb, con, &mut Endian::Little)?; + if intensity == -32768 { + state.cur_intensity = f64::from(extract::(rb, con, &mut Endian::Little)?); + state.n_bytes_left = n_bytes_left.saturating_sub(6); + } else { + state.cur_intensity += f64::from(intensity); + state.n_bytes_left = n_bytes_left.saturating_sub(2); + } + + state.n_scans_left = n_scans_left; + *consumed += *con; + Ok(true) + } + + fn get(&mut self, _rb: &'b [u8], state: &'s Self::State) -> Result<(), EtError> { + self.wavelength = state.cur_wv / 20.; + self.time = state.cur_time / 60_000.; + self.intensity = state.cur_intensity / 2000.; + Ok(()) + } +} + +impl_reader!( + ChemstationDadReader, + ChemstationDadRecord, + ChemstationDadRecord, + ChemstationDadState, + () +); impl_reader!( ChemstationFidReader, ChemstationFidRecord, diff --git a/entab/src/parsers/agilent/chemstation_new.rs b/entab/src/parsers/agilent/chemstation_new.rs index 206a5f1..db9f231 100644 --- a/entab/src/parsers/agilent/chemstation_new.rs +++ b/entab/src/parsers/agilent/chemstation_new.rs @@ -224,20 +224,6 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationUvRecord { } } -// scratch with offsets for info in different files - -// LC - 03 31 33 31 ("131") -// * 264 - 512 byte header chunks // 2 + 1 -// ? 278 - Number of Records -// - 858 - Sample Name -// - 1880 - Operator Name -// - 2391 - Run Date -// - 2492 - Instrument Name -// - 2533 - "LC" -// - 2574 - Method Name -// - 3093 - Units -// 4096 - data start? - impl_reader!( ChemstationUvReader, ChemstationUvRecord, diff --git a/entab/src/parsers/agilent/mod.rs b/entab/src/parsers/agilent/mod.rs index 73d3e56..3af3637 100644 --- a/entab/src/parsers/agilent/mod.rs +++ b/entab/src/parsers/agilent/mod.rs @@ -1,11 +1,9 @@ -// TODO: finish and reenable this -// /// Readers for instrument telemetry data generated by Chemstation -// pub mod chemstation_reg; /// Readers for formats generated by the GC/LC control software Chemstation pub mod chemstation; /// Readers for newer formats generated by the GC/LC control software Chemstation pub mod chemstation_new; -// /// Reader for Chemstation's logging files +// TODO: finish and reenable this +// /// Readers for instrument telemetry data generated by Chemstation // pub mod chemstation_reg; /// Readers for formats generated by the GC/LC control software Masshunter #[cfg(feature = "std")] diff --git a/entab/src/readers.rs b/entab/src/readers.rs index 4282d84..65fdb31 100644 --- a/entab/src/readers.rs +++ b/entab/src/readers.rs @@ -41,6 +41,9 @@ fn _get_reader<'n, 'p, 'r>( ) -> Result<(Box, &'n str), EtError> { let reader: Box = match parser_name { "bam" => Box::new(parsers::sam::BamReader::new(rb, None)?), + "chemstation_dad" => Box::new(parsers::agilent::chemstation::ChemstationDadReader::new( + rb, None, + )?), "chemstation_fid" => Box::new(parsers::agilent::chemstation::ChemstationFidReader::new( rb, None, )?),