From 2b08951c54dc066b33b32ad5b3656afda4f08d07 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 30 Apr 2024 10:29:50 +0200 Subject: [PATCH 01/13] fix: use array instead of Vec for Codon representation --- Cargo.toml | 1 + src/sequences.rs | 75 ++++++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6a984ca..19e762e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ serde = { version = "1.0", features = ["derive"] } thiserror = "1.0" indexmap = { version = "2", features = ["serde"] } biocommons-bioutils = "0.1.0" +ahash = "0.8.11" [dev-dependencies] anyhow = "1.0" diff --git a/src/sequences.rs b/src/sequences.rs index 5a183d2..b53fccb 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -2,8 +2,8 @@ //! //! Partially ported over from `bioutils.sequences`. +use ahash::AHashMap; use md5::{Digest, Md5}; -use rustc_hash::FxHashMap; pub use crate::sequences::error::Error; @@ -713,42 +713,48 @@ lazy_static::lazy_static! { ("YTR", "L"), ]; - static ref AA1_TO_AA3: FxHashMap<&'static [u8], &'static str> = { - let mut m = FxHashMap::default(); + static ref AA1_TO_AA3: AHashMap<&'static [u8], &'static str> = { + let mut m = AHashMap::default(); for (aa3, aa1) in AA3_TO_AA1_VEC.iter() { m.insert(aa1.as_bytes(), *aa3); } m }; - static ref AA3_TO_AA1: FxHashMap<&'static [u8], &'static str> = { - let mut m = FxHashMap::default(); + static ref AA3_TO_AA1: AHashMap<&'static [u8], &'static str> = { + let mut m = AHashMap::default(); for (aa3, aa1) in AA3_TO_AA1_VEC.iter() { m.insert(aa3.as_bytes(), *aa1); } m }; - static ref DNA_TO_AA1_LUT: FxHashMap, u8> = { - let mut m = FxHashMap::default(); + static ref DNA_TO_AA1_LUT: AHashMap = { + let mut m = AHashMap::default(); for (dna, aa1) in DNA_TO_AA1_LUT_VEC.iter() { - m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]); + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); } m }; - static ref DNA_TO_AA1_SEC: FxHashMap, u8> = { - let mut m = FxHashMap::default(); + static ref DNA_TO_AA1_SEC: AHashMap = { + let mut m = AHashMap::default(); for (dna, aa1) in DNA_TO_AA1_SEC_VEC.iter() { - m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]); + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); } m }; - static ref DNA_TO_AA1_CHRMT_VERTEBRATE: FxHashMap, u8> = { - let mut m = FxHashMap::default(); + static ref DNA_TO_AA1_CHRMT_VERTEBRATE: AHashMap = { + let mut m = AHashMap::default(); for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC.iter() { - m.insert(Vec::from(dna.as_bytes()), aa1.as_bytes()[0]); + assert_eq!(dna.len(), 3); + let d = dna.as_bytes(); + m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); } m }; @@ -926,6 +932,8 @@ fn looks_like_aa3_p(seq: &str) -> bool { seq.len() % 3 == 0 && seq.chars().nth(1).map(|c| c.is_lowercase()).unwrap_or(true) } +type Codon = [u8; 3]; + /// Allow translation of `&[u8]` DNA codons to `u8` amino acids. /// /// We use separate structs here to encapsulate getting the lazy static global data. @@ -940,10 +948,10 @@ struct CodonTranslator { /// Mapping from 2bit DNA codon to amino acid 1-letter ASCII. codon_2bit_to_aa1: &'static [u8; 64], /// Mapping from DNA 2-bit to amino acid 1-letter ASCII including degenerate codons. - full_dna_to_aa1: &'static FxHashMap, u8>, + full_dna_to_aa1: &'static AHashMap, /// Buffer. - codon: Vec, + codon: Codon, } impl CodonTranslator { @@ -965,7 +973,7 @@ impl CodonTranslator { TranslationTable::VertebrateMitochondrial => &DNA_TO_AA1_CHRMT_VERTEBRATE, }, - codon: Vec::with_capacity(3), + codon: [0; 3], } } @@ -981,31 +989,31 @@ impl CodonTranslator { pub fn translate(&mut self, codon: &[u8]) -> Result { // Normalize (to upper case etc.) codon. self.normalize_codon(codon); - // Attempt fast translation of codon. - if let Some(aa) = self.codon_to_aa1(&self.codon) { - return Ok(aa); - } - if let Some(aa) = self.full_dna_to_aa1.get(&self.codon) { + + let translation = self + // Attempt fast translation of codon + .codon_to_aa1(&self.codon) // Fast translation fails, but slower hash map succeeded. - Ok(*aa) - } else { + .or_else(|| self.full_dna_to_aa1.get(&self.codon).copied()) // If this contains an ambiguous code, set aa to X, otherwise, throw error - for c in codon.iter() { - if self.iupac_ambiguity_codes.contains(c) { - return Ok(b'X'); - } - } - return Err(Error::UndefinedCodon( + .or_else(|| { + codon + .iter() + .any(|c| self.iupac_ambiguity_codes.contains(c)) + .then_some(b'X') + }); + translation.ok_or_else(|| { + Error::UndefinedCodon( std::str::from_utf8(codon) .expect("cannot decode UTF-8") .to_owned(), - )); - } + ) + }) } fn dna3_to_2bit(&self, c: &[u8]) -> Option { let mut result = 0; - for i in c.iter().take(3) { + for i in &c[..3] { result <<= 2; let tmp = self.dna_ascii_to_2bit[*i as usize]; if tmp == 255 { @@ -1018,7 +1026,6 @@ impl CodonTranslator { /// Helper function to extract normalized codon to `self.codon`. fn normalize_codon(&mut self, codon: &[u8]) { - self.codon.resize(3, 0); for (i, c) in codon.iter().enumerate() { self.codon[i] = self.dna_ascii_map[*c as usize]; } From 5ae5d05c675906e72aa3b46f3a98a51de3824407 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 May 2024 12:13:36 +0200 Subject: [PATCH 02/13] don't allocate until necessary --- src/mapper/alignment.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 3456b76..195932e 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -22,6 +22,7 @@ // n. -2 -1 ! 1 2 3 4 5 6 7 8 9 // g. ... 123 124 125 126 127 128 129 130 131 132 133 ... +use std::iter::once; use std::sync::Arc; use crate::{ @@ -162,21 +163,18 @@ impl Mapper { // exons are adjacent. Assert that here. let mut sorted_exons = tx_exons.clone(); sorted_exons - .sort_by(|a, b| a.ord.partial_cmp(&b.ord).expect("comparison failed / NaN?")); - let offenders = sorted_exons - .windows(2) - .filter(|pair| { - let lhs = &pair[0]; - let rhs = &pair[1]; - lhs.tx_end_i != rhs.tx_start_i - }) - .collect::>(); - if !offenders.is_empty() { + .sort_unstable_by(|a, b| a.ord.partial_cmp(&b.ord).expect("comparison failed / NaN?")); + let mut offenders = sorted_exons.windows(2).filter(|pair| { + let lhs = &pair[0]; + let rhs = &pair[1]; + lhs.tx_end_i != rhs.tx_start_i + }); + if let Some(offender) = offenders.next() { return Err(Error::NonAdjacentExons( tx_ac.to_string(), alt_ac.to_string(), alt_aln_method.to_string(), - format!("{:?}", offenders), + format!("{:?}", (once(offender).chain(offenders)).collect::>()), )); } From 191fc8d8800727bf33046dc94b4e4c4ba84af66b Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 May 2024 15:27:19 +0200 Subject: [PATCH 03/13] reduce usage of lazy_static, use consts instead where possible, copy instead of reference lookup tables --- src/sequences.rs | 1292 +++++++++++++++++++++++----------------------- 1 file changed, 652 insertions(+), 640 deletions(-) diff --git a/src/sequences.rs b/src/sequences.rs index b53fccb..dda64dc 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -83,46 +83,54 @@ pub fn revcomp(seq: &str) -> String { .to_string() } -lazy_static::lazy_static! { - /// Mapping for DNA characters for normalization. - static ref DNA_ASCII_MAP: [u8; 256] = { - let mut result = [0; 256]; - - for c in 0..=255 { - if c == b'u' || c == b'U' { - result[c as usize] = b'T'; - } else if c.is_ascii_lowercase() { - result[c as usize] = c.to_ascii_uppercase(); - } else { - result[c as usize] = c; - } - } - - result - }; -} - -lazy_static::lazy_static! { - static ref DNA_ASCII_TO_2BIT: [u8; 256] = { - let mut result = [255; 256]; - - result[b'A' as usize] = 0; - result[b'a' as usize] = 0; - - result[b'C' as usize] = 1; - result[b'c' as usize] = 1; - - result[b'G' as usize] = 2; - result[b'g' as usize] = 2; - - result[b'T' as usize] = 3; - result[b't' as usize] = 3; - result[b'U' as usize] = 3; - result[b'u' as usize] = 3; - - result - }; -} +/// Mapping for DNA characters for normalization. +/// Built via +/// ``` +/// let mut result = [0; 256]; +/// for c in 0..=255 { +/// if c == b'u' || c == b'U' { +/// result[c as usize] = b'T'; +/// } else if c.is_ascii_lowercase() { +/// result[c as usize] = c.to_ascii_uppercase(); +/// } else { +/// result[c as usize] = c; +/// } +/// } +/// ``` +/// Could probably be done in build.rs +const DNA_ASCII_MAP: [u8; 256] = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, + 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 65, + 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 84, 86, 87, 88, 89, + 90, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, + 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, + 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, + 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, + 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, + 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +]; +const DNA_ASCII_TO_2BIT: [u8; 256] = { + let mut result = [255; 256]; + + result[b'A' as usize] = 0; + result[b'a' as usize] = 0; + + result[b'C' as usize] = 1; + result[b'c' as usize] = 1; + + result[b'G' as usize] = 2; + result[b'g' as usize] = 2; + + result[b'T' as usize] = 3; + result[b't' as usize] = 3; + result[b'U' as usize] = 3; + result[b'u' as usize] = 3; + result +}; fn dna3_to_2bit(c: &[u8]) -> Option { let mut result = 0; @@ -137,582 +145,598 @@ fn dna3_to_2bit(c: &[u8]) -> Option { Some(result) } -lazy_static::lazy_static! { - pub static ref AA3_TO_AA1_VEC: Vec<(&'static str, &'static str)> = vec![ - ("Ala", "A"), - ("Arg", "R"), - ("Asn", "N"), - ("Asp", "D"), - ("Cys", "C"), - ("Gln", "Q"), - ("Glu", "E"), - ("Gly", "G"), - ("His", "H"), - ("Ile", "I"), - ("Leu", "L"), - ("Lys", "K"), - ("Met", "M"), - ("Phe", "F"), - ("Pro", "P"), - ("Ser", "S"), - ("Thr", "T"), - ("Trp", "W"), - ("Tyr", "Y"), - ("Val", "V"), - ("Xaa", "X"), - ("Ter", "*"), - ("Sec", "U"), - ]; - - /// NCBI standard translation table. - pub static ref DNA_TO_AA1_LUT_VEC: Vec<(&'static str, &'static str)> = vec![ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - ("AGA", "R"), - ("AGC", "S"), - ("AGG", "R"), - ("AGT", "S"), - ("ATA", "I"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "*"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), - ]; - - /// Translation table for selenocysteine. - pub static ref DNA_TO_AA1_SEC_VEC: Vec<(&'static str, &'static str)> = vec![ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - ("AGA", "R"), - ("AGC", "S"), - ("AGG", "R"), - ("AGT", "S"), - ("ATA", "I"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "U"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), - ]; - - /// Vertebrate mitochondrial code, cf. https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code - pub static ref DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: Vec<(&'static str, &'static str)> = vec![ - ("AAA", "K"), - ("AAC", "N"), - ("AAG", "K"), - ("AAT", "N"), - ("ACA", "T"), - ("ACC", "T"), - ("ACG", "T"), - ("ACT", "T"), - // caveat lector - ("AGA", "*"), - ("AGC", "S"), - // caveat lector - ("AGG", "*"), - ("AGT", "S"), - // caveat lector - ("ATA", "M"), - ("ATC", "I"), - ("ATG", "M"), - ("ATT", "I"), - ("CAA", "Q"), - ("CAC", "H"), - ("CAG", "Q"), - ("CAT", "H"), - ("CCA", "P"), - ("CCC", "P"), - ("CCG", "P"), - ("CCT", "P"), - ("CGA", "R"), - ("CGC", "R"), - ("CGG", "R"), - ("CGT", "R"), - ("CTA", "L"), - ("CTC", "L"), - ("CTG", "L"), - ("CTT", "L"), - ("GAA", "E"), - ("GAC", "D"), - ("GAG", "E"), - ("GAT", "D"), - ("GCA", "A"), - ("GCC", "A"), - ("GCG", "A"), - ("GCT", "A"), - ("GGA", "G"), - ("GGC", "G"), - ("GGG", "G"), - ("GGT", "G"), - ("GTA", "V"), - ("GTC", "V"), - ("GTG", "V"), - ("GTT", "V"), - ("TAA", "*"), - ("TAC", "Y"), - ("TAG", "*"), - ("TAT", "Y"), - ("TCA", "S"), - ("TCC", "S"), - ("TCG", "S"), - ("TCT", "S"), - // caveat lector - ("TGA", "W"), - ("TGC", "C"), - ("TGG", "W"), - ("TGT", "C"), - ("TTA", "L"), - ("TTC", "F"), - ("TTG", "L"), - ("TTT", "F"), - // degenerate codons - ("AAR", "K"), - ("AAY", "N"), - ("ACB", "T"), - ("ACD", "T"), - ("ACH", "T"), - ("ACK", "T"), - ("ACM", "T"), - ("ACN", "T"), - ("ACR", "T"), - ("ACS", "T"), - ("ACV", "T"), - ("ACW", "T"), - ("ACY", "T"), - ("AGR", "R"), - ("AGY", "S"), - ("ATH", "I"), - ("ATM", "I"), - ("ATW", "I"), - ("ATY", "I"), - ("CAR", "Q"), - ("CAY", "H"), - ("CCB", "P"), - ("CCD", "P"), - ("CCH", "P"), - ("CCK", "P"), - ("CCM", "P"), - ("CCN", "P"), - ("CCR", "P"), - ("CCS", "P"), - ("CCV", "P"), - ("CCW", "P"), - ("CCY", "P"), - ("CGB", "R"), - ("CGD", "R"), - ("CGH", "R"), - ("CGK", "R"), - ("CGM", "R"), - ("CGN", "R"), - ("CGR", "R"), - ("CGS", "R"), - ("CGV", "R"), - ("CGW", "R"), - ("CGY", "R"), - ("CTB", "L"), - ("CTD", "L"), - ("CTH", "L"), - ("CTK", "L"), - ("CTM", "L"), - ("CTN", "L"), - ("CTR", "L"), - ("CTS", "L"), - ("CTV", "L"), - ("CTW", "L"), - ("CTY", "L"), - ("GAR", "E"), - ("GAY", "D"), - ("GCB", "A"), - ("GCD", "A"), - ("GCH", "A"), - ("GCK", "A"), - ("GCM", "A"), - ("GCN", "A"), - ("GCR", "A"), - ("GCS", "A"), - ("GCV", "A"), - ("GCW", "A"), - ("GCY", "A"), - ("GGB", "G"), - ("GGD", "G"), - ("GGH", "G"), - ("GGK", "G"), - ("GGM", "G"), - ("GGN", "G"), - ("GGR", "G"), - ("GGS", "G"), - ("GGV", "G"), - ("GGW", "G"), - ("GGY", "G"), - ("GTB", "V"), - ("GTD", "V"), - ("GTH", "V"), - ("GTK", "V"), - ("GTM", "V"), - ("GTN", "V"), - ("GTR", "V"), - ("GTS", "V"), - ("GTV", "V"), - ("GTW", "V"), - ("GTY", "V"), - ("MGA", "R"), - ("MGG", "R"), - ("MGR", "R"), - ("TAR", "*"), - ("TAY", "Y"), - ("TCB", "S"), - ("TCD", "S"), - ("TCH", "S"), - ("TCK", "S"), - ("TCM", "S"), - ("TCN", "S"), - ("TCR", "S"), - ("TCS", "S"), - ("TCV", "S"), - ("TCW", "S"), - ("TCY", "S"), - ("TGY", "C"), - ("TRA", "*"), - ("TTR", "L"), - ("TTY", "F"), - ("YTA", "L"), - ("YTG", "L"), - ("YTR", "L"), - ]; +pub const AA3_TO_AA1_VEC: &[(&str, &str)] = &[ + ("Ala", "A"), + ("Arg", "R"), + ("Asn", "N"), + ("Asp", "D"), + ("Cys", "C"), + ("Gln", "Q"), + ("Glu", "E"), + ("Gly", "G"), + ("His", "H"), + ("Ile", "I"), + ("Leu", "L"), + ("Lys", "K"), + ("Met", "M"), + ("Phe", "F"), + ("Pro", "P"), + ("Ser", "S"), + ("Thr", "T"), + ("Trp", "W"), + ("Tyr", "Y"), + ("Val", "V"), + ("Xaa", "X"), + ("Ter", "*"), + ("Sec", "U"), +]; + +const DNA_TO_AA1_LUT_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "*"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Translation table for selenocysteine. +const DNA_TO_AA1_SEC_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + ("AGA", "R"), + ("AGC", "S"), + ("AGG", "R"), + ("AGT", "S"), + ("ATA", "I"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "U"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Vertebrate mitochondrial code, cf. https://en.wikipedia.org/wiki/Vertebrate_mitochondrial_code +const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ + ("AAA", "K"), + ("AAC", "N"), + ("AAG", "K"), + ("AAT", "N"), + ("ACA", "T"), + ("ACC", "T"), + ("ACG", "T"), + ("ACT", "T"), + // caveat lector + ("AGA", "*"), + ("AGC", "S"), + // caveat lector + ("AGG", "*"), + ("AGT", "S"), + // caveat lector + ("ATA", "M"), + ("ATC", "I"), + ("ATG", "M"), + ("ATT", "I"), + ("CAA", "Q"), + ("CAC", "H"), + ("CAG", "Q"), + ("CAT", "H"), + ("CCA", "P"), + ("CCC", "P"), + ("CCG", "P"), + ("CCT", "P"), + ("CGA", "R"), + ("CGC", "R"), + ("CGG", "R"), + ("CGT", "R"), + ("CTA", "L"), + ("CTC", "L"), + ("CTG", "L"), + ("CTT", "L"), + ("GAA", "E"), + ("GAC", "D"), + ("GAG", "E"), + ("GAT", "D"), + ("GCA", "A"), + ("GCC", "A"), + ("GCG", "A"), + ("GCT", "A"), + ("GGA", "G"), + ("GGC", "G"), + ("GGG", "G"), + ("GGT", "G"), + ("GTA", "V"), + ("GTC", "V"), + ("GTG", "V"), + ("GTT", "V"), + ("TAA", "*"), + ("TAC", "Y"), + ("TAG", "*"), + ("TAT", "Y"), + ("TCA", "S"), + ("TCC", "S"), + ("TCG", "S"), + ("TCT", "S"), + // caveat lector + ("TGA", "W"), + ("TGC", "C"), + ("TGG", "W"), + ("TGT", "C"), + ("TTA", "L"), + ("TTC", "F"), + ("TTG", "L"), + ("TTT", "F"), + // degenerate codons + ("AAR", "K"), + ("AAY", "N"), + ("ACB", "T"), + ("ACD", "T"), + ("ACH", "T"), + ("ACK", "T"), + ("ACM", "T"), + ("ACN", "T"), + ("ACR", "T"), + ("ACS", "T"), + ("ACV", "T"), + ("ACW", "T"), + ("ACY", "T"), + ("AGR", "R"), + ("AGY", "S"), + ("ATH", "I"), + ("ATM", "I"), + ("ATW", "I"), + ("ATY", "I"), + ("CAR", "Q"), + ("CAY", "H"), + ("CCB", "P"), + ("CCD", "P"), + ("CCH", "P"), + ("CCK", "P"), + ("CCM", "P"), + ("CCN", "P"), + ("CCR", "P"), + ("CCS", "P"), + ("CCV", "P"), + ("CCW", "P"), + ("CCY", "P"), + ("CGB", "R"), + ("CGD", "R"), + ("CGH", "R"), + ("CGK", "R"), + ("CGM", "R"), + ("CGN", "R"), + ("CGR", "R"), + ("CGS", "R"), + ("CGV", "R"), + ("CGW", "R"), + ("CGY", "R"), + ("CTB", "L"), + ("CTD", "L"), + ("CTH", "L"), + ("CTK", "L"), + ("CTM", "L"), + ("CTN", "L"), + ("CTR", "L"), + ("CTS", "L"), + ("CTV", "L"), + ("CTW", "L"), + ("CTY", "L"), + ("GAR", "E"), + ("GAY", "D"), + ("GCB", "A"), + ("GCD", "A"), + ("GCH", "A"), + ("GCK", "A"), + ("GCM", "A"), + ("GCN", "A"), + ("GCR", "A"), + ("GCS", "A"), + ("GCV", "A"), + ("GCW", "A"), + ("GCY", "A"), + ("GGB", "G"), + ("GGD", "G"), + ("GGH", "G"), + ("GGK", "G"), + ("GGM", "G"), + ("GGN", "G"), + ("GGR", "G"), + ("GGS", "G"), + ("GGV", "G"), + ("GGW", "G"), + ("GGY", "G"), + ("GTB", "V"), + ("GTD", "V"), + ("GTH", "V"), + ("GTK", "V"), + ("GTM", "V"), + ("GTN", "V"), + ("GTR", "V"), + ("GTS", "V"), + ("GTV", "V"), + ("GTW", "V"), + ("GTY", "V"), + ("MGA", "R"), + ("MGG", "R"), + ("MGR", "R"), + ("TAR", "*"), + ("TAY", "Y"), + ("TCB", "S"), + ("TCD", "S"), + ("TCH", "S"), + ("TCK", "S"), + ("TCM", "S"), + ("TCN", "S"), + ("TCR", "S"), + ("TCS", "S"), + ("TCV", "S"), + ("TCW", "S"), + ("TCY", "S"), + ("TGY", "C"), + ("TRA", "*"), + ("TTR", "L"), + ("TTY", "F"), + ("YTA", "L"), + ("YTG", "L"), + ("YTR", "L"), +]; + +/// Generated via: +/// ``` +/// let mut result = [0; 64]; +/// for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { +/// if i > 63 { +/// break; // skip degenerate codons +/// } +/// let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); +/// result[dna3_2bit as usize] = aa1.as_bytes()[0]; +/// ``` +/// +const CODON_2BIT_TO_AA1_LUT: [u8; 64] = [ + 75, 78, 75, 78, 84, 84, 84, 84, 82, 83, 82, 83, 73, 73, 77, 73, 81, 72, 81, 72, 80, 80, 80, 80, + 82, 82, 82, 82, 76, 76, 76, 76, 69, 68, 69, 68, 65, 65, 65, 65, 71, 71, 71, 71, 86, 86, 86, 86, + 42, 89, 42, 89, 83, 83, 83, 83, 42, 67, 87, 67, 76, 70, 76, 70, +]; +lazy_static::lazy_static! { static ref AA1_TO_AA3: AHashMap<&'static [u8], &'static str> = { let mut m = AHashMap::default(); for (aa3, aa1) in AA3_TO_AA1_VEC.iter() { @@ -731,7 +755,7 @@ lazy_static::lazy_static! { static ref DNA_TO_AA1_LUT: AHashMap = { let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_LUT_VEC.iter() { + for (dna, aa1) in DNA_TO_AA1_LUT_VEC { assert_eq!(dna.len(), 3); let d = dna.as_bytes(); m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); @@ -741,7 +765,7 @@ lazy_static::lazy_static! { static ref DNA_TO_AA1_SEC: AHashMap = { let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_SEC_VEC.iter() { + for (dna, aa1) in DNA_TO_AA1_SEC_VEC { assert_eq!(dna.len(), 3); let d = dna.as_bytes(); m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); @@ -751,7 +775,7 @@ lazy_static::lazy_static! { static ref DNA_TO_AA1_CHRMT_VERTEBRATE: AHashMap = { let mut m = AHashMap::default(); - for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC.iter() { + for (dna, aa1) in DNA_TO_AA1_CHRMT_VERTEBRATE_VEC { assert_eq!(dna.len(), 3); let d = dna.as_bytes(); m.insert([d[0], d[1], d[2]], aa1.as_bytes()[0]); @@ -759,18 +783,6 @@ lazy_static::lazy_static! { m }; - static ref CODON_2BIT_TO_AA1_LUT: [u8; 64] = { - let mut result = [0; 64]; - for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { - if i > 63 { - break; // skip degenerate codons - } - let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); - result[dna3_2bit as usize] = aa1.as_bytes()[0]; - } - result - }; - static ref CODON_2BIT_TO_AA1_SEC: [u8; 64] = { let mut result = [0; 64]; for (i, (dna3, aa1)) in DNA_TO_AA1_SEC_VEC.iter().enumerate() { @@ -796,7 +808,7 @@ lazy_static::lazy_static! { }; } -static IUPAC_AMBIGUITY_CODES: &[u8] = b"BDHVNUWSMKRYZ"; +const IUPAC_AMBIGUITY_CODES: [u8; 13] = *b"BDHVNUWSMKRYZ"; /// Allow selection of translation table. #[derive( @@ -939,14 +951,14 @@ type Codon = [u8; 3]; /// We use separate structs here to encapsulate getting the lazy static global data. struct CodonTranslator { /// Mapping for "normalizing" DNA ASCII character (to upper case and `U -> T`). - dna_ascii_map: &'static [u8; 256], + dna_ascii_map: [u8; 256], /// Mapping from DNA ASCII to 2-bit representation. - dna_ascii_to_2bit: &'static [u8; 256], + dna_ascii_to_2bit: [u8; 256], /// IUPAC ambiguity codes. - iupac_ambiguity_codes: &'static [u8], + iupac_ambiguity_codes: [u8; 13], /// Mapping from 2bit DNA codon to amino acid 1-letter ASCII. - codon_2bit_to_aa1: &'static [u8; 64], + codon_2bit_to_aa1: [u8; 64], /// Mapping from DNA 2-bit to amino acid 1-letter ASCII including degenerate codons. full_dna_to_aa1: &'static AHashMap, @@ -958,14 +970,14 @@ impl CodonTranslator { /// Initialize the struct. pub fn new(table: TranslationTable) -> Self { Self { - dna_ascii_map: &DNA_ASCII_MAP, - dna_ascii_to_2bit: &DNA_ASCII_TO_2BIT, + dna_ascii_map: DNA_ASCII_MAP, + dna_ascii_to_2bit: DNA_ASCII_TO_2BIT, iupac_ambiguity_codes: IUPAC_AMBIGUITY_CODES, codon_2bit_to_aa1: match table { - TranslationTable::Standard => &CODON_2BIT_TO_AA1_LUT, - TranslationTable::Selenocysteine => &CODON_2BIT_TO_AA1_SEC, - TranslationTable::VertebrateMitochondrial => &CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE, + TranslationTable::Standard => CODON_2BIT_TO_AA1_LUT, + TranslationTable::Selenocysteine => *CODON_2BIT_TO_AA1_SEC, + TranslationTable::VertebrateMitochondrial => *CODON_2BIT_TO_AA1_CHRMT_VERTEBRATE, }, full_dna_to_aa1: match table { TranslationTable::Standard => &DNA_TO_AA1_LUT, From 413d4cf27a576bf8c302d2acbf7968bbbaa10a95 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Wed, 8 May 2024 16:59:15 +0200 Subject: [PATCH 04/13] cache calls to alignment::Mapper::new --- Cargo.toml | 1 + src/data/error.rs | 5 +- src/data/interface.rs | 1 + src/data/uta.rs | 133 ++++++++++++++++++++-------------------- src/mapper/alignment.rs | 3 +- src/mapper/altseq.rs | 20 ++++++ src/mapper/cigar.rs | 2 +- src/mapper/error.rs | 4 +- src/mapper/variant.rs | 79 ++++++++++++++++++++---- src/normalizer.rs | 2 +- src/parser/error.rs | 2 +- src/sequences.rs | 4 +- src/validator/error.rs | 2 +- 13 files changed, 169 insertions(+), 89 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 19e762e..23606b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ thiserror = "1.0" indexmap = { version = "2", features = ["serde"] } biocommons-bioutils = "0.1.0" ahash = "0.8.11" +cached = "0.50.0" [dev-dependencies] anyhow = "1.0" diff --git a/src/data/error.rs b/src/data/error.rs index 182ee1b..5addf8d 100644 --- a/src/data/error.rs +++ b/src/data/error.rs @@ -1,12 +1,13 @@ //! Error type definition. +use std::sync::Arc; use thiserror::Error; /// Error type for data. -#[derive(Error, Debug)] +#[derive(Error, Debug, Clone)] pub enum Error { #[error("UTA Postgres access error")] - UtaPostgresError(#[from] postgres::Error), + UtaPostgresError(#[from] Arc), #[error("sequence operation failed")] SequenceOperationFailed(#[from] crate::sequences::Error), #[error("problem with seqrepo access")] diff --git a/src/data/interface.rs b/src/data/interface.rs index b5a14fe..dbde9d6 100644 --- a/src/data/interface.rs +++ b/src/data/interface.rs @@ -5,6 +5,7 @@ use indexmap::IndexMap; use crate::{data::error::Error, sequences::TranslationTable}; use biocommons_bioutils::assemblies::Assembly; +use cached::proc_macro::cached; /// Information about a gene. /// diff --git a/src/data/uta.rs b/src/data/uta.rs index a0879dc..87d18fe 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -6,10 +6,11 @@ use indexmap::IndexMap; use postgres::{Client, NoTls, Row}; use quick_cache::sync::Cache; use std::fmt::Debug; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use crate::sequences::{seq_md5, TranslationTable}; use biocommons_bioutils::assemblies::{Assembly, ASSEMBLY_INFOS}; +use postgres::fallible_iterator::FallibleIterator; use crate::data::{ error::Error, interface, interface::GeneInfoRecord, interface::TxExonsRecord, @@ -42,15 +43,15 @@ impl TryFrom for GeneInfoRecord { type Error = Error; fn try_from(row: Row) -> Result { - let aliases: String = row.try_get("aliases")?; + let aliases: String = row.try_get("aliases").map_err(|e| Arc::new(e))?; let aliases = aliases.split(',').map(|s| s.to_owned()).collect::>(); Ok(Self { - hgnc: row.try_get("hgnc")?, - maploc: row.try_get("maploc")?, - descr: row.try_get("descr")?, - summary: row.try_get("summary")?, + hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, + maploc: row.try_get("maploc").map_err(|e| Arc::new(e))?, + descr: row.try_get("descr").map_err(|e| Arc::new(e))?, + summary: row.try_get("summary").map_err(|e| Arc::new(e))?, aliases, - added: row.try_get("added")?, + added: row.try_get("added").map_err(|e| Arc::new(e))?, }) } } @@ -60,8 +61,8 @@ impl TryFrom for TxSimilarityRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac1: row.try_get("tx_ac1")?, - tx_ac2: row.try_get("tx_ac2")?, + tx_ac1: row.try_get("tx_ac1").map_err(|e| Arc::new(e))?, + tx_ac2: row.try_get("tx_ac2").map_err(|e| Arc::new(e))?, hgnc_eq: row.try_get("hgnc_eq").unwrap_or(false), cds_eq: row.try_get("cds_eq").unwrap_or(false), es_fp_eq: row.try_get("es_fp_eq").unwrap_or(false), @@ -76,24 +77,24 @@ impl TryFrom for TxExonsRecord { fn try_from(row: Row) -> Result { Ok(Self { - hgnc: row.try_get("hgnc")?, - tx_ac: row.try_get("tx_ac")?, - alt_ac: row.try_get("alt_ac")?, - alt_aln_method: row.try_get("alt_aln_method")?, - alt_strand: row.try_get("alt_strand")?, - ord: row.try_get("ord")?, - tx_start_i: row.try_get("tx_start_i")?, - tx_end_i: row.try_get("tx_end_i")?, - alt_start_i: row.try_get("alt_start_i")?, - alt_end_i: row.try_get("alt_end_i")?, - cigar: row.try_get("cigar")?, - tx_aseq: row.try_get("tx_aseq")?, - alt_aseq: row.try_get("alt_aseq")?, - tx_exon_set_id: row.try_get("tx_exon_set_id")?, - alt_exon_set_id: row.try_get("alt_exon_set_id")?, - tx_exon_id: row.try_get("tx_exon_id")?, - alt_exon_id: row.try_get("alt_exon_id")?, - exon_aln_id: row.try_get("exon_aln_id")?, + hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, + tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, + alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, + alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, + alt_strand: row.try_get("alt_strand").map_err(|e| Arc::new(e))?, + ord: row.try_get("ord").map_err(|e| Arc::new(e))?, + tx_start_i: row.try_get("tx_start_i").map_err(|e| Arc::new(e))?, + tx_end_i: row.try_get("tx_end_i").map_err(|e| Arc::new(e))?, + alt_start_i: row.try_get("alt_start_i").map_err(|e| Arc::new(e))?, + alt_end_i: row.try_get("alt_end_i").map_err(|e| Arc::new(e))?, + cigar: row.try_get("cigar").map_err(|e| Arc::new(e))?, + tx_aseq: row.try_get("tx_aseq").map_err(|e| Arc::new(e))?, + alt_aseq: row.try_get("alt_aseq").map_err(|e| Arc::new(e))?, + tx_exon_set_id: row.try_get("tx_exon_set_id").map_err(|e| Arc::new(e))?, + alt_exon_set_id: row.try_get("alt_exon_set_id").map_err(|e| Arc::new(e))?, + tx_exon_id: row.try_get("tx_exon_id").map_err(|e| Arc::new(e))?, + alt_exon_id: row.try_get("alt_exon_id").map_err(|e| Arc::new(e))?, + exon_aln_id: row.try_get("exon_aln_id").map_err(|e| Arc::new(e))?, }) } } @@ -103,12 +104,12 @@ impl TryFrom for TxForRegionRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac: row.try_get("tx_ac")?, - alt_ac: row.try_get("alt_ac")?, - alt_strand: row.try_get("alt_strand")?, - alt_aln_method: row.try_get("alt_aln_method")?, - start_i: row.try_get("start_i")?, - end_i: row.try_get("end_i")?, + tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, + alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, + alt_strand: row.try_get("alt_strand").map_err(|e| Arc::new(e))?, + alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, + start_i: row.try_get("start_i").map_err(|e| Arc::new(e))?, + end_i: row.try_get("end_i").map_err(|e| Arc::new(e))?, }) } } @@ -126,15 +127,15 @@ impl TryFrom for TxIdentityInfo { type Error = Error; fn try_from(row: Row) -> Result { - let hgnc = row.try_get("hgnc")?; + let hgnc = row.try_get("hgnc").map_err(|e| Arc::new(e))?; let is_selenoprotein = SELENOPROTEIN_SYMBOLS.contains(&hgnc); Ok(Self { - tx_ac: row.try_get("tx_ac")?, - alt_ac: row.try_get("alt_ac")?, - alt_aln_method: row.try_get("alt_aln_method")?, - cds_start_i: row.try_get("cds_start_i")?, - cds_end_i: row.try_get("cds_end_i")?, - lengths: row.try_get("lengths")?, + tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, + alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, + alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, + cds_start_i: row.try_get("cds_start_i").map_err(|e| Arc::new(e))?, + cds_end_i: row.try_get("cds_end_i").map_err(|e| Arc::new(e))?, + lengths: row.try_get("lengths").map_err(|e| Arc::new(e))?, hgnc: hgnc.to_string(), // UTA database does not support selenoproteins (yet). translation_table: if is_selenoprotein { @@ -151,12 +152,12 @@ impl TryFrom for TxInfoRecord { fn try_from(row: Row) -> Result { Ok(Self { - hgnc: row.try_get("hgnc")?, - cds_start_i: row.try_get("cds_start_i")?, - cds_end_i: row.try_get("cds_end_i")?, - tx_ac: row.try_get("tx_ac")?, - alt_ac: row.try_get("alt_ac")?, - alt_aln_method: row.try_get("alt_aln_method")?, + hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, + cds_start_i: row.try_get("cds_start_i").map_err(|e| Arc::new(e))?, + cds_end_i: row.try_get("cds_end_i").map_err(|e| Arc::new(e))?, + tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, + alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, + alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, }) } } @@ -166,9 +167,9 @@ impl TryFrom for TxMappingOptionsRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac: row.try_get("tx_ac")?, - alt_ac: row.try_get("alt_ac")?, - alt_aln_method: row.try_get("alt_aln_method")?, + tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, + alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, + alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, }) } } @@ -231,7 +232,7 @@ impl Debug for Provider { impl Provider { pub fn with_config(config: &Config) -> Result { let config = config.clone(); - let conn = Mutex::new(Client::connect(&config.db_url, NoTls)?); + let conn = Mutex::new(Client::connect(&config.db_url, NoTls).map_err(|e| Arc::new(e))?); let schema_version = Self::fetch_schema_version( &mut conn.lock().expect("cannot obtain connection lock"), &config.db_schema, @@ -246,7 +247,7 @@ impl Provider { fn fetch_schema_version(conn: &mut Client, db_schema: &str) -> Result { let sql = format!("select key, value from {db_schema}.meta where key = 'schema_version'"); - let row = conn.query_one(&sql, &[])?; + let row = conn.query_one(&sql, &[]).map_err(|e| Arc::new(e))?; Ok(row.get("value")) } } @@ -282,7 +283,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&hgnc])? + .query_one(&sql, &[&hgnc]).map_err(|e| Arc::new(e))? .try_into()?; self.caches @@ -305,11 +306,11 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac])?) + .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))?) .into_iter() .next() { - let result = Some(row.try_get("pro_ac")?); + let result = Some(row.try_get("pro_ac").map_err(|e| Arc::new(e))?); self.caches .get_pro_ac_for_tx_ac .insert(tx_ac.to_string(), None); @@ -337,8 +338,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&ac])? - .try_get("seq_id")?; + .query_one(&sql, &[&ac]).map_err(|e| Arc::new(e))? + .try_get("seq_id").map_err(|e| Arc::new(e))?; let sql = format!( "SELECT seq FROM {}.seq WHERE seq_id = $1", @@ -348,8 +349,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&seq_id])? - .try_get("seq")?; + .query_one(&sql, &[&seq_id]).map_err(|e| Arc::new(e))? + .try_get("seq").map_err(|e| Arc::new(e))?; let begin = begin.unwrap_or_default(); let end = end @@ -373,7 +374,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&md5])? + .query(&sql, &[&md5]).map_err(|e| Arc::new(e))? { result.push(row.get(0)); } @@ -402,7 +403,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac])? + .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -439,7 +440,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method])? + .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]).map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -473,7 +474,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&gene])? + .query(&sql, &[&gene]).map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -517,7 +518,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&alt_ac, &start_i, &end_i])? + .query(&sql, &[&alt_ac, &start_i, &end_i]).map_err(|e| Arc::new(e))? { let record: TxForRegionRecord = row.try_into()?; // NB: The original Python code did not use alt_aln_method in the query either. @@ -547,7 +548,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&tx_ac])? + .query_one(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? .try_into()?; self.caches @@ -583,7 +584,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method])? + .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]).map_err(|e| Arc::new(e))? .try_into()?; self.caches.get_tx_info.insert(key, result.clone()); @@ -607,7 +608,7 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac])? + .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 195932e..7c0fc7f 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -116,6 +116,7 @@ impl Default for Config { /// Map HGVS location objects between genomic (g), non-coding (n) and cds (c) /// coordinates according to a CIGAR string. +#[derive(Clone)] pub struct Mapper { /// Configuration for alignment mapping. pub config: Config, @@ -163,7 +164,7 @@ impl Mapper { // exons are adjacent. Assert that here. let mut sorted_exons = tx_exons.clone(); sorted_exons - .sort_unstable_by(|a, b| a.ord.partial_cmp(&b.ord).expect("comparison failed / NaN?")); + .sort_by(|a, b| a.ord.partial_cmp(&b.ord).expect("comparison failed / NaN?")); let mut offenders = sorted_exons.windows(2).filter(|pair| { let lhs = &pair[0]; let rhs = &pair[1]; diff --git a/src/mapper/altseq.rs b/src/mapper/altseq.rs index aab1b7d..52b991d 100644 --- a/src/mapper/altseq.rs +++ b/src/mapper/altseq.rs @@ -2,6 +2,9 @@ use std::{cmp::Ordering, sync::Arc}; +use cached::proc_macro::cached; +use cached::SizedCache; + use crate::{ data::interface::Provider, mapper::error::Error, @@ -28,6 +31,23 @@ pub struct RefTranscriptData { pub translation_table: TranslationTable, } +#[cached( + ty = "SizedCache>", + create = "{ SizedCache::with_size(1000) }", + convert = r#"{ format!("{}{}{}{:?}", + provider.data_version(), + provider.schema_version(), + tx_ac, + pro_ac) }"# +)] +pub(crate) fn ref_transcript_data_cached( + provider: Arc, + tx_ac: &str, + pro_ac: Option<&str>, +) -> Result { + RefTranscriptData::new(provider, tx_ac, pro_ac) +} + impl RefTranscriptData { /// Construct new instance fetching data from the provider.. /// diff --git a/src/mapper/cigar.rs b/src/mapper/cigar.rs index dc4ad36..05732d9 100644 --- a/src/mapper/cigar.rs +++ b/src/mapper/cigar.rs @@ -190,7 +190,7 @@ pub fn parse_cigar_string(input: &str) -> Result { /// bases. They often look similar to zero-based, right open coordinates. (But don't call them /// that. It upsets me deeply.) The most important difference is that zero width intervals /// neatly represent insertions between bases (or before or after the sequence). -#[derive(Default, Debug)] +#[derive(Default, Debug, Clone)] pub struct CigarMapper { pub cigar_string: CigarString, pub ref_pos: Vec, diff --git a/src/mapper/error.rs b/src/mapper/error.rs index 36d8351..4c222ec 100644 --- a/src/mapper/error.rs +++ b/src/mapper/error.rs @@ -3,7 +3,7 @@ use thiserror::Error; /// Error type for variant mapping. -#[derive(Error, Debug)] +#[derive(Error, Debug, Clone)] pub enum Error { #[error("validation error")] ValidationFailed(#[from] crate::validator::Error), @@ -99,4 +99,6 @@ pub enum Error { CannotConvertIntervalStart(i32), #[error("cannot convert interval end: {0} to usize")] CannotConvertIntervalEnd(i32), + #[error("general mapper error")] + General, } diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index aa5a41c..6e88a56 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -1,10 +1,12 @@ //! Code for mapping variants between sequences. +use std::ops::Deref; use std::{ops::Range, sync::Arc}; +use cached::proc_macro::cached; +use cached::SizedCache; use log::{debug, info}; -use super::alignment; use crate::{ data::interface::Provider, mapper::Error, @@ -16,7 +18,8 @@ use crate::{ sequences::revcomp, validator::{ValidationLevel, Validator}, }; -use std::ops::Deref; + +use super::alignment; /// Configuration for Mapper. /// @@ -121,12 +124,19 @@ impl Mapper { alt_ac: &str, alt_aln_method: &str, ) -> Result { - // TODO: implement caching - alignment::Mapper::new( - &alignment::Config { - strict_bounds: self.config.strict_bounds, - }, + // // TODO: implement caching + // alignment::Mapper::new( + // &alignment::Config { + // strict_bounds: self.config.strict_bounds, + // }, + // self.provider.clone(), + // tx_ac, + // alt_ac, + // alt_aln_method, + // ) + build_alignment_mapper_cached( self.provider.clone(), + self.config.strict_bounds, tx_ac, alt_ac, alt_aln_method, @@ -704,7 +714,7 @@ impl Mapper { var_c.clone() }; - let reference_data = RefTranscriptData::new( + let reference_data = ref_transcript_data_cached( self.provider.clone(), accession.deref(), prot_ac.map(|s| s.to_string()).as_deref(), @@ -941,15 +951,56 @@ impl Mapper { } } +/// A LRU cached version of `alignment::Mapper::new`. +/// The indirection here is due to the fact that `cached` cannot deal with `self` arguments. +/// The `convert` argument constructs the key to be used in the cache. +/// All of this function's arguments contribute to the key; +/// that is why the supplied provider's `data_version` and `schema_version` +/// should return sensible values which allow distinguishing them from other providers. +/// +/// Because the cached value must implement `Clone` +/// and the type is `Result`, +/// `Error`, too, must implement `Clone`. +/// Sadly, postgres Errors do not do that, so either: +/// 1. wrap non-clonable errors in `Arc` +/// 2. convert the result to an option instead +/// 3. return a generic error instead +#[cached( + ty = "SizedCache>", + create = "{ SizedCache::with_size(1000) }", + convert = r#"{ format!("{}{}{}{}{}{}", + provider.data_version(), + provider.schema_version(), + strict_bounds, + tx_ac, + alt_ac, + alt_aln_method) }"# +)] +fn build_alignment_mapper_cached( + provider: Arc, + strict_bounds: bool, + tx_ac: &str, + alt_ac: &str, + alt_aln_method: &str, +) -> Result { + alignment::Mapper::new( + &alignment::Config { strict_bounds }, + provider, + tx_ac, + alt_ac, + alt_aln_method, + ) +} #[cfg(test)] mod test { - use anyhow::Error; - use pretty_assertions::assert_eq; - use regex::Regex; use std::{ path::{Path, PathBuf}, str::FromStr, }; + + use anyhow::Error; + use pretty_assertions::assert_eq; + use regex::Regex; use test_log::test; use crate::{ @@ -1101,12 +1152,13 @@ mod test { /// in the `sanity_mock` module. mod sanity_mock { - use anyhow::Error; use std::{ path::{Path, PathBuf}, sync::Arc, }; + use anyhow::Error; + use crate::data::interface; use crate::{ data::interface::TxIdentityInfo, @@ -1685,9 +1737,10 @@ mod test { } mod gcp_tests { - use anyhow::Error; use std::path::Path; + use anyhow::Error; + #[derive(Debug, serde::Deserialize)] pub struct Record { pub id: String, diff --git a/src/normalizer.rs b/src/normalizer.rs index 34faa72..59585ba 100644 --- a/src/normalizer.rs +++ b/src/normalizer.rs @@ -16,7 +16,7 @@ use crate::{ mod error { /// Error type for normalization of HGVS expressins. - #[derive(thiserror::Error, Debug)] + #[derive(thiserror::Error, Debug, Clone)] pub enum Error { #[error("integer conversion failed")] IntegerConversion(#[from] std::num::TryFromIntError), diff --git a/src/parser/error.rs b/src/parser/error.rs index 11ab2eb..f8b50b3 100644 --- a/src/parser/error.rs +++ b/src/parser/error.rs @@ -3,7 +3,7 @@ use thiserror::Error; /// Error type for parsing of HGVS expressions. -#[derive(Error, Debug)] +#[derive(Error, Debug, Clone)] pub enum Error { /// Invalid genome interval. #[error("{0} is not a valid genome interval")] diff --git a/src/sequences.rs b/src/sequences.rs index dda64dc..4bca4d5 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -9,7 +9,7 @@ pub use crate::sequences::error::Error; mod error { /// Error type for normalization of HGVS expressins. - #[derive(thiserror::Error, Debug)] + #[derive(thiserror::Error, Debug, Clone)] pub enum Error { #[error("invalid 1-letter aminoacid: {0} at {1}")] InvalidOneLetterAminoAcid(String, String), @@ -1038,7 +1038,7 @@ impl CodonTranslator { /// Helper function to extract normalized codon to `self.codon`. fn normalize_codon(&mut self, codon: &[u8]) { - for (i, c) in codon.iter().enumerate() { + for (i, c) in codon[..3].iter().enumerate() { self.codon[i] = self.dna_ascii_map[*c as usize]; } } diff --git a/src/validator/error.rs b/src/validator/error.rs index 30e6e56..44da712 100644 --- a/src/validator/error.rs +++ b/src/validator/error.rs @@ -3,7 +3,7 @@ use thiserror::Error; /// Error type for validation of HGVS expressions. -#[derive(Error, Debug)] +#[derive(Error, Debug, Clone)] pub enum Error { #[error("ref or alt must be non-empty in {0}")] RefOrAltMustBeNonEmpty(String), From 7b9a3ecdb5978b26f19b1b6d365537adb236eab1 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Wed, 8 May 2024 17:02:04 +0200 Subject: [PATCH 05/13] rely on seqrepo-rs with cloneable Error --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 23606b2..3a3d422 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ postgres = { version = "0.19", features = ["with-chrono-0_4"] } quick_cache = "0.4" regex = "1.7" rustc-hash = "1.1" -seqrepo = { version = "0.10", features = ["cached"] } +seqrepo = { git = "https://github.com/varfish-org/seqrepo-rs", rev = "7d8824a0f600e72296aa7e9e5c5143cdba7e1234", features = ["cached"] } serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } thiserror = "1.0" From 843089c4e540731066727827349bfc971710e3e9 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 10:58:02 +0200 Subject: [PATCH 06/13] fmt --- src/data/uta.rs | 42 +++++++++++++++++++++++++++-------------- src/mapper/alignment.rs | 5 ++++- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/data/uta.rs b/src/data/uta.rs index 87d18fe..35cef49 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -283,7 +283,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&hgnc]).map_err(|e| Arc::new(e))? + .query_one(&sql, &[&hgnc]) + .map_err(|e| Arc::new(e))? .try_into()?; self.caches @@ -306,7 +307,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))?) + .query(&sql, &[&tx_ac]) + .map_err(|e| Arc::new(e))?) .into_iter() .next() { @@ -338,8 +340,10 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&ac]).map_err(|e| Arc::new(e))? - .try_get("seq_id").map_err(|e| Arc::new(e))?; + .query_one(&sql, &[&ac]) + .map_err(|e| Arc::new(e))? + .try_get("seq_id") + .map_err(|e| Arc::new(e))?; let sql = format!( "SELECT seq FROM {}.seq WHERE seq_id = $1", @@ -349,8 +353,10 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&seq_id]).map_err(|e| Arc::new(e))? - .try_get("seq").map_err(|e| Arc::new(e))?; + .query_one(&sql, &[&seq_id]) + .map_err(|e| Arc::new(e))? + .try_get("seq") + .map_err(|e| Arc::new(e))?; let begin = begin.unwrap_or_default(); let end = end @@ -374,7 +380,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&md5]).map_err(|e| Arc::new(e))? + .query(&sql, &[&md5]) + .map_err(|e| Arc::new(e))? { result.push(row.get(0)); } @@ -403,7 +410,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? + .query(&sql, &[&tx_ac]) + .map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -440,7 +448,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]).map_err(|e| Arc::new(e))? + .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]) + .map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -474,7 +483,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&gene]).map_err(|e| Arc::new(e))? + .query(&sql, &[&gene]) + .map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } @@ -518,7 +528,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&alt_ac, &start_i, &end_i]).map_err(|e| Arc::new(e))? + .query(&sql, &[&alt_ac, &start_i, &end_i]) + .map_err(|e| Arc::new(e))? { let record: TxForRegionRecord = row.try_into()?; // NB: The original Python code did not use alt_aln_method in the query either. @@ -548,7 +559,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? + .query_one(&sql, &[&tx_ac]) + .map_err(|e| Arc::new(e))? .try_into()?; self.caches @@ -584,7 +596,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]).map_err(|e| Arc::new(e))? + .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]) + .map_err(|e| Arc::new(e))? .try_into()?; self.caches.get_tx_info.insert(key, result.clone()); @@ -608,7 +621,8 @@ impl interface::Provider for Provider { .conn .lock() .expect("cannot obtain connection lock") - .query(&sql, &[&tx_ac]).map_err(|e| Arc::new(e))? + .query(&sql, &[&tx_ac]) + .map_err(|e| Arc::new(e))? { result.push(row.try_into()?); } diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 7c0fc7f..127f39d 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -175,7 +175,10 @@ impl Mapper { tx_ac.to_string(), alt_ac.to_string(), alt_aln_method.to_string(), - format!("{:?}", (once(offender).chain(offenders)).collect::>()), + format!( + "{:?}", + (once(offender).chain(offenders)).collect::>() + ), )); } From 80776f291e7521c22fbcdbefc1ed742915d60d47 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 11:14:01 +0200 Subject: [PATCH 07/13] implement data_version and schema_version for mock Provider in tests --- src/data/interface.rs | 1 - src/mapper/variant.rs | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/data/interface.rs b/src/data/interface.rs index dbde9d6..b5a14fe 100644 --- a/src/data/interface.rs +++ b/src/data/interface.rs @@ -5,7 +5,6 @@ use indexmap::IndexMap; use crate::{data::error::Error, sequences::TranslationTable}; use biocommons_bioutils::assemblies::Assembly; -use cached::proc_macro::cached; /// Information about a gene. /// diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index 6e88a56..7760d61 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -1164,6 +1164,8 @@ mod test { data::interface::TxIdentityInfo, mapper::variant::{Config, Mapper}, }; + use std::sync::atomic::AtomicUsize; + static PROVIDER_COUNT: AtomicUsize = AtomicUsize::new(0); #[derive(Debug, serde::Deserialize)] struct ProviderRecord { @@ -1174,6 +1176,8 @@ mod test { } pub struct Provider { + data_version: String, + schema_version: String, records: Vec, } @@ -1188,18 +1192,23 @@ mod test { for record in rdr.deserialize() { records.push(record?); } - - Ok(Self { records }) + let number = PROVIDER_COUNT.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let dummy_version = format!("provider_{number}"); + Ok(Self { + records, + data_version: dummy_version.clone(), + schema_version: dummy_version, + }) } } impl interface::Provider for Provider { fn data_version(&self) -> &str { - panic!("for test use only"); + &self.data_version } fn schema_version(&self) -> &str { - panic!("for test use only"); + &self.schema_version } fn get_assembly_map( From 16fdf52c0fc5e183224681c00ec672164a784cbb Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:03:13 +0200 Subject: [PATCH 08/13] update seqrepo to v0.10.2 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3a3d422..cca21fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ postgres = { version = "0.19", features = ["with-chrono-0_4"] } quick_cache = "0.4" regex = "1.7" rustc-hash = "1.1" -seqrepo = { git = "https://github.com/varfish-org/seqrepo-rs", rev = "7d8824a0f600e72296aa7e9e5c5143cdba7e1234", features = ["cached"] } +seqrepo = { version = "0.10.2", features = ["cached"] } serde_json = "1.0" serde = { version = "1.0", features = ["derive"] } thiserror = "1.0" From be821aeab5d191523148a93b063f5b214bf8ec91 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:06:01 +0200 Subject: [PATCH 09/13] shorter Arc::new error wrapping --- src/data/uta.rs | 131 ++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/src/data/uta.rs b/src/data/uta.rs index 35cef49..0d4fc45 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Mutex}; use crate::sequences::{seq_md5, TranslationTable}; use biocommons_bioutils::assemblies::{Assembly, ASSEMBLY_INFOS}; -use postgres::fallible_iterator::FallibleIterator; use crate::data::{ error::Error, interface, interface::GeneInfoRecord, interface::TxExonsRecord, @@ -43,15 +42,15 @@ impl TryFrom for GeneInfoRecord { type Error = Error; fn try_from(row: Row) -> Result { - let aliases: String = row.try_get("aliases").map_err(|e| Arc::new(e))?; + let aliases: String = row.try_get("aliases").map_err(Arc::new)?; let aliases = aliases.split(',').map(|s| s.to_owned()).collect::>(); Ok(Self { - hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, - maploc: row.try_get("maploc").map_err(|e| Arc::new(e))?, - descr: row.try_get("descr").map_err(|e| Arc::new(e))?, - summary: row.try_get("summary").map_err(|e| Arc::new(e))?, + hgnc: row.try_get("hgnc").map_err(Arc::new)?, + maploc: row.try_get("maploc").map_err(Arc::new)?, + descr: row.try_get("descr").map_err(Arc::new)?, + summary: row.try_get("summary").map_err(Arc::new)?, aliases, - added: row.try_get("added").map_err(|e| Arc::new(e))?, + added: row.try_get("added").map_err(Arc::new)?, }) } } @@ -61,8 +60,8 @@ impl TryFrom for TxSimilarityRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac1: row.try_get("tx_ac1").map_err(|e| Arc::new(e))?, - tx_ac2: row.try_get("tx_ac2").map_err(|e| Arc::new(e))?, + tx_ac1: row.try_get("tx_ac1").map_err(Arc::new)?, + tx_ac2: row.try_get("tx_ac2").map_err(Arc::new)?, hgnc_eq: row.try_get("hgnc_eq").unwrap_or(false), cds_eq: row.try_get("cds_eq").unwrap_or(false), es_fp_eq: row.try_get("es_fp_eq").unwrap_or(false), @@ -77,24 +76,24 @@ impl TryFrom for TxExonsRecord { fn try_from(row: Row) -> Result { Ok(Self { - hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, - tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, - alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, - alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, - alt_strand: row.try_get("alt_strand").map_err(|e| Arc::new(e))?, - ord: row.try_get("ord").map_err(|e| Arc::new(e))?, - tx_start_i: row.try_get("tx_start_i").map_err(|e| Arc::new(e))?, - tx_end_i: row.try_get("tx_end_i").map_err(|e| Arc::new(e))?, - alt_start_i: row.try_get("alt_start_i").map_err(|e| Arc::new(e))?, - alt_end_i: row.try_get("alt_end_i").map_err(|e| Arc::new(e))?, - cigar: row.try_get("cigar").map_err(|e| Arc::new(e))?, - tx_aseq: row.try_get("tx_aseq").map_err(|e| Arc::new(e))?, - alt_aseq: row.try_get("alt_aseq").map_err(|e| Arc::new(e))?, - tx_exon_set_id: row.try_get("tx_exon_set_id").map_err(|e| Arc::new(e))?, - alt_exon_set_id: row.try_get("alt_exon_set_id").map_err(|e| Arc::new(e))?, - tx_exon_id: row.try_get("tx_exon_id").map_err(|e| Arc::new(e))?, - alt_exon_id: row.try_get("alt_exon_id").map_err(|e| Arc::new(e))?, - exon_aln_id: row.try_get("exon_aln_id").map_err(|e| Arc::new(e))?, + hgnc: row.try_get("hgnc").map_err(Arc::new)?, + tx_ac: row.try_get("tx_ac").map_err(Arc::new)?, + alt_ac: row.try_get("alt_ac").map_err(Arc::new)?, + alt_aln_method: row.try_get("alt_aln_method").map_err(Arc::new)?, + alt_strand: row.try_get("alt_strand").map_err(Arc::new)?, + ord: row.try_get("ord").map_err(Arc::new)?, + tx_start_i: row.try_get("tx_start_i").map_err(Arc::new)?, + tx_end_i: row.try_get("tx_end_i").map_err(Arc::new)?, + alt_start_i: row.try_get("alt_start_i").map_err(Arc::new)?, + alt_end_i: row.try_get("alt_end_i").map_err(Arc::new)?, + cigar: row.try_get("cigar").map_err(Arc::new)?, + tx_aseq: row.try_get("tx_aseq").map_err(Arc::new)?, + alt_aseq: row.try_get("alt_aseq").map_err(Arc::new)?, + tx_exon_set_id: row.try_get("tx_exon_set_id").map_err(Arc::new)?, + alt_exon_set_id: row.try_get("alt_exon_set_id").map_err(Arc::new)?, + tx_exon_id: row.try_get("tx_exon_id").map_err(Arc::new)?, + alt_exon_id: row.try_get("alt_exon_id").map_err(Arc::new)?, + exon_aln_id: row.try_get("exon_aln_id").map_err(Arc::new)?, }) } } @@ -104,12 +103,12 @@ impl TryFrom for TxForRegionRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, - alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, - alt_strand: row.try_get("alt_strand").map_err(|e| Arc::new(e))?, - alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, - start_i: row.try_get("start_i").map_err(|e| Arc::new(e))?, - end_i: row.try_get("end_i").map_err(|e| Arc::new(e))?, + tx_ac: row.try_get("tx_ac").map_err(Arc::new)?, + alt_ac: row.try_get("alt_ac").map_err(Arc::new)?, + alt_strand: row.try_get("alt_strand").map_err(Arc::new)?, + alt_aln_method: row.try_get("alt_aln_method").map_err(Arc::new)?, + start_i: row.try_get("start_i").map_err(Arc::new)?, + end_i: row.try_get("end_i").map_err(Arc::new)?, }) } } @@ -127,15 +126,15 @@ impl TryFrom for TxIdentityInfo { type Error = Error; fn try_from(row: Row) -> Result { - let hgnc = row.try_get("hgnc").map_err(|e| Arc::new(e))?; + let hgnc = row.try_get("hgnc").map_err(Arc::new)?; let is_selenoprotein = SELENOPROTEIN_SYMBOLS.contains(&hgnc); Ok(Self { - tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, - alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, - alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, - cds_start_i: row.try_get("cds_start_i").map_err(|e| Arc::new(e))?, - cds_end_i: row.try_get("cds_end_i").map_err(|e| Arc::new(e))?, - lengths: row.try_get("lengths").map_err(|e| Arc::new(e))?, + tx_ac: row.try_get("tx_ac").map_err(Arc::new)?, + alt_ac: row.try_get("alt_ac").map_err(Arc::new)?, + alt_aln_method: row.try_get("alt_aln_method").map_err(Arc::new)?, + cds_start_i: row.try_get("cds_start_i").map_err(Arc::new)?, + cds_end_i: row.try_get("cds_end_i").map_err(Arc::new)?, + lengths: row.try_get("lengths").map_err(Arc::new)?, hgnc: hgnc.to_string(), // UTA database does not support selenoproteins (yet). translation_table: if is_selenoprotein { @@ -152,12 +151,12 @@ impl TryFrom for TxInfoRecord { fn try_from(row: Row) -> Result { Ok(Self { - hgnc: row.try_get("hgnc").map_err(|e| Arc::new(e))?, - cds_start_i: row.try_get("cds_start_i").map_err(|e| Arc::new(e))?, - cds_end_i: row.try_get("cds_end_i").map_err(|e| Arc::new(e))?, - tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, - alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, - alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, + hgnc: row.try_get("hgnc").map_err(Arc::new)?, + cds_start_i: row.try_get("cds_start_i").map_err(Arc::new)?, + cds_end_i: row.try_get("cds_end_i").map_err(Arc::new)?, + tx_ac: row.try_get("tx_ac").map_err(Arc::new)?, + alt_ac: row.try_get("alt_ac").map_err(Arc::new)?, + alt_aln_method: row.try_get("alt_aln_method").map_err(Arc::new)?, }) } } @@ -167,9 +166,9 @@ impl TryFrom for TxMappingOptionsRecord { fn try_from(row: Row) -> Result { Ok(Self { - tx_ac: row.try_get("tx_ac").map_err(|e| Arc::new(e))?, - alt_ac: row.try_get("alt_ac").map_err(|e| Arc::new(e))?, - alt_aln_method: row.try_get("alt_aln_method").map_err(|e| Arc::new(e))?, + tx_ac: row.try_get("tx_ac").map_err(Arc::new)?, + alt_ac: row.try_get("alt_ac").map_err(Arc::new)?, + alt_aln_method: row.try_get("alt_aln_method").map_err(Arc::new)?, }) } } @@ -232,7 +231,7 @@ impl Debug for Provider { impl Provider { pub fn with_config(config: &Config) -> Result { let config = config.clone(); - let conn = Mutex::new(Client::connect(&config.db_url, NoTls).map_err(|e| Arc::new(e))?); + let conn = Mutex::new(Client::connect(&config.db_url, NoTls).map_err(Arc::new)?); let schema_version = Self::fetch_schema_version( &mut conn.lock().expect("cannot obtain connection lock"), &config.db_schema, @@ -247,7 +246,7 @@ impl Provider { fn fetch_schema_version(conn: &mut Client, db_schema: &str) -> Result { let sql = format!("select key, value from {db_schema}.meta where key = 'schema_version'"); - let row = conn.query_one(&sql, &[]).map_err(|e| Arc::new(e))?; + let row = conn.query_one(&sql, &[]).map_err(Arc::new)?; Ok(row.get("value")) } } @@ -284,7 +283,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query_one(&sql, &[&hgnc]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? .try_into()?; self.caches @@ -308,11 +307,11 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&tx_ac]) - .map_err(|e| Arc::new(e))?) + .map_err(Arc::new)?) .into_iter() .next() { - let result = Some(row.try_get("pro_ac").map_err(|e| Arc::new(e))?); + let result = Some(row.try_get("pro_ac").map_err(Arc::new)?); self.caches .get_pro_ac_for_tx_ac .insert(tx_ac.to_string(), None); @@ -341,9 +340,9 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query_one(&sql, &[&ac]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? .try_get("seq_id") - .map_err(|e| Arc::new(e))?; + .map_err(Arc::new)?; let sql = format!( "SELECT seq FROM {}.seq WHERE seq_id = $1", @@ -354,9 +353,9 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query_one(&sql, &[&seq_id]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? .try_get("seq") - .map_err(|e| Arc::new(e))?; + .map_err(Arc::new)?; let begin = begin.unwrap_or_default(); let end = end @@ -381,7 +380,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&md5]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { result.push(row.get(0)); } @@ -411,7 +410,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&tx_ac]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { result.push(row.try_into()?); } @@ -449,7 +448,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { result.push(row.try_into()?); } @@ -484,7 +483,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&gene]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { result.push(row.try_into()?); } @@ -529,7 +528,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&alt_ac, &start_i, &end_i]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { let record: TxForRegionRecord = row.try_into()?; // NB: The original Python code did not use alt_aln_method in the query either. @@ -560,7 +559,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query_one(&sql, &[&tx_ac]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? .try_into()?; self.caches @@ -597,7 +596,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? .try_into()?; self.caches.get_tx_info.insert(key, result.clone()); @@ -622,7 +621,7 @@ impl interface::Provider for Provider { .lock() .expect("cannot obtain connection lock") .query(&sql, &[&tx_ac]) - .map_err(|e| Arc::new(e))? + .map_err(Arc::new)? { result.push(row.try_into()?); } From faa04f810dcbc4707ff791e48042baf49c036ae7 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:06:48 +0200 Subject: [PATCH 10/13] clippy lints --- src/data/cdot/json.rs | 10 +++------- src/mapper/altseq.rs | 16 ++++++++-------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/data/cdot/json.rs b/src/data/cdot/json.rs index 7580444..391b07f 100644 --- a/src/data/cdot/json.rs +++ b/src/data/cdot/json.rs @@ -766,13 +766,9 @@ impl TxProvider { .as_ref() .expect("cannot happen; genes without map_location are not imported") .clone(), - descr: gene - .description - .as_ref() - .map(Clone::clone) - .unwrap_or_default(), - summary: gene.summary.as_ref().map(Clone::clone).unwrap_or_default(), - aliases: gene.aliases.as_ref().map(Clone::clone).unwrap_or_default(), + descr: gene.description.clone().unwrap_or_default(), + summary: gene.summary.clone().unwrap_or_default(), + aliases: gene.aliases.clone().unwrap_or_default(), added: NaiveDateTime::default(), }) } diff --git a/src/mapper/altseq.rs b/src/mapper/altseq.rs index 52b991d..08f6ad4 100644 --- a/src/mapper/altseq.rs +++ b/src/mapper/altseq.rs @@ -878,7 +878,7 @@ impl AltSeqToHgvsp { }); aa_end = aa_start.clone(); - reference = "".to_owned(); + "".clone_into(&mut reference); alternative = insertion .chars() .next() @@ -897,7 +897,7 @@ impl AltSeqToHgvsp { }); aa_end = aa_start.clone(); - reference = "".to_owned(); + "".clone_into(&mut reference); alternative = insertion .chars() .next() @@ -928,12 +928,12 @@ impl AltSeqToHgvsp { number: *start, }); aa_end = aa_start.clone(); - reference = "".to_owned(); - alternative = insertion.clone(); + "".clone_into(&mut reference); + alternative.clone_from(insertion); is_sub = true; } else if !deletion.is_empty() { // delins OR deletion OR stop codon at variant position - reference = deletion.clone(); + reference.clone_from(deletion); let end = start + deletion.len() as i32 - 1; aa_start = Some(ProtPos { @@ -958,7 +958,7 @@ impl AltSeqToHgvsp { } else { aa_start.clone() }; - alternative = insertion.clone(); + alternative.clone_from(insertion); } else { // deletion OR stop codon at variant position if deletion.len() as i32 + start == self.ref_seq().len() as i32 { @@ -1009,7 +1009,7 @@ impl AltSeqToHgvsp { number: dup_end, }); reference = "".to_string(); - alternative = reference.clone(); + alternative.clone_from(&reference); } else { // is non-dup insertion let start = std::cmp::max(2, *start as usize) - 1; @@ -1024,7 +1024,7 @@ impl AltSeqToHgvsp { number: end as i32, }); reference = "".to_string(); - alternative = insertion.clone(); + alternative.clone_from(insertion); } } else { panic!("Unexpected variant: {:?}", &record); From 36ae6a6a13d764070d77c06067779c7b8635b980 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:09:12 +0200 Subject: [PATCH 11/13] clippy lints --- src/mapper/alignment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 127f39d..aa10137 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -58,7 +58,7 @@ fn hgvs_to_zbc(i: i32) -> i32 { /// /// The input exons are expected to be in transcript order, and the resulting CIGAR is also /// in transcript order. -pub fn build_tx_cigar(exons: &Vec, strand: i16) -> Result { +pub fn build_tx_cigar(exons: &[TxExonsRecord], strand: i16) -> Result { if exons.is_empty() { return Err(Error::EmptyExons); } From 0a0444166a700bf03a160a0ecf815ec79db5d3e5 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:12:32 +0200 Subject: [PATCH 12/13] do not run doctests for sequence.rs generators --- src/sequences.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sequences.rs b/src/sequences.rs index 4bca4d5..636f48b 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -85,7 +85,7 @@ pub fn revcomp(seq: &str) -> String { /// Mapping for DNA characters for normalization. /// Built via -/// ``` +/// ```ignore /// let mut result = [0; 256]; /// for c in 0..=255 { /// if c == b'u' || c == b'U' { @@ -720,7 +720,7 @@ const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ ]; /// Generated via: -/// ``` +/// ```ignore /// let mut result = [0; 64]; /// for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { /// if i > 63 { @@ -728,6 +728,7 @@ const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ /// } /// let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); /// result[dna3_2bit as usize] = aa1.as_bytes()[0]; +/// } /// ``` /// const CODON_2BIT_TO_AA1_LUT: [u8; 64] = [ From 6cc379654eb409af8c2aecb0a95ede794f175be0 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 10 Jun 2024 16:22:25 +0200 Subject: [PATCH 13/13] workaround ignore in doctest by using stringify, see rust-lang/rust/issues/87586 --- src/sequences.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/sequences.rs b/src/sequences.rs index 636f48b..9da74f2 100644 --- a/src/sequences.rs +++ b/src/sequences.rs @@ -85,7 +85,7 @@ pub fn revcomp(seq: &str) -> String { /// Mapping for DNA characters for normalization. /// Built via -/// ```ignore +/// ```rust,no_run /// let mut result = [0; 256]; /// for c in 0..=255 { /// if c == b'u' || c == b'U' { @@ -720,15 +720,17 @@ const DNA_TO_AA1_CHRMT_VERTEBRATE_VEC: &[(&str, &str)] = &[ ]; /// Generated via: -/// ```ignore -/// let mut result = [0; 64]; -/// for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { -/// if i > 63 { -/// break; // skip degenerate codons +/// ```rust,no_run +/// const _: &str = stringify!{ +/// let mut result = [0; 64]; +/// for (i, (dna3, aa1)) in DNA_TO_AA1_LUT_VEC.iter().enumerate() { +/// if i > 63 { +/// break; // skip degenerate codons +/// } +/// let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); +/// result[dna3_2bit as usize] = aa1.as_bytes()[0]; /// } -/// let dna3_2bit = dna3_to_2bit(dna3.as_bytes()).expect("invalid dna3"); -/// result[dna3_2bit as usize] = aa1.as_bytes()[0]; -/// } +/// }; /// ``` /// const CODON_2BIT_TO_AA1_LUT: [u8; 64] = [