diff --git a/README.md b/README.md index d9e8990..9309b6b 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Arguments: Options: -H, --header: Indicates the input file has a header, which will be preserved in the output [Default: False] -o, --output : Path to output file [Default: STDOUT] - + -t, --transscript-version: Indicates that '.'-delimited transcript version information is present in col1 and should be considered during liftover [default: False]. ``` - Liftover prepends 6 columns to the input file, containing the genome coordinates of the transcript features in BED format - All data in the original input are preserved in the output and shifted by 6 columns @@ -59,6 +59,7 @@ Arguments: Options: -H, --header: Indicates the input file has a header, which will be preserved in the output [Default: False] -o, --output : Path to output file [Default: STDOUT] + -t, --transscript-version: Indicates that '.'-delimited transcript version is present in col1 and should be considered during annotation [default: False]. ``` diff --git a/src/annotate.rs b/src/annotate.rs index 4d21e4e..876577a 100644 --- a/src/annotate.rs +++ b/src/annotate.rs @@ -128,7 +128,7 @@ fn splice_site_distances(tx_coord: u64, splice_sites: &[SpliceSite]) -> (Option< } -pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box> { +pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool, has_version: bool) -> Result<(), Box> { // eprintln!("Running the annotate functionality..."); @@ -140,7 +140,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), // TODO: implement GFF3 parsing // let default_format = String::from("gtf"); // let format = matches.get_one("format").unwrap_or(&default_format); - let annotations = read_annotation_file(>f_file, true)?; + let annotations = read_annotation_file(>f_file, true, has_version)?; // Print the annotations in a table // eprintln!("Previewing transcript annotations\n"); @@ -173,7 +173,15 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), let line = line.unwrap(); let fields: Vec<&str> = line.split('\t').collect(); let transcript_id_with_version = fields[0]; - let transcript_id = transcript_id_with_version.split('.').next().unwrap(); + + + let transcript_id = if has_version { + transcript_id_with_version + } else { + transcript_id_with_version.split('.').next().unwrap() + }; + + let tx_coord: u64 = fields[1].parse().unwrap(); if let Some(transcript) = transcripts.get(transcript_id) { diff --git a/src/liftover.rs b/src/liftover.rs index a6631e1..eec1c7c 100644 --- a/src/liftover.rs +++ b/src/liftover.rs @@ -8,6 +8,8 @@ use std::collections::HashMap; pub fn convert_transcriptomic_to_genomic_coordinates( site_fields: &[&str], // input: tab-separated transcriptome position fields annotations: &HashMap, // input: parsed annotation object + has_version: bool + ) -> Option { // return type: Option Result<(), Box> { +pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool, has_version: bool) -> Result<(), Box> { // TODO: implement format matching for GFF3 file parsing // let default_format = String::from("gtf"); - //let format = matches.get_one("format").unwrap_or(&default_format); + // let format = matches.get_one("format").unwrap_or(&default_format); let gtf_file: String = matches.get_one::("gtf").unwrap().to_string(); let input_file: String = matches.get_one::("input").unwrap().to_string(); @@ -97,7 +102,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(), // By default, read in the annotations as GTF file // TODO: implement GFF3 parsing - let annotations = read_annotation_file(>f_file, true)?; + let annotations = read_annotation_file(>f_file, true, has_version)?; // Print the annotations in a table // eprintln!("Previewing transcript annotations\n"); @@ -132,7 +137,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(), if let Err(e) = site_fields[1].parse::() { eprintln!("Error parsing position from line: '{}'\nError: {}", line.trim(), e); } else if let Some(genomic_coordinates) = - convert_transcriptomic_to_genomic_coordinates(&site_fields, &annotations) + convert_transcriptomic_to_genomic_coordinates(&site_fields, &annotations, has_version) { if let Err(_) = writeln!(output_writer, "{}", genomic_coordinates) { break; diff --git a/src/main.rs b/src/main.rs index 5f269a8..0a05ebd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -39,6 +39,13 @@ fn main() { .help("Indicates that the input file has a header in line 1") .action(clap::ArgAction::SetTrue) ) + .arg( + Arg::new("transcript-version") + .short('t') + .long("transcript-version") + .help("Retain transcript version information (. delimited) in col 1") + .action(clap::ArgAction::SetTrue) + ) .arg( Arg::new("output") .short('o') @@ -80,6 +87,13 @@ fn main() { .help("Indicates that the input file has a header in line 1") .action(clap::ArgAction::SetTrue) ) + .arg( + Arg::new("transcript-version") + .short('t') + .long("transcript-version") + .help("Retain transcript version information (. delimited) in col 1") + .action(clap::ArgAction::SetTrue) + ) .arg( Arg::new("output") .short('o') @@ -100,11 +114,11 @@ fn main() { // Handle the liftover subcommand if let Some(liftover_matches) = matches.subcommand_matches("liftover") { let has_header = liftover_matches.get_flag("header"); - + let has_version = liftover_matches.get_flag("transcript-version"); eprintln!("Running liftover..."); - if let Err(e) = liftover::run_liftover(liftover_matches, has_header) { + if let Err(e) = liftover::run_liftover(liftover_matches, has_header, has_version) { eprintln!("Error running liftover: {}", e); } } @@ -113,10 +127,11 @@ fn main() { // Handle the annotate subcommand if let Some(annotate_matches) = matches.subcommand_matches("annotate") { let has_header = annotate_matches.get_flag("header"); - + let has_version = annotate_matches.get_flag("transcript-version"); + eprintln!("Running annotate..."); - if let Err(e) = annotate::run_annotate(annotate_matches, has_header) { + if let Err(e) = annotate::run_annotate(annotate_matches, has_header, has_version) { eprintln!("Error running annotate: {}", e); } } diff --git a/src/parse_gtf.rs b/src/parse_gtf.rs index f92b37d..4ce88af 100644 --- a/src/parse_gtf.rs +++ b/src/parse_gtf.rs @@ -72,7 +72,7 @@ pub fn parse_gff_attributes(attributes: &MultiMap) -> HashMap Result, Box> { +pub fn read_annotation_file(file_path: &str, is_gtf: bool, has_version: bool) -> Result, Box> { let mut transcripts: HashMap = HashMap::new(); let mut ignored_features: HashMap = HashMap::new(); let mut skipped_par_genes = HashSet::new(); // Do not read _PAR_ genes @@ -127,14 +127,19 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result id.to_string(), - None => { - warn!("Invalid transcript ID format: {}. Skipping...", transcript_id_with_version); - continue; + + let transcript_id = if has_version { + transcript_id_with_version.to_string() + } else { + match transcript_id_with_version.split('.').next() { + Some(id) => id.to_string(), + None => { + warn!("Invalid transcript ID format: {}. Skipping...", transcript_id_with_version); + continue; + } } }; - + debug!("Transcript ID: {}", transcript_id); if *record.start() > *record.end() {