Skip to content

Commit

Permalink
add -t flag to consider trancsript version during liftover and annota…
Browse files Browse the repository at this point in the history
…tion
  • Loading branch information
pre-mRNA committed Apr 29, 2024
1 parent 066e367 commit 4252f07
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 21 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Arguments:
Options:
-H, --header: Indicates the input file has a header, which will be preserved in the output [Default: False]
-o, --output <OUTPUT>: Path to output file [Default: STDOUT]
-t, --transscript-version: Indicates that '.'-delimited transcript version information is present in col1 and should be considered during liftover [default: False].
```
- Liftover prepends 6 columns to the input file, containing the genome coordinates of the transcript features in BED format
- All data in the original input are preserved in the output and shifted by 6 columns
Expand All @@ -59,6 +59,7 @@ Arguments:
Options:
-H, --header: Indicates the input file has a header, which will be preserved in the output [Default: False]
-o, --output <OUTPUT>: Path to output file [Default: STDOUT]
-t, --transscript-version: Indicates that '.'-delimited transcript version is present in col1 and should be considered during annotation [default: False].
```

Expand Down
14 changes: 11 additions & 3 deletions src/annotate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ fn splice_site_distances(tx_coord: u64, splice_sites: &[SpliceSite]) -> (Option<
}


pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box<dyn Error>> {
pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool, has_version: bool) -> Result<(), Box<dyn Error>> {

// eprintln!("Running the annotate functionality...");

Expand All @@ -140,7 +140,7 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
// TODO: implement GFF3 parsing
// let default_format = String::from("gtf");
// let format = matches.get_one("format").unwrap_or(&default_format);
let annotations = read_annotation_file(&gtf_file, true)?;
let annotations = read_annotation_file(&gtf_file, true, has_version)?;

// Print the annotations in a table
// eprintln!("Previewing transcript annotations\n");
Expand Down Expand Up @@ -173,7 +173,15 @@ pub fn run_annotate(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
let line = line.unwrap();
let fields: Vec<&str> = line.split('\t').collect();
let transcript_id_with_version = fields[0];
let transcript_id = transcript_id_with_version.split('.').next().unwrap();


let transcript_id = if has_version {
transcript_id_with_version
} else {
transcript_id_with_version.split('.').next().unwrap()
};


let tx_coord: u64 = fields[1].parse().unwrap();

if let Some(transcript) = transcripts.get(transcript_id) {
Expand Down
17 changes: 11 additions & 6 deletions src/liftover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ use std::collections::HashMap;
pub fn convert_transcriptomic_to_genomic_coordinates(
site_fields: &[&str], // input: tab-separated transcriptome position fields
annotations: &HashMap<String, Transcript>, // input: parsed annotation object
has_version: bool

) -> Option<String> { // return type: Option<String), either Some(String) or None

// Check if there are at least 4 fields in site_fields
Expand All @@ -18,8 +20,11 @@ pub fn convert_transcriptomic_to_genomic_coordinates(
// Extract the transcript ID with version from the first field
let transcript_id_with_version = site_fields[0];

// Remove the version from transcript ID
let transcript_id = transcript_id_with_version.split('.').next().unwrap();
let transcript_id = if has_version {
transcript_id_with_version
} else {
transcript_id_with_version.split('.').next().unwrap()
};

// Parse the transcriptomic position (second field) as u64
let position: u64 = site_fields[1].parse().unwrap();
Expand Down Expand Up @@ -85,19 +90,19 @@ pub fn convert_transcriptomic_to_genomic_coordinates(
None
}

pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(), Box<dyn Error>> {
pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool, has_version: bool) -> Result<(), Box<dyn Error>> {

// TODO: implement format matching for GFF3 file parsing
// let default_format = String::from("gtf");
//let format = matches.get_one("format").unwrap_or(&default_format);
// let format = matches.get_one("format").unwrap_or(&default_format);

let gtf_file: String = matches.get_one::<String>("gtf").unwrap().to_string();
let input_file: String = matches.get_one::<String>("input").unwrap().to_string();
let output_file: Option<String> = matches.get_one::<String>("output").map(|s: &String| s.to_string());

// By default, read in the annotations as GTF file
// TODO: implement GFF3 parsing
let annotations = read_annotation_file(&gtf_file, true)?;
let annotations = read_annotation_file(&gtf_file, true, has_version)?;

// Print the annotations in a table
// eprintln!("Previewing transcript annotations\n");
Expand Down Expand Up @@ -132,7 +137,7 @@ pub fn run_liftover(matches: &clap::ArgMatches, has_header: bool) -> Result<(),
if let Err(e) = site_fields[1].parse::<u64>() {
eprintln!("Error parsing position from line: '{}'\nError: {}", line.trim(), e);
} else if let Some(genomic_coordinates) =
convert_transcriptomic_to_genomic_coordinates(&site_fields, &annotations)
convert_transcriptomic_to_genomic_coordinates(&site_fields, &annotations, has_version)
{
if let Err(_) = writeln!(output_writer, "{}", genomic_coordinates) {
break;
Expand Down
23 changes: 19 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ fn main() {
.help("Indicates that the input file has a header in line 1")
.action(clap::ArgAction::SetTrue)
)
.arg(
Arg::new("transcript-version")
.short('t')
.long("transcript-version")
.help("Retain transcript version information (. delimited) in col 1")
.action(clap::ArgAction::SetTrue)
)
.arg(
Arg::new("output")
.short('o')
Expand Down Expand Up @@ -80,6 +87,13 @@ fn main() {
.help("Indicates that the input file has a header in line 1")
.action(clap::ArgAction::SetTrue)
)
.arg(
Arg::new("transcript-version")
.short('t')
.long("transcript-version")
.help("Retain transcript version information (. delimited) in col 1")
.action(clap::ArgAction::SetTrue)
)
.arg(
Arg::new("output")
.short('o')
Expand All @@ -100,11 +114,11 @@ fn main() {
// Handle the liftover subcommand
if let Some(liftover_matches) = matches.subcommand_matches("liftover") {
let has_header = liftover_matches.get_flag("header");

let has_version = liftover_matches.get_flag("transcript-version");

eprintln!("Running liftover...");

if let Err(e) = liftover::run_liftover(liftover_matches, has_header) {
if let Err(e) = liftover::run_liftover(liftover_matches, has_header, has_version) {
eprintln!("Error running liftover: {}", e);
}
}
Expand All @@ -113,10 +127,11 @@ fn main() {
// Handle the annotate subcommand
if let Some(annotate_matches) = matches.subcommand_matches("annotate") {
let has_header = annotate_matches.get_flag("header");

let has_version = annotate_matches.get_flag("transcript-version");

eprintln!("Running annotate...");

if let Err(e) = annotate::run_annotate(annotate_matches, has_header) {
if let Err(e) = annotate::run_annotate(annotate_matches, has_header, has_version) {
eprintln!("Error running annotate: {}", e);
}
}
Expand Down
19 changes: 12 additions & 7 deletions src/parse_gtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pub fn parse_gff_attributes(attributes: &MultiMap<String, String>) -> HashMap<St

// Read annotation file
// TODO: Implement parsing for GFF3 files, when is_gtf = False
pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<String, Transcript>, Box<dyn std::error::Error>> {
pub fn read_annotation_file(file_path: &str, is_gtf: bool, has_version: bool) -> Result<HashMap<String, Transcript>, Box<dyn std::error::Error>> {
let mut transcripts: HashMap<String, Transcript> = HashMap::new();
let mut ignored_features: HashMap<String, u32> = HashMap::new();
let mut skipped_par_genes = HashSet::new(); // Do not read _PAR_ genes
Expand Down Expand Up @@ -127,14 +127,19 @@ pub fn read_annotation_file(file_path: &str, is_gtf: bool) -> Result<HashMap<Str
continue;
}
};
let transcript_id = match transcript_id_with_version.split('.').next() {
Some(id) => id.to_string(),
None => {
warn!("Invalid transcript ID format: {}. Skipping...", transcript_id_with_version);
continue;

let transcript_id = if has_version {
transcript_id_with_version.to_string()
} else {
match transcript_id_with_version.split('.').next() {
Some(id) => id.to_string(),
None => {
warn!("Invalid transcript ID format: {}. Skipping...", transcript_id_with_version);
continue;
}
}
};

debug!("Transcript ID: {}", transcript_id);

if *record.start() > *record.end() {
Expand Down

0 comments on commit 4252f07

Please sign in to comment.