diff --git a/CHANGELOG.md b/CHANGELOG.md index 00b29d4..de01067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Change Log +## v0.12.9 - 2025-01-10 + +### Added + +- CR-418 Provide BAM output of assembled SV contig alignments in the joint-call step + - BAM output reflects all assembled SV haplotypes used during genotyping, which can be useful for reviewing SV calls + ## v0.12.8 - 2024-12-13 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index c3ef25e..fab0cff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1765,7 +1765,7 @@ dependencies = [ [[package]] name = "sawfish" -version = "0.12.8" +version = "0.12.9" dependencies = [ "approx", "bio", diff --git a/Cargo.toml b/Cargo.toml index 9af2359..b5c4766 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sawfish" -version = "0.12.8" +version = "0.12.9" authors = ["Chris Saunders "] description = "Structural variant analysis for mapped PacBio HiFi reads" edition = "2021" diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json index 6030546..5968178 100644 --- a/LICENSE-THIRDPARTY.json +++ b/LICENSE-THIRDPARTY.json @@ -1702,7 +1702,7 @@ }, { "name": "sawfish", - "version": "0.12.8", + "version": "0.12.9", "authors": "Chris Saunders ", "repository": null, "license": null, diff --git a/docs/user_guide.md b/docs/user_guide.md index d33f591..057e4c0 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -102,7 +102,8 @@ Just as in the single-sample case, note that the reference fasta and all 3 sampl ### Joint call step -The primary user output of the sawfish SV caller is the SV VCF produced by the joint-call step. This file lists all SVs in VCF 4.2 format. Details of the SV representation in this file are provided below. +The primary output of the joint-calling step is the genotyped SV calls for all samples in VCF 4.2 format, written to `${OUTPUT_DIR}/genotyped.sv.vcf.gz`. +Details of the SV representation in this file are provided below. #### Quality scores @@ -121,7 +122,7 @@ The following filters may be applied to each VCF record: - `MaxScoringDepth` - Read depth at the SV locus exceeds 1000x, so all scoring and genotyping steps were disabled. - `InvBreakpoint` - This breakpoint is represented as part of a separate VCF inversion record (the inversion record shares the same EVENT ID) -#### SV Types +#### SV types Notes on formatting and representation of SVs are listed below for each major type. @@ -149,7 +150,7 @@ When an inversion is found, a VCF record will be output using the `` symbol #### Overlapping SV formatting -All sawfish SVs are output so that only one allele is described in each VCF record, even if an overlapping SV allele is output at the same locus. The internal SV calling model accounts for up to 2 overlapping alleles per sample during genotyping and quality scoring, reads supporting a 2nd alternate allele at any given locus will be counted as support the reference in output fields such as allele depth (`AD`). This protocol matches standard SV caller formatting conventions. Users interested in a more detailed output format, such as representing overlapping read support on the VCF `<*>` allele can request this for prioritization. +All sawfish SVs are output so that only one allele is described in each VCF record, even if an overlapping SV allele is output at the same locus. The internal SV calling model accounts for up to 2 overlapping alleles per sample during genotyping and quality scoring. Reads which support a 2nd alternate allele at any given locus will be counted as supporting the reference in output fields such as allele depth (`AD`). This protocol matches standard SV caller formatting conventions. Users interested in a more detailed output format, such as representing overlapping read support on the VCF `<*>` allele can request this for prioritization. #### Phasing @@ -185,18 +186,53 @@ Note that the number of read QNAME entries should often match the supporting AD ### Discover step -The discover step produces a number of output files in the discover output directory used by sawfish during the subsequent joint calling step. Although these are not intended for direct use, some of the important files are described here: +The discover step produces a number of output files in the discover output directory used by sawfish during the subsequent joint calling step. Although these are not intended for direct use, some of the important files are summarized here: - `assembly.regions.bed` - Describes each region of the genome targeted for assembly. -- `contig.alignment.bam` - This is a BAM file containing the SV locus contigs aligned back to the genome to create candidate SVs for each sample. - `candidate.sv.bcf` - These are the candidate SVs expressed in a simplified format for each sample. These are used as input for joint genotyping together with the aligned candidate contigs. - `discover.settings.json` - Various parameters from the discover step (either user input or default) are recorded in this file. Some of the paths to files like the sample bam and reference fasta will be reused in the joint call step. -### Debug outputs +### Debug outputs from either step -In either run step, the following files are produced to help debug problematic runs: -1. `${OUTPUT_DIR}/sawfish.log` - High level logging output -2. `${OUTPUT_DIR}/run_stats.json` - Run statistics and component timings +In either run step, the following files are produced to help debug problematic runs or SV calls: + +- `${OUTPUT_DIR}/sawfish.log` - High level logging output +- `${OUTPUT_DIR}/run_stats.json` - Run statistics and component timings +- `${OUTPUT_DIR}/contig.alignment.bam` - Contigs for assembled SV haplotypes aligned back to the reference genome + +#### Contig alignment format + +SV haplotype contig alignments are output to `${OUTPUT_DIR}/contig.alignment.bam` in either the discover or joint-call steps, and can be useful for reviewing SV calls. For instance, +this file can be viewed in alignment browsers such as IGV. + +Aligned contigs are provided for all single breakpoint SV calls. To find the contig for a given SV, locate the SV's VCF ID field, such as `sawfish:0:2803:1:2`, +and take the prefix from this ID that includes the first three digits, in this case `sawfish:0:2803:1` -- this is `QNAME` value of the corresponding SV +haplotype alignment(s) in the contig alignment BAM file. + +Contigs are not available for multi-breakpoint events such as inversions. However, contigs are available for each individual breakpoint +comprising the event. + +In addition to standard sequence and alignment information, each contig BAM record includes a custom aux field called `sf` which provides a list of key/value +properties associated with the contig, for instance: + + sf:Z:n_reads:15;hq_range:1500-2311; + +The properties are: + +- `n_reads` - The number or reads used to assemble the contig +- `hq_range` - The high-quality assembled region of the contig, prior to appending any flanking read sequence anchors. + +An example contig alignment bam record is: + +``` +sawfish:0:92:0 0 chr1 1649635 20 211=1X108=2I38=1X201=1X86=20I89=1I12=1X281=6D17=1X65=1D22=1X359=1X129=1X154=53D99=1X210=1X140=1X134=1X73=1X55=1X400=1X186=1X133=1X233=1X44=1D74=1635D9= * 0 0 TCCCTAATGAGAAATAAAGTGTCATGCAAAGAAACCTCACTTCAAAAATTTCACATGAAGCCGGGCACGGAGGCTTATGCCTGTAATCCTAGCACTTTGGGAGGCTGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCAAGGCCATCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCTGGGCGTGGTGGCAGACACCTGTAATCCCAGCTACTAAGCGAGGCTGAGGCAGAAGAATTGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCACTGCACTACAGCCTGGGCAAAAAAAAAAAAAAAAAACCCACGTGAAACTGAAATTAAGGCCGGGCGCGGTGGCTCACGCCTGTAATTCCAGCACTCTGGGAGGCCGAGGTGGGCGGATCACAAGGTCAGATCGGGACCATCCTGGCTAACACGGTGAAACCCCATCTCTACTAAAAATACAAAAAATTAGCTGGGTGTGGTGGCGGGCACCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGAATAGCGTGAACCCGGGAGATGGAATTTGCAGTGAGCTGAGATTGCGCCACTGTACTCCAGCCTGGGTGACAAGCAAGACTCCGTCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAAAGAAATTAAATCAAGAACAGTAAATATTTAAATAAATATTTAAATAATGATGTTAACGTTAAGTAATCCTAATTTTTCTTTTTTTTCTTTTTTTTTTTTTTGAGATGGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCAATCTCGGCTCACTGCAAGCTCCGCCTCCCGTGTTCACACCATTCTCCTGCCTCAGCCTCCCGAGTAGCTGGTACTACAGGCGCCTGCCACCACGCCCGGCTAATTTTTTGTCTTTTTAGTAGAGACGGGGTTCCACCATGTTAGCCAGGATGGTCTTGATGTTCTGACCTCGTGATCTGCCGGCCTCGGCCTCCCAAAGTGTGGGGATTACAGTTGTGAGCCACCGCGCCCGGCCTTTTTTTTTTTTTTTTTAAAAGAGACAGGGTCTCGCTATATTGGCCAGGCTGGTCTTGAACTCCCGGACTCAATTGATCCTCCAAGTGCTGGGATTACAGGCCTGAGCCACTGCACCCAGCCGAATAATCATGATTTTATGTTAAATAAAAAACTTTGAAAATAAAAAACTATCTGCAGTAAGCGTCTAATTATGAAGAAAGAAGAAAAAAGAAAAACAATTCTGCTATCACAGAAGAGCAGAATTGTAATATTCGTCTTTTAAAACTTTTCCATACTGAATAAACTATAATTATCAGTTTTATAATACAAAAATCACTCTTCACAAAGACTACAGAACAAAGCTTTGCTATCAGTGGGCTTCTCCACTGTGCAATGAAGCCACATTAATTAATCAAGTGTATTTATAATCATGACATTTCAATCGGGCTCCAGGTCCAATTTTCCTAACACCCGTAAGAACCTCTTGATGTTGGTACGAGATCAAACTGCTCAAGCCAAACCCATTCTTTGGACTTGAGCAAATACCCATTTTGGGGTGTGTTTTTCTCCTATACTTGTTGAATTCAGGTCATTTTAAATGTAAACAAACTGCTCCCAAACAATATAATGGGGGAGAGAAAACCCCAGAGGAAAAATGGACTAGCCATTCTGAATGGTCTGTGACACACGCACGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCGGTCTGTGATACACGCATGCTTTCAGCTGGAGTTTGCTCTCTGTAGCCCCTCTGAATGGTCTGTGACACATGCACGCTTTCAGCTAGAGTACTCTCTCTATAGCCCTTCTGAATGGTCTGTGACACACGCATGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCGGTCTGGGACACATGCATGCTTTTAGCTAGAGTTTGCTCTGTATAGCCCTTCTGAACGGTCTGTGACACACGCATGCTTTCAGCTGGAGTTTGCTCTCTATAGCCCCTCTGAATGGTCTGTGACACACGCATGCTTTCAGATAGAGTATTCTCTCTATAGCCCTTCTGAATGGTCTGTAACACACGCAAGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTTGGTCTGTGACACACGCATGCTTTTAGCTAGAGTTTGCTCTGTATAGCCCTTCTGAATGGTCTGTGACACATGCATGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCAGTCTGTGACACACACATGCTTTTAGCTAGAGTTTGCTCTGTATAGCCCTTCTGAATGGTCTGTGACACACGCGTGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTTGGTCTGTGACACACGCATGCTTTTAGCTAGTTTGCTCTCATAGCCCTTCTGAACGGTCTGTGACACATGCATGCTTTCAGCTATTCTCTCTATAGCCATTGTGAATGGTCTGTGACACACGCACGCTTTCAGCTAGAGTTTGCTCTTTCTGGTTTTTGGTCTGTGACACACGCATGCTTTCAGCTAGAGTTTGCTCTCTCTGGTTTTCGGTCTGTGACGCACGCATGCTTTTAGCTAGAGTATTCTCTCTATAGCCATTCTGAACGGTCTGTGACACACGTATGCTTTCAGCTAGAGTTTGCTTTCTCTGGTTTTTCAGTGGTGCTCTGGGGAAGGCAGAAGAGTAGGAACAGGAAAGAAACCACACTTGAACATGATGTCAAAGAAAGTAAATGCTTCTGTACCCCCTTCTGCTGAATGGCTACGATGCCTACGTTTCTCTTTTCTCTTTTCATCTTTTCTGTGATGAGCTTTTTCTTTCCGAGACATTTGCTGGGGTGGTTTGATGGCCAAAGAATCATCTTCTTCTCCTCTGAAATAAAACACAACAGCACTGCGTCATGCTTGAGAAAGTGCAAAGAAGCATCAGGCTATTATAAGGTTTCTTCAACCCAGAAAAATGCATGATTCAGACAGGAACAAAGCTGAAACATCATTTAAAAAATTACATTAATTCTCCAACTTTAGGCATCTTTTTTTTCTTTTTTTCTTTTTTTTAGACAGTCTCGCTCTGTTGCCCGGGCTGTAGTGGCACGATCTCGGCTCACTGCAATCTCCACCCTCCGGGTTCATGCCATTCTCTTGCCTCAGCCTCCCGAGTAGCTGGGACTACAGGCGCCCGCCGCCACGCTGGCTAATTTTTGTATTTTTAGTAGAGATGGGGTTTTACCATGTTAGCCAGGATGGTCTTGGTCTCCTGACCTCATGATCCGCCCACCTCGGTCTCCCAAAGTGCTGGGATTACAGGCGTGAGCCACTGCGCCCGGCCTGTATTTATTTTTTTGAGACGGAGTCTCGCTCTGTTGCCCAGGCTGGAATGCAGTGGTACGATCTCGGCTCATTGCAACCTCCCCTTCCAGTCCCAGGTTCAAGCAATTCTCCTGCCTCTGCCTCAGGAGTAGCTGGGATTACAGGCATGCGCCACCACACCCGGCTAATTTTTTATTTTTAGTAGAGACGGGGTTTCACCATATTGGTCAGGCTGGTCTCAAACTTGTGACATCATGATCCACCCACCTCG * sf:Z:n_reads:12;hq_range:1500-2103; +``` + +In this case, the VCF record for the corresponding SV call generated from the contig is: + +``` +chr1 1651421 sawfish:0:92:0:0 GTAGCCCCTCTGAACGGTCTGTGACACACGCATGCTTTCAGCTAGAGTACTCTA G 504 PASS SVTYPE=DEL;END=1651474;SVLEN=-53;HOMLEN=13;HOMSEQ=TAGCCCCTCTGAA GT:GQ:PL:AD 0/1:387:537,0,387:9,12 +``` ## Usage Details diff --git a/src/cli/joint_call.rs b/src/cli/joint_call.rs index f8a9776..9b337fa 100644 --- a/src/cli/joint_call.rs +++ b/src/cli/joint_call.rs @@ -44,6 +44,14 @@ pub struct JointCallSettings { /// #[arg(long)] pub report_supporting_reads: bool, + + /// Disable filtering out unused contigs and trimming alignment artifacts in + /// the aligned contig output. + /// + /// This is only intended for internal debugging cases + /// + #[arg(hide = true, long)] + pub no_contig_cleanup: bool, } /// Validate settings and update to parameters that can't be processed automatically by clap. diff --git a/src/contig_output.rs b/src/contig_output.rs index 12e5aeb..f549d2b 100644 --- a/src/contig_output.rs +++ b/src/contig_output.rs @@ -43,6 +43,7 @@ impl std::fmt::Debug for ContigAlignmentSegment { /// Assembly contig alignment information /// +#[derive(Clone)] pub struct ContigAlignmentInfo { /// Contig source sample pub sample_index: usize, @@ -100,15 +101,15 @@ fn get_sa_tag_segment( ) } -pub fn convert_contig_alignment_to_bam_record( +fn convert_contig_alignment_to_bam_record( chrom_list: &ChromList, - upper_pkg_name: &str, + pkg_name: &str, contig_alignment: ContigAlignmentInfo, ) -> bam::Record { const CONTIG_MAPQ: u8 = 20; let qname = format!( - "{}_contig:{}:{}:{}", - upper_pkg_name, + "{}:{}:{}:{}", + pkg_name, contig_alignment.sample_index, contig_alignment.cluster_index, contig_alignment.assembly_index @@ -122,6 +123,7 @@ pub fn convert_contig_alignment_to_bam_record( contig_alignment.seq.as_slice(), &quals, ); + record.unset_unmapped(); record.set_tid(this_segment.tid); record.set_pos(this_segment.pos); record.set_mapq(CONTIG_MAPQ); @@ -170,12 +172,41 @@ pub fn convert_contig_alignment_to_bam_record( record } +/// Create bam header for the sawfish contig alignments +/// +/// This is a simple header containing just the contig info, and the sawfish commandline as a "PG" entry +/// +fn get_contig_alignment_header(chrom_list: &ChromList, pkg_name: &str) -> bam::header::Header { + let mut new_header = bam::header::Header::new(); + + let mut hd_record = bam::header::HeaderRecord::new(b"HD"); + hd_record.push_tag(b"VN", "1.6"); + hd_record.push_tag(b"SO", "coordinate"); + new_header.push_record(&hd_record); + + for chrom_info in chrom_list.data.iter() { + let mut sq_record = bam::header::HeaderRecord::new(b"SQ"); + sq_record.push_tag(b"SN", &chrom_info.label); + sq_record.push_tag(b"LN", chrom_info.length); + new_header.push_record(&sq_record); + } + + let cmdline = std::env::args().collect::>().join(" "); + let mut pg_record = bam::header::HeaderRecord::new(b"PG"); + pg_record.push_tag(b"PN", pkg_name); + pg_record.push_tag(b"ID", format!("{pkg_name}-{PROGRAM_VERSION}")); + pg_record.push_tag(b"VN", PROGRAM_VERSION); + pg_record.push_tag(b"CL", &cmdline); + + new_header.push_record(&pg_record); + new_header +} + /// Completes writing (and closing) the contig bam file itself /// fn write_contig_alignments_bam( filename: &Path, thread_count: usize, - bam_header: bam::HeaderView, chrom_list: &ChromList, contig_alignments: Vec, ) { @@ -183,35 +214,24 @@ fn write_contig_alignments_bam( // Setup new bam writer for debug output: let mut bam_writer = { - let mut output_bam_header: bam::header::Header = - bam::header::Header::from_template(&bam_header); - let cmdline = std::env::args().collect::>().join(" "); - - let mut pg_record = bam::header::HeaderRecord::new(b"PG"); - pg_record.push_tag(b"PN", pkg_name); - pg_record.push_tag(b"ID", format!("{pkg_name}-{PROGRAM_VERSION}")); - pg_record.push_tag(b"VN", PROGRAM_VERSION); - pg_record.push_tag(b"CL", &cmdline); - output_bam_header.push_record(&pg_record); - + let output_bam_header = get_contig_alignment_header(chrom_list, pkg_name); bam::Writer::from_path(filename, &output_bam_header, bam::Format::Bam).unwrap() }; bam_writer.set_threads(thread_count).unwrap(); - let upper_pkg_name = pkg_name.to_uppercase(); - for contig_alignment in contig_alignments.into_iter() { - let record = - convert_contig_alignment_to_bam_record(chrom_list, &upper_pkg_name, contig_alignment); + let record = convert_contig_alignment_to_bam_record(chrom_list, pkg_name, contig_alignment); bam_writer.write(&record).unwrap(); } } +/// The bam_header should not already have a sawfish PG entry, to avoid conflicting PG entires which cause problems in IGV +/// pub fn write_contig_alignments( output_dir: &Path, thread_count: usize, - bam_header: bam::HeaderView, chrom_list: &ChromList, + contig_alignment_type_label: &str, mut contig_alignments: Vec, ) { // Sort contig alignments @@ -224,17 +244,11 @@ pub fn write_contig_alignments( let filename = output_dir.join(discover::CONTIG_ALIGNMENT_FILENAME); info!( - "Writing contig alignments to bam file: '{}'", + "Writing {contig_alignment_type_label} contig alignments to bam file: '{}'", filename.display() ); - write_contig_alignments_bam( - &filename, - thread_count, - bam_header, - chrom_list, - contig_alignments, - ); + write_contig_alignments_bam(&filename, thread_count, chrom_list, contig_alignments); // Index bam file // diff --git a/src/joint_call/get_refined_svs.rs b/src/joint_call/get_refined_svs.rs index dc7c1ab..70ca751 100644 --- a/src/joint_call/get_refined_svs.rs +++ b/src/joint_call/get_refined_svs.rs @@ -72,7 +72,11 @@ fn get_high_quality_contig_range(record: &bam::Record) -> Option { x } -/// Parse BAM record from sawfish contig alignment file into the sample_assembles structure to use in haplotype merging and joint GT steps +/// Parse BAM record from sawfish contig alignment file into the sample_assembles structure to use in haplotype merging +/// and joint GT steps +/// +/// Results are accumulated in the sample_assemblies structure. The primary index for this struct is cluster_index, so +/// that it can easily be looked up from any SV cluster_index. /// fn update_sample_assemblies_from_bam_record( chrom_list: &ChromList, @@ -94,7 +98,9 @@ fn update_sample_assemblies_from_bam_record( let words = qname.split(':').collect::>(); assert!(words.len() >= 4); - // Ignore sample index + // Ignore sample index. The joint call routine will add its own sample indexes based on the input order + // of the discover directories + // //let sample_index = words[1].parse::().unwrap(); let cluster_index = words[2].parse::().unwrap(); @@ -126,6 +132,7 @@ fn update_sample_assemblies_from_bam_record( chrom_index: record.tid() as usize, contig_alignment, high_quality_contig_range: high_quality_contig_range.clone(), + is_fwd_strand: true, // The contig output format requires fwd-strand for the primary alignment } }; @@ -155,6 +162,7 @@ fn update_sample_assemblies_from_bam_record( chrom_index: segment.chrom_index, contig_alignment, high_quality_contig_range, + is_fwd_strand: segment.is_fwd_strand, }; cluster_assembly .contig_alignments @@ -269,6 +277,11 @@ fn parse_bnd_alt_allele(chrom_list: &ChromList, alt_allele: &[u8], homlen: i64) /// RefinedSV plus auxiliary data which will be added to the SVGroup struct AnnotatedRefinedSV { refined_sv: RefinedSV, + + /// Assembly index or indexes used to associate the contig back to the SV call + /// + /// This provides a way to clarify which contigs are associated with the variant in overlapping contig regions. + /// contig_map: Vec, } @@ -491,6 +504,10 @@ fn process_bcf_record_to_anno_refine_sv( } } +/// Attempt to convert the full set of AnnotatedRefinedSVs from the same cluster_index into a single-sample SVGroup +/// +/// The group_arsvs will always be cleared by this method. +/// fn process_rsv_cluster_to_sv_group( contigs: &SampleAssemblies, expected_copy_number_regions: Option<&GenomeRegionsByChromIndex>, @@ -649,6 +666,7 @@ fn process_rsv_cluster_to_sv_group( } } +/// Take a set of annotated refined SVs from the same cluster, and try to convert these into an SVGroup fn process_rsv_cluster( contigs: &SampleAssemblies, expected_copy_number_regions: Option<&GenomeRegionsByChromIndex>, diff --git a/src/joint_call/merge_haplotypes/merge_sv_shared.rs b/src/joint_call/merge_haplotypes/merge_sv_shared.rs new file mode 100644 index 0000000..4d54bb0 --- /dev/null +++ b/src/joint_call/merge_haplotypes/merge_sv_shared.rs @@ -0,0 +1,45 @@ +//! Shared haplotype merging code for both single-region and multi-region cases +//! + +use crate::sv_group::SVGroup; + +#[derive(Debug)] +pub(super) struct AnnoSVGroup<'a> { + pub sv_group: &'a SVGroup, + pub sample_index: usize, +} + +/// Reformat single-sample SV group into multi-sample format +/// +pub(super) fn clone_sv_group_as_multisample( + sample_count: usize, + input_sv_group_info: &AnnoSVGroup, +) -> SVGroup { + let mut multisample_sv_group = input_sv_group_info.sv_group.clone(); + if sample_count != 1 { + // The single sample sv_group input has all information set to a sample_index of 0. + // The following steps move the sample index to its new value for all sample_index + // annotations within SVGroup + // + let haplotype_source_sample_index = input_sv_group_info.sample_index; + + // 1. Move sample_haplotypes to new sample_index value + { + let sample_haplotypes = multisample_sv_group.sample_haplotype_list.pop().unwrap(); + multisample_sv_group.sample_haplotype_list = vec![Vec::new(); sample_count]; + multisample_sv_group.sample_haplotype_list[haplotype_source_sample_index] = + sample_haplotypes; + } + + // 2. Move SV ids to new sample_index value + for rsv in multisample_sv_group.refined_svs.iter_mut() { + rsv.id.sample_index = haplotype_source_sample_index; + } + + // 3. Move group haplotypes to the new sample_index value + for group_haplotype in multisample_sv_group.group_haplotypes.iter_mut() { + group_haplotype.hap_id.sample_index = haplotype_source_sample_index; + } + } + multisample_sv_group +} diff --git a/src/joint_call/merge_haplotypes/mod.rs b/src/joint_call/merge_haplotypes/mod.rs index 1b581dd..e43fe19 100644 --- a/src/joint_call/merge_haplotypes/mod.rs +++ b/src/joint_call/merge_haplotypes/mod.rs @@ -1,3 +1,4 @@ +mod merge_sv_shared; mod multi_region; mod single_region; diff --git a/src/joint_call/merge_haplotypes/multi_region.rs b/src/joint_call/merge_haplotypes/multi_region.rs index 45d63f4..98b085c 100644 --- a/src/joint_call/merge_haplotypes/multi_region.rs +++ b/src/joint_call/merge_haplotypes/multi_region.rs @@ -2,6 +2,7 @@ use std::collections::BinaryHeap; use rust_vc_utils::ChromList; +use super::merge_sv_shared::{clone_sv_group_as_multisample, AnnoSVGroup}; use super::{get_duplicate_stats, CandidateSVGroupInfo}; use crate::bio_align_utils::{print_pairwise_alignment, PairwiseAligner}; use crate::breakpoint::Breakpoint; @@ -93,41 +94,6 @@ fn test_for_duplicate_haplotypes(sv_group1: &SVGroup, sv_group2: &SVGroup) -> bo test_for_duplicate_haplotypes_with_alignment(sv_group1, sv_group2, debug_duptest) } -struct AnnoSVGroup<'a> { - sv_group: &'a SVGroup, - sample_index: usize, -} - -/// Reformat single-sample SV group into multi-sample format -/// -fn clone_sv_group_as_multisample( - sample_count: usize, - input_sv_group_info: &AnnoSVGroup, -) -> SVGroup { - let mut multisample_sv_group = input_sv_group_info.sv_group.clone(); - if sample_count != 1 { - // The single sample sv_group input has all information set to a sample_index of 0. - // The following steps move the sample index to its new value, and extends - // sample_haplotype_list to length sample_count. - // - let haplotype_source_sample_index = input_sv_group_info.sample_index; - - // 1. Move sample_haplotypes to new sample_index value - { - let sample_haplotypes = multisample_sv_group.sample_haplotype_list.pop().unwrap(); - multisample_sv_group.sample_haplotype_list = vec![Vec::new(); sample_count]; - multisample_sv_group.sample_haplotype_list[haplotype_source_sample_index] = - sample_haplotypes; - } - - // 2. Move SV ids to new sample_index value - for rsv in multisample_sv_group.refined_svs.iter_mut() { - rsv.id.sample_index = haplotype_source_sample_index; - } - } - multisample_sv_group -} - /// Merge candidates from the duplicate candidate pool /// /// 1. Determine how many SVGroups to keep out of the pool members diff --git a/src/joint_call/merge_haplotypes/single_region.rs b/src/joint_call/merge_haplotypes/single_region.rs index 5976880..78d8a6e 100644 --- a/src/joint_call/merge_haplotypes/single_region.rs +++ b/src/joint_call/merge_haplotypes/single_region.rs @@ -1,5 +1,6 @@ use std::collections::{BTreeSet, BinaryHeap}; +use super::merge_sv_shared::{clone_sv_group_as_multisample, AnnoSVGroup}; use super::{get_duplicate_stats, CandidateSVGroupInfo}; use crate::bio_align_utils::{print_pairwise_alignment, PairwiseAligner}; use crate::breakpoint::{Breakend, Breakpoint}; @@ -47,12 +48,19 @@ fn merge_sv_group_into_base_group( y })); - // 2. Concat haplotypes + // 2. Copy input group haplotypes with updated sample_index: + let input_group_haplotype_iter = input_group.group_haplotypes.iter().map(|x| { + let mut y = x.clone(); + y.hap_id.sample_index = input_sample_index; + y + }); + + // 3. Concat haplotypes base_group .group_haplotypes - .extend(input_group.group_haplotypes.clone()); + .extend(input_group_haplotype_iter); - // 3. Update sample_haplotype_list + // 4. Update sample_haplotype_list // TODO bring this check back //if !base_group.sample_haplotype_list[input_sample_index].is_empty() { @@ -463,12 +471,6 @@ fn merge_duplicated_candidates( ); } - #[derive(Debug)] - struct AnnoSVGroup<'a> { - sv_group: &'a SVGroup, - sample_index: usize, - } - let sv_groups = duplicate_candidate_pool .iter() .map(|x| AnnoSVGroup { @@ -477,34 +479,12 @@ fn merge_duplicated_candidates( }) .collect::>(); - // Step 1: Arbitrarily pick the first SV group, and format it for multi-sample output. + // Step 1: Arbitrarily pick the first SV group and format it for multi-sample output. // // Use this SV group as the base onto which any other SV groups will be merged. // - let mut merged_sv_group = sv_groups[0].sv_group.clone(); let sample_count = all_sample_data.len(); - if sample_count != 1 { - // Reformat SV group into multi-sample format - // - // The single sample sv_group input is going to come in with all information set - // to a sample_index of 0. The following steps move the sample index to its new value, - // and also extends sample_haplotype_list to length sample_count. - // - let haplotype_source_sample_index = sv_groups[0].sample_index; - - // 1. Move sample_haplotypes to new sample_index value - { - let sample_haplotypes = merged_sv_group.sample_haplotype_list.pop().unwrap(); - merged_sv_group.sample_haplotype_list = vec![Vec::new(); sample_count]; - merged_sv_group.sample_haplotype_list[haplotype_source_sample_index] = - sample_haplotypes; - } - - // 2. Move SV ids to new sample_index value - for rsv in merged_sv_group.refined_svs.iter_mut() { - rsv.id.sample_index = haplotype_source_sample_index; - } - } + let mut merged_sv_group = clone_sv_group_as_multisample(sample_count, &sv_groups[0]); // Step 2: Handle the special N==1 case, where we don't have to worry about any merging: if sv_groups.len() == 1 { diff --git a/src/joint_call/mod.rs b/src/joint_call/mod.rs index ea102c0..6533296 100644 --- a/src/joint_call/mod.rs +++ b/src/joint_call/mod.rs @@ -4,6 +4,7 @@ mod joint_call_all_samples; mod merge_haplotypes; mod read_sample_data; mod supporting_read_names; +mod write_merged_contigs; use self::find_inversions::find_inversions; use self::joint_call_all_samples::joint_genotype_all_samples; @@ -21,6 +22,13 @@ pub fn run_joint_call(shared_settings: &SharedSettings, settings: &JointCallSett let (merged_sv_groups, merge_stats) = merge_haplotypes(shared_settings, &shared_data.chrom_list, &all_sample_data); + write_merged_contigs::write_merged_contig_alignments( + shared_settings, + settings, + &shared_data, + &merged_sv_groups, + ); + let enable_phasing = true; let (mut scored_svs, mut score_stats) = joint_genotype_all_samples( shared_settings, diff --git a/src/joint_call/write_merged_contigs.rs b/src/joint_call/write_merged_contigs.rs new file mode 100644 index 0000000..a15f4ba --- /dev/null +++ b/src/joint_call/write_merged_contigs.rs @@ -0,0 +1,121 @@ +use crate::bam_utils::clip_alignment_read_edges; +use crate::contig_output::{write_contig_alignments, ContigAlignmentInfo, ContigAlignmentSegment}; +use crate::joint_call::read_sample_data::SharedJointCallData; +use crate::joint_call::{JointCallSettings, SharedSettings}; +use crate::sv_group::SVGroup; + +/// Convert the multi-sample merged SVGroups haplotypes into the contig +/// alignment format used to write the alingments out to BAM +/// +/// If 'cleanup_mode' is true, this operation is arranged to simplify some of the internal +/// contig complexity for end-users to reason about their SV calls. Otherwise, the full +/// haplotype information is converted as directly as possible to facilitate debugging. +/// +fn convert_merged_sv_groups_to_contig_alignments( + merged_sv_groups: &[SVGroup], + cleanup_mode: bool, +) -> Vec { + let mut contig_alignments = Vec::new(); + + for sv_group in merged_sv_groups.iter() { + let haplotype_to_sv_map = sv_group.haplotype_to_sv_map(); + for (haplotype_index, sv_group_haplotype) in sv_group.group_haplotypes.iter().enumerate() { + // Check if this group haplotype is one that doesn't have an associated SV, and skip the + // haplotype if this is the case. + // + // Note that non-SV haplotypes are maintained in some cases to provide an accurate alignment + // target for the second haplotype in a region, which can improve genotype accuracy. For the + // user-facing contig alignment output we want to simplify this complexity out and only show + // the contigs corresponding to the SV call ouput. + // + if cleanup_mode && haplotype_to_sv_map[haplotype_index].is_empty() { + continue; + } + + let hap_id = &sv_group_haplotype.hap_id; + let sample_index = hap_id.sample_index; + let cluster_index = hap_id.cluster_index; + let assembly_index = hap_id.assembly_index; + let supporting_read_count = sv_group_haplotype.contig_info.supporting_read_count; + + let segment_count = sv_group_haplotype.contig_info.contig_alignments.len(); + + // Build all segments: + let segments = { + let mut x = Vec::new(); + for segment_id in 0..segment_count { + let segment_alignment = + &sv_group_haplotype.contig_info.contig_alignments[segment_id]; + + // Trim the first and last N bases of the alignment + // + // This is done to remove some of the alignemnt noise created by they way we're trying to use WFA for + // local contig to reference alignment, which tends to create small alignment segments at the ends of + // each read. Ideally we should clean up the artifact instead of trying to remove it here + // + let (trimmed_cigar, trimmed_ref_offset) = if cleanup_mode { + let contig_alignment_edge_trim_size = 20; + clip_alignment_read_edges( + &segment_alignment.contig_alignment.cigar, + contig_alignment_edge_trim_size, + contig_alignment_edge_trim_size, + ) + } else { + (segment_alignment.contig_alignment.cigar.clone(), 0) + }; + + let contig_alignment_segment = ContigAlignmentSegment { + tid: segment_alignment.chrom_index as i32, + pos: segment_alignment.contig_alignment.ref_offset + trimmed_ref_offset, + cigar: trimmed_cigar, + is_fwd_strand: segment_alignment.is_fwd_strand, + }; + x.push(contig_alignment_segment); + } + x + }; + + // Push contig_alignment for each segment: + for segment_id in 0..segment_count { + let segment_alignment = + &sv_group_haplotype.contig_info.contig_alignments[segment_id]; + + let segment_contig_alignment = ContigAlignmentInfo { + sample_index, + cluster_index, + assembly_index, + seq: segment_alignment.contig_seq.clone(), + segment_id, + segments: segments.clone(), + supporting_read_count, + high_quality_contig_range: segment_alignment.high_quality_contig_range.clone(), + }; + contig_alignments.push(segment_contig_alignment); + } + } + } + + contig_alignments +} + +pub fn write_merged_contig_alignments( + shared_settings: &SharedSettings, + settings: &JointCallSettings, + shared_data: &SharedJointCallData, + merged_sv_groups: &[SVGroup], +) { + let sample_count = settings.sample.len(); + assert!(sample_count > 0); + + let cleanup_mode = !settings.no_contig_cleanup; + + let contig_alignments = + convert_merged_sv_groups_to_contig_alignments(merged_sv_groups, cleanup_mode); + write_contig_alignments( + &settings.output_dir, + shared_settings.thread_count, + &shared_data.chrom_list, + "merged", + contig_alignments, + ); +} diff --git a/src/refine_sv/align/mod.rs b/src/refine_sv/align/mod.rs index 9ff449a..1323eb5 100644 --- a/src/refine_sv/align/mod.rs +++ b/src/refine_sv/align/mod.rs @@ -529,6 +529,7 @@ fn align_single_ref_region_assemblies( chrom_index: target_segment.chrom_index, contig_alignment: contig_alignment.clone(), high_quality_contig_range: assembly_contig.high_quality_range.clone(), + is_fwd_strand: true, }; let group_haplotype = SVGroupHaplotype { hap_id, @@ -1910,6 +1911,7 @@ fn get_group_haplotype( chrom_index: alt_hap_info.ref_segment1.chrom_index, contig_alignment, high_quality_contig_range: assembly_contig.high_quality_range.clone(), + is_fwd_strand: true, } }; let segment2_alignement = { @@ -1929,6 +1931,7 @@ fn get_group_haplotype( chrom_index: alt_hap_info.ref_segment2.chrom_index, contig_alignment, high_quality_contig_range, + is_fwd_strand: !alt_hap_info.ref_segment2_revcomp, } }; diff --git a/src/refine_sv/mod.rs b/src/refine_sv/mod.rs index cc119e3..fc60b01 100644 --- a/src/refine_sv/mod.rs +++ b/src/refine_sv/mod.rs @@ -15,7 +15,7 @@ use std::time::Instant; use itertools::Itertools; use log::info; -use rust_htslib::bam::{self, Read}; +use rust_htslib::bam; use rust_vc_utils::{downsample_vector, ChromList, GenomeRef, ProgressReporter}; use strum::EnumCount; @@ -879,15 +879,11 @@ pub fn refine_sv_candidates( } // Write out a debug bam file of contig alignments to the reference - let sample_index = 0; - let bam_header = worker_thread_dataset[0].lock().unwrap().bam_readers[sample_index] - .header() - .clone(); write_contig_alignments( &settings.output_dir, shared_settings.thread_count, - bam_header, chrom_list, + "all", contig_alignments, ); diff --git a/src/sv_group.rs b/src/sv_group.rs index 4ce038e..4f42eb3 100644 --- a/src/sv_group.rs +++ b/src/sv_group.rs @@ -43,6 +43,12 @@ pub struct ClusterAssemblyAlignment { pub contig_alignment: SimpleAlignment, pub high_quality_contig_range: IntRange, + + /// Used to indicate if the original segment was mapped to the reverse strand + /// + /// Note that this isn't used by the joint-call routine because contig_seq and high_quality_contig_range will already be reversed if is_fwd_seq is false, + /// but this will be used to write out the contig to bam again if needed. + pub is_fwd_strand: bool, } #[derive(Clone, Default)] @@ -50,7 +56,8 @@ pub struct ClusterAssembly { /// Contig alignment for each split alignment segment /// /// For single region SVs there will be only one aligment. For SVs with breakpoints, there should - /// be (1 + breakpoint count) alignments. + /// be (1 + breakpoint count) alignments. The primary alignment should always be listed first, followed + /// by alignments in the SA tag, which will be reordered into 'sequencing order'. /// pub contig_alignments: Vec, pub supporting_read_count: usize,