diff --git a/PUBLISHING.rst b/PUBLISHING.rst index 85a45384..a27ae5ed 100644 --- a/PUBLISHING.rst +++ b/PUBLISHING.rst @@ -42,7 +42,7 @@ Once changes have been made to develop that are ready to be published, first cho Then go through the steps of merging the changes into the master branch: -#. Run pytest and make sure all the tests pass. Then run :code:`./test/cmdline_tests.sh` and make sure those tests pass. +#. Run :code:`pytest` and make sure all the tests pass. Then run :code:`./test/cmdline_tests.sh` and make sure those tests pass. #. Change the 'Unreleased Changes' section of :code:`RELEASE_NOTES.rst` to the new version number. #. Check if any changes have been made that have not yet been documented in the release notes. If so, document them. #. Update the version number in setup.py diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 16e993f6..619e9179 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -1,3 +1,11 @@ +5.0.2 +----- + +Bug fixes: + +* MergeSTR now will no longer sometimes emit an alternate allele identical to the ref allele when + dealing with flanking base pairs. + 5.0.1 ----- diff --git a/setup.py b/setup.py index 41fb66e3..bfb4deb8 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ curdir = os.path.abspath(os.path.dirname(__file__)) MAJ = 5 MIN = 0 -REV = 1 +REV = 2 VERSION = '%d.%d.%d' % (MAJ, MIN, REV) with open(os.path.join(curdir, 'trtools/version.py'), 'w') as fout: fout.write( diff --git a/trtools/mergeSTR/README.rst b/trtools/mergeSTR/README.rst index 045fa91c..3ffc0883 100644 --- a/trtools/mergeSTR/README.rst +++ b/trtools/mergeSTR/README.rst @@ -9,7 +9,7 @@ MergeSTR If TR genotyping was performed separately on different samples or batches of samples, mergeSTR can be used to combine the resulting VCFs into one file. This is often necessary for downstream steps such as: computing per-locus statistics, performing per-locus filtering, and association testing. -While other VCF libraries have capabilities to merge VCF files, they do not always handle multi-allelic TRs properly, especially if the allele definitions are different across files. MergeSTR is TR-aware and currently handles VCF files obtained by: GangSTR, HipSTR, ExpansionHunter, popSTR, or adVNTR. See below for specific VCF fields supported for each genotyper. +While other VCF libraries have capabilities to merge VCF files, they do not always handle multi-allelic TRs properly, especially if the allele definitions are different across files. MergeSTR is TR-aware. See below for specific VCF fields supported for each genotyper. mergeSTR does not support merging VCFs produced by different TR genotypers - that is a more complex usecase, and we are designing a separate tool to do that. diff --git a/trtools/mergeSTR/mergeSTR.py b/trtools/mergeSTR/mergeSTR.py index 142e7e92..56b2d319 100644 --- a/trtools/mergeSTR/mergeSTR.py +++ b/trtools/mergeSTR/mergeSTR.py @@ -215,12 +215,14 @@ def GetAltsByKey(current_records: List[trh.TRRecord], mergelist: List[bool], key return result -def GetAltAlleles(current_records: List[trh.TRRecord], mergelist: List[bool], vcftype: Union[str, trh.VcfTypes]) \ +def GetAltAlleles(ref_allele: str, current_records: List[trh.TRRecord], mergelist: List[bool], vcftype: Union[str, trh.VcfTypes]) \ -> Tuple[List[str], List[np.ndarray]]: r"""Get list of alt alleles Parameters ---------- + ref_allele: + The (flank trimmed) reference allele, in upper case current_records : list of vcf.Record List of records being merged mergelist : list of bool @@ -260,6 +262,12 @@ def HipstrKey(record: trh.TRRecord): alt_picker = HipstrKey alts = GetAltsByKey(current_records, mergelist, alt_picker) + # normally the reference allele is distinct from all alt alleles + # however, if the record has flanking base pairs which are then removed, + # then the ref allele AAAAT and alt allele AAAAC would both be AAAA and + # would be duplicate, which shouldn't occur + if ref_allele in alts: + alts.remove(ref_allele) if vcftype == trh.VcfTypes.eh: # EH alleles look like where 42 is the @@ -272,12 +280,15 @@ def HipstrKey(record: trh.TRRecord): else: out_alts = sorted(alts, key=lambda x: (len(x), x)) + alleles = [ref_allele] + alleles.extend(out_alts) + mappings = [] for i in range(len(mergelist)): if mergelist[i]: ralts = alt_picker(current_records[i]) mappings.append( - np.array([0] + [out_alts.index(ralt.upper()) + 1 for ralt in ralts]).astype(str) + np.array([0] + [alleles.index(ralt.upper()) for ralt in ralts]).astype(str) ) return out_alts, mappings @@ -466,7 +477,7 @@ def MergeRecords(readers: cyvcf2.VCF, vcftype: Union[str, trh.VcfTypes], num_sam common.WARNING("Conflicting refs found at {}:{}. Skipping.".format(chrom, pos)) return - alt_alleles, mappings = GetAltAlleles(current_records, mergelist, vcftype) + alt_alleles, mappings = GetAltAlleles(ref_allele, current_records, mergelist, vcftype) # Set common fields vcfw.write(chrom) # CHROM diff --git a/trtools/mergeSTR/tests/test_mergeSTR.py b/trtools/mergeSTR/tests/test_mergeSTR.py index 618e4442..82d813d8 100644 --- a/trtools/mergeSTR/tests/test_mergeSTR.py +++ b/trtools/mergeSTR/tests/test_mergeSTR.py @@ -1,5 +1,7 @@ import argparse import os + +import cyvcf2 import numpy as np import pytest @@ -199,9 +201,20 @@ def test_MissingFieldWarnings(capsys, args, mrgvcfdir): captured = capsys.readouterr() assert "Expected format field DP not found" in captured.err +def test_alt_same_len_as_ref_different_flanking(args, mrgvcfdir): + fname1 = os.path.join(mrgvcfdir, "test_file_hipstr1.vcf.gz") + fname2 = os.path.join(mrgvcfdir, "test_file_hipstr2_alt_v_ref.vcf.gz") + args.vcfs = fname1 + "," + fname2 + main(args) + + vcf = cyvcf2.VCF(args.out + '.vcf') + var = next(vcf) + for alt in var.ALT: + assert alt != var.REF + def test_ConflictingRefs(): # Set up dummy records - dummy_records = [] + dummy_records = [] dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG')) dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAGCAG')) dummy_records.append(DummyHarmonizedRecord('chr1', 100, 'CAG')) @@ -337,3 +350,4 @@ def test_popstr_output(args, mrgvcfdir): # what if there are multiple records at the same location in the same VCF # if a genotype is no called but there is other format info, # we aren't emitting it. Is that intended? +# TODO write test where sample and allele orderings change diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz new file mode 100644 index 00000000..70877bad Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz differ diff --git a/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz.tbi b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz.tbi new file mode 100644 index 00000000..e34d3e3c Binary files /dev/null and b/trtools/testsupport/sample_vcfs/mergeSTR_vcfs/test_file_hipstr2_alt_v_ref.vcf.gz.tbi differ diff --git a/trtools/version.py b/trtools/version.py index adac3ff1..e4741ae5 100644 --- a/trtools/version.py +++ b/trtools/version.py @@ -1,4 +1,4 @@ # THIS FILE IS GENERATED FROM SETUP.PY -version = '5.0.1' +version = '5.0.2' __version__ = version \ No newline at end of file