Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add stop codon to table and other small fixes #138

Merged
merged 10 commits into from
Dec 1, 2021
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

### Added
- Support for including stop codons in the panel with the `*` character.
- Panel version is now included in JSON output [[#119][119]]
- Shorthand CLI versions for common/long options:
- `-D`: `--min_proportion_expected_depth`
- `-P`: `--custom_probe_set_path`
- `-R`: `--custom_variant_to_resistance_json`
- `-L`: `--custom_lineage_json`
- `-O`: `--format`

### Changed
- Default kmer size (21) made consistent across all subcommands [[#141][141]]

### Fixed
- Improved error messaging when an X amino acid resolves to a mutation already present
in the panel.
- Do not assume reference fasta file name is the same as the chromosome [[#140][140]]

## 0.10.0

Expand Down Expand Up @@ -43,8 +60,11 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

[113]: https://github.com/Mykrobe-tools/mykrobe/issues/113
[114]: https://github.com/Mykrobe-tools/mykrobe/issues/114
[119]: https://github.com/Mykrobe-tools/mykrobe/issues/119
[123]: https://github.com/Mykrobe-tools/mykrobe/issues/123
[127]: https://github.com/Mykrobe-tools/mykrobe/issues/127
[140]: https://github.com/Mykrobe-tools/mykrobe/issues/140
[141]: https://github.com/Mykrobe-tools/mykrobe/issues/141
[Unreleased]: https://github.com/Mykrobe-tools/mykrobe/compare/v0.10.0...HEAD
[mkdtemp]: https://docs.python.org/3.6/library/tempfile.html#tempfile.mkdtemp

1 change: 1 addition & 0 deletions src/mykrobe/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from mykrobe import version
from mykrobe.constants import *
110 changes: 66 additions & 44 deletions src/mykrobe/annotation/genes/models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from __future__ import print_function

import itertools
import logging
from collections import defaultdict

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data import CodonTable
import itertools
from Bio.Seq import Seq

from mykrobe import STOP
from mykrobe.utils import split_var_name
import logging

logger = logging.getLogger(__name__)
logger.setLevel("INFO")

Expand All @@ -15,7 +20,6 @@ def flatten(l):


class Region(object):

def __init__(self, reference, start, end, forward=True):
self.reference = reference
self.start = start
Expand All @@ -32,9 +36,9 @@ def strand(self):
@property
def seq(self):
if self.forward:
return self.reference[self.start - 1:self.end]
return self.reference[self.start - 1 : self.end]
else:
return self.reference[self.start - 1:self.end].reverse_complement()
return self.reference[self.start - 1 : self.end].reverse_complement()

def get_reference_position(self, pos):
if pos < 0 and self.forward:
Expand All @@ -51,7 +55,6 @@ def get_reference_position(self, pos):


class Gene(Region):

def __init__(self, name, reference, start, end, forward=True):
super(self.__class__, self).__init__(reference, start, end, forward)
self.name = name
Expand All @@ -63,15 +66,15 @@ def prot(self):
return self.seq.translate(table=self.translation_table).rstrip("*")

def get_context(self, pos, N):
return self.seq[(3 * (pos - 1)) - N: pos * 3 + N]
return self.seq[(3 * (pos - 1)) - N : pos * 3 + N]

def get_codon(self, pos):
if pos > len(self.prot):
raise ValueError(
"There are only %s aminoacids in this gene" % len(
self.prot))
"There are only %s aminoacids in this gene" % len(self.prot)
)
else:
return self.seq[(3 * (pos - 1)): pos * 3]
return self.seq[(3 * (pos - 1)) : pos * 3]

def get_reference_codon(self, pos):
if self.forward:
Expand All @@ -89,8 +92,7 @@ def get_reference_codons(self, pos):
if self.forward:
return condons_in_reading_frame
else:
return [str(Seq(s).reverse_complement())
for s in condons_in_reading_frame]
return [str(Seq(s).reverse_complement()) for s in condons_in_reading_frame]

def __str__(self):
return "Gene:%s" % self.name
Expand All @@ -100,7 +102,7 @@ def __repr__(self):


def make_backward_codon_table():
table = {}
table = defaultdict(list)
standard_table = CodonTable.unambiguous_dna_by_id[11]
codons = generate_all_possible_codons()
for codon in codons:
Expand All @@ -109,16 +111,18 @@ def make_backward_codon_table():
table[standard_table.forward_table[codon]].append(codon)
except:
table[standard_table.forward_table[codon]] = [codon]
else:
table["*"].append(codon)
return table


def generate_all_possible_codons():
return ["".join(subset)
for subset in itertools.product(["A", "T", "C", "G"], repeat=3)]

return [
"".join(subset) for subset in itertools.product(["A", "T", "C", "G"], repeat=3)
]

class GeneAminoAcidChangeToDNAVariants():

class GeneAminoAcidChangeToDNAVariants:
def __init__(self, reference, genbank):
self.reference = self._parse_reference(reference)
self.genbank = self._parse_genbank(genbank)
Expand All @@ -130,13 +134,13 @@ def _parse_reference(self, reference):

def _parse_genbank(self, genbank):
d = {}
with open(genbank, 'r') as infile:
with open(genbank, "r") as infile:
for feat in SeqIO.read(infile, "genbank").features:
if feat.type == "gene":
try:
name = feat.qualifiers.get(
"gene",
feat.qualifiers.get("db_xref"))[0]
"gene", feat.qualifiers.get("db_xref")
)[0]
except TypeError:
pass
else:
Expand All @@ -153,27 +157,31 @@ def _parse_genbank(self, genbank):
self.reference,
start=feat.location.start + 1,
end=feat.location.end,
forward=forward)
forward=forward,
)
return d

def get_alts(self, amino_acid):
if amino_acid == "X":
return flatten(self.backward_codon_table.values())
# exclude stop codons
non_stop_codons = [
v for k, v in self.backward_codon_table.items() if k != STOP
]
return flatten(non_stop_codons)
else:
return self.backward_codon_table[amino_acid]

def get_reference_alts(self, gene, amino_acid):
if gene.forward:
return self.get_alts(amino_acid)
else:
return [str(Seq(s).reverse_complement())
for s in self.get_alts(amino_acid)]
return [str(Seq(s).reverse_complement()) for s in self.get_alts(amino_acid)]

def get_location(self, gene, pos):
if gene.forward:
dna_pos = (3 * (pos - 1)) + 1
else:
dna_pos = (3 * (pos))
dna_pos = 3 * (pos)
return gene.get_reference_position(dna_pos)

def get_variant_names(self, gene, mutation, protein_coding_var=True):
Expand All @@ -185,15 +193,16 @@ def get_variant_names(self, gene, mutation, protein_coding_var=True):
return self._process_coding_mutation(gene, ref, start, alt)
else:
raise ValueError(
"Variants are defined in 1-based coordinates. You can't have pos 0. ")
"Variants are defined in 1-based coordinates. You can't have pos 0. "
)

def _process_DNA_mutation(self, gene, ref, start, alt):
names = []
pos = gene.get_reference_position(start)
if not gene.forward:
pos -= len(ref) - 1
ref = str(Seq(ref).reverse_complement())
if alt != 'X':
if alt != "X":
alt = str(Seq(alt).reverse_complement())
if alt == "X":
for a in ["A", "T", "C", "G"]:
Expand All @@ -204,33 +213,46 @@ def _process_DNA_mutation(self, gene, ref, start, alt):
return names

def _process_coding_mutation(self, gene, ref, start, alt):
logger.debug("Processing gene:{} ref:{} start:{} alt:{}".format(
gene, ref, start, alt))
logger.debug(
"Processing gene:{} ref:{} start:{} alt:{}".format(gene, ref, start, alt)
)
if not gene.prot or start > len(gene.prot):
raise ValueError("Error translating %s_%s " %
(gene, "".join([ref, str(start), alt])))
raise ValueError(
"Error translating %s_%s " % (gene, "".join([ref, str(start), alt]))
)
if not gene.prot[start - 1] == ref:
raise ValueError(
"Error processing %s_%s. The reference at pos %i is not %s, it's %s. " %
(gene, "".join(
[
ref, str(start), alt]), start, ref, gene.prot[
start - 1]))
"Error processing %s_%s. The reference at pos %i is not %s, it's %s. "
% (
gene,
"".join([ref, str(start), alt]),
start,
ref,
gene.prot[start - 1],
)
)
ref_codons = gene.get_reference_codons(start)
alt_codons = self.get_reference_alts(gene, alt)
logger.debug("Reference codons (forward strand equivalent) {}".format(
"".join(ref_codons)))
logger.debug("Alternate codons (forward strand equivalent) {}".format(
"".join(alt_codons)))
logger.debug(
"Reference codons (forward strand equivalent) {}".format(
"".join(ref_codons)
)
)
logger.debug(
"Alternate codons (forward strand equivalent) {}".format(
"".join(alt_codons)
)
)
for ref_codon in ref_codons:
if ref_codon in alt_codons:
alt_codons.remove(ref_codon)
location = self.get_location(gene, start)
alternative = "/".join(alt_codons)
ref_codon = gene.get_reference_codon(start)
names = ["".join(["".join(ref_codon),
str(location),
"".join(alt_codon)]) for alt_codon in alt_codons]
names = [
"".join(["".join(ref_codon), str(location), "".join(alt_codon)])
for alt_codon in alt_codons
]
return names

def get_gene(self, gene):
Expand Down
8 changes: 4 additions & 4 deletions src/mykrobe/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import argparse
import os

DEFAULT_KMER_SIZE = os.environ.get("KMER_SIZE", 21)
DEFAULT_DB_NAME = os.environ.get("DB_NAME", "mykrobe")
from mykrobe import K

sequence_or_graph_parser_mixin = argparse.ArgumentParser(add_help=False)
sequence_or_graph_parser_mixin.add_argument(
Expand All @@ -16,7 +15,7 @@
metavar="kmer",
type=int,
help="K-mer length (default: %(default)d)",
default=DEFAULT_KMER_SIZE,
default=K,
)
sequence_or_graph_parser_mixin.add_argument(
"--tmp", help="Directory to write temporary files to", default=None
Expand Down Expand Up @@ -141,6 +140,7 @@
type=int,
)
genotyping_mixin.add_argument(
"-D",
"--min_proportion_expected_depth",
help="Minimum depth required on the sum of both alleles (default: %(default).2f)",
default=0.3,
Expand All @@ -156,7 +156,7 @@
"-o",
"--output",
type=str,
help="File path to save output json file as. Default is to stdout",
help="File path to save output file as. Default is to stdout",
default="",
)

Expand Down
3 changes: 2 additions & 1 deletion src/mykrobe/cmds/amr.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def run(parser, args):
logger.warning("Setting conf_percent_cutoff to 90 (was not specified, and --ont flag was used)")
else:
args.conf_percent_cutoff = 100
logger.warning("Setting conf_percent_cutoff to 90 (was not specified, and --ont flag was not used)")
logger.warning("Setting conf_percent_cutoff to 100 (was not specified, and --ont flag was not used)")


# conf_percent_cutoff == 100 means that we want to keep all variant calls,
Expand Down Expand Up @@ -446,6 +446,7 @@ def run(parser, args):
"version": {
"mykrobe-predictor": predictor_version,
"mykrobe-atlas": atlas_version,
"panel": ref_data["version"]
},
"genotype_model": args.model,
}
Expand Down
Loading