Skip to content

Commit

Permalink
Merge pull request #698 from nextstrain/count-indels-once
Browse files Browse the repository at this point in the history
Count insertion/deletion events once in pairwise distances
  • Loading branch information
huddlej authored Apr 13, 2021
2 parents c663c95 + 00fe015 commit 727b0e4
Showing 1 changed file with 138 additions and 7 deletions.
145 changes: 138 additions & 7 deletions augur/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
import Bio.Phylo
from collections import defaultdict
import copy
from itertools import chain
import json
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -198,7 +199,7 @@ def read_distance_map(map_file):
return distance_map


def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map):
def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, aggregate_function=max):
"""Calculate distance between the two given nodes using the given distance map.
In cases where the distance map between sequences is asymmetric, the first
Expand Down Expand Up @@ -246,6 +247,90 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
>>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map)
0.0
Treat a single indel as one event.
>>> node_a_sequences = {"gene": "ACTG"}
>>> node_b_sequences = {"gene": "A--G"}
>>> distance_map = {"default": 1, "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
1
Use the maximum weight of all sites affected by an indel with a site-specific distance map.
>>> distance_map = {"default": 0, "map": {"gene": {1: 1, 2: 2}}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
2
Use the maximum weight of all mutations at all sites affected by an indel with a mutation-specific distance map.
>>> distance_map = {
... "default": 0,
... "map": {
... "gene": {
... 1: {
... ('C', 'G'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', 'A'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
3
Use the maximum weight of gaps at all sites affected by an indel with a mutation-specific distance map.
>>> distance_map = {
... "default": 0,
... "map": {
... "gene": {
... 1: {
... ('C', '-'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', '-'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
2
If the default value is greater than any of the site-specific mismatches and
the specific mismatch does not have a weighted defined, use the default
weight.
>>> distance_map = {
... "default": 4,
... "map": {
... "gene": {
... 1: {
... ('C', 'G'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', 'A'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
4
Count mismatches adjacent to indel events.
>>> node_a_sequences = {"gene": "ACTGTA"}
>>> node_b_sequences = {"gene": "A--CCA"}
>>> distance_map = {"default": 1, "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
3
Ignore specific characters defined in the distance map.
>>> node_a_sequences = {"gene": "ACTGG"}
Expand All @@ -256,6 +341,7 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
>>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
0
"""
distance_type = type(distance_map["default"])
distance = distance_type(0)
Expand All @@ -264,10 +350,33 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
for gene in node_a_sequences:
gene_length = len(node_a_sequences[gene])

# In a first pass, find all mismatches between the given sequences. Use
# this pass to identify all sites in insertion/deletion (indel) events,
# so we can calculate an aggregate weight per event. Track each event by
# its start site.
mismatches_by_site = defaultdict(set)
in_gap = False
for site in range(gene_length):
if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
and node_a_sequences[gene][site] not in ignored_characters
and node_b_sequences[gene][site] not in ignored_characters):
if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
if not in_gap:
gap_start = site
in_gap = True

mismatches_by_site[gap_start].add(site)
else:
in_gap = False
mismatches_by_site[site].add(site)
else:
in_gap = False

# Sum distances across mismatched sites, aggregating indel events by a
# summary function (e.g., max, mean, etc.).
for sites in mismatches_by_site.values():
mismatch_distances = []
for site in sites:
if gene in distance_map["map"] and site in distance_map["map"][gene]:
# Distances can be provided as either site- and
# sequence-specific dictionaries of sequence pairs to
Expand All @@ -276,14 +385,36 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
if isinstance(distance_map["map"][gene][site], dict):
seq_ancestral = node_a_sequences[gene][site]
seq_derived = node_b_sequences[gene][site]
distance += distance_map["map"][gene][site].get(
(seq_ancestral, seq_derived),
distance_map["default"]
)

# Check first for a user-defined weight for the
# mismatched bases. This supports mismatch weights
# between specific characters including gaps.
if (seq_ancestral, seq_derived) in distance_map["map"][gene][site]:
mismatch_distances.append(
distance_map["map"][gene][site][(seq_ancestral, seq_derived)]
)
# Next, check whether the mismatch is a gap. We want to
# take the aggregate of all weights at this site.
elif seq_ancestral == "-" or seq_derived == "-":
mismatch_distances.append(
aggregate_function(
chain(
(distance_map["default"],),
distance_map["map"][gene][site].values()
)
)
)
# Finally, use the default weight, if no
# sequence-specific weights are defined.
else:
mismatch_distances.append(distance_map["default"])
else:
distance += distance_map["map"][gene][site]
mismatch_distances.append(distance_map["map"][gene][site])
else:
distance += distance_map["default"]
mismatch_distances.append(distance_map["default"])

# Aggregate the distances for all sites in the current mismatch.
distance += aggregate_function(mismatch_distances)

return distance_type(np.round(distance, 2))

Expand Down

0 comments on commit 727b0e4

Please sign in to comment.