Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Count insertion/deletion events once in pairwise distances #698

Merged
merged 3 commits into from
Apr 13, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 138 additions & 7 deletions augur/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
import Bio.Phylo
from collections import defaultdict
import copy
from itertools import chain
import json
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -198,7 +199,7 @@ def read_distance_map(map_file):
return distance_map


def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map):
def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, aggregate_function=max):
"""Calculate distance between the two given nodes using the given distance map.

In cases where the distance map between sequences is asymmetric, the first
Expand Down Expand Up @@ -246,6 +247,90 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
>>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map)
0.0

Treat a single indel as one event.

>>> node_a_sequences = {"gene": "ACTG"}
>>> node_b_sequences = {"gene": "A--G"}
>>> distance_map = {"default": 1, "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
1

Use the maximum weight of all sites affected by an indel with a site-specific distance map.

>>> distance_map = {"default": 0, "map": {"gene": {1: 1, 2: 2}}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
2

Use the maximum weight of all mutations at all sites affected by an indel with a mutation-specific distance map.

>>> distance_map = {
... "default": 0,
... "map": {
... "gene": {
... 1: {
... ('C', 'G'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', 'A'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
3

Use the maximum weight of gaps at all sites affected by an indel with a mutation-specific distance map.

>>> distance_map = {
... "default": 0,
... "map": {
... "gene": {
... 1: {
... ('C', '-'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', '-'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
2

If the default value is greater than any of the site-specific mismatches and
the specific mismatch does not have a weighted defined, use the default
weight.

>>> distance_map = {
... "default": 4,
... "map": {
... "gene": {
... 1: {
... ('C', 'G'): 1,
... ('C', 'A'): 2
... },
... 2: {
... ('T', 'G'): 3,
... ('T', 'A'): 2
... }
... }
... }
... }
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
4

Count mismatches adjacent to indel events.

>>> node_a_sequences = {"gene": "ACTGTA"}
>>> node_b_sequences = {"gene": "A--CCA"}
>>> distance_map = {"default": 1, "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
3

Ignore specific characters defined in the distance map.

>>> node_a_sequences = {"gene": "ACTGG"}
Expand All @@ -256,6 +341,7 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
>>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
>>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
0

"""
distance_type = type(distance_map["default"])
distance = distance_type(0)
Expand All @@ -264,10 +350,33 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
for gene in node_a_sequences:
gene_length = len(node_a_sequences[gene])

# In a first pass, find all mismatches between the given sequences. Use
# this pass to identify all sites in insertion/deletion (indel) events,
# so we can calculate an aggregate weight per event. Track each event by
# its start site.
mismatches_by_site = defaultdict(set)
in_gap = False
for site in range(gene_length):
if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
and node_a_sequences[gene][site] not in ignored_characters
and node_b_sequences[gene][site] not in ignored_characters):
if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
if not in_gap:
gap_start = site
in_gap = True

mismatches_by_site[gap_start].add(site)
else:
in_gap = False
mismatches_by_site[site].add(site)
else:
in_gap = False

# Sum distances across mismatched sites, aggregating indel events by a
# summary function (e.g., max, mean, etc.).
for sites in mismatches_by_site.values():
mismatch_distances = []
for site in sites:
if gene in distance_map["map"] and site in distance_map["map"][gene]:
# Distances can be provided as either site- and
# sequence-specific dictionaries of sequence pairs to
Expand All @@ -276,14 +385,36 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
if isinstance(distance_map["map"][gene][site], dict):
seq_ancestral = node_a_sequences[gene][site]
seq_derived = node_b_sequences[gene][site]
distance += distance_map["map"][gene][site].get(
(seq_ancestral, seq_derived),
distance_map["default"]
)

# Check first for a user-defined weight for the
# mismatched bases. This supports mismatch weights
# between specific characters including gaps.
if (seq_ancestral, seq_derived) in distance_map["map"][gene][site]:
mismatch_distances.append(
distance_map["map"][gene][site][(seq_ancestral, seq_derived)]
)
# Next, check whether the mismatch is a gap. We want to
# take the aggregate of all weights at this site.
elif seq_ancestral == "-" or seq_derived == "-":
mismatch_distances.append(
aggregate_function(
chain(
(distance_map["default"],),
distance_map["map"][gene][site].values()
)
)
)
# Finally, use the default weight, if no
# sequence-specific weights are defined.
else:
mismatch_distances.append(distance_map["default"])
else:
distance += distance_map["map"][gene][site]
mismatch_distances.append(distance_map["map"][gene][site])
else:
distance += distance_map["default"]
mismatch_distances.append(distance_map["default"])

# Aggregate the distances for all sites in the current mismatch.
distance += aggregate_function(mismatch_distances)

return distance_type(np.round(distance, 2))

Expand Down