Merge pull request #698 from nextstrain/count-indels-once

Count insertion/deletion events once in pairwise distances
nextstrain · Apr 13, 2021 · 727b0e4 · 727b0e4
2 parents c663c95 + 00fe015
commit 727b0e4
Showing 1 changed file with 138 additions and 7 deletions.
diff --git a/augur/distance.py b/augur/distance.py
@@ -135,6 +135,7 @@
 import Bio.Phylo
 from collections import defaultdict
 import copy
+from itertools import chain
 import json
 import numpy as np
 import pandas as pd
@@ -198,7 +199,7 @@ def read_distance_map(map_file):
     return distance_map
 
 
-def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map):
+def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, aggregate_function=max):
     """Calculate distance between the two given nodes using the given distance map.
 
     In cases where the distance map between sequences is asymmetric, the first
@@ -246,6 +247,90 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     >>> get_distance_between_nodes(node_b_sequences, node_a_sequences, distance_map)
     0.0
 
+    Treat a single indel as one event.
+
+    >>> node_a_sequences = {"gene": "ACTG"}
+    >>> node_b_sequences = {"gene": "A--G"}
+    >>> distance_map = {"default": 1, "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    1
+
+    Use the maximum weight of all sites affected by an indel with a site-specific distance map.
+
+    >>> distance_map = {"default": 0, "map": {"gene": {1: 1, 2: 2}}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    2
+
+    Use the maximum weight of all mutations at all sites affected by an indel with a mutation-specific distance map.
+
+    >>> distance_map = {
+    ...     "default": 0,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', 'G'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', 'A'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    3
+
+    Use the maximum weight of gaps at all sites affected by an indel with a mutation-specific distance map.
+
+    >>> distance_map = {
+    ...     "default": 0,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', '-'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', '-'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    2
+
+    If the default value is greater than any of the site-specific mismatches and
+    the specific mismatch does not have a weighted defined, use the default
+    weight.
+
+    >>> distance_map = {
+    ...     "default": 4,
+    ...     "map": {
+    ...         "gene": {
+    ...             1: {
+    ...                 ('C', 'G'): 1,
+    ...                 ('C', 'A'): 2
+    ...             },
+    ...             2: {
+    ...                 ('T', 'G'): 3,
+    ...                 ('T', 'A'): 2
+    ...             }
+    ...         }
+    ...     }
+    ... }
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    4
+
+    Count mismatches adjacent to indel events.
+
+    >>> node_a_sequences = {"gene": "ACTGTA"}
+    >>> node_b_sequences = {"gene": "A--CCA"}
+    >>> distance_map = {"default": 1, "map": {}}
+    >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
+    3
+
     Ignore specific characters defined in the distance map.
 
     >>> node_a_sequences = {"gene": "ACTGG"}
@@ -256,6 +341,7 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     >>> distance_map = {"default": 1, "ignored_characters":["-", "N"], "map": {}}
     >>> get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     0
+
     """
     distance_type = type(distance_map["default"])
     distance = distance_type(0)
@@ -264,10 +350,33 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
     for gene in node_a_sequences:
         gene_length = len(node_a_sequences[gene])
 
+        # In a first pass, find all mismatches between the given sequences. Use
+        # this pass to identify all sites in insertion/deletion (indel) events,
+        # so we can calculate an aggregate weight per event. Track each event by
+        # its start site.
+        mismatches_by_site = defaultdict(set)
+        in_gap = False
         for site in range(gene_length):
             if (node_a_sequences[gene][site] != node_b_sequences[gene][site]
                 and node_a_sequences[gene][site] not in ignored_characters
                 and node_b_sequences[gene][site] not in ignored_characters):
+                if node_a_sequences[gene][site] == "-" or node_b_sequences[gene][site] == "-":
+                    if not in_gap:
+                        gap_start = site
+                        in_gap = True
+
+                    mismatches_by_site[gap_start].add(site)
+                else:
+                    in_gap = False
+                    mismatches_by_site[site].add(site)
+            else:
+                in_gap = False
+
+        # Sum distances across mismatched sites, aggregating indel events by a
+        # summary function (e.g., max, mean, etc.).
+        for sites in mismatches_by_site.values():
+            mismatch_distances = []
+            for site in sites:
                 if gene in distance_map["map"] and site in distance_map["map"][gene]:
                     # Distances can be provided as either site- and
                     # sequence-specific dictionaries of sequence pairs to
@@ -276,14 +385,36 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map)
                     if isinstance(distance_map["map"][gene][site], dict):
                         seq_ancestral = node_a_sequences[gene][site]
                         seq_derived = node_b_sequences[gene][site]
-                        distance += distance_map["map"][gene][site].get(
-                            (seq_ancestral, seq_derived),
-                            distance_map["default"]
-                        )
+
+                        # Check first for a user-defined weight for the
+                        # mismatched bases. This supports mismatch weights
+                        # between specific characters including gaps.
+                        if (seq_ancestral, seq_derived) in distance_map["map"][gene][site]:
+                            mismatch_distances.append(
+                                distance_map["map"][gene][site][(seq_ancestral, seq_derived)]
+                            )
+                        # Next, check whether the mismatch is a gap. We want to
+                        # take the aggregate of all weights at this site.
+                        elif seq_ancestral == "-" or seq_derived == "-":
+                            mismatch_distances.append(
+                                aggregate_function(
+                                    chain(
+                                        (distance_map["default"],),
+                                        distance_map["map"][gene][site].values()
+                                    )
+                                )
+                            )
+                        # Finally, use the default weight, if no
+                        # sequence-specific weights are defined.
+                        else:
+                            mismatch_distances.append(distance_map["default"])
                     else:
-                        distance += distance_map["map"][gene][site]
+                        mismatch_distances.append(distance_map["map"][gene][site])
                 else:
-                    distance += distance_map["default"]
+                    mismatch_distances.append(distance_map["default"])
+
+            # Aggregate the distances for all sites in the current mismatch.
+            distance += aggregate_function(mismatch_distances)
 
     return distance_type(np.round(distance, 2))