Refactoring to fix network related bugs

ncihtan · Mar 24, 2023 · 59c0c1b · 59c0c1b
1 parent 99a09a4
commit 59c0c1b
Show file tree

Hide file tree

Showing 42 changed files with 885 additions and 764 deletions.
diff --git a/Makefile b/Makefile
@@ -21,11 +21,8 @@ format:
 	$(FORMATTER) $(PROJ_SLUG)
 	$(FORMATTER) tests
 
-qtest:  prepare
-	py.test -s tests/
-
 test:   prepare
-	py.test -s --cov-report term --cov=$(PROJ_SLUG) tests/
+	py.test -v tests/
 
 coverage:
 	py.test --cov-report html --cov=$(PROJ_SLUG) tests/

diff --git a/hdash/cli.py b/hdash/cli.py
@@ -1,5 +1,5 @@
 """Command Line Interface (CLI) for generating HTAN Dashboard."""
-from hdash.graph.graph_util import GraphUtil
+from hdash.graph.graph_flattener import GraphFlattener
 import logging
 import emoji
 import click
@@ -9,14 +9,21 @@
 import pandas as pd
 from datetime import datetime
 
-from hdash.stats import stats_summary
 from hdash.synapse.synapse_util import SynapseUtil
 from hdash.google.gsheet_util import GoogleSheetUtil
 from hdash.util.heatmap_util import HeatMapUtil
 from hdash.util.report_writer import ReportWriter
 from hdash.synapse.table_util import TableUtil
 from hdash.validator.htan_validator import HtanValidator
 from synapseclient.core.exceptions import SynapseHTTPError
+from hdash.synapse.meta_map import MetaMap
+from hdash.graph.graph_flattener import GraphFlattener
+from hdash.stats.completeness_summary import CompletenessSummary
+from hdash.graph.graph_creator import GraphCreator
+from hdash.graph.sif_writer import SifWriter
+from hdash.stats.meta_summary import MetaDataSummary
+
+
 
 # Local Project Table, Used when google option is not enabled.
 MASTER_PROJECT_TABLE = "config/htan_projects.csv"
@@ -32,7 +39,12 @@ def cli(verbose):
         log_level = logging.INFO
         log_file_name = "hdash.log"
         print(f"Logging to:  {log_file_name}.")
-        logging.basicConfig(filename=log_file_name, filemode='w', level=log_level, format="%(levelname)s:%(message)s")
+        logging.basicConfig(
+            filename=log_file_name,
+            filemode="w",
+            level=log_level,
+            format="%(levelname)s:%(message)s",
+        )
     else:
         logging.basicConfig(level=log_level, format="%(levelname)s:%(message)s")
 
@@ -91,25 +103,36 @@ def _create_dashboard(use_cache, surge, google):
                     output_message("Could not retrieve:  %s" % meta_file.id)
 
     for project in p_list:
+        # Create the Meta Map
         for meta_file in project.meta_list:
             table_util.annotate_meta_file(meta_file)
-        validator = HtanValidator(project.atlas_id, project.meta_list)
+        meta_map = MetaMap()
+        for meta_file in project.meta_list:
+            meta_map.add_meta_file(meta_file)
+
+        # Create the Graph and Completeness Stats
+        graph_creator = GraphCreator(project.atlas_id, meta_map)
+        flat_graph = GraphFlattener(graph_creator.htan_graph)
+        completeness_stats = CompletenessSummary(project.atlas_id, meta_map, flat_graph)
+
+        # Validate
+        validator = HtanValidator(project.atlas_id, meta_map, graph_creator.htan_graph)
+
+        # Create the Heat Maps
+        heatmap_util = HeatMapUtil(project.atlas_id, completeness_stats)
+
+        # Create the Network SIF
+        sif_writer = SifWriter(graph_creator.htan_graph.directed_graph)
+        project.sif = sif_writer.sif
+
+        # Assess Metadata Completeness
+        MetaDataSummary(meta_map.meta_list_sorted)
+
+        # Store for later reference
+        project.meta_map = meta_map
+        project.flat_graph = flat_graph
+        project.completeness_stats = completeness_stats
         project.validation_list = validator.get_validation_list()
-        node_map = validator.get_node_map()
-        edge_list = validator.get_edge_list()
-        graph_util = GraphUtil(node_map, edge_list)
-        project.data_list = graph_util.data_list
-        project.node_map = validator.get_node_map()
-        project.sif_list = graph_util.sif_list
-        assays_2_biospecimens = graph_util.assays_2_biospecimens
-        stats = stats_summary.StatsSummary(
-            project.atlas_id, validator.meta_map, assays_2_biospecimens
-        )
-        project.participant_id_set = stats.participant_id_set
-        project.df_stats_map = stats.df_stats_map
-        project.participant_2_biopsecimens = graph_util.participant_2_biopsecimens
-        project.assays_2_biospecimens = graph_util.assays_2_biospecimens
-        heatmap_util = HeatMapUtil(project)
         project.heatmap_list = heatmap_util.heatmaps
 
     _write_html(p_list)
@@ -130,7 +153,7 @@ def _write_html(project_list):
     _write_index_html(report_writer)
     _write_atlas_html(report_writer)
     _write_matrix_html(report_writer)
-    _write_atlas_cytoscape_json_sif(project_list)
+    _write_atlas_sif(project_list)
 
 
 def _write_index_html(report_writer):
@@ -178,21 +201,10 @@ def _deploy_with_surge():
     subprocess.run(["surge", "deploy", "http://htan_dashboard.surge.sh/"])
 
 
-def _write_atlas_cytoscape_json_sif(project_list):
+def _write_atlas_sif(project_list):
     for project in project_list:
         out_name = "deploy/%s_network.sif" % project.atlas_id
         output_message("Writing to:  %s." % out_name)
-        sif_list = project.sif_list
-        fd = open(out_name, "w")
-        for edge in sif_list:
-            fd.write("%s\tconnect\t%s\n" % (edge[0], edge[1]))
-        fd.close()
-
-        out_name = "deploy/%s_nodes.txt" % project.atlas_id
-        output_message("Writing to:  %s." % out_name)
         fd = open(out_name, "w")
-        fd.write("ID\tCATEGORY\n")
-        for key in project.node_map:
-            node = project.node_map[key]
-            fd.write("%s\t%s\n" % (node.sif_id, node.category))
+        fd.write(project.sif)
         fd.close()
diff --git a/hdash/graph/graph.py b/hdash/graph/graph.py
diff --git a/hdash/graph/graph_creator.py b/hdash/graph/graph_creator.py
@@ -0,0 +1,106 @@
+"""Graph Util Class."""
+
+from hdash.synapse.meta_map import MetaMap
+from hdash.util.id_util import IdUtil
+from hdash.validator.categories import Categories
+from hdash.graph.node_data import NodeData
+from hdash.graph.htan_graph import HtanGraph
+
+
+class GraphCreator:
+    """
+    Creates a Graph of Atlas Nodes.
+
+    Given a set of MetaFiles, create an HTAN Graph.
+    This enables us to link patients --> biospecimens --> assays.
+    """
+
+    def __init__(self, atlas_id, meta_map: MetaMap):
+        """Default Constructor."""
+        self._atlas_id = atlas_id
+        self._graph = HtanGraph()
+        self._meta_map = meta_map
+        self._categories = Categories()
+        self._id_util = IdUtil()
+        self.__gather_nodes()
+        self.__gather_edges()
+
+    @property
+    def htan_graph(self):
+        return self._graph
+
+    def __gather_nodes(self):
+        """Gather all nodes."""
+        self.__gather_nodes_by_category(self._categories.DEMOGRAPHICS)
+        self.__gather_nodes_by_category(self._categories.BIOSPECIMEN)
+        self.__gather_nodes_by_category(self._categories.SRRS_BIOSPECIMEN)
+        for category in self._categories.all_assays:
+            self.__gather_nodes_by_category(category)
+
+    def __gather_nodes_by_category(self, category):
+        """Gather all Nodes in the Specified Category."""
+        meta_file_list = self._meta_map.get_meta_file_list(category)
+        for meta_file in meta_file_list:
+            df = meta_file.df
+            primary_id = self._id_util.get_primary_id_column(category)
+            id_list = df[primary_id].to_list()
+
+            # Each Primary ID Gets its Own Node
+            for current_id in id_list:
+                current_id = str(current_id)
+                node_data = NodeData(current_id, meta_file)
+                self._graph.add_node(node_data)
+
+    def __gather_edges(self):
+        """Gather all the edges."""
+        for category in self._categories.all_categories:
+            self.__gather_edges_by_category(category)
+
+    def __gather_edges_by_category(self, category):
+        meta_file_list = self._meta_map.get_meta_file_list(category)
+        for meta_file in meta_file_list:
+            df = meta_file.df
+            primary_id_col = self._id_util.get_primary_id_column(category)
+            parent_id_col = self._id_util.get_parent_id_column(category)
+            adj_id_col = self._id_util.get_adjacent_id_column(category)
+            if parent_id_col is not None:
+                self.__gather_child_parent_edges(df, primary_id_col, parent_id_col)
+            if adj_id_col is not None and adj_id_col in df.columns:
+                self.__gather_adjacent_edges(df, primary_id_col, adj_id_col)
+
+    def __gather_child_parent_edges(self, df, primary_id_col, parent_id_col):
+        """Gather Parent Child Edges."""
+        for index, row in df.iterrows():
+            primary_id = str(row[primary_id_col])
+            parent_id_chunk = str(row[parent_id_col])
+            parent_id_chunk = self.__handle_htapp_special_case(parent_id_chunk, row)
+
+            # We can have multiple parents
+            parent_id_chunk = parent_id_chunk.replace(";", " ").replace(",", " ")
+            parts = parent_id_chunk.split()
+            for part in parts:
+                parent_id = part.strip()
+                self._graph.add_edge(parent_id, primary_id)
+
+    def __handle_htapp_special_case(self, parent_id_chunk, row):
+        """Special case handling for HTAPP/DFCI."""
+        if parent_id_chunk.startswith("Not Applicable"):
+            try:
+                parent_id_chunk = str(row[IdUtil.HTAN_PARENT_BIOSPECIMEN_ID])
+            except KeyError:
+                parent_id_chunk = "NOT_APPLICABLE"
+        return parent_id_chunk
+
+    def __gather_adjacent_edges(self, df, primary_id_col, adj_id_col):
+        """Gather Adjacent Edges."""
+        for index, row in df.iterrows():
+            adj_id_chunk = str(row[adj_id_col])
+            primary_id = str(row[primary_id_col])
+
+            # We can have multiple adjacent nodes
+            if adj_id_chunk != "nan":
+                adj_id_chunk = adj_id_chunk.replace(";", " ").replace(",", " ")
+                parts = adj_id_chunk.split()
+                for part in parts:
+                    adjacent_id = part.strip()
+                    self._graph.add_adjacency_edge(primary_id, adjacent_id)
diff --git a/hdash/graph/graph_flattener.py b/hdash/graph/graph_flattener.py
@@ -0,0 +1,80 @@
+"""Graph Flattener Class."""
+import networkx as nx
+from natsort import natsorted
+from hdash.graph.htan_graph import HtanGraph
+from hdash.validator.categories import Categories
+from hdash.graph.key import KeyUtil
+
+
+class GraphFlattener:
+    """Graph Flattener Class.
+
+    Given an HTAN Graph, flatten it so that we can map:
+    1) Participant --> All Derived Biospecimens.
+    2) Biospecimen --> All Derived Assays.
+    """
+
+    def __init__(self, htan_graph: HtanGraph):
+        """Default Constructor."""
+        self.htan_graph = htan_graph
+        self.directed_graph = htan_graph.directed_graph
+        self.categories = Categories()
+        self.participant_2_biopsecimens = {}
+        self.biospecimen_2_assays = {}
+        self.assay_map = set()
+        self.__bin_nodes()
+        self.__gather_downstream_biospecimens()
+        self.__gather_downstream_assays()
+
+    def biospecimen_has_assay(self, biospecimen_id, category):
+        """Determine if the specified biospecimen has the specified assay."""
+        key = KeyUtil.create_key(biospecimen_id, category)
+        return key in self.assay_map
+
+    def __bin_nodes(self):
+        """Bin Participants and Biospecimens."""
+        self.participant_id_set = set()
+        self.biospecimen_id_set = set()
+        for node_id in self.directed_graph.nodes:
+            data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
+            category = data.meta_file.category
+            if category == self.categories.DEMOGRAPHICS:
+                self.participant_id_set.add(node_id)
+            elif category in self.categories.biospecimen_list:
+                self.biospecimen_id_set.add(node_id)
+
+        # Sort the Participants
+        self.participant_id_set = natsorted(self.participant_id_set)
+
+    def __gather_downstream_biospecimens(self):
+        """Given a Participant, find *all* Downstream Biospecimens."""
+        for participant_id in self.participant_id_set:
+            nodes = nx.descendants(self.directed_graph, participant_id)
+
+            # Filter Descendents for Biospecimens Only
+            filtered_list = self.__filter_nodes(nodes, self.categories.biospecimen_list)
+            self.participant_2_biopsecimens[participant_id] = filtered_list
+
+    def __gather_downstream_assays(self):
+        """Given a Biospecimen, find *all* Downstream Assays."""
+        for biospecimen_id in self.biospecimen_id_set:
+            nodes = nx.descendants(self.directed_graph, biospecimen_id)
+
+            # Filter Descendents for Assays Only
+            filtered_list = self.__filter_nodes(nodes, self.categories.all_assays)
+            self.biospecimen_2_assays[biospecimen_id] = filtered_list
+
+            # Add to assay map for easy look-up
+            for node_id in filtered_list:
+                data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
+                key = KeyUtil.create_key(biospecimen_id, data.meta_file.category)
+                self.assay_map.add(key)
+
+    def __filter_nodes(self, nodes, target_categories):
+        """Filter Node List to Only those in the Target Categories."""
+        filtered_list = []
+        for node_id in nodes:
+            data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
+            if data.meta_file.category in target_categories:
+                filtered_list.append(node_id)
+        return filtered_list