Skip to content

Commit

Permalink
Refactoring to fix network related bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
ecerami committed Mar 24, 2023
1 parent 99a09a4 commit 59c0c1b
Show file tree
Hide file tree
Showing 42 changed files with 885 additions and 764 deletions.
5 changes: 1 addition & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@ format:
$(FORMATTER) $(PROJ_SLUG)
$(FORMATTER) tests

qtest: prepare
py.test -s tests/

test: prepare
py.test -s --cov-report term --cov=$(PROJ_SLUG) tests/
py.test -v tests/

coverage:
py.test --cov-report html --cov=$(PROJ_SLUG) tests/
Expand Down
78 changes: 45 additions & 33 deletions hdash/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Command Line Interface (CLI) for generating HTAN Dashboard."""
from hdash.graph.graph_util import GraphUtil
from hdash.graph.graph_flattener import GraphFlattener
import logging
import emoji
import click
Expand All @@ -9,14 +9,21 @@
import pandas as pd
from datetime import datetime

from hdash.stats import stats_summary
from hdash.synapse.synapse_util import SynapseUtil
from hdash.google.gsheet_util import GoogleSheetUtil
from hdash.util.heatmap_util import HeatMapUtil
from hdash.util.report_writer import ReportWriter
from hdash.synapse.table_util import TableUtil
from hdash.validator.htan_validator import HtanValidator
from synapseclient.core.exceptions import SynapseHTTPError
from hdash.synapse.meta_map import MetaMap
from hdash.graph.graph_flattener import GraphFlattener
from hdash.stats.completeness_summary import CompletenessSummary
from hdash.graph.graph_creator import GraphCreator
from hdash.graph.sif_writer import SifWriter
from hdash.stats.meta_summary import MetaDataSummary



# Local Project Table, Used when google option is not enabled.
MASTER_PROJECT_TABLE = "config/htan_projects.csv"
Expand All @@ -32,7 +39,12 @@ def cli(verbose):
log_level = logging.INFO
log_file_name = "hdash.log"
print(f"Logging to: {log_file_name}.")
logging.basicConfig(filename=log_file_name, filemode='w', level=log_level, format="%(levelname)s:%(message)s")
logging.basicConfig(
filename=log_file_name,
filemode="w",
level=log_level,
format="%(levelname)s:%(message)s",
)
else:
logging.basicConfig(level=log_level, format="%(levelname)s:%(message)s")

Expand Down Expand Up @@ -91,25 +103,36 @@ def _create_dashboard(use_cache, surge, google):
output_message("Could not retrieve: %s" % meta_file.id)

for project in p_list:
# Create the Meta Map
for meta_file in project.meta_list:
table_util.annotate_meta_file(meta_file)
validator = HtanValidator(project.atlas_id, project.meta_list)
meta_map = MetaMap()
for meta_file in project.meta_list:
meta_map.add_meta_file(meta_file)

# Create the Graph and Completeness Stats
graph_creator = GraphCreator(project.atlas_id, meta_map)
flat_graph = GraphFlattener(graph_creator.htan_graph)
completeness_stats = CompletenessSummary(project.atlas_id, meta_map, flat_graph)

# Validate
validator = HtanValidator(project.atlas_id, meta_map, graph_creator.htan_graph)

# Create the Heat Maps
heatmap_util = HeatMapUtil(project.atlas_id, completeness_stats)

# Create the Network SIF
sif_writer = SifWriter(graph_creator.htan_graph.directed_graph)
project.sif = sif_writer.sif

# Assess Metadata Completeness
MetaDataSummary(meta_map.meta_list_sorted)

# Store for later reference
project.meta_map = meta_map
project.flat_graph = flat_graph
project.completeness_stats = completeness_stats
project.validation_list = validator.get_validation_list()
node_map = validator.get_node_map()
edge_list = validator.get_edge_list()
graph_util = GraphUtil(node_map, edge_list)
project.data_list = graph_util.data_list
project.node_map = validator.get_node_map()
project.sif_list = graph_util.sif_list
assays_2_biospecimens = graph_util.assays_2_biospecimens
stats = stats_summary.StatsSummary(
project.atlas_id, validator.meta_map, assays_2_biospecimens
)
project.participant_id_set = stats.participant_id_set
project.df_stats_map = stats.df_stats_map
project.participant_2_biopsecimens = graph_util.participant_2_biopsecimens
project.assays_2_biospecimens = graph_util.assays_2_biospecimens
heatmap_util = HeatMapUtil(project)
project.heatmap_list = heatmap_util.heatmaps

_write_html(p_list)
Expand All @@ -130,7 +153,7 @@ def _write_html(project_list):
_write_index_html(report_writer)
_write_atlas_html(report_writer)
_write_matrix_html(report_writer)
_write_atlas_cytoscape_json_sif(project_list)
_write_atlas_sif(project_list)


def _write_index_html(report_writer):
Expand Down Expand Up @@ -178,21 +201,10 @@ def _deploy_with_surge():
subprocess.run(["surge", "deploy", "http://htan_dashboard.surge.sh/"])


def _write_atlas_cytoscape_json_sif(project_list):
def _write_atlas_sif(project_list):
for project in project_list:
out_name = "deploy/%s_network.sif" % project.atlas_id
output_message("Writing to: %s." % out_name)
sif_list = project.sif_list
fd = open(out_name, "w")
for edge in sif_list:
fd.write("%s\tconnect\t%s\n" % (edge[0], edge[1]))
fd.close()

out_name = "deploy/%s_nodes.txt" % project.atlas_id
output_message("Writing to: %s." % out_name)
fd = open(out_name, "w")
fd.write("ID\tCATEGORY\n")
for key in project.node_map:
node = project.node_map[key]
fd.write("%s\t%s\n" % (node.sif_id, node.category))
fd.write(project.sif)
fd.close()
25 changes: 0 additions & 25 deletions hdash/graph/graph.py

This file was deleted.

106 changes: 106 additions & 0 deletions hdash/graph/graph_creator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Graph Util Class."""

from hdash.synapse.meta_map import MetaMap
from hdash.util.id_util import IdUtil
from hdash.validator.categories import Categories
from hdash.graph.node_data import NodeData
from hdash.graph.htan_graph import HtanGraph


class GraphCreator:
"""
Creates a Graph of Atlas Nodes.
Given a set of MetaFiles, create an HTAN Graph.
This enables us to link patients --> biospecimens --> assays.
"""

def __init__(self, atlas_id, meta_map: MetaMap):
"""Default Constructor."""
self._atlas_id = atlas_id
self._graph = HtanGraph()
self._meta_map = meta_map
self._categories = Categories()
self._id_util = IdUtil()
self.__gather_nodes()
self.__gather_edges()

@property
def htan_graph(self):
return self._graph

def __gather_nodes(self):
"""Gather all nodes."""
self.__gather_nodes_by_category(self._categories.DEMOGRAPHICS)
self.__gather_nodes_by_category(self._categories.BIOSPECIMEN)
self.__gather_nodes_by_category(self._categories.SRRS_BIOSPECIMEN)
for category in self._categories.all_assays:
self.__gather_nodes_by_category(category)

def __gather_nodes_by_category(self, category):
"""Gather all Nodes in the Specified Category."""
meta_file_list = self._meta_map.get_meta_file_list(category)
for meta_file in meta_file_list:
df = meta_file.df
primary_id = self._id_util.get_primary_id_column(category)
id_list = df[primary_id].to_list()

# Each Primary ID Gets its Own Node
for current_id in id_list:
current_id = str(current_id)
node_data = NodeData(current_id, meta_file)
self._graph.add_node(node_data)

def __gather_edges(self):
"""Gather all the edges."""
for category in self._categories.all_categories:
self.__gather_edges_by_category(category)

def __gather_edges_by_category(self, category):
meta_file_list = self._meta_map.get_meta_file_list(category)
for meta_file in meta_file_list:
df = meta_file.df
primary_id_col = self._id_util.get_primary_id_column(category)
parent_id_col = self._id_util.get_parent_id_column(category)
adj_id_col = self._id_util.get_adjacent_id_column(category)
if parent_id_col is not None:
self.__gather_child_parent_edges(df, primary_id_col, parent_id_col)
if adj_id_col is not None and adj_id_col in df.columns:
self.__gather_adjacent_edges(df, primary_id_col, adj_id_col)

def __gather_child_parent_edges(self, df, primary_id_col, parent_id_col):
"""Gather Parent Child Edges."""
for index, row in df.iterrows():
primary_id = str(row[primary_id_col])
parent_id_chunk = str(row[parent_id_col])
parent_id_chunk = self.__handle_htapp_special_case(parent_id_chunk, row)

# We can have multiple parents
parent_id_chunk = parent_id_chunk.replace(";", " ").replace(",", " ")
parts = parent_id_chunk.split()
for part in parts:
parent_id = part.strip()
self._graph.add_edge(parent_id, primary_id)

def __handle_htapp_special_case(self, parent_id_chunk, row):
"""Special case handling for HTAPP/DFCI."""
if parent_id_chunk.startswith("Not Applicable"):
try:
parent_id_chunk = str(row[IdUtil.HTAN_PARENT_BIOSPECIMEN_ID])
except KeyError:
parent_id_chunk = "NOT_APPLICABLE"
return parent_id_chunk

def __gather_adjacent_edges(self, df, primary_id_col, adj_id_col):
"""Gather Adjacent Edges."""
for index, row in df.iterrows():
adj_id_chunk = str(row[adj_id_col])
primary_id = str(row[primary_id_col])

# We can have multiple adjacent nodes
if adj_id_chunk != "nan":
adj_id_chunk = adj_id_chunk.replace(";", " ").replace(",", " ")
parts = adj_id_chunk.split()
for part in parts:
adjacent_id = part.strip()
self._graph.add_adjacency_edge(primary_id, adjacent_id)
80 changes: 80 additions & 0 deletions hdash/graph/graph_flattener.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Graph Flattener Class."""
import networkx as nx
from natsort import natsorted
from hdash.graph.htan_graph import HtanGraph
from hdash.validator.categories import Categories
from hdash.graph.key import KeyUtil


class GraphFlattener:
"""Graph Flattener Class.
Given an HTAN Graph, flatten it so that we can map:
1) Participant --> All Derived Biospecimens.
2) Biospecimen --> All Derived Assays.
"""

def __init__(self, htan_graph: HtanGraph):
"""Default Constructor."""
self.htan_graph = htan_graph
self.directed_graph = htan_graph.directed_graph
self.categories = Categories()
self.participant_2_biopsecimens = {}
self.biospecimen_2_assays = {}
self.assay_map = set()
self.__bin_nodes()
self.__gather_downstream_biospecimens()
self.__gather_downstream_assays()

def biospecimen_has_assay(self, biospecimen_id, category):
"""Determine if the specified biospecimen has the specified assay."""
key = KeyUtil.create_key(biospecimen_id, category)
return key in self.assay_map

def __bin_nodes(self):
"""Bin Participants and Biospecimens."""
self.participant_id_set = set()
self.biospecimen_id_set = set()
for node_id in self.directed_graph.nodes:
data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
category = data.meta_file.category
if category == self.categories.DEMOGRAPHICS:
self.participant_id_set.add(node_id)
elif category in self.categories.biospecimen_list:
self.biospecimen_id_set.add(node_id)

# Sort the Participants
self.participant_id_set = natsorted(self.participant_id_set)

def __gather_downstream_biospecimens(self):
"""Given a Participant, find *all* Downstream Biospecimens."""
for participant_id in self.participant_id_set:
nodes = nx.descendants(self.directed_graph, participant_id)

# Filter Descendents for Biospecimens Only
filtered_list = self.__filter_nodes(nodes, self.categories.biospecimen_list)
self.participant_2_biopsecimens[participant_id] = filtered_list

def __gather_downstream_assays(self):
"""Given a Biospecimen, find *all* Downstream Assays."""
for biospecimen_id in self.biospecimen_id_set:
nodes = nx.descendants(self.directed_graph, biospecimen_id)

# Filter Descendents for Assays Only
filtered_list = self.__filter_nodes(nodes, self.categories.all_assays)
self.biospecimen_2_assays[biospecimen_id] = filtered_list

# Add to assay map for easy look-up
for node_id in filtered_list:
data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
key = KeyUtil.create_key(biospecimen_id, data.meta_file.category)
self.assay_map.add(key)

def __filter_nodes(self, nodes, target_categories):
"""Filter Node List to Only those in the Target Categories."""
filtered_list = []
for node_id in nodes:
data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY]
if data.meta_file.category in target_categories:
filtered_list.append(node_id)
return filtered_list
Loading

0 comments on commit 59c0c1b

Please sign in to comment.