diff --git a/Makefile b/Makefile index 65bc7a1..ebb7329 100644 --- a/Makefile +++ b/Makefile @@ -21,11 +21,8 @@ format: $(FORMATTER) $(PROJ_SLUG) $(FORMATTER) tests -qtest: prepare - py.test -s tests/ - test: prepare - py.test -s --cov-report term --cov=$(PROJ_SLUG) tests/ + py.test -v tests/ coverage: py.test --cov-report html --cov=$(PROJ_SLUG) tests/ diff --git a/hdash/cli.py b/hdash/cli.py index 2597af5..0ac1d53 100644 --- a/hdash/cli.py +++ b/hdash/cli.py @@ -1,5 +1,5 @@ """Command Line Interface (CLI) for generating HTAN Dashboard.""" -from hdash.graph.graph_util import GraphUtil +from hdash.graph.graph_flattener import GraphFlattener import logging import emoji import click @@ -9,7 +9,6 @@ import pandas as pd from datetime import datetime -from hdash.stats import stats_summary from hdash.synapse.synapse_util import SynapseUtil from hdash.google.gsheet_util import GoogleSheetUtil from hdash.util.heatmap_util import HeatMapUtil @@ -17,6 +16,14 @@ from hdash.synapse.table_util import TableUtil from hdash.validator.htan_validator import HtanValidator from synapseclient.core.exceptions import SynapseHTTPError +from hdash.synapse.meta_map import MetaMap +from hdash.graph.graph_flattener import GraphFlattener +from hdash.stats.completeness_summary import CompletenessSummary +from hdash.graph.graph_creator import GraphCreator +from hdash.graph.sif_writer import SifWriter +from hdash.stats.meta_summary import MetaDataSummary + + # Local Project Table, Used when google option is not enabled. MASTER_PROJECT_TABLE = "config/htan_projects.csv" @@ -32,7 +39,12 @@ def cli(verbose): log_level = logging.INFO log_file_name = "hdash.log" print(f"Logging to: {log_file_name}.") - logging.basicConfig(filename=log_file_name, filemode='w', level=log_level, format="%(levelname)s:%(message)s") + logging.basicConfig( + filename=log_file_name, + filemode="w", + level=log_level, + format="%(levelname)s:%(message)s", + ) else: logging.basicConfig(level=log_level, format="%(levelname)s:%(message)s") @@ -91,25 +103,36 @@ def _create_dashboard(use_cache, surge, google): output_message("Could not retrieve: %s" % meta_file.id) for project in p_list: + # Create the Meta Map for meta_file in project.meta_list: table_util.annotate_meta_file(meta_file) - validator = HtanValidator(project.atlas_id, project.meta_list) + meta_map = MetaMap() + for meta_file in project.meta_list: + meta_map.add_meta_file(meta_file) + + # Create the Graph and Completeness Stats + graph_creator = GraphCreator(project.atlas_id, meta_map) + flat_graph = GraphFlattener(graph_creator.htan_graph) + completeness_stats = CompletenessSummary(project.atlas_id, meta_map, flat_graph) + + # Validate + validator = HtanValidator(project.atlas_id, meta_map, graph_creator.htan_graph) + + # Create the Heat Maps + heatmap_util = HeatMapUtil(project.atlas_id, completeness_stats) + + # Create the Network SIF + sif_writer = SifWriter(graph_creator.htan_graph.directed_graph) + project.sif = sif_writer.sif + + # Assess Metadata Completeness + MetaDataSummary(meta_map.meta_list_sorted) + + # Store for later reference + project.meta_map = meta_map + project.flat_graph = flat_graph + project.completeness_stats = completeness_stats project.validation_list = validator.get_validation_list() - node_map = validator.get_node_map() - edge_list = validator.get_edge_list() - graph_util = GraphUtil(node_map, edge_list) - project.data_list = graph_util.data_list - project.node_map = validator.get_node_map() - project.sif_list = graph_util.sif_list - assays_2_biospecimens = graph_util.assays_2_biospecimens - stats = stats_summary.StatsSummary( - project.atlas_id, validator.meta_map, assays_2_biospecimens - ) - project.participant_id_set = stats.participant_id_set - project.df_stats_map = stats.df_stats_map - project.participant_2_biopsecimens = graph_util.participant_2_biopsecimens - project.assays_2_biospecimens = graph_util.assays_2_biospecimens - heatmap_util = HeatMapUtil(project) project.heatmap_list = heatmap_util.heatmaps _write_html(p_list) @@ -130,7 +153,7 @@ def _write_html(project_list): _write_index_html(report_writer) _write_atlas_html(report_writer) _write_matrix_html(report_writer) - _write_atlas_cytoscape_json_sif(project_list) + _write_atlas_sif(project_list) def _write_index_html(report_writer): @@ -178,21 +201,10 @@ def _deploy_with_surge(): subprocess.run(["surge", "deploy", "http://htan_dashboard.surge.sh/"]) -def _write_atlas_cytoscape_json_sif(project_list): +def _write_atlas_sif(project_list): for project in project_list: out_name = "deploy/%s_network.sif" % project.atlas_id output_message("Writing to: %s." % out_name) - sif_list = project.sif_list - fd = open(out_name, "w") - for edge in sif_list: - fd.write("%s\tconnect\t%s\n" % (edge[0], edge[1])) - fd.close() - - out_name = "deploy/%s_nodes.txt" % project.atlas_id - output_message("Writing to: %s." % out_name) fd = open(out_name, "w") - fd.write("ID\tCATEGORY\n") - for key in project.node_map: - node = project.node_map[key] - fd.write("%s\t%s\n" % (node.sif_id, node.category)) + fd.write(project.sif) fd.close() diff --git a/hdash/graph/graph.py b/hdash/graph/graph.py deleted file mode 100644 index 8870cf6..0000000 --- a/hdash/graph/graph.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Graph Objects.""" - - -class Node: - """Node in Graph.""" - - id: str - sif_id: str - label: str - category: str - - def __repr__(self): - """Get node summary.""" - return "Node: %s: %s: %s" % (self.id, self.label, self.category) - - -class Edge: - """Edge in Graph.""" - - source_id: str - target_id: str - - def __repr__(self): - """Get edge summary.""" - return "Edge: %s --> %s" % (self.source_id, self.target_id) diff --git a/hdash/graph/graph_creator.py b/hdash/graph/graph_creator.py new file mode 100644 index 0000000..ce7eec1 --- /dev/null +++ b/hdash/graph/graph_creator.py @@ -0,0 +1,106 @@ +"""Graph Util Class.""" + +from hdash.synapse.meta_map import MetaMap +from hdash.util.id_util import IdUtil +from hdash.validator.categories import Categories +from hdash.graph.node_data import NodeData +from hdash.graph.htan_graph import HtanGraph + + +class GraphCreator: + """ + Creates a Graph of Atlas Nodes. + + Given a set of MetaFiles, create an HTAN Graph. + This enables us to link patients --> biospecimens --> assays. + """ + + def __init__(self, atlas_id, meta_map: MetaMap): + """Default Constructor.""" + self._atlas_id = atlas_id + self._graph = HtanGraph() + self._meta_map = meta_map + self._categories = Categories() + self._id_util = IdUtil() + self.__gather_nodes() + self.__gather_edges() + + @property + def htan_graph(self): + return self._graph + + def __gather_nodes(self): + """Gather all nodes.""" + self.__gather_nodes_by_category(self._categories.DEMOGRAPHICS) + self.__gather_nodes_by_category(self._categories.BIOSPECIMEN) + self.__gather_nodes_by_category(self._categories.SRRS_BIOSPECIMEN) + for category in self._categories.all_assays: + self.__gather_nodes_by_category(category) + + def __gather_nodes_by_category(self, category): + """Gather all Nodes in the Specified Category.""" + meta_file_list = self._meta_map.get_meta_file_list(category) + for meta_file in meta_file_list: + df = meta_file.df + primary_id = self._id_util.get_primary_id_column(category) + id_list = df[primary_id].to_list() + + # Each Primary ID Gets its Own Node + for current_id in id_list: + current_id = str(current_id) + node_data = NodeData(current_id, meta_file) + self._graph.add_node(node_data) + + def __gather_edges(self): + """Gather all the edges.""" + for category in self._categories.all_categories: + self.__gather_edges_by_category(category) + + def __gather_edges_by_category(self, category): + meta_file_list = self._meta_map.get_meta_file_list(category) + for meta_file in meta_file_list: + df = meta_file.df + primary_id_col = self._id_util.get_primary_id_column(category) + parent_id_col = self._id_util.get_parent_id_column(category) + adj_id_col = self._id_util.get_adjacent_id_column(category) + if parent_id_col is not None: + self.__gather_child_parent_edges(df, primary_id_col, parent_id_col) + if adj_id_col is not None and adj_id_col in df.columns: + self.__gather_adjacent_edges(df, primary_id_col, adj_id_col) + + def __gather_child_parent_edges(self, df, primary_id_col, parent_id_col): + """Gather Parent Child Edges.""" + for index, row in df.iterrows(): + primary_id = str(row[primary_id_col]) + parent_id_chunk = str(row[parent_id_col]) + parent_id_chunk = self.__handle_htapp_special_case(parent_id_chunk, row) + + # We can have multiple parents + parent_id_chunk = parent_id_chunk.replace(";", " ").replace(",", " ") + parts = parent_id_chunk.split() + for part in parts: + parent_id = part.strip() + self._graph.add_edge(parent_id, primary_id) + + def __handle_htapp_special_case(self, parent_id_chunk, row): + """Special case handling for HTAPP/DFCI.""" + if parent_id_chunk.startswith("Not Applicable"): + try: + parent_id_chunk = str(row[IdUtil.HTAN_PARENT_BIOSPECIMEN_ID]) + except KeyError: + parent_id_chunk = "NOT_APPLICABLE" + return parent_id_chunk + + def __gather_adjacent_edges(self, df, primary_id_col, adj_id_col): + """Gather Adjacent Edges.""" + for index, row in df.iterrows(): + adj_id_chunk = str(row[adj_id_col]) + primary_id = str(row[primary_id_col]) + + # We can have multiple adjacent nodes + if adj_id_chunk != "nan": + adj_id_chunk = adj_id_chunk.replace(";", " ").replace(",", " ") + parts = adj_id_chunk.split() + for part in parts: + adjacent_id = part.strip() + self._graph.add_adjacency_edge(primary_id, adjacent_id) diff --git a/hdash/graph/graph_flattener.py b/hdash/graph/graph_flattener.py new file mode 100644 index 0000000..2c15609 --- /dev/null +++ b/hdash/graph/graph_flattener.py @@ -0,0 +1,80 @@ +"""Graph Flattener Class.""" +import networkx as nx +from natsort import natsorted +from hdash.graph.htan_graph import HtanGraph +from hdash.validator.categories import Categories +from hdash.graph.key import KeyUtil + + +class GraphFlattener: + """Graph Flattener Class. + + Given an HTAN Graph, flatten it so that we can map: + 1) Participant --> All Derived Biospecimens. + 2) Biospecimen --> All Derived Assays. + """ + + def __init__(self, htan_graph: HtanGraph): + """Default Constructor.""" + self.htan_graph = htan_graph + self.directed_graph = htan_graph.directed_graph + self.categories = Categories() + self.participant_2_biopsecimens = {} + self.biospecimen_2_assays = {} + self.assay_map = set() + self.__bin_nodes() + self.__gather_downstream_biospecimens() + self.__gather_downstream_assays() + + def biospecimen_has_assay(self, biospecimen_id, category): + """Determine if the specified biospecimen has the specified assay.""" + key = KeyUtil.create_key(biospecimen_id, category) + return key in self.assay_map + + def __bin_nodes(self): + """Bin Participants and Biospecimens.""" + self.participant_id_set = set() + self.biospecimen_id_set = set() + for node_id in self.directed_graph.nodes: + data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY] + category = data.meta_file.category + if category == self.categories.DEMOGRAPHICS: + self.participant_id_set.add(node_id) + elif category in self.categories.biospecimen_list: + self.biospecimen_id_set.add(node_id) + + # Sort the Participants + self.participant_id_set = natsorted(self.participant_id_set) + + def __gather_downstream_biospecimens(self): + """Given a Participant, find *all* Downstream Biospecimens.""" + for participant_id in self.participant_id_set: + nodes = nx.descendants(self.directed_graph, participant_id) + + # Filter Descendents for Biospecimens Only + filtered_list = self.__filter_nodes(nodes, self.categories.biospecimen_list) + self.participant_2_biopsecimens[participant_id] = filtered_list + + def __gather_downstream_assays(self): + """Given a Biospecimen, find *all* Downstream Assays.""" + for biospecimen_id in self.biospecimen_id_set: + nodes = nx.descendants(self.directed_graph, biospecimen_id) + + # Filter Descendents for Assays Only + filtered_list = self.__filter_nodes(nodes, self.categories.all_assays) + self.biospecimen_2_assays[biospecimen_id] = filtered_list + + # Add to assay map for easy look-up + for node_id in filtered_list: + data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY] + key = KeyUtil.create_key(biospecimen_id, data.meta_file.category) + self.assay_map.add(key) + + def __filter_nodes(self, nodes, target_categories): + """Filter Node List to Only those in the Target Categories.""" + filtered_list = [] + for node_id in nodes: + data = self.directed_graph.nodes[node_id][HtanGraph.DATA_KEY] + if data.meta_file.category in target_categories: + filtered_list.append(node_id) + return filtered_list diff --git a/hdash/graph/graph_util.py b/hdash/graph/graph_util.py deleted file mode 100644 index 3d71bec..0000000 --- a/hdash/graph/graph_util.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Graph Util Class.""" -import networkx as nx -from hdash.validator.categories import Categories - - -class GraphUtil: - """Graph Utilities Class.""" - - def __init__(self, node_map, edge_list): - """Construct a new Graph Utility Class.""" - self.node_map = node_map - self.edge_list = edge_list - self.sif_list = [] - self.categories = Categories() - self.participant_id_set = set() - self.biospecimen_id_set = set() - self.participant_2_biopsecimens = {} - self.assays_2_biospecimens = {} - abbrev_map = self.categories.abbrev_category_map - - self.data_list = [] - for node_id in node_map: - node = node_map[node_id] - node.sif_id = abbrev_map[node.category] + "_" + node.label - current_node = { - "id": node.id, - "label": node.sif_id, - "category": node.category, - } - node_dict = {"data": current_node} - self.data_list.append(node_dict) - - edge_id = 0 - for edge in edge_list: - current_edge = { - "id": "e" + str(edge_id), - "source": edge.source_id, - "target": edge.target_id, - } - node_dict = {"data": current_edge} - self.data_list.append(node_dict) - - s_node = self.node_map[edge.source_id] - t_node = self.node_map[edge.target_id] - self.sif_list.append([s_node.sif_id, t_node.sif_id]) - edge_id += 1 - self.__init_networkx() - self.__gather_participants_biospecimens() - self.__gather_downstream_assays() - - def __init_networkx(self): - self.graph = nx.DiGraph() - for node_id, node in self.node_map.items(): - self.graph.add_node(node_id) - - for edge in self.edge_list: - self.graph.add_edge(edge.source_id, edge.target_id) - - def __gather_participants_biospecimens(self): - """Gather Participant IDs and all Downstream Biospecimen IDs.""" - for node_id, node in self.node_map.items(): - if node.category == self.categories.DEMOGRAPHICS: - self.participant_id_set.add(node_id) - for participant_id in self.participant_id_set: - self.__downstream_nodes = [] - self.__walk_non_assay_node(self.graph, participant_id) - - downstream_biospecimens = [] - for downstream_node in self.__downstream_nodes: - downstream_biospecimens.append(downstream_node) - - for biospecimen_id in downstream_biospecimens: - self.biospecimen_id_set.add(biospecimen_id) - self.participant_2_biopsecimens[participant_id] = downstream_biospecimens - - def __gather_downstream_assays(self): - """For each biospecimen, gather all downstream assays.""" - for biospecimen_id in self.biospecimen_id_set: - self.__downstream_nodes = [] - self.__walk_node(self.graph, biospecimen_id) - for downstream_id in self.__downstream_nodes: - assay_id = self.node_map[downstream_id].id - self.assays_2_biospecimens[assay_id] = biospecimen_id - - def __walk_node(self, graph, node_id): - """Walk the graph and gather all downstream nodes.""" - successors = graph.successors(node_id) - for successor_id in successors: - node = self.node_map[successor_id] - # if this is not a biospecimen, keeping walking - if not node.sif_id.startswith("B"): - self.__downstream_nodes.append(successor_id) - self.__walk_node(graph, successor_id) - - def __walk_non_assay_node(self, graph, node_id): - """Walk the graph and gather all downstream biospecimens.""" - successors = graph.successors(node_id) - for successor_id in successors: - node = self.node_map[successor_id] - # if this is a biospecimen, keeping walking - if node.sif_id.startswith("B"): - self.__downstream_nodes.append(successor_id) - self.__walk_non_assay_node(graph, successor_id) diff --git a/hdash/graph/htan_graph.py b/hdash/graph/htan_graph.py new file mode 100644 index 0000000..845357f --- /dev/null +++ b/hdash/graph/htan_graph.py @@ -0,0 +1,48 @@ +"""HTAN Graph.""" +import networkx as nx +from hdash.graph.node_data import NodeData +from hdash.validator.categories import Categories + + +class HtanGraph: + """HTAN Graph. + + This class leverages the networkx Python package. + """ + + DATA_KEY = "data" + + def __init__(self): + """Default Constructor.""" + self.directed_graph = nx.DiGraph() + self.edge_list = [] + self.adjacent_list = [] + self.participant_list = set() + self.biospecimen_list = set() + self._categories = Categories() + + def add_node(self, node_data: NodeData): + """Add node to the graph.""" + self.directed_graph.add_node(node_data.id) + self.directed_graph.nodes[node_data.id][self.DATA_KEY] = node_data + + def add_edge(self, source_id, target_id): + """Add edge to the graph.""" + # Only Add the Edge if both nodes exist + if target_id in self.directed_graph.nodes \ + and source_id in self.directed_graph.nodes: + self.directed_graph.add_edge(source_id, target_id) + + # Store all the original edges for later validation + self.edge_list.append([source_id, target_id]) + + def add_adjacency_edge(self, source_id, target_id): + """Add adjacency edge to its own list.""" + self.adjacent_list.append([source_id, target_id]) + + def __repr__(self): + """Get edge summary.""" + return "HTAN Graph [%s nodes, %s edges]" % ( + len(self.directed_graph.nodes), + len(self.directed_graph.edges), + ) diff --git a/hdash/graph/key.py b/hdash/graph/key.py new file mode 100644 index 0000000..b9e4fe7 --- /dev/null +++ b/hdash/graph/key.py @@ -0,0 +1,10 @@ +"""Key Util.""" + + +class KeyUtil: + """Key Utilities Class.""" + + @staticmethod + def create_key(primary_id, category): + """Create key of primary_id + category.""" + return f"{primary_id}__{category}" diff --git a/hdash/graph/node_data.py b/hdash/graph/node_data.py new file mode 100644 index 0000000..0c94977 --- /dev/null +++ b/hdash/graph/node_data.py @@ -0,0 +1,25 @@ +"""Node Data.""" +from hdash.synapse.meta_file import MetaFile +from hdash.validator.categories import Categories + + +class NodeData: + """Node Data.""" + + id: str + sif_id: str + meta_file: MetaFile + + def __init__(self, id: str, meta_file: MetaFile): + """Default Constructor""" + self.id = id + self.meta_file = meta_file + self.abbrev_map = Categories().abbrev_category_map + + @property + def sif_id(self): + return self.abbrev_map[self.meta_file.category] + "_" + self.id + + def __repr__(self): + """Get node summary.""" + return "Node: %s: %s" % (self.id, self.meta_file.category) diff --git a/hdash/graph/sif_writer.py b/hdash/graph/sif_writer.py new file mode 100644 index 0000000..ba2f75e --- /dev/null +++ b/hdash/graph/sif_writer.py @@ -0,0 +1,23 @@ +"""SIF Writer.""" +import networkx as nx +from hdash.graph.htan_graph import HtanGraph + + +class SifWriter: + """SIF Network Writer for Cytoscape.""" + + def __init__(self, graph: nx.DiGraph): + self.graph = graph + self.sif = "" + self._create_sif() + + + def _create_sif(self): + """Create the SIF Network.""" + edge_list = self.graph.edges + for edge in edge_list: + node0_id = edge[0] + node1_id = edge[1] + sif_id0 = self.graph.nodes[node0_id][HtanGraph.DATA_KEY].sif_id + sif_id1 = self.graph.nodes[node1_id][HtanGraph.DATA_KEY].sif_id + self.sif += f"{sif_id0}\tconnect\t{sif_id1}\n" diff --git a/hdash/stats/completeness_summary.py b/hdash/stats/completeness_summary.py new file mode 100644 index 0000000..7875d86 --- /dev/null +++ b/hdash/stats/completeness_summary.py @@ -0,0 +1,51 @@ +from hdash.synapse.meta_map import MetaMap +from hdash.graph.graph_flattener import GraphFlattener +from hdash.graph.key import KeyUtil +from hdash.util.id_util import IdUtil +from hdash.validator.categories import Categories + + +class CompletenessSummary: + """ + Assesses how complete the data set is. + + For example: + - given patient 1, do we have metadata for diagnosis, therapy, etc? + - given biospecimen 1, do we have data for all levels of RNASeq? + """ + + def __init__(self, atlas_id, meta_map: MetaMap, graph_flat: GraphFlattener): + """Default Constructor.""" + self.atlas_id = atlas_id + self.meta_map = meta_map + self.graph_flat = graph_flat + self.completeness_map = set() + self.id_util = IdUtil() + self.categories = Categories() + + # Assess Clinical Categories + for clinical_category in self.categories.all_clinical: + self.__walk_clinical_category(clinical_category) + + def has_data(self, primary_id, category): + """Determine if the specified ID has data of the specified category.""" + key = KeyUtil.create_key(primary_id, category) + if category in self.categories.all_clinical: + return key in self.completeness_map + else: + return self.graph_flat.biospecimen_has_assay(primary_id, category) + + def __walk_clinical_category(self, category): + """Walk through specified clinical category.""" + meta_file_list = self.meta_map.get_meta_file_list(category) + for meta_file in meta_file_list: + self.__inspect_clinical_df(category, meta_file.df) + + def __inspect_clinical_df(self, category, df): + """Inspect Data Frame for Primary IDs.""" + for index, row in df.iterrows(): + primary_id_column = self.id_util.get_primary_id_column(category) + primary_id = row[primary_id_column] + + key = KeyUtil.create_key(primary_id, category) + self.completeness_map.add(key) diff --git a/hdash/stats/meta_summary.py b/hdash/stats/meta_summary.py new file mode 100644 index 0000000..b2f996f --- /dev/null +++ b/hdash/stats/meta_summary.py @@ -0,0 +1,38 @@ +"""Summary Stats across all metadata files.""" +from hdash.util.id_util import IdUtil +from hdash.validator.categories import Categories +from typing import List +from hdash.synapse.meta_file import MetaFile + + +class MetaDataSummary: + """Summary Stats across all metadata files.""" + + NA_VALUES = ["na", "nan", "unknown", "not applicable", "not reported"] + IGNORED_FIELDS = [ + IdUtil.HTAN_PARTICIPANT_ID, + IdUtil.HTAN_PARENT_ID, + IdUtil.HTAN_BIOSPECIMEN_ID, + Categories.COMPONENT_COL, + Categories.ENTITY_ID_COL, + ] + + def __init__(self, meta_list: List[MetaFile]): + """Default Constructor.""" + self.categories = Categories() + for meta_file in meta_list: + percent_complete = self._calculate_percent_complete(meta_file.df) + meta_file.percent_meta_data_complete = percent_complete + + def _calculate_percent_complete(self, df): + """Inspect Data Frame for completed/missing fields.""" + num_fields = 0 + num_completed_fields = 0 + for index, row in df.iterrows(): + for field_name, field_value in row.items(): + if field_name not in self.IGNORED_FIELDS: + num_fields += 1 + field_value = str(field_value).lower() + if field_value not in self.NA_VALUES: + num_completed_fields += 1 + return num_completed_fields / num_fields diff --git a/hdash/stats/stats_summary.py b/hdash/stats/stats_summary.py deleted file mode 100644 index 2a03710..0000000 --- a/hdash/stats/stats_summary.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Summary Stats Across all Data Categories.""" -from hdash.validator.id_util import IdUtil -from hdash.validator.categories import Categories -from hdash.synapse.meta_map import MetaMap -from natsort import natsorted - - -class StatsSummary: - """Summary Stats across all data categories.""" - - NA_VALUES = ["na", "nan", "unknown", "not applicable", "not reported"] - IGNORED_FIELDS = [ - IdUtil.HTAN_PARTICIPANT_ID, - IdUtil.HTAN_PARENT_ID, - IdUtil.HTAN_BIOSPECIMEN_ID, - Categories.COMPONENT_COL, - Categories.ENTITY_ID_COL, - ] - - def __init__(self, atlas_id, meta_map: MetaMap, assays_2_biospecimens): - """Init with Atlas ID and Map of all DataFrames.""" - self.atlas_id = atlas_id - self.meta_map = meta_map - self.assays_2_biospecimens = assays_2_biospecimens - self.participant_id_set = set() - self.df_stats_map = {} - self.categories = Categories() - self.num_fields = {} - self.num_completed_fields = {} - self.id_util = IdUtil() - self.__walk_clinical_data_categories() - self.participant_id_set = natsorted(self.participant_id_set) - self.__walk_category(self.categories.BIOSPECIMEN) - self.__walk_assay_categories() - - def __walk_clinical_data_categories(self): - """Walk through all the clinical categories.""" - for clinical_category in self.categories.all_clinical: - self.__walk_category(clinical_category) - - def __walk_assay_categories(self): - """Walk through all the assay categories.""" - for assay_category in self.categories.all_assays: - self.__walk_category(assay_category) - - def __walk_category(self, category): - """Walk through specified data category.""" - meta_file_list = self.meta_map.get_meta_file_list(category) - for meta_file in meta_file_list: - self.inspect_df(category, meta_file.df) - - def inspect_df(self, category, df): - """Inspect Data Frame for completed/missing fields.""" - for index, row in df.iterrows(): - primary_id_column = self.id_util.get_primary_id_column(category) - primary_id = row[primary_id_column] - - # Inspect Primary ID: If this is an assay type, we need - # to link to the root biopsecimen. - if primary_id_column == IdUtil.HTAN_PARTICIPANT_ID: - self.participant_id_set.add(primary_id) - elif primary_id_column == IdUtil.HTAN_DATA_FILE_ID: - primary_id = self.assays_2_biospecimens.get(primary_id, "NA") - - # Iterate through all the fields - # and count number of fields, and number of completed fields - key = primary_id + ":" + category - for field_name, field_value in row.items(): - if field_name not in StatsSummary.IGNORED_FIELDS: - self.__increment_num_fields(key) - field_value = str(field_value).lower() - if field_value not in StatsSummary.NA_VALUES: - self.__increment_num_completed_fields(key) - - percent_complete = self.__calculate_percent_complete_fields(key) - self.df_stats_map[key] = percent_complete - - def __increment_num_fields(self, key): - if key in self.num_fields: - self.num_fields[key] = self.num_fields[key] + 1 - else: - self.num_fields[key] = 1 - - def __increment_num_completed_fields(self, key): - if key in self.num_completed_fields: - self.num_completed_fields[key] = self.num_completed_fields[key] + 1 - else: - self.num_completed_fields[key] = 1 - - def __calculate_percent_complete_fields(self, key): - num_complete_fields = self.num_completed_fields.get(key, 0) - num_fields = self.num_fields.get(key, 1) - return num_complete_fields / num_fields diff --git a/hdash/synapse/file_counter.py b/hdash/synapse/file_counter.py index 84fbaec..569aab7 100644 --- a/hdash/synapse/file_counter.py +++ b/hdash/synapse/file_counter.py @@ -61,7 +61,9 @@ def _walk_files(self): try: file_type = self.file_type_map[file_extension] except KeyError: - logging.warning("Unrecognized File Extension: %s [%s]" % (file_extension, path)) + logging.warning( + "Unrecognized File Extension: %s [%s]" % (file_extension, path) + ) file_type = FileCounter.OTHER file_type_list.append(file_type) diff --git a/hdash/synapse/htan_project.py b/hdash/synapse/htan_project.py index 2c7a87f..41e902e 100644 --- a/hdash/synapse/htan_project.py +++ b/hdash/synapse/htan_project.py @@ -26,13 +26,11 @@ def __init__(self): self.meta_list = [] self.validation_list = [] self.num_errors = 0 - self.data_list = [] - self.node_map = [] - self.sif_list = [] - self.participant_id_set = [] - self.participant_2_biopsecimens = {} - self.assays_2_biospecimens = {} - self.df_stats_map = {} + self.meta_map = None + self.htan_graph = None + self.flat_graph = None + self.sif = None + self.completeness_stats = None self.heatmap_list = [] def get_total_file_size(self): @@ -53,6 +51,3 @@ def get_total_fize_size_human_readable(self): def __repr__(self): """Return summary of object.""" return "%s: %s" % (self.id, self.name) - - - diff --git a/hdash/synapse/meta_file.py b/hdash/synapse/meta_file.py index 7223a11..d964c17 100644 --- a/hdash/synapse/meta_file.py +++ b/hdash/synapse/meta_file.py @@ -11,7 +11,8 @@ def __init__(self): self.category = None self.num_items = 0 self.df = None + self.percent_meta_data_complete = 0 def __repr__(self): """Return summary of object.""" - return "%s: %s [%d items]" % (self.id, self.category, self.num_items) \ No newline at end of file + return "%s: %s [%d items]" % (self.id, self.category, self.num_items) diff --git a/hdash/synapse/meta_map.py b/hdash/synapse/meta_map.py index 5a0bb17..b8658cb 100644 --- a/hdash/synapse/meta_map.py +++ b/hdash/synapse/meta_map.py @@ -12,9 +12,11 @@ class MetaMap: def __init__(self): """Construct new MetaMap File.""" self.map = {} + self.meta_list_sorted: List[MetaMap] = [] - def add_meta_file(self, meta_file: MetaFile): + def add_meta_file(self, meta_file: MetaFile): """Add new Meta File to the Map.""" + self.meta_list_sorted.append(meta_file) category = meta_file.category if category in self.map: meta_list = self.map[category] diff --git a/hdash/templates/atlas.html b/hdash/templates/atlas.html index 637295c..6267584 100644 --- a/hdash/templates/atlas.html +++ b/hdash/templates/atlas.html @@ -33,14 +33,13 @@
- -