Merge pull request #27 from hubmapconsortium/icaoberg-metadata-fix

Metadata fixes
hubmapconsortium · Feb 3, 2025 · 4a4c9b8 · 4a4c9b8
2 parents 8bfae09 + 3f309e7
commit 4a4c9b8
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,9 @@ ENV/
 *.log
 *.swp
 .DS_Store
+notebooks/phs*
+notebooks/*zip
+notebooks/daily*
+notebooks/.datasets/
+notebooks/.entity/
+notebooks/.provenance/
diff --git a/README.md b/README.md
@@ -1,3 +1,10 @@
 <img src="images/logo.png" />
 
-Python package that creates a submission for dbGaP.
+This Python package automates the process of preparing and validating data from the HuBMAP project for submission to dbGaP. It helps format metadata and data into the required structure, ensuring compliance with dbGaP's submission guidelines for genomic and phenotype data.
+
+For more information, visit [HuBMAP](https://www.hubmapconsortium.org/) and [dbGaP](https://www.ncbi.nlm.nih.gov/gap).
+
+---
+Copyright © 2020-2025 Pittsburgh Supercomputing Center. All Rights Reserved.
+
+The [Biomedical Applications Group](https://www.psc.edu/biomedical-applications/) at the [Pittsburgh Supercomputing Center](http://www.psc.edu) in the [Mellon College of Science](https://www.cmu.edu/mcs/) at [Carnegie Mellon University](http://www.cmu.edu).
diff --git a/hubmapdbgap/create.py b/hubmapdbgap/create.py
@@ -176,9 +176,7 @@ def submission(
 
         # THE METADATA
         try:
-            library_id = (
-                f'{metadata["ingest_metadata"]["metadata"]["library_id"]}-{hubmap_id}'
-            )
+            library_id = f'{metadata["metadata"]["library_id"]}-{hubmap_id}'
         except BaseException:
             library_id = f"lib-{hubmap_id}"
 
@@ -213,52 +211,46 @@ def submission(
             "DNA + RNA": "OTHER",
         }
 
-        library_source = analyte_class[
-            metadata["ingest_metadata"]["metadata"]["analyte_class"]
-        ]
+        library_source = analyte_class[metadata["metadata"]["analyte_class"]]
 
         if (
             metadata["dataset_type"][0] == "SNAREseq"
-            and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
+            and metadata["metadata"]["analyte_class"] == "RNA"
         ):
             library_strategy = "RNA-Seq"
         elif (
             metadata["dataset_type"][0] == "SNAREseq"
-            and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
+            and metadata["metadata"]["analyte_class"] == "DNA"
         ):
             library_strategy = "ATAC-seq"
         elif (
             metadata["dataset_type"][0] == "sciRNAseq"
-            and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
+            and metadata["metadata"]["analyte_class"] == "RNA"
         ):
             library_strategy = "RNA-Seq"
         elif (
             metadata["dataset_type"][0] == "sciATACseq"
-            and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
+            and metadata["metadata"]["analyte_class"] == "RNA"
         ):
             library_strategy = "RNA-Seq"
         elif (
             metadata["dataset_type"][0] == "sciATACseq"
-            and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
+            and metadata["metadata"]["analyte_class"] == "DNA"
         ):
             library_strategy = "ATAC-seq"
         else:
             # library_strategy = library_strategy[metadata["dataset_type"][0]]
             library_strategy = library_strategy[metadata["dataset_type"]]
 
         library_layout = {"paired-end": "paired", "paired end": "paired"}
-        library_layout = library_layout[
-            metadata["ingest_metadata"]["metadata"]["library_layout"]
-        ]
+        library_layout = library_layout[metadata["metadata"]["library_layout"]]
 
         # library_selection
         library_selection = "other"
 
         # platform
         platform = {"Illumina": "ILLUMINA"}
-        platform = platform[
-            metadata["ingest_metadata"]["metadata"]["acquisition_instrument_vendor"]
-        ]
+        platform = platform[metadata["metadata"]["acquisition_instrument_vendor"]]
 
         # instrument_model
         instrument_model = {
@@ -270,37 +262,47 @@ def submission(
             "HiSeq": "Illumina HiSeq 4000",
             "HiSeq 4000": "Illumina HiSeq 4000",
             "Nextseq2000": "NextSeq 2000",
+            "Novaseq6021": "Illumina NovaSeq 6000",
+            "Novaseq6022": "Illumina NovaSeq 6000",
+            "Novaseq6007": "Illumina NovaSeq 6000",
+            "Novaseq6011": "Illumina NovaSeq 6000",
+            "Novaseq6012": "Illumina NovaSeq 6000",
             "Novaseq6020": "Illumina NovaSeq 6000",
             "Novaseq6019": "Illumina NovaSeq 6000",
+            "Novaseq6022": "Illumina NovaSeq 6000",
+            "Novaseq6019": "Illumina NovaSeq 6000",
             "Novaseq6018": "Illumina NovaSeq 6000",
+            "Novaseq6014": "Illumina NovaSeq 6000",
+            "Novaseq6015": "Illumina NovaSeq 6000",
+            "Novaseq6008": "Illumina NovaSeq 6000",
             "Novaseq6016": "Illumina NovaSeq 6000",
             "Novaseq6015": "Illumina NovaSeq 6000",
             "Novaseq6010": "Illumina NovaSeq 6000",
             "Novaseq6006": "Illumina NovaSeq 6000",
+            "Novaseq6018": "Illumina NovaSeq 6000",
+            "Novaseq6017": "Illumina NovaSeq 6000",
             "Novaseq6005": "Illumina NovaSeq 6000",
             "Novaseq6004": "Illumina NovaSeq 6000",
             "Novaseq6003": "Illumina NovaSeq 6000",
             "Novaseq6002": "Illumina NovaSeq 6000",
             "Novaseq6001": "Illumina NovaSeq 6000",
             "Nextseq500-NS500488": "NextSeq 550",
             "NextSeq2000": "NextSeq 2000",
-            "NextSeq550": "NextSeq 550"
+            "NextSeq550": "NextSeq 550",
         }
         instrument_model = instrument_model[
-            metadata["ingest_metadata"]["metadata"]["acquisition_instrument_model"]
+            metadata["metadata"]["acquisition_instrument_model"]
         ]
 
         assay_type = metadata["dataset_type"]
 
-        acquisition_instrument_vendor = metadata["ingest_metadata"]["metadata"][
+        acquisition_instrument_vendor = metadata["metadata"][
             "acquisition_instrument_vendor"
         ]
-        acquisition_instrument_model = metadata["ingest_metadata"]["metadata"][
+        acquisition_instrument_model = metadata["metadata"][
             "acquisition_instrument_model"
         ]
-        sequencing_reagent_kit_raw = metadata["ingest_metadata"]["metadata"][
-            "sequencing_reagent_kit"
-        ]
+        sequencing_reagent_kit_raw = metadata["metadata"]["sequencing_reagent_kit"]
         sequencing_reagent_kit = sequencing_reagent_kit_raw.replace(";", "")
 
         # @icaoberg link is needed to map to a protocol description
@@ -323,14 +325,10 @@ def submission(
             "10.17504/protocols.io.dm6gpb7p5lzp/v1": "Overview of scRNA-seq of Human Knee Meniscus",
         }
 
-        if "preparation_protocol_doi" in metadata["ingest_metadata"]["metadata"]:
-            protocols_io_doi = metadata["ingest_metadata"]["metadata"][
-                "preparation_protocol_doi"
-            ]
-        elif "protocols_io_doi" in metadata["ingest_metadata"]["metadata"]:
-            protocols_io_doi = metadata["ingest_metadata"]["metadata"][
-                "protocols_io_doi"
-            ]
+        if "preparation_protocol_doi" in metadata["metadata"]:
+            protocols_io_doi = metadata["metadata"]["preparation_protocol_doi"]
+        elif "protocols_io_doi" in metadata["metadata"]:
+            protocols_io_doi = metadata["metadata"]["protocols_io_doi"]
         else:
             protocols_io_doi = None
 
@@ -506,10 +504,8 @@ def __create_sample_attributes(df: pd.DataFrame, token: str, directory: str):
             analyte_class.append("DNA")
         elif datum["sample_id"] == "HBM773.WCXC.264":
             analyte_class.append("RNA")
-        elif "ingest_metadata" in metadata.keys():
-            analyte_class.append(
-                metadata["ingest_metadata"]["metadata"]["analyte_class"]
-            )
+        elif "metadata" in metadata.keys():
+            analyte_class.append(metadata["metadata"]["analyte_class"])
         else:
             print(datum["sample_id"])
 

diff --git a/notebooks/phs002267.v1.ipynb b/notebooks/phs002267.v1.ipynb
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "40798f90-c6d4-418f-832b-390c711b306e",
+   "metadata": {},
+   "source": [
+    "# phs002267.v1 - CalTech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65f08289-536a-4511-8190-b2aecb20a92d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hubmapdbgap\n",
+    "import hubmapbags\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "from pprint import pprint\n",
+    "\n",
+    "token = 'TOKEN'\n",
+    "\n",
+    "def get_hubmap_ids(group_name: str) -> dict:\n",
+    "    url = \"https://search.api.hubmapconsortium.org/v3/search\"\n",
+    "\n",
+    "    headers = {\"Accept\": \"application/json\"}\n",
+    "\n",
+    "    body = {\n",
+    "        \"size\": 500,\n",
+    "        \"_source\": {\n",
+    "            \"include\": [\"hubmap_id\", \"uuid\", \"group_name\", \"dataset_type\", \"status\", \"data_types\",\"contains_human_genetic_sequences\"]\n",
+    "        },\n",
+    "        \"query\": {\n",
+    "            \"bool\": {\n",
+    "                \"must\": [{\"match_phrase\": {\"group_name\": group_name}}],\n",
+    "                \"filter\": [{\"match\": {\"entity_type\": \"Dataset\"}}],\n",
+    "            }\n",
+    "        },\n",
+    "    }\n",
+    "\n",
+    "    answer = requests.post(url=url, headers=headers, json=body).json()\n",
+    "\n",
+    "    if \"error\" in answer.keys():\n",
+    "        warning(answer[\"error\"])\n",
+    "        return None\n",
+    "\n",
+    "    data = answer[\"hits\"][\"hits\"]\n",
+    "\n",
+    "    results = []\n",
+    "    for datum in data:\n",
+    "        results.append(\n",
+    "            {\n",
+    "                \"uuid\": datum[\"_source\"][\"uuid\"],\n",
+    "                \"hubmap_id\": datum[\"_source\"][\"hubmap_id\"],\n",
+    "                \"status\": datum[\"_source\"][\"status\"],\n",
+    "                \"dataset_type\": datum[\"_source\"][\"dataset_type\"],\n",
+    "                \"is_protected\": datum[\"_source\"][\"contains_human_genetic_sequences\"],\n",
+    "            }\n",
+    "        )\n",
+    "    \n",
+    "    return results\n",
+    "\n",
+    "group_name = 'California Institute of Technology TMC'\n",
+    "dbgap_study_id = 'phs002267'\n",
+    "data = get_hubmap_ids(group_name)\n",
+    "df = pd.DataFrame(data)\n",
+    "df = df[(df[\"is_protected\"]==True) & (df[\"status\"]==\"Published\")]\n",
+    "\n",
+    "hubmap_ids = list(df['hubmap_id'])\n",
+    "print(f'List of total datasets to include in study is {len(hubmap_ids)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5d85dd1-30df-4ac6-9d77-90629f97f273",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = hubmapdbgap.create.submission(hubmap_ids, dbgap_study_id=dbgap_study_id, token=token, prepend_sample_id=True )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/setup.py b/setup.py
@@ -1,17 +1,27 @@
-from setuptools import setup
+from setuptools import setup, find_packages
 
 setup(
     name="hubmap-dbgap",
-    version="1.0",
+    version="2025.01",
     description="Generates submission for dbGaP",
     url="https://github.com/hubmapconsortium/py-hubmap-dbgap",
     author="Ivan Cao-Berg, Gesina Phillips",
     author_email="icaoberg@psc.edu",
+    license="MIT",  # Add a license (you can change this if needed)
     install_requires=[
         "pandas",
         "numpy",
         "tabulate",
         "tqdm",
     ],
-    packages=["hubmapdbgap"],
+    packages=find_packages(),  # This will automatically find your package directories
+    classifiers=[  # These help users find your package based on its usage
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",  # Make sure this matches your license
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.7",  # Adjust according to your code's Python version compatibility
+    long_description_content_type="text/markdown",  # If you use a README.md file
+    long_description=open('README.md').read(),  # This reads your README file (make sure you have one)
 )
+