Skip to content

Commit

Permalink
Merge pull request #27 from hubmapconsortium/icaoberg-metadata-fix
Browse files Browse the repository at this point in the history
Metadata fixes
  • Loading branch information
icaoberg authored Feb 3, 2025
2 parents 8bfae09 + 3f309e7 commit 4a4c9b8
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 40 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,9 @@ ENV/
*.log
*.swp
.DS_Store
notebooks/phs*
notebooks/*zip
notebooks/daily*
notebooks/.datasets/
notebooks/.entity/
notebooks/.provenance/
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
<img src="images/logo.png" />

Python package that creates a submission for dbGaP.
This Python package automates the process of preparing and validating data from the HuBMAP project for submission to dbGaP. It helps format metadata and data into the required structure, ensuring compliance with dbGaP's submission guidelines for genomic and phenotype data.

For more information, visit [HuBMAP](https://www.hubmapconsortium.org/) and [dbGaP](https://www.ncbi.nlm.nih.gov/gap).

---
Copyright © 2020-2025 Pittsburgh Supercomputing Center. All Rights Reserved.

The [Biomedical Applications Group](https://www.psc.edu/biomedical-applications/) at the [Pittsburgh Supercomputing Center](http://www.psc.edu) in the [Mellon College of Science](https://www.cmu.edu/mcs/) at [Carnegie Mellon University](http://www.cmu.edu).
68 changes: 32 additions & 36 deletions hubmapdbgap/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def submission(

# THE METADATA
try:
library_id = (
f'{metadata["ingest_metadata"]["metadata"]["library_id"]}-{hubmap_id}'
)
library_id = f'{metadata["metadata"]["library_id"]}-{hubmap_id}'
except BaseException:
library_id = f"lib-{hubmap_id}"

Expand Down Expand Up @@ -213,52 +211,46 @@ def submission(
"DNA + RNA": "OTHER",
}

library_source = analyte_class[
metadata["ingest_metadata"]["metadata"]["analyte_class"]
]
library_source = analyte_class[metadata["metadata"]["analyte_class"]]

if (
metadata["dataset_type"][0] == "SNAREseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "SNAREseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
and metadata["metadata"]["analyte_class"] == "DNA"
):
library_strategy = "ATAC-seq"
elif (
metadata["dataset_type"][0] == "sciRNAseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "sciATACseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "sciATACseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
and metadata["metadata"]["analyte_class"] == "DNA"
):
library_strategy = "ATAC-seq"
else:
# library_strategy = library_strategy[metadata["dataset_type"][0]]
library_strategy = library_strategy[metadata["dataset_type"]]

library_layout = {"paired-end": "paired", "paired end": "paired"}
library_layout = library_layout[
metadata["ingest_metadata"]["metadata"]["library_layout"]
]
library_layout = library_layout[metadata["metadata"]["library_layout"]]

# library_selection
library_selection = "other"

# platform
platform = {"Illumina": "ILLUMINA"}
platform = platform[
metadata["ingest_metadata"]["metadata"]["acquisition_instrument_vendor"]
]
platform = platform[metadata["metadata"]["acquisition_instrument_vendor"]]

# instrument_model
instrument_model = {
Expand All @@ -270,37 +262,47 @@ def submission(
"HiSeq": "Illumina HiSeq 4000",
"HiSeq 4000": "Illumina HiSeq 4000",
"Nextseq2000": "NextSeq 2000",
"Novaseq6021": "Illumina NovaSeq 6000",
"Novaseq6022": "Illumina NovaSeq 6000",
"Novaseq6007": "Illumina NovaSeq 6000",
"Novaseq6011": "Illumina NovaSeq 6000",
"Novaseq6012": "Illumina NovaSeq 6000",
"Novaseq6020": "Illumina NovaSeq 6000",
"Novaseq6019": "Illumina NovaSeq 6000",
"Novaseq6022": "Illumina NovaSeq 6000",
"Novaseq6019": "Illumina NovaSeq 6000",
"Novaseq6018": "Illumina NovaSeq 6000",
"Novaseq6014": "Illumina NovaSeq 6000",
"Novaseq6015": "Illumina NovaSeq 6000",
"Novaseq6008": "Illumina NovaSeq 6000",
"Novaseq6016": "Illumina NovaSeq 6000",
"Novaseq6015": "Illumina NovaSeq 6000",
"Novaseq6010": "Illumina NovaSeq 6000",
"Novaseq6006": "Illumina NovaSeq 6000",
"Novaseq6018": "Illumina NovaSeq 6000",
"Novaseq6017": "Illumina NovaSeq 6000",
"Novaseq6005": "Illumina NovaSeq 6000",
"Novaseq6004": "Illumina NovaSeq 6000",
"Novaseq6003": "Illumina NovaSeq 6000",
"Novaseq6002": "Illumina NovaSeq 6000",
"Novaseq6001": "Illumina NovaSeq 6000",
"Nextseq500-NS500488": "NextSeq 550",
"NextSeq2000": "NextSeq 2000",
"NextSeq550": "NextSeq 550"
"NextSeq550": "NextSeq 550",
}
instrument_model = instrument_model[
metadata["ingest_metadata"]["metadata"]["acquisition_instrument_model"]
metadata["metadata"]["acquisition_instrument_model"]
]

assay_type = metadata["dataset_type"]

acquisition_instrument_vendor = metadata["ingest_metadata"]["metadata"][
acquisition_instrument_vendor = metadata["metadata"][
"acquisition_instrument_vendor"
]
acquisition_instrument_model = metadata["ingest_metadata"]["metadata"][
acquisition_instrument_model = metadata["metadata"][
"acquisition_instrument_model"
]
sequencing_reagent_kit_raw = metadata["ingest_metadata"]["metadata"][
"sequencing_reagent_kit"
]
sequencing_reagent_kit_raw = metadata["metadata"]["sequencing_reagent_kit"]
sequencing_reagent_kit = sequencing_reagent_kit_raw.replace(";", "")

# @icaoberg link is needed to map to a protocol description
Expand All @@ -323,14 +325,10 @@ def submission(
"10.17504/protocols.io.dm6gpb7p5lzp/v1": "Overview of scRNA-seq of Human Knee Meniscus",
}

if "preparation_protocol_doi" in metadata["ingest_metadata"]["metadata"]:
protocols_io_doi = metadata["ingest_metadata"]["metadata"][
"preparation_protocol_doi"
]
elif "protocols_io_doi" in metadata["ingest_metadata"]["metadata"]:
protocols_io_doi = metadata["ingest_metadata"]["metadata"][
"protocols_io_doi"
]
if "preparation_protocol_doi" in metadata["metadata"]:
protocols_io_doi = metadata["metadata"]["preparation_protocol_doi"]
elif "protocols_io_doi" in metadata["metadata"]:
protocols_io_doi = metadata["metadata"]["protocols_io_doi"]
else:
protocols_io_doi = None

Expand Down Expand Up @@ -506,10 +504,8 @@ def __create_sample_attributes(df: pd.DataFrame, token: str, directory: str):
analyte_class.append("DNA")
elif datum["sample_id"] == "HBM773.WCXC.264":
analyte_class.append("RNA")
elif "ingest_metadata" in metadata.keys():
analyte_class.append(
metadata["ingest_metadata"]["metadata"]["analyte_class"]
)
elif "metadata" in metadata.keys():
analyte_class.append(metadata["metadata"]["analyte_class"])
else:
print(datum["sample_id"])

Expand Down
108 changes: 108 additions & 0 deletions notebooks/phs002267.v1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "40798f90-c6d4-418f-832b-390c711b306e",
"metadata": {},
"source": [
"# phs002267.v1 - CalTech"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65f08289-536a-4511-8190-b2aecb20a92d",
"metadata": {},
"outputs": [],
"source": [
"import hubmapdbgap\n",
"import hubmapbags\n",
"import requests\n",
"import pandas as pd\n",
"from pprint import pprint\n",
"\n",
"token = 'TOKEN'\n",
"\n",
"def get_hubmap_ids(group_name: str) -> dict:\n",
" url = \"https://search.api.hubmapconsortium.org/v3/search\"\n",
"\n",
" headers = {\"Accept\": \"application/json\"}\n",
"\n",
" body = {\n",
" \"size\": 500,\n",
" \"_source\": {\n",
" \"include\": [\"hubmap_id\", \"uuid\", \"group_name\", \"dataset_type\", \"status\", \"data_types\",\"contains_human_genetic_sequences\"]\n",
" },\n",
" \"query\": {\n",
" \"bool\": {\n",
" \"must\": [{\"match_phrase\": {\"group_name\": group_name}}],\n",
" \"filter\": [{\"match\": {\"entity_type\": \"Dataset\"}}],\n",
" }\n",
" },\n",
" }\n",
"\n",
" answer = requests.post(url=url, headers=headers, json=body).json()\n",
"\n",
" if \"error\" in answer.keys():\n",
" warning(answer[\"error\"])\n",
" return None\n",
"\n",
" data = answer[\"hits\"][\"hits\"]\n",
"\n",
" results = []\n",
" for datum in data:\n",
" results.append(\n",
" {\n",
" \"uuid\": datum[\"_source\"][\"uuid\"],\n",
" \"hubmap_id\": datum[\"_source\"][\"hubmap_id\"],\n",
" \"status\": datum[\"_source\"][\"status\"],\n",
" \"dataset_type\": datum[\"_source\"][\"dataset_type\"],\n",
" \"is_protected\": datum[\"_source\"][\"contains_human_genetic_sequences\"],\n",
" }\n",
" )\n",
" \n",
" return results\n",
"\n",
"group_name = 'California Institute of Technology TMC'\n",
"dbgap_study_id = 'phs002267'\n",
"data = get_hubmap_ids(group_name)\n",
"df = pd.DataFrame(data)\n",
"df = df[(df[\"is_protected\"]==True) & (df[\"status\"]==\"Published\")]\n",
"\n",
"hubmap_ids = list(df['hubmap_id'])\n",
"print(f'List of total datasets to include in study is {len(hubmap_ids)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5d85dd1-30df-4ac6-9d77-90629f97f273",
"metadata": {},
"outputs": [],
"source": [
"data = hubmapdbgap.create.submission(hubmap_ids, dbgap_study_id=dbgap_study_id, token=token, prepend_sample_id=True )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
16 changes: 13 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
from setuptools import setup
from setuptools import setup, find_packages

setup(
name="hubmap-dbgap",
version="1.0",
version="2025.01",
description="Generates submission for dbGaP",
url="https://github.com/hubmapconsortium/py-hubmap-dbgap",
author="Ivan Cao-Berg, Gesina Phillips",
author_email="icaoberg@psc.edu",
license="MIT", # Add a license (you can change this if needed)
install_requires=[
"pandas",
"numpy",
"tabulate",
"tqdm",
],
packages=["hubmapdbgap"],
packages=find_packages(), # This will automatically find your package directories
classifiers=[ # These help users find your package based on its usage
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", # Make sure this matches your license
"Operating System :: OS Independent",
],
python_requires=">=3.7", # Adjust according to your code's Python version compatibility
long_description_content_type="text/markdown", # If you use a README.md file
long_description=open('README.md').read(), # This reads your README file (make sure you have one)
)

0 comments on commit 4a4c9b8

Please sign in to comment.