Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metadata fixes #27

Merged
merged 7 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,9 @@ ENV/
*.log
*.swp
.DS_Store
notebooks/phs*
notebooks/*zip
notebooks/daily*
notebooks/.datasets/
notebooks/.entity/
notebooks/.provenance/
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
<img src="images/logo.png" />

Python package that creates a submission for dbGaP.
This Python package automates the process of preparing and validating data from the HuBMAP project for submission to dbGaP. It helps format metadata and data into the required structure, ensuring compliance with dbGaP's submission guidelines for genomic and phenotype data.

For more information, visit [HuBMAP](https://www.hubmapconsortium.org/) and [dbGaP](https://www.ncbi.nlm.nih.gov/gap).

---
Copyright © 2020-2025 Pittsburgh Supercomputing Center. All Rights Reserved.

The [Biomedical Applications Group](https://www.psc.edu/biomedical-applications/) at the [Pittsburgh Supercomputing Center](http://www.psc.edu) in the [Mellon College of Science](https://www.cmu.edu/mcs/) at [Carnegie Mellon University](http://www.cmu.edu).
68 changes: 32 additions & 36 deletions hubmapdbgap/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def submission(

# THE METADATA
try:
library_id = (
f'{metadata["ingest_metadata"]["metadata"]["library_id"]}-{hubmap_id}'
)
library_id = f'{metadata["metadata"]["library_id"]}-{hubmap_id}'
except BaseException:
library_id = f"lib-{hubmap_id}"

Expand Down Expand Up @@ -213,52 +211,46 @@ def submission(
"DNA + RNA": "OTHER",
}

library_source = analyte_class[
metadata["ingest_metadata"]["metadata"]["analyte_class"]
]
library_source = analyte_class[metadata["metadata"]["analyte_class"]]

if (
metadata["dataset_type"][0] == "SNAREseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "SNAREseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
and metadata["metadata"]["analyte_class"] == "DNA"
):
library_strategy = "ATAC-seq"
elif (
metadata["dataset_type"][0] == "sciRNAseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "sciATACseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "RNA"
and metadata["metadata"]["analyte_class"] == "RNA"
):
library_strategy = "RNA-Seq"
elif (
metadata["dataset_type"][0] == "sciATACseq"
and metadata["ingest_metadata"]["metadata"]["analyte_class"] == "DNA"
and metadata["metadata"]["analyte_class"] == "DNA"
):
library_strategy = "ATAC-seq"
else:
# library_strategy = library_strategy[metadata["dataset_type"][0]]
library_strategy = library_strategy[metadata["dataset_type"]]

library_layout = {"paired-end": "paired", "paired end": "paired"}
library_layout = library_layout[
metadata["ingest_metadata"]["metadata"]["library_layout"]
]
library_layout = library_layout[metadata["metadata"]["library_layout"]]

# library_selection
library_selection = "other"

# platform
platform = {"Illumina": "ILLUMINA"}
platform = platform[
metadata["ingest_metadata"]["metadata"]["acquisition_instrument_vendor"]
]
platform = platform[metadata["metadata"]["acquisition_instrument_vendor"]]

# instrument_model
instrument_model = {
Expand All @@ -270,37 +262,47 @@ def submission(
"HiSeq": "Illumina HiSeq 4000",
"HiSeq 4000": "Illumina HiSeq 4000",
"Nextseq2000": "NextSeq 2000",
"Novaseq6021": "Illumina NovaSeq 6000",
"Novaseq6022": "Illumina NovaSeq 6000",
"Novaseq6007": "Illumina NovaSeq 6000",
"Novaseq6011": "Illumina NovaSeq 6000",
"Novaseq6012": "Illumina NovaSeq 6000",
"Novaseq6020": "Illumina NovaSeq 6000",
"Novaseq6019": "Illumina NovaSeq 6000",
"Novaseq6022": "Illumina NovaSeq 6000",
"Novaseq6019": "Illumina NovaSeq 6000",
"Novaseq6018": "Illumina NovaSeq 6000",
"Novaseq6014": "Illumina NovaSeq 6000",
"Novaseq6015": "Illumina NovaSeq 6000",
"Novaseq6008": "Illumina NovaSeq 6000",
"Novaseq6016": "Illumina NovaSeq 6000",
"Novaseq6015": "Illumina NovaSeq 6000",
"Novaseq6010": "Illumina NovaSeq 6000",
"Novaseq6006": "Illumina NovaSeq 6000",
"Novaseq6018": "Illumina NovaSeq 6000",
"Novaseq6017": "Illumina NovaSeq 6000",
"Novaseq6005": "Illumina NovaSeq 6000",
"Novaseq6004": "Illumina NovaSeq 6000",
"Novaseq6003": "Illumina NovaSeq 6000",
"Novaseq6002": "Illumina NovaSeq 6000",
"Novaseq6001": "Illumina NovaSeq 6000",
"Nextseq500-NS500488": "NextSeq 550",
"NextSeq2000": "NextSeq 2000",
"NextSeq550": "NextSeq 550"
"NextSeq550": "NextSeq 550",
}
instrument_model = instrument_model[
metadata["ingest_metadata"]["metadata"]["acquisition_instrument_model"]
metadata["metadata"]["acquisition_instrument_model"]
]

assay_type = metadata["dataset_type"]

acquisition_instrument_vendor = metadata["ingest_metadata"]["metadata"][
acquisition_instrument_vendor = metadata["metadata"][
"acquisition_instrument_vendor"
]
acquisition_instrument_model = metadata["ingest_metadata"]["metadata"][
acquisition_instrument_model = metadata["metadata"][
"acquisition_instrument_model"
]
sequencing_reagent_kit_raw = metadata["ingest_metadata"]["metadata"][
"sequencing_reagent_kit"
]
sequencing_reagent_kit_raw = metadata["metadata"]["sequencing_reagent_kit"]
sequencing_reagent_kit = sequencing_reagent_kit_raw.replace(";", "")

# @icaoberg link is needed to map to a protocol description
Expand All @@ -323,14 +325,10 @@ def submission(
"10.17504/protocols.io.dm6gpb7p5lzp/v1": "Overview of scRNA-seq of Human Knee Meniscus",
}

if "preparation_protocol_doi" in metadata["ingest_metadata"]["metadata"]:
protocols_io_doi = metadata["ingest_metadata"]["metadata"][
"preparation_protocol_doi"
]
elif "protocols_io_doi" in metadata["ingest_metadata"]["metadata"]:
protocols_io_doi = metadata["ingest_metadata"]["metadata"][
"protocols_io_doi"
]
if "preparation_protocol_doi" in metadata["metadata"]:
protocols_io_doi = metadata["metadata"]["preparation_protocol_doi"]
elif "protocols_io_doi" in metadata["metadata"]:
protocols_io_doi = metadata["metadata"]["protocols_io_doi"]
else:
protocols_io_doi = None

Expand Down Expand Up @@ -506,10 +504,8 @@ def __create_sample_attributes(df: pd.DataFrame, token: str, directory: str):
analyte_class.append("DNA")
elif datum["sample_id"] == "HBM773.WCXC.264":
analyte_class.append("RNA")
elif "ingest_metadata" in metadata.keys():
analyte_class.append(
metadata["ingest_metadata"]["metadata"]["analyte_class"]
)
elif "metadata" in metadata.keys():
analyte_class.append(metadata["metadata"]["analyte_class"])
else:
print(datum["sample_id"])

Expand Down
108 changes: 108 additions & 0 deletions notebooks/phs002267.v1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "40798f90-c6d4-418f-832b-390c711b306e",
"metadata": {},
"source": [
"# phs002267.v1 - CalTech"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65f08289-536a-4511-8190-b2aecb20a92d",
"metadata": {},
"outputs": [],
"source": [
"import hubmapdbgap\n",
"import hubmapbags\n",
"import requests\n",
"import pandas as pd\n",
"from pprint import pprint\n",
"\n",
"token = 'TOKEN'\n",
"\n",
"def get_hubmap_ids(group_name: str) -> dict:\n",
" url = \"https://search.api.hubmapconsortium.org/v3/search\"\n",
"\n",
" headers = {\"Accept\": \"application/json\"}\n",
"\n",
" body = {\n",
" \"size\": 500,\n",
" \"_source\": {\n",
" \"include\": [\"hubmap_id\", \"uuid\", \"group_name\", \"dataset_type\", \"status\", \"data_types\",\"contains_human_genetic_sequences\"]\n",
" },\n",
" \"query\": {\n",
" \"bool\": {\n",
" \"must\": [{\"match_phrase\": {\"group_name\": group_name}}],\n",
" \"filter\": [{\"match\": {\"entity_type\": \"Dataset\"}}],\n",
" }\n",
" },\n",
" }\n",
"\n",
" answer = requests.post(url=url, headers=headers, json=body).json()\n",
"\n",
" if \"error\" in answer.keys():\n",
" warning(answer[\"error\"])\n",
" return None\n",
"\n",
" data = answer[\"hits\"][\"hits\"]\n",
"\n",
" results = []\n",
" for datum in data:\n",
" results.append(\n",
" {\n",
" \"uuid\": datum[\"_source\"][\"uuid\"],\n",
" \"hubmap_id\": datum[\"_source\"][\"hubmap_id\"],\n",
" \"status\": datum[\"_source\"][\"status\"],\n",
" \"dataset_type\": datum[\"_source\"][\"dataset_type\"],\n",
" \"is_protected\": datum[\"_source\"][\"contains_human_genetic_sequences\"],\n",
" }\n",
" )\n",
" \n",
" return results\n",
"\n",
"group_name = 'California Institute of Technology TMC'\n",
"dbgap_study_id = 'phs002267'\n",
"data = get_hubmap_ids(group_name)\n",
"df = pd.DataFrame(data)\n",
"df = df[(df[\"is_protected\"]==True) & (df[\"status\"]==\"Published\")]\n",
"\n",
"hubmap_ids = list(df['hubmap_id'])\n",
"print(f'List of total datasets to include in study is {len(hubmap_ids)}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5d85dd1-30df-4ac6-9d77-90629f97f273",
"metadata": {},
"outputs": [],
"source": [
"data = hubmapdbgap.create.submission(hubmap_ids, dbgap_study_id=dbgap_study_id, token=token, prepend_sample_id=True )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
16 changes: 13 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
from setuptools import setup
from setuptools import setup, find_packages

setup(
name="hubmap-dbgap",
version="1.0",
version="2025.01",
description="Generates submission for dbGaP",
url="https://github.com/hubmapconsortium/py-hubmap-dbgap",
author="Ivan Cao-Berg, Gesina Phillips",
author_email="icaoberg@psc.edu",
license="MIT", # Add a license (you can change this if needed)
install_requires=[
"pandas",
"numpy",
"tabulate",
"tqdm",
],
packages=["hubmapdbgap"],
packages=find_packages(), # This will automatically find your package directories
classifiers=[ # These help users find your package based on its usage
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License", # Make sure this matches your license
"Operating System :: OS Independent",
],
python_requires=">=3.7", # Adjust according to your code's Python version compatibility
long_description_content_type="text/markdown", # If you use a README.md file
long_description=open('README.md').read(), # This reads your README file (make sure you have one)
)