Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: handle mixed input file types in a project #344

Merged
merged 3 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 177 additions & 20 deletions .tests/unit/gtdb_prep/expected/data/interim/gtdb/GCA_000056065.1.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,179 @@
{
"GCA_000056065.1":{
"assembly":"ASM5606v1",
"organism":"Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes)",
"genus":"Lactobacillus",
"species":"delbrueckii",
"strain":"ATCC 11842",
"tax_id":"390333",
"refseq_category":null,
"refseq":"GCF_000056065.1",
"genbank":"GCA_000056065.1",
"assembly_type":"na",
"release_type":"major",
"assembly_level":"Complete Genome",
"genome_representation":"full",
"refseq_genbank_identity":"yes",
"biosample":"SAMEA3138258",
"submitter":"Genoscope",
"date":"2006-05-26",
"BioProject":"PRJNA16871"
}
"genome_id": "GCA_000056065.1",
"gtdb_url": "https://gtdb-api.ecogenomic.org/genome/GCA_000056065.1/taxon-history",
"gtdb_release": "R214",
"gtdb_taxonomy": {
"domain": "d__Bacteria",
"phylum": "p__Bacillota",
"class": "c__Bacilli",
"order": "o__Lactobacillales",
"family": "f__Lactobacillaceae",
"genus": "g__Lactobacillus",
"species": "s__Lactobacillus delbrueckii"
},
"metadata_url": "https://gtdb-api.ecogenomic.org/genome/GCA_000056065.1/card",
"metadata": {
"genome": {
"accession": "GCA_000056065.1",
"name": "GCF_000056065.1"
},
"metadata_nucleotide": {
"trna_aa_count": 19,
"contig_count": 1,
"n50_contigs": 1864998,
"longest_contig": 1864998,
"scaffold_count": 1,
"n50_scaffolds": 1864998,
"longest_scaffold": 1864998,
"genome_size": 1864998,
"gc_percentage": 49.71903455124348,
"ambiguous_bases": 0
},
"metadata_gene": {
"checkm_completeness": "98.38",
"checkm_contamination": "0.0",
"checkm_strain_heterogeneity": "0.0",
"checkm2_completeness": "99.6",
"checkm2_contamination": "1.19",
"checkm2_model": "Specific",
"lsu_5s_count": "9",
"ssu_count": "9",
"lsu_23s_count": "9",
"protein_count": "1916",
"coding_density": "84.01837428243891"
},
"metadata_ncbi": {
"ncbi_genbank_assembly_accession": "GCA_000056065.1",
"ncbi_strain_identifiers": "ATCC 11842",
"ncbi_assembly_level": "Complete Genome",
"ncbi_assembly_name": "ASM5606v1",
"ncbi_assembly_type": "na",
"ncbi_bioproject": "PRJNA224116",
"ncbi_biosample": "SAMEA3138258",
"ncbi_country": null,
"ncbi_date": "2006-05-26",
"ncbi_genome_category": null,
"ncbi_genome_representation": "full",
"ncbi_isolate": "na",
"ncbi_isolation_source": null,
"ncbi_lat_lon": null,
"ncbi_molecule_count": "1",
"ncbi_cds_count": "1862",
"ncbi_refseq_category": "na",
"ncbi_seq_rel_date": "2006/05/26",
"ncbi_spanned_gaps": "0",
"ncbi_species_taxid": "1584",
"ncbi_ssu_count": "9",
"ncbi_submitter": "Genoscope",
"ncbi_taxid": "390333",
"ncbi_total_gap_length": "0",
"ncbi_translation_table": "11",
"ncbi_trna_count": "95",
"ncbi_unspanned_gaps": "0",
"ncbi_version_status": "latest",
"ncbi_wgs_master": "na"
},
"metadata_type_material": {
"gtdbTypeDesignation": "type strain of subspecies",
"gtdbTypeDesignationSources": "LPSN",
"lpsnTypeDesignation": "type strain of subspecies",
"dsmzTypeDesignation": "type strain of subspecies",
"lpsnPriorityYear": 1919,
"gtdbTypeSpeciesOfGenus": false
},
"metadataTaxonomy": {
"ncbi_taxonomy": "d__Bacteria;p__Bacillota;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus delbrueckii",
"ncbi_taxonomy_unfiltered": "d__Bacteria;x__Terrabacteria group;p__Bacillota;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus delbrueckii;sb__Lactobacillus delbrueckii subsp. bulgaricus;x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002",
"gtdb_representative": false,
"gtdb_genome_representative": "RS_GCF_001433875.1",
"ncbi_type_material_designation": "assembly from type material",
"gtdbDomain": "d__Bacteria",
"gtdbPhylum": "p__Bacillota",
"gtdbClass": "c__Bacilli",
"gtdbOrder": "o__Lactobacillales",
"gtdbFamily": "f__Lactobacillaceae",
"gtdbGenus": "g__Lactobacillus",
"gtdbSpecies": "s__Lactobacillus delbrueckii"
},
"gtdbTypeDesignation": "type strain of subspecies",
"subunit_summary": "5S/16S/23S",
"speciesRepName": "GCA_001433875.1",
"speciesClusterCount": 342,
"lpsnUrl": "https://lpsn.dsmz.de/species/lactobacillus-delbrueckii",
"link_ncbi_taxonomy": "<a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/2/\">d__Bacteria</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1239/\">p__Bacillota</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/91061/\">c__Bacilli</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/186826/\">o__Lactobacillales</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/33958/\">f__Lactobacillaceae</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1578/\">g__Lactobacillus</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1584/\">s__Lactobacillus delbrueckii</a>",
"link_ncbi_taxonomy_unfiltered": "<a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/2/\">d__Bacteria</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1783272/\">x__Terrabacteria group</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1239/\">p__Bacillota</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/91061/\">c__Bacilli</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/186826/\">o__Lactobacillales</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/33958/\">f__Lactobacillaceae</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1578/\">g__Lactobacillus</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1584/\">s__Lactobacillus delbrueckii</a>; sb__Lactobacillus delbrueckii subsp. bulgaricus; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/390333/\">x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002</a>",
"ncbiTaxonomyFiltered": [
{
"taxon": "d__Bacteria",
"taxonId": "2"
},
{
"taxon": "p__Bacillota",
"taxonId": "1239"
},
{
"taxon": "c__Bacilli",
"taxonId": "91061"
},
{
"taxon": "o__Lactobacillales",
"taxonId": "186826"
},
{
"taxon": "f__Lactobacillaceae",
"taxonId": "33958"
},
{
"taxon": "g__Lactobacillus",
"taxonId": "1578"
},
{
"taxon": "s__Lactobacillus delbrueckii",
"taxonId": "1584"
}
],
"ncbiTaxonomyUnfiltered": [
{
"taxon": "d__Bacteria",
"taxonId": "2"
},
{
"taxon": "x__Terrabacteria group",
"taxonId": "1783272"
},
{
"taxon": "p__Bacillota",
"taxonId": "1239"
},
{
"taxon": "c__Bacilli",
"taxonId": "91061"
},
{
"taxon": "o__Lactobacillales",
"taxonId": "186826"
},
{
"taxon": "f__Lactobacillaceae",
"taxonId": "33958"
},
{
"taxon": "g__Lactobacillus",
"taxonId": "1578"
},
{
"taxon": "s__Lactobacillus delbrueckii",
"taxonId": "1584"
},
{
"taxon": "sb__Lactobacillus delbrueckii subsp. bulgaricus",
"taxonId": null
},
{
"taxon": "x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002",
"taxonId": "390333"
}
],
"detail": "Genome found"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
# Assert that the symlink was correctly generated
assert link.is_symlink(), f"Failed to create symlink: {link}"
assert (
link.resolve() == target_path
link.resolve() == target_path.resolve()
), f"Symlink {link} does not point to the correct target: {target_path}"

record_log["record_id"] = record.id
Expand Down
14 changes: 10 additions & 4 deletions workflow/bgcflow/bgcflow/database/csv_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,24 @@ def csv_to_parquet(project_folder, output_folder="."):
pass
elif "assets" in str(i):
pass
elif "notebooks" in str(i):
pass
elif "ipynb_checkpoints" in str(i):
pass
else:
category = str(i).split("/")[3]
output_subfolder = output_folder / category
df = pd.read_csv(i)
# df = df.fillna("")
# df.columns = [c.replace(".", "_") for c in df.columns]
output_parquet = output_subfolder / f"{i.stem}.parquet"
logging.info(f"Converting {i} to {output_parquet}")
output_subfolder.mkdir(parents=True, exist_ok=True)
df.to_parquet(output_parquet)
try:
df = pd.read_csv(i)
df.to_parquet(output_parquet)
except Exception as e:
logging.error(f"Error converting {i} to {output_parquet}: {e}")
logging.info("Retrying with all columns as strings")
df = pd.read_csv(i, dtype=str)
df.to_parquet(output_parquet)

return

Expand Down
98 changes: 98 additions & 0 deletions workflow/bgcflow/bgcflow/misc/summarize_gbk_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import logging
import sys
from pathlib import Path

from Bio import SeqIO

# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def extract_genbank_info(gb_file):
"""
Extract information from a GenBank file and store it in a dictionary.

Parameters:
gb_file (str): The path to the GenBank file.

Returns:
dict: A dictionary containing the extracted information.
"""
logging.info(f"Extracting information from {gb_file}")
info_dict_all = {}
for num, record in enumerate(SeqIO.parse(gb_file, "genbank")):
source = [feat for feat in record.features if feat.type == "source"]
assert (
len(source) == 1
), f"Expected 1 source feature in the record, found {len(source)}"
info_dict = {}
info_dict["organism"] = source[0].qualifiers["strain"][0]
info_dict["bases"] = len(record)
info_dict["CDS"] = len([feat for feat in record.features if feat.type == "CDS"])
info_dict["rRNA"] = len(
[feat for feat in record.features if feat.type == "rRNA"]
)
info_dict["repeat_region"] = len(
[feat for feat in record.features if feat.type == "repeat_region"]
)
info_dict["tRNA"] = len(
[feat for feat in record.features if feat.type == "tRNA"]
)
info_dict["tmRNA"] = len(
[feat for feat in record.features if feat.type == "tmRNA"]
)
info_dict_all[num] = info_dict
return info_dict_all


def summarize_and_write(info_dict_all, output_file):
"""
Summarize the information in a dictionary and write it to a text file.

Parameters:
info_dict_all (dict): The dictionary containing the information to summarize.
output_file (str): The path to the output text file.

Returns:
None
"""
logging.info(f"Summarizing information and writing to {output_file}")
summary = {
"organism": info_dict_all[0]["organism"],
"contigs": len(info_dict_all),
"bases": sum(info["bases"] for info in info_dict_all.values()),
"CDS": sum(info["CDS"] for info in info_dict_all.values()),
"rRNA": sum(info["rRNA"] for info in info_dict_all.values()),
"repeat_region": sum(info["repeat_region"] for info in info_dict_all.values()),
"tRNA": sum(info["tRNA"] for info in info_dict_all.values()),
"tmRNA": sum(info["tmRNA"] for info in info_dict_all.values()),
}

with open(output_file, "w") as f:
for key, value in summary.items():
f.write(f"{key}: {value}\n")


def summarize_genbank(gb_file, output_file):
"""
Extract information from a GenBank file, summarize it, and write the summary to a text file.

Parameters:
gb_file (str): The path to the GenBank file.
output_file (str): The path to the output text file.

Returns:
None
"""
logging.info(f"Starting to summarize GenBank file {gb_file}")
output_file = Path(output_file)
output_file.parent.mkdir(parents=True, exist_ok=True)
info_dict_all = extract_genbank_info(gb_file)
logging.info(f"Writing summary to {output_file}")
summarize_and_write(info_dict_all, output_file)


if __name__ == "__main__":
summarize_genbank(sys.argv[1], sys.argv[2])
4 changes: 2 additions & 2 deletions workflow/envs/convert_genbank.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ channels:
- defaults
dependencies:
- any2fasta==0.4.2
- biopython
- biopython>=1.80
- pip
- pip:
- bcbio-gff==0.7.0
- bcbio-gff==0.7.1
2 changes: 1 addition & 1 deletion workflow/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ gtdbtk:
- Parks DH, et al. 2018. A standardized bacterial taxonomy based on genome phylogeny
substantially revises the tree of life. Nature Biotechnology, [http://dx.doi.org/10.1038/nbt.4229](http://dx.doi.org/10.1038/nbt.4229).
prokka-gbk:
final_output: data/processed/{name}/genbank/{strains}.gbk
final_output: "data/processed/{name}/genbank/{strains}.txt"
description: Copy annotated genbank results.
category: Functional Annotation
link:
Expand Down
3 changes: 3 additions & 0 deletions workflow/rules/automlst_wrapper.smk
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ rule automlst_wrapper_out:
organism_info=lambda wildcards: expand("data/interim/prokka/{strains}/organism_info.txt",
strains=[s for s in list(PEP_PROJECTS[wildcards.name].sample_table.index)],
),
gtdb=lambda wildcards: expand("data/interim/gtdb/{strains}.json",
name=wildcards.name,
strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()])
output:
automlst_processed=directory("data/processed/{name}/automlst_wrapper/"),
final_tree="data/processed/{name}/automlst_wrapper/final.newick",
Expand Down
6 changes: 2 additions & 4 deletions workflow/rules/bgc_selection.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@ def get_bgc_inputs(pep_object, antismash_version):
df.rename(columns={'gbk_path': 'input_file'}, inplace=True)
assert 'input_file' in df.columns
custom_path = df.loc[i, "input_file"]
#print(custom_path, type(custom_path), custom_path != None, file=sys.stderr)

if custom_path != None:
gbk_path = input_path / custom_path
gbk_path = Path(input_path) / custom_path
elif 'input_folder' in pep_object.config.keys():
gbk_path = Path(input_path / f"{bgc_id}.gbk")
else:
gbk_path = antismash_path / genome_id / f"{bgc_id}.gbk"
#print(bgc_id, gbk_path, file=sys.stderr)
gbk_list.append(gbk_path)
return gbk_list

Expand Down Expand Up @@ -64,7 +62,7 @@ rule downstream_bgc_prep_selection:
echo "Previous dataset detected, appending dataset information for {wildcards.name}..."
sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
else
echo "No previous dataset detected, generating dataset information for {wildcards.name}..."
echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log}
echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log}
sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
fi
Expand Down
Loading
Loading