NBChub · matinnuhamunada · Apr 29, 2024 · Apr 28, 2024 · Apr 29, 2024 · Apr 29, 2024
diff --git a/.tests/unit/gtdb_prep/expected/data/interim/gtdb/GCA_000056065.1.json b/.tests/unit/gtdb_prep/expected/data/interim/gtdb/GCA_000056065.1.json
@@ -1,22 +1,179 @@
 {
-    "GCA_000056065.1":{
-        "assembly":"ASM5606v1",
-        "organism":"Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002 (firmicutes)",
-        "genus":"Lactobacillus",
-        "species":"delbrueckii",
-        "strain":"ATCC 11842",
-        "tax_id":"390333",
-        "refseq_category":null,
-        "refseq":"GCF_000056065.1",
-        "genbank":"GCA_000056065.1",
-        "assembly_type":"na",
-        "release_type":"major",
-        "assembly_level":"Complete Genome",
-        "genome_representation":"full",
-        "refseq_genbank_identity":"yes",
-        "biosample":"SAMEA3138258",
-        "submitter":"Genoscope",
-        "date":"2006-05-26",
-        "BioProject":"PRJNA16871"
-    }
+  "genome_id": "GCA_000056065.1",
+  "gtdb_url": "https://gtdb-api.ecogenomic.org/genome/GCA_000056065.1/taxon-history",
+  "gtdb_release": "R214",
+  "gtdb_taxonomy": {
+    "domain": "d__Bacteria",
+    "phylum": "p__Bacillota",
+    "class": "c__Bacilli",
+    "order": "o__Lactobacillales",
+    "family": "f__Lactobacillaceae",
+    "genus": "g__Lactobacillus",
+    "species": "s__Lactobacillus delbrueckii"
+  },
+  "metadata_url": "https://gtdb-api.ecogenomic.org/genome/GCA_000056065.1/card",
+  "metadata": {
+    "genome": {
+      "accession": "GCA_000056065.1",
+      "name": "GCF_000056065.1"
+    },
+    "metadata_nucleotide": {
+      "trna_aa_count": 19,
+      "contig_count": 1,
+      "n50_contigs": 1864998,
+      "longest_contig": 1864998,
+      "scaffold_count": 1,
+      "n50_scaffolds": 1864998,
+      "longest_scaffold": 1864998,
+      "genome_size": 1864998,
+      "gc_percentage": 49.71903455124348,
+      "ambiguous_bases": 0
+    },
+    "metadata_gene": {
+      "checkm_completeness": "98.38",
+      "checkm_contamination": "0.0",
+      "checkm_strain_heterogeneity": "0.0",
+      "checkm2_completeness": "99.6",
+      "checkm2_contamination": "1.19",
+      "checkm2_model": "Specific",
+      "lsu_5s_count": "9",
+      "ssu_count": "9",
+      "lsu_23s_count": "9",
+      "protein_count": "1916",
+      "coding_density": "84.01837428243891"
+    },
+    "metadata_ncbi": {
+      "ncbi_genbank_assembly_accession": "GCA_000056065.1",
+      "ncbi_strain_identifiers": "ATCC 11842",
+      "ncbi_assembly_level": "Complete Genome",
+      "ncbi_assembly_name": "ASM5606v1",
+      "ncbi_assembly_type": "na",
+      "ncbi_bioproject": "PRJNA224116",
+      "ncbi_biosample": "SAMEA3138258",
+      "ncbi_country": null,
+      "ncbi_date": "2006-05-26",
+      "ncbi_genome_category": null,
+      "ncbi_genome_representation": "full",
+      "ncbi_isolate": "na",
+      "ncbi_isolation_source": null,
+      "ncbi_lat_lon": null,
+      "ncbi_molecule_count": "1",
+      "ncbi_cds_count": "1862",
+      "ncbi_refseq_category": "na",
+      "ncbi_seq_rel_date": "2006/05/26",
+      "ncbi_spanned_gaps": "0",
+      "ncbi_species_taxid": "1584",
+      "ncbi_ssu_count": "9",
+      "ncbi_submitter": "Genoscope",
+      "ncbi_taxid": "390333",
+      "ncbi_total_gap_length": "0",
+      "ncbi_translation_table": "11",
+      "ncbi_trna_count": "95",
+      "ncbi_unspanned_gaps": "0",
+      "ncbi_version_status": "latest",
+      "ncbi_wgs_master": "na"
+    },
+    "metadata_type_material": {
+      "gtdbTypeDesignation": "type strain of subspecies",
+      "gtdbTypeDesignationSources": "LPSN",
+      "lpsnTypeDesignation": "type strain of subspecies",
+      "dsmzTypeDesignation": "type strain of subspecies",
+      "lpsnPriorityYear": 1919,
+      "gtdbTypeSpeciesOfGenus": false
+    },
+    "metadataTaxonomy": {
+      "ncbi_taxonomy": "d__Bacteria;p__Bacillota;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus delbrueckii",
+      "ncbi_taxonomy_unfiltered": "d__Bacteria;x__Terrabacteria group;p__Bacillota;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__Lactobacillus delbrueckii;sb__Lactobacillus delbrueckii subsp. bulgaricus;x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002",
+      "gtdb_representative": false,
+      "gtdb_genome_representative": "RS_GCF_001433875.1",
+      "ncbi_type_material_designation": "assembly from type material",
+      "gtdbDomain": "d__Bacteria",
+      "gtdbPhylum": "p__Bacillota",
+      "gtdbClass": "c__Bacilli",
+      "gtdbOrder": "o__Lactobacillales",
+      "gtdbFamily": "f__Lactobacillaceae",
+      "gtdbGenus": "g__Lactobacillus",
+      "gtdbSpecies": "s__Lactobacillus delbrueckii"
+    },
+    "gtdbTypeDesignation": "type strain of subspecies",
+    "subunit_summary": "5S/16S/23S",
+    "speciesRepName": "GCA_001433875.1",
+    "speciesClusterCount": 342,
+    "lpsnUrl": "https://lpsn.dsmz.de/species/lactobacillus-delbrueckii",
+    "link_ncbi_taxonomy": "<a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/2/\">d__Bacteria</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1239/\">p__Bacillota</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/91061/\">c__Bacilli</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/186826/\">o__Lactobacillales</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/33958/\">f__Lactobacillaceae</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1578/\">g__Lactobacillus</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1584/\">s__Lactobacillus delbrueckii</a>",
+    "link_ncbi_taxonomy_unfiltered": "<a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/2/\">d__Bacteria</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1783272/\">x__Terrabacteria group</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1239/\">p__Bacillota</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/91061/\">c__Bacilli</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/186826/\">o__Lactobacillales</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/33958/\">f__Lactobacillaceae</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1578/\">g__Lactobacillus</a>; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/1584/\">s__Lactobacillus delbrueckii</a>; sb__Lactobacillus delbrueckii subsp. bulgaricus; <a target=\"_blank\" href=\"https://www.ncbi.nlm.nih.gov/data-hub/taxonomy/390333/\">x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002</a>",
+    "ncbiTaxonomyFiltered": [
+      {
+        "taxon": "d__Bacteria",
+        "taxonId": "2"
+      },
+      {
+        "taxon": "p__Bacillota",
+        "taxonId": "1239"
+      },
+      {
+        "taxon": "c__Bacilli",
+        "taxonId": "91061"
+      },
+      {
+        "taxon": "o__Lactobacillales",
+        "taxonId": "186826"
+      },
+      {
+        "taxon": "f__Lactobacillaceae",
+        "taxonId": "33958"
+      },
+      {
+        "taxon": "g__Lactobacillus",
+        "taxonId": "1578"
+      },
+      {
+        "taxon": "s__Lactobacillus delbrueckii",
+        "taxonId": "1584"
+      }
+    ],
+    "ncbiTaxonomyUnfiltered": [
+      {
+        "taxon": "d__Bacteria",
+        "taxonId": "2"
+      },
+      {
+        "taxon": "x__Terrabacteria group",
+        "taxonId": "1783272"
+      },
+      {
+        "taxon": "p__Bacillota",
+        "taxonId": "1239"
+      },
+      {
+        "taxon": "c__Bacilli",
+        "taxonId": "91061"
+      },
+      {
+        "taxon": "o__Lactobacillales",
+        "taxonId": "186826"
+      },
+      {
+        "taxon": "f__Lactobacillaceae",
+        "taxonId": "33958"
+      },
+      {
+        "taxon": "g__Lactobacillus",
+        "taxonId": "1578"
+      },
+      {
+        "taxon": "s__Lactobacillus delbrueckii",
+        "taxonId": "1584"
+      },
+      {
+        "taxon": "sb__Lactobacillus delbrueckii subsp. bulgaricus",
+        "taxonId": null
+      },
+      {
+        "taxon": "x__Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842 = JCM 1002",
+        "taxonId": "390333"
+      }
+    ],
+    "detail": "Genome found"
+  }
 }
diff --git a/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py b/workflow/bgcflow/bgcflow/data/bgc_downstream_prep_selection.py
@@ -87,7 +87,7 @@ def generate_symlink(path, genome_id, output_dir, selected_bgcs=False):
                     # Assert that the symlink was correctly generated
                     assert link.is_symlink(), f"Failed to create symlink: {link}"
                     assert (
-                        link.resolve() == target_path
+                        link.resolve() == target_path.resolve()
                     ), f"Symlink {link} does not point to the correct target: {target_path}"
 
                 record_log["record_id"] = record.id

diff --git a/workflow/bgcflow/bgcflow/database/csv_to_parquet.py b/workflow/bgcflow/bgcflow/database/csv_to_parquet.py
@@ -29,18 +29,24 @@ def csv_to_parquet(project_folder, output_folder="."):
             pass
         elif "assets" in str(i):
             pass
+        elif "notebooks" in str(i):
+            pass
         elif "ipynb_checkpoints" in str(i):
             pass
         else:
             category = str(i).split("/")[3]
             output_subfolder = output_folder / category
-            df = pd.read_csv(i)
-            # df = df.fillna("")
-            # df.columns = [c.replace(".", "_") for c in df.columns]
             output_parquet = output_subfolder / f"{i.stem}.parquet"
             logging.info(f"Converting {i} to {output_parquet}")
             output_subfolder.mkdir(parents=True, exist_ok=True)
-            df.to_parquet(output_parquet)
+            try:
+                df = pd.read_csv(i)
+                df.to_parquet(output_parquet)
+            except Exception as e:
+                logging.error(f"Error converting {i} to {output_parquet}: {e}")
+                logging.info("Retrying with all columns as strings")
+                df = pd.read_csv(i, dtype=str)
+                df.to_parquet(output_parquet)
 
     return
 

diff --git a/workflow/bgcflow/bgcflow/misc/summarize_gbk_txt.py b/workflow/bgcflow/bgcflow/misc/summarize_gbk_txt.py
@@ -0,0 +1,98 @@
+import logging
+import sys
+from pathlib import Path
+
+from Bio import SeqIO
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+def extract_genbank_info(gb_file):
+    """
+    Extract information from a GenBank file and store it in a dictionary.
+
+    Parameters:
+    gb_file (str): The path to the GenBank file.
+
+    Returns:
+    dict: A dictionary containing the extracted information.
+    """
+    logging.info(f"Extracting information from {gb_file}")
+    info_dict_all = {}
+    for num, record in enumerate(SeqIO.parse(gb_file, "genbank")):
+        source = [feat for feat in record.features if feat.type == "source"]
+        assert (
+            len(source) == 1
+        ), f"Expected 1 source feature in the record, found {len(source)}"
+        info_dict = {}
+        info_dict["organism"] = source[0].qualifiers["strain"][0]
+        info_dict["bases"] = len(record)
+        info_dict["CDS"] = len([feat for feat in record.features if feat.type == "CDS"])
+        info_dict["rRNA"] = len(
+            [feat for feat in record.features if feat.type == "rRNA"]
+        )
+        info_dict["repeat_region"] = len(
+            [feat for feat in record.features if feat.type == "repeat_region"]
+        )
+        info_dict["tRNA"] = len(
+            [feat for feat in record.features if feat.type == "tRNA"]
+        )
+        info_dict["tmRNA"] = len(
+            [feat for feat in record.features if feat.type == "tmRNA"]
+        )
+        info_dict_all[num] = info_dict
+    return info_dict_all
+
+
+def summarize_and_write(info_dict_all, output_file):
+    """
+    Summarize the information in a dictionary and write it to a text file.
+
+    Parameters:
+    info_dict_all (dict): The dictionary containing the information to summarize.
+    output_file (str): The path to the output text file.
+
+    Returns:
+    None
+    """
+    logging.info(f"Summarizing information and writing to {output_file}")
+    summary = {
+        "organism": info_dict_all[0]["organism"],
+        "contigs": len(info_dict_all),
+        "bases": sum(info["bases"] for info in info_dict_all.values()),
+        "CDS": sum(info["CDS"] for info in info_dict_all.values()),
+        "rRNA": sum(info["rRNA"] for info in info_dict_all.values()),
+        "repeat_region": sum(info["repeat_region"] for info in info_dict_all.values()),
+        "tRNA": sum(info["tRNA"] for info in info_dict_all.values()),
+        "tmRNA": sum(info["tmRNA"] for info in info_dict_all.values()),
+    }
+
+    with open(output_file, "w") as f:
+        for key, value in summary.items():
+            f.write(f"{key}: {value}\n")
+
+
+def summarize_genbank(gb_file, output_file):
+    """
+    Extract information from a GenBank file, summarize it, and write the summary to a text file.
+
+    Parameters:
+    gb_file (str): The path to the GenBank file.
+    output_file (str): The path to the output text file.
+
+    Returns:
+    None
+    """
+    logging.info(f"Starting to summarize GenBank file {gb_file}")
+    output_file = Path(output_file)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    info_dict_all = extract_genbank_info(gb_file)
+    logging.info(f"Writing summary to {output_file}")
+    summarize_and_write(info_dict_all, output_file)
+
+
+if __name__ == "__main__":
+    summarize_genbank(sys.argv[1], sys.argv[2])
diff --git a/workflow/envs/convert_genbank.yaml b/workflow/envs/convert_genbank.yaml
@@ -5,7 +5,7 @@ channels:
   - defaults
 dependencies:
   - any2fasta==0.4.2
-  - biopython
+  - biopython>=1.80
   - pip
   - pip:
-    - bcbio-gff==0.7.0
+    - bcbio-gff==0.7.1
diff --git a/workflow/rules.yaml b/workflow/rules.yaml
@@ -137,7 +137,7 @@ gtdbtk:
   - Parks DH, et al. 2018. A standardized bacterial taxonomy based on genome phylogeny
     substantially revises the tree of life. Nature Biotechnology, [http://dx.doi.org/10.1038/nbt.4229](http://dx.doi.org/10.1038/nbt.4229).
 prokka-gbk:
-  final_output: data/processed/{name}/genbank/{strains}.gbk
+  final_output: "data/processed/{name}/genbank/{strains}.txt"
   description: Copy annotated genbank results.
   category: Functional Annotation
   link:

diff --git a/workflow/rules/automlst_wrapper.smk b/workflow/rules/automlst_wrapper.smk
@@ -62,6 +62,9 @@ rule automlst_wrapper_out:
         organism_info=lambda wildcards: expand("data/interim/prokka/{strains}/organism_info.txt",
                     strains=[s for s in list(PEP_PROJECTS[wildcards.name].sample_table.index)],
         ),
+        gtdb=lambda wildcards: expand("data/interim/gtdb/{strains}.json",
+            name=wildcards.name,
+            strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()])
     output:
         automlst_processed=directory("data/processed/{name}/automlst_wrapper/"),
         final_tree="data/processed/{name}/automlst_wrapper/final.newick",

diff --git a/workflow/rules/bgc_selection.smk b/workflow/rules/bgc_selection.smk
@@ -20,15 +20,13 @@ def get_bgc_inputs(pep_object, antismash_version):
             df.rename(columns={'gbk_path': 'input_file'}, inplace=True)
         assert 'input_file' in df.columns
         custom_path = df.loc[i, "input_file"]
-        #print(custom_path, type(custom_path), custom_path != None, file=sys.stderr)
 
         if custom_path != None:
-            gbk_path = input_path / custom_path
+            gbk_path = Path(input_path) / custom_path
         elif 'input_folder' in pep_object.config.keys():
             gbk_path = Path(input_path / f"{bgc_id}.gbk")
         else:
             gbk_path = antismash_path / genome_id / f"{bgc_id}.gbk"
-        #print(bgc_id, gbk_path, file=sys.stderr)
         gbk_list.append(gbk_path)
     return gbk_list
 
@@ -64,7 +62,7 @@ rule downstream_bgc_prep_selection:
             echo "Previous dataset detected, appending dataset information for {wildcards.name}..."
             sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         else
-            echo "No previous dataset detected, generating dataset information for {wildcards.name}..."
+            echo "No previous dataset detected, generating dataset information for {wildcards.name}..." 2>> {log}
             echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log}
             sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log}
         fi