diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index c3716c92..81c5536b 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -46,6 +46,42 @@ jobs:
stagein: "conda config --set channel_priority strict"
args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n"
+ dry-run-report:
+ runs-on: ubuntu-latest
+ needs:
+ - linting
+ - formatting
+ steps:
+ - name: Checkout repository and submodules
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Dry-run workflow
+ uses: snakemake/snakemake-github-action@v1.24.0
+ with:
+ directory: .tests
+ snakefile: workflow/Report
+ stagein: "conda config --set channel_priority strict"
+ args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n"
+
+ dry-run-metabase:
+ runs-on: ubuntu-latest
+ needs:
+ - linting
+ - formatting
+ steps:
+ - name: Checkout repository and submodules
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Dry-run workflow
+ uses: snakemake/snakemake-github-action@v1.24.0
+ with:
+ directory: .tests
+ snakefile: workflow/Metabase
+ stagein: "conda config --set channel_priority strict"
+ args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n"
+
unit-test:
runs-on: ubuntu-latest
needs:
diff --git a/README.md b/README.md
index a47ad4c8..894a5f13 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
[![Snakemake](https://img.shields.io/badge/snakemake-≥7.14.0-brightgreen.svg)](https://snakemake.bitbucket.io)
[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pep.databio.org)
[![wiki](https://img.shields.io/badge/wiki-documentation-forestgreen)](https://github.com/NBChub/bgcflow/wiki)
+[![bgcflow-wrapper](https://img.shields.io/badge/CLI-BGCFlow_wrapper-orange)](https://github.com/NBChub/bgcflow_wrapper)
+[![example report](https://img.shields.io/badge/demo-report-blue)](https://nbchub.github.io/bgcflow_report_demo/)
`BGCFlow` is a systematic workflow for the analysis of biosynthetic gene clusters across large collections of genomes (pangenomes) from internal & public datasets.
@@ -15,13 +17,14 @@ At present, `BGCFlow` is only tested and confirmed to work on **Linux** systems
Please use the latest version of `BGCFlow` available.
## Quick Start
-A quick and easy way to use `BGCFlow` using [`bgcflow_wrapper`](https://github.com/NBChub/bgcflow_wrapper).
+A quick and easy way to use `BGCFlow` using the command line interface wrapper:
+[![bgcflow-wrapper](https://img.shields.io/badge/CLI-BGCFlow_wrapper-orange)](https://github.com/NBChub/bgcflow_wrapper)
1. Create a conda environment and install the [`BGCFlow` python wrapper](https://github.com/NBChub/bgcflow_wrapper) :
```bash
# create and activate a new conda environment
-conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y
+conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase
conda activate bgcflow
# install `BGCFlow` wrapper
@@ -37,10 +40,6 @@ With the environment activated, install or setup this configurations:
```bash
conda config --set channel_priority disabled
conda config --describe channel_priority
-```
- - Java (required to run `metabase`)
-```bash
-conda install openjdk
```
3. Deploy and run BGCFlow, change `your_bgcflow_directory` variable accordingly:
@@ -52,7 +51,10 @@ bgcflow init # initiate `BGCFlow` config and examples from template
bgcflow run -n # do a dry run, remove the flag "-n" to run the example dataset
```
-4. Build and serve interactive report (after `bgcflow run` finished). The report will be served in [http://localhost:8001/](http://localhost:8001/):
+4. Build and serve interactive report (after `bgcflow run` finished). The report will be served in [http://localhost:8001/](http://localhost:8001/). A demo of the report is available here:
+
+ [![example report](https://img.shields.io/badge/demo-report-blue)](https://nbchub.github.io/bgcflow_report_demo/)
+
```bash
# build a report
bgcflow build report
@@ -65,7 +67,8 @@ bgcflow serve --project Lactobacillus_delbrueckii
```
-- For detailed usage and configurations, have a look at the [`BGCFlow` WIKI](https://github.com/NBChub/bgcflow/wiki/) (`under development`) :warning:
+- For detailed usage and configurations, have a look at the WIKI:
+[![wiki](https://img.shields.io/badge/wiki-documentation-forestgreen)](https://github.com/NBChub/bgcflow/wiki)
- Read more about [`bgcflow_wrapper`](https://github.com/NBChub/bgcflow_wrapper) for a detailed overview of the command line interface.
[![asciicast](https://asciinema.org/a/595149.svg)](https://asciinema.org/a/595149)
diff --git a/workflow/bgcflow/bgcflow/data/arts_extract_all.py b/workflow/bgcflow/bgcflow/data/arts_extract_all.py
index 46cf5ad6..ecbb442e 100644
--- a/workflow/bgcflow/bgcflow/data/arts_extract_all.py
+++ b/workflow/bgcflow/bgcflow/data/arts_extract_all.py
@@ -430,6 +430,7 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None):
pandas.DataFrame
A DataFrame representation of the extracted Known Resistance Hits information,
with each row representing a hit and columns representing hit attributes.
+ If the input DataFrame is empty, an empty DataFrame is returned.
Example
-------
@@ -465,7 +466,6 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None):
] = f"{str(genome_id)}__{str(model)}__{str(scaffold)}__{str(sequence_id)}"
arts_knownhits = (
arts_knownhits.drop(columns=["Sequence description"])
- .set_index("index")
.rename(
columns={
"#Model": "model",
@@ -474,6 +474,10 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None):
}
)
)
+ if arts_knownhits.empty:
+ logging.warning("ARTS Known Resistance Hits table is empty.")
+ else:
+ arts_knownhits = arts_knownhits.set_index("index")
arts_knownhits_dict = arts_knownhits.T.to_dict()
outfile_hits.parent.mkdir(parents=True, exist_ok=True)
logging.info(f"Writing Known Resistance Hits JSON to: {outfile_hits}")
diff --git a/workflow/bgcflow/bgcflow/data/get_dependencies.py b/workflow/bgcflow/bgcflow/data/get_dependencies.py
index 8648ec09..97f4e6ca 100644
--- a/workflow/bgcflow/bgcflow/data/get_dependencies.py
+++ b/workflow/bgcflow/bgcflow/data/get_dependencies.py
@@ -1,24 +1,36 @@
import json
+import logging
import sys
+log_format = "%(levelname)-8s %(asctime)s %(message)s"
+date_format = "%d/%m %H:%M:%S"
+logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)
+
import yaml
# list of the main dependecies used in the workflow
dependencies = {
"antismash": r"workflow/envs/antismash.yaml",
+ "bigslice": r"workflow/envs/bigslice.yaml",
+ "cblaster": r"workflow/envs/cblaster.yaml",
"prokka": r"workflow/envs/prokka.yaml",
- "mlst": r"workflow/envs/mlst.yaml",
"eggnog-mapper": r"workflow/envs/eggnog.yaml",
"roary": r"workflow/envs/roary.yaml",
- "refseq_masher": r"workflow/envs/refseq_masher.yaml",
"seqfu": r"workflow/envs/seqfu.yaml",
+ "checkm": r"workflow/envs/checkm.yaml",
+ "gtdbtk": r"workflow/envs/gtdbtk.yaml",
}
-def get_dependency_version(dep, dep_key):
+def get_dependency_version(dep, dep_key, antismash_version="7"):
"""
return dependency version tags given a dictionary (dep) and its key (dep_key)
"""
+ if dep_key == "antismash":
+ logging.info(f"AntiSMASH version is: {antismash_version}")
+ if antismash_version == "6":
+ dep[dep_key] = "workflow/envs/antismash_v6.yaml"
+ logging.info(f"Getting software version for: {dep_key}")
with open(dep[dep_key]) as file:
result = []
documents = yaml.full_load(file)
@@ -38,14 +50,14 @@ def get_dependency_version(dep, dep_key):
return str(result)
-def write_dependecies_to_json(outfile, dep=dependencies):
+def write_dependecies_to_json(outfile, antismash_version, dep=dependencies):
"""
write dependency version to a json file
"""
with open(outfile, "w") as file:
dv = {}
for ky in dep.keys():
- vr = get_dependency_version(dep, ky)
+ vr = get_dependency_version(dep, ky, antismash_version=antismash_version)
dv[ky] = vr
json.dump(
dv,
@@ -57,4 +69,4 @@ def write_dependecies_to_json(outfile, dep=dependencies):
if __name__ == "__main__":
- write_dependecies_to_json(sys.argv[1])
+ write_dependecies_to_json(sys.argv[1], sys.argv[2])
diff --git a/workflow/envs/bgcflow_notes.yaml b/workflow/envs/bgcflow_notes.yaml
index d4c19a3f..b63b10b5 100644
--- a/workflow/envs/bgcflow_notes.yaml
+++ b/workflow/envs/bgcflow_notes.yaml
@@ -4,6 +4,7 @@ channels:
- bioconda
- defaults
dependencies:
+ - python>=3.9,<3.12
- jupyterlab>3
- ipywidgets>=7.6
- jupyter-dash
@@ -13,13 +14,17 @@ dependencies:
- python-kaleido
- bokeh
- openpyxl
- - nbconvert
+ - nbconvert==6.4.2
- ipykernel
- - ete3
- xlrd >= 1.0.0
+ - altair
+ - itables
+ - scikit-learn
+ - pyarrow
+ - pygraphviz
- pip
- pip:
- biopython
+ - jupyterlab-dash
- networkx
- alive_progress
- - ../bgcflow
diff --git a/workflow/envs/bigslice.yaml b/workflow/envs/bigslice.yaml
index b0b04c5a..99579f65 100644
--- a/workflow/envs/bigslice.yaml
+++ b/workflow/envs/bigslice.yaml
@@ -41,4 +41,4 @@ dependencies:
- scipy==1.9.3
- six==1.16.0
- threadpoolctl==3.1.0
- - git+https://github.com/medema-group/bigslice.git@103d8f2
+ - git+https://github.com/NBChub/bigslice.git@c0085de
diff --git a/workflow/envs/dbt-duckdb.yaml b/workflow/envs/dbt-duckdb.yaml
index f5c1340e..4544edc3 100644
--- a/workflow/envs/dbt-duckdb.yaml
+++ b/workflow/envs/dbt-duckdb.yaml
@@ -1,9 +1,13 @@
name: dbt-duckdb
channels:
- conda-forge
+ - bioconda
- defaults
dependencies:
+ - python==3.11
+ - python-duckdb==0.8.1
+ - unzip
- pip
- pip:
- dbt-duckdb==1.6.0
- - dbt-metabase==0.9.15
+ - dbt-metabase==0.9.15
\ No newline at end of file
diff --git a/workflow/notebook/bigslice.py.ipynb b/workflow/notebook/bigslice.py.ipynb
index 62e9b44d..e3c17e61 100644
--- a/workflow/notebook/bigslice.py.ipynb
+++ b/workflow/notebook/bigslice.py.ipynb
@@ -21,6 +21,8 @@
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
+ "import shutil, json\n",
+ "from IPython.display import display, Markdown\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
@@ -33,7 +35,57 @@
"metadata": {},
"outputs": [],
"source": [
- "report_dir = Path(\"../\")"
+ "report_dir = Path(\"../\")\n",
+ "bgcflow_dir = report_dir / (\"../../../\")\n",
+ "envs = bgcflow_dir / \"workflow/envs/bigslice.yaml\"\n",
+ "\n",
+ "metadata = report_dir / \"metadata/dependency_versions.json\"\n",
+ "with open(metadata, \"r\") as f:\n",
+ " dependency_version = json.load(f)\n",
+ "\n",
+ "# Define the destination path\n",
+ "destination_path = Path(\"assets/envs/bigslice.yaml\")\n",
+ "\n",
+ "# Ensure the destination directory exists\n",
+ "destination_path.parent.mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ "# Copy the file\n",
+ "shutil.copy(envs, destination_path);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43ca282f",
+ "metadata": {},
+ "source": [
+ "## Usage\n",
+ "\n",
+ "You can start the BiG-SLICE flask app to view the clustering result.\n",
+ "\n",
+ "Steps:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed378d7c-7d1a-4715-a06d-8850c1344387",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "run_bigslice=f\"\"\"- Install the conda environment:\n",
+ "\n",
+ "```bash\n",
+ " conda install -f {report_dir.resolve()}/docs/assets/envs/bigslice.yaml\n",
+ "```\n",
+ "\n",
+ "- Run the app\n",
+ "\n",
+ "```bash\n",
+ " port='5001'\n",
+ " conda run -n bigslice bash {report_dir.resolve()}/cluster_as_{dependency_version[\"antismash\"]}/start_server.sh $port\n",
+ "```\n",
+ "\"\"\"\n",
+ "display(Markdown(run_bigslice))"
]
},
{
@@ -69,7 +121,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/notebook/deeptfactor.py.ipynb b/workflow/notebook/deeptfactor.py.ipynb
index d3030026..99bbe250 100644
--- a/workflow/notebook/deeptfactor.py.ipynb
+++ b/workflow/notebook/deeptfactor.py.ipynb
@@ -26,6 +26,7 @@
"from IPython.display import display, Markdown, HTML\n",
"import itables.options as opt\n",
"from itables import to_html_datatable as DT\n",
+ "import plotly.graph_objects as go\n",
"opt.css = \"\"\"\n",
".itables table td { font-style: italic; font-size: .8em;}\n",
".itables table th { font-style: oblique; font-size: .8em; }\n",
@@ -88,7 +89,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df_deeptf = pd.merge(df.reset_index().drop(columns='genome_id'), df_aa.reset_index(), on=\"locus_tag\", how=\"outer\")"
+ "df_deeptf = pd.merge(df.reset_index().drop(columns='genome_id'), df_aa.reset_index(), on=\"locus_tag\", how=\"outer\")\n",
+ "df_deeptf.deeptfactor_prediction = df_deeptf.deeptfactor_prediction.fillna(False)\n",
+ "df_deeptf = df_deeptf.fillna(0)"
]
},
{
@@ -101,6 +104,92 @@
"display(HTML(DT(df_deeptf, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\", \"searchable\": True}], maxColumns=df_deeptf.shape[1], maxBytes=0)))"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "864665c9-ea6c-4e24-914f-889d7e5563ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_sunburst_plot(data, outfile_sunburst, outfile_barh):\n",
+ " # Calculate the necessary information\n",
+ " total_annotated = data[\"deeptfactor_prediction\"].sum()\n",
+ " hypothetical_count = data[data[\"deeptfactor_prediction\"] & data[\"annotation\"].str.contains(\"hypothetical\", case=False)].shape[0]\n",
+ " non_annotated_hypothetical_count = data[~data[\"deeptfactor_prediction\"] & data[\"annotation\"].str.contains(\"hypothetical\", case=False)].shape[0]\n",
+ " non_hypothetical_annotated = data[data[\"deeptfactor_prediction\"] & ~data[\"annotation\"].str.contains(\"hypothetical\", case=False)]\n",
+ " annotation_counts = non_hypothetical_annotated[\"annotation\"].value_counts()\n",
+ "\n",
+ " # Sunburst plot data\n",
+ " labels = [\"Locus Tags\", \"Annotated by deepTF\", \"Not Annotated\", \n",
+ " \"Hypothetical Proteins\", \"Other Proteins\", \n",
+ " \"Not Annotated - Hypothetical\", \"Not Annotated - Other\"]\n",
+ " parents = [\"\", \"Locus Tags\", \"Locus Tags\", \n",
+ " \"Annotated by deepTF\", \"Annotated by deepTF\", \n",
+ " \"Not Annotated\", \"Not Annotated\"]\n",
+ " values = [len(data), total_annotated, len(data) - total_annotated, \n",
+ " hypothetical_count, total_annotated - hypothetical_count, \n",
+ " non_annotated_hypothetical_count, len(data) - total_annotated - non_annotated_hypothetical_count]\n",
+ "\n",
+ " # Create the sunburst plot\n",
+ " fig1 = go.Figure(go.Sunburst(\n",
+ " labels=labels,\n",
+ " parents=parents,\n",
+ " values=values,\n",
+ " maxdepth=2,\n",
+ " marker=dict(colors=['#f5f5f5', '#66b2ff', '#ff9999', '#ffcccc', '#99ccff', '#ffdddd', '#aaddff'])\n",
+ " ))\n",
+ " fig1.update_layout(height=800, title=\"Sunburst Plot of Locus Tags and Protein Types\")\n",
+ " fig1.write_html(outfile_sunburst)\n",
+ "\n",
+ " # Create the horizontal bar chart for annotations\n",
+ " fig2 = go.Figure(go.Bar(\n",
+ " y=annotation_counts.index,\n",
+ " x=annotation_counts.values,\n",
+ " marker_color='#66b2ff',\n",
+ " orientation='h'\n",
+ " ))\n",
+ " fig2.update_layout(title=\"Distribution of Annotations for Non-Hypothetical, deepTF-Annotated Entries\", \n",
+ " yaxis_title=\"Annotation\", \n",
+ " xaxis_title=\"Count\",\n",
+ " height=800)\n",
+ " fig2.write_html(outfile_barh)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bb8b4144-7336-49fc-8f30-cbe837b47de1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "outfile1 = Path(f\"assets/figures/deeptf_sunburst.html\")\n",
+ "outfile1.parent.mkdir(parents=True, exist_ok=True)\n",
+ "\n",
+ "outfile2 = Path(f\"assets/figures/deeptf_barh.html\")\n",
+ "outfile2.parent.mkdir(parents=True, exist_ok=True)\n",
+ "generate_sunburst_plot(df_deeptf, outfile1, outfile2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "392407dc-77bf-4e07-8e93-ae3114d04af9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(HTML(filename=str(outfile1)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f7658b2-639e-4553-98ee-c873fc116e30",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(HTML(filename=str(outfile2)))"
+ ]
+ },
{
"cell_type": "markdown",
"id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8",
diff --git a/workflow/notebook/fastani.py.ipynb b/workflow/notebook/fastani.py.ipynb
index 980ec241..0705abd6 100644
--- a/workflow/notebook/fastani.py.ipynb
+++ b/workflow/notebook/fastani.py.ipynb
@@ -92,7 +92,11 @@
"outputs": [],
"source": [
"df_fastani = pd.read_csv(report_dir / 'fastani/df_fastani.csv', index_col=0)\n",
- "df_gtdb = pd.read_csv(report_dir / 'tables' / 'df_gtdb_meta.csv', index_col='genome_id')"
+ "df_gtdb = pd.read_csv(report_dir / 'tables' / 'df_gtdb_meta.csv', index_col='genome_id')\n",
+ "\n",
+ "# correct table\n",
+ "for i in df_fastani.index:\n",
+ " df_fastani.loc[i, i] = 100"
]
},
{
@@ -112,15 +116,18 @@
"source": [
"df_fastani_corr = df_fastani.fillna(0).corr()\n",
"\n",
- "plt.figure(figsize=(30, 7))\n",
- "plt.title(\"FastANI Similarity\")\n",
+ "plt.figure(figsize=(30, 10))\n",
+ "#plt.title(\"FastANI Similarity\")\n",
"\n",
"selected_data = df_fastani_corr.copy()\n",
"clusters = shc.linkage(selected_data, \n",
" method='ward', \n",
" metric=\"euclidean\",\n",
" optimal_ordering=True,)\n",
- "shc.dendrogram(Z=clusters, labels=df_fastani_corr.index, leaf_rotation=90)\n",
+ "shc.dendrogram(Z=clusters, labels=df_fastani_corr.index, orientation='left') # Set orientation to 'left'\n",
+ "\n",
+ "plt.yticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n",
+ "plt.xticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n",
"plt.show()"
]
},
@@ -213,12 +220,20 @@
" \n",
"comm_colors = df_hclusts['color_code']\n",
"plt.figure()\n",
+ "\n",
"# sns.set_theme(color_codes=True)\n",
"g = sns.clustermap(df_fastani,\n",
- " figsize=(50,50), row_linkage=clusters, col_linkage=clusters,\n",
- " row_colors=comm_colors, col_colors=comm_colors)\n",
- "g.ax_heatmap.set_xlabel('Genomes')\n",
- "g.ax_heatmap.set_ylabel('Genomes')\n",
+ " figsize=(20,20), row_linkage=clusters, col_linkage=clusters,\n",
+ " row_colors=comm_colors, col_colors=comm_colors, cmap=\"rocket_r\")\n",
+ "g.ax_heatmap.set_xlabel('Genomes', fontsize=18)\n",
+ "g.ax_heatmap.set_ylabel('Genomes', fontsize=18)\n",
+ "plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=12) # set rotation and font size for x-axis labels\n",
+ "plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=12) # set font size for y-axis labels\n",
+ "\n",
+ "# Adjust font size of the colorbar's tick labels\n",
+ "cbar = g.cax\n",
+ "cbar.set_yticklabels(cbar.get_yticklabels(), fontsize=16)\n",
+ "\n",
"plt.show()"
]
},
@@ -255,7 +270,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/notebook/mash.py.ipynb b/workflow/notebook/mash.py.ipynb
index 5807d138..914211b5 100644
--- a/workflow/notebook/mash.py.ipynb
+++ b/workflow/notebook/mash.py.ipynb
@@ -116,15 +116,18 @@
"source": [
"df_mash_corr = df_mash.fillna(0).corr()\n",
"\n",
- "plt.figure(figsize=(30, 7))\n",
- "plt.title(\"MASH Distances\")\n",
+ "plt.figure(figsize=(30, 10))\n",
+ "#plt.title(\"MASH Distances\", fontsize=20) # You can adjust the title font size here\n",
"\n",
"selected_data = df_mash_corr.copy()\n",
"clusters = shc.linkage(selected_data, \n",
" method='ward', \n",
" metric=\"euclidean\",\n",
- " optimal_ordering=True,)\n",
- "shc.dendrogram(Z=clusters, labels=df_mash_corr.index, leaf_rotation=90)\n",
+ " optimal_ordering=True)\n",
+ "shc.dendrogram(Z=clusters, labels=df_mash_corr.index, orientation='left') # Set orientation to 'left'\n",
+ "\n",
+ "plt.yticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n",
+ "plt.xticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n",
"plt.show()"
]
},
@@ -217,12 +220,21 @@
" \n",
"comm_colors = df_hclusts['color_code']\n",
"plt.figure()\n",
+ "\n",
+ "\n",
"# sns.set_theme(color_codes=True)\n",
- "g = sns.clustermap(df_mash,\n",
- " figsize=(50,50), row_linkage=clusters, col_linkage=clusters,\n",
- " row_colors=comm_colors, col_colors=comm_colors)\n",
- "g.ax_heatmap.set_xlabel('Genomes')\n",
- "g.ax_heatmap.set_ylabel('Genomes')\n",
+ "g = sns.clustermap((1 - df_mash)*100,\n",
+ " figsize=(20,20), row_linkage=clusters, col_linkage=clusters,\n",
+ " row_colors=comm_colors, col_colors=comm_colors, cmap=\"rocket_r\")\n",
+ "g.ax_heatmap.set_xlabel('Genomes', fontsize=18)\n",
+ "g.ax_heatmap.set_ylabel('Genomes', fontsize=18)\n",
+ "plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=12) # set rotation and font size for x-axis labels\n",
+ "plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=12) # set font size for y-axis labels\n",
+ "\n",
+ "# Adjust font size of the colorbar's tick labels\n",
+ "cbar = g.cax\n",
+ "cbar.set_yticklabels(cbar.get_yticklabels(), fontsize=16)\n",
+ "\n",
"plt.show()"
]
},
@@ -256,7 +268,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.0"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/notebook/prokka-gbk.py.ipynb b/workflow/notebook/prokka-gbk.py.ipynb
index 94c28d8d..5301343f 100644
--- a/workflow/notebook/prokka-gbk.py.ipynb
+++ b/workflow/notebook/prokka-gbk.py.ipynb
@@ -23,7 +23,8 @@
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
- "\n",
+ "import altair as alt\n",
+ "import json\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
@@ -47,7 +48,11 @@
},
"outputs": [],
"source": [
- "report_dir = Path(\"../\")"
+ "report_dir = Path(\"../\")\n",
+ "metadata = report_dir / \"metadata/project_metadata.json\"\n",
+ "with open(metadata, \"r\") as f:\n",
+ " metadata = json.load(f)\n",
+ "project_name = [i for i in metadata.keys()][0]"
]
},
{
@@ -81,6 +86,46 @@
"display(HTML(DT(df, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "7645aa83",
+ "metadata": {},
+ "source": [
+ "## Summary Statistics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "68ba254c-ab05-45ac-aa81-8c0c05e49f1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "source = df.copy()\n",
+ "source[\"dataset\"] = project_name\n",
+ "\n",
+ "# Create a list of charts, one for each column to plot\n",
+ "colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854', '#ffd92f', '#e5c494']\n",
+ "\n",
+ "charts = []\n",
+ "\n",
+ "for idx, col in enumerate(['contigs', 'bases', 'CDS', 'rRNA', 'repeat_region', 'tRNA', 'tmRNA']):\n",
+ " data_range = source[col].max() - source[col].min()\n",
+ " buffer = 0.10 * data_range # 10% buffer\n",
+ " ymin = source[col].min() - buffer\n",
+ " ymax = source[col].max() + buffer\n",
+ " chart = alt.Chart(source).mark_boxplot(size=50, median=dict(color='black')).encode(\n",
+ " y=alt.Y(f'{col}:Q', title=None, scale=alt.Scale(domain=(ymin, ymax))),\n",
+ " x=alt.X(\"dataset:N\", axis=None), # This is used to align the boxplots vertically\n",
+ " color=alt.value(colors[idx]), # Color of the boxplot\n",
+ " opacity=alt.value(0.7) # Opacity of the boxplot\n",
+ " ).properties(title=f'{col}', width=100, height=150)\n",
+ " \n",
+ " charts.append(chart)\n",
+ "\n",
+ "alt.hconcat(*charts)"
+ ]
+ },
{
"cell_type": "markdown",
"id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8",
@@ -114,7 +159,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.16"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/notebook/query-bigslice.py.ipynb b/workflow/notebook/query-bigslice.py.ipynb
index 9bf016ee..f9c450ba 100644
--- a/workflow/notebook/query-bigslice.py.ipynb
+++ b/workflow/notebook/query-bigslice.py.ipynb
@@ -21,9 +21,133 @@
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
+ "import sqlite3\n",
+ "from IPython.display import display, Markdown, HTML\n",
+ "\n",
+ "import json, yaml\n",
+ "import altair as alt\n",
+ "import ast\n",
"\n",
"import warnings\n",
- "warnings.filterwarnings('ignore')"
+ "warnings.filterwarnings('ignore')\n",
+ "\n",
+ "import networkx as nx\n",
+ "import plotly.graph_objects as go\n",
+ "import altair as alt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5dd35e5b-bca8-4c8a-9187-ddfd61e1d957",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_bigfam_network(df, rank=0):\n",
+ " \"\"\"\n",
+ " Build a networkx graph from bigscape df network\n",
+ " \"\"\"\n",
+ " df[df['rank'] == rank]\n",
+ " G = nx.from_pandas_edgelist(df, source='bgc_id', target='gcf_id', edge_attr=['membership_value',\n",
+ " 'rank'], edge_key='bigfam_edge_id')\n",
+ " return G"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b1a957a4-f659-436d-9497-bf5933572124",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def annotate_bigfam_models(G, df, columns = ['bgc_member', 'chemical_class_hits', 'top_chemical_class',\n",
+ " 'top_chemical_class_proportion', 'top_chemical_subclass',\n",
+ " 'top_chemical_subclass_proportion', 'taxonomic_hits', \n",
+ " 'taxonomic_level', 'H-index', 'richness', 'top_taxa', \n",
+ " 'top_taxa_proportion']):\n",
+ " for bgc in G.nodes:\n",
+ " if bgc in df.index:\n",
+ " G.nodes[bgc]['node_type'] = \"BiG-FAM GCFs\"\n",
+ " G.nodes[bgc]['text'] = f\"GCF {bgc}
size: {df.loc[bgc, 'bgc_member']}
top_chemical_class: {df.loc[bgc, 'top_chemical_class']} ({df.loc[bgc, 'top_chemical_class_proportion']:.0%})\\\n",
+ "
top_chemical_subclass: {df.loc[bgc, 'top_chemical_subclass']} ({df.loc[bgc, 'top_chemical_subclass_proportion']:.0%})\\\n",
+ "
top_taxa: {df.loc[bgc, 'top_taxa']} ({df.loc[bgc, 'top_taxa_proportion']:.0%})\"\n",
+ " for c in columns:\n",
+ " G.nodes[bgc][c] = df.loc[bgc, c]\n",
+ " return G"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dd034c15-182e-4d65-8266-3392a2a0ec60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def annotate_bigfam_antismash(G, antismash_region_path, columns = [\"genome_id\", \"product\", \"contig_edge\", \"region_length\", \"most_similar_known_cluster_id\", \"most_similar_known_cluster_description\", \"most_similar_known_cluster_type\", \"similarity\"]):\n",
+ " df_antismash = pd.read_csv(antismash_region_path, index_col=0)\n",
+ " for bgc in G.nodes:\n",
+ " if bgc in df_antismash.index:\n",
+ " G.nodes[bgc]['node_type'] = \"BGC\"\n",
+ " G.nodes[bgc]['text'] = f\"{bgc}
{df_antismash.loc[bgc, 'product']}\"\n",
+ " if 'most_similar_known_cluster_description' in df_antismash.columns:\n",
+ " G.nodes[bgc]['text'] = G.nodes[bgc]['text'] + f\"
{df_antismash.loc[bgc, 'most_similar_known_cluster_description']}\"\n",
+ " for c in df_antismash.columns:\n",
+ " if c in columns:\n",
+ " G.nodes[bgc][c] = df_antismash.loc[bgc, c]\n",
+ " return G"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c8923e90-4526-4146-ba33-6cff84cf2f7c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_edge_trace(G, name):\n",
+ " edge_trace = go.Scatter(\n",
+ " x=[],\n",
+ " y=[],\n",
+ " name=name,\n",
+ " line=dict(width=0.5,color='#888'),\n",
+ " hoverinfo='none',\n",
+ " mode='lines')\n",
+ "\n",
+ " for edge in G.edges():\n",
+ " x0, y0 = G.nodes[edge[0]]['pos']\n",
+ " x1, y1 = G.nodes[edge[1]]['pos']\n",
+ " edge_trace['x'] += tuple([x0, x1, None])\n",
+ " edge_trace['y'] += tuple([y0, y1, None])\n",
+ " return edge_trace\n",
+ "\n",
+ "def create_node_trace(G, node_trace_category, color, showtextlabel=False, nodesize=10, nodeopacity=0.8, nodesymbol=\"circle\", linewidth=1, linecolor=\"black\", textposition=\"top center\"):\n",
+ " if showtextlabel:\n",
+ " markermode = \"markers+text\"\n",
+ " else:\n",
+ " markermode = \"markers\"\n",
+ " node_trace = go.Scatter(\n",
+ " x=[],\n",
+ " y=[],\n",
+ " text=[],\n",
+ " textposition=textposition,\n",
+ " mode=markermode,\n",
+ " hoverinfo='text',\n",
+ " name=node_trace_category,\n",
+ " marker=dict(\n",
+ " symbol=nodesymbol,\n",
+ " opacity=nodeopacity,\n",
+ " showscale=False,\n",
+ " color=color,\n",
+ " size=nodesize,\n",
+ " line=dict(width=linewidth, color=linecolor)))\n",
+ "\n",
+ " for node in G.nodes():\n",
+ " if G.nodes[node][\"node_trace\"] == node_trace_category:\n",
+ " x, y = G.nodes[node]['pos']\n",
+ " node_trace['x'] += tuple([x])\n",
+ " node_trace['y'] += tuple([y])\n",
+ " node_trace['text'] +=tuple([G.nodes[node]['text']])\n",
+ " return node_trace"
]
},
{
@@ -33,7 +157,325 @@
"metadata": {},
"outputs": [],
"source": [
- "report_dir = Path(\"../\")"
+ "report_dir = Path(\"../\")\n",
+ "\n",
+ "with open(report_dir / \"metadata/project_metadata.json\", \"r\") as f:\n",
+ " project_configuration = json.load(f)\n",
+ "with open(report_dir / \"metadata/dependency_versions.json\", \"r\") as f:\n",
+ " dependency_version = json.load(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a457b21e-12a4-4465-bab1-0a9cf2eb309b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "project_name = [i for i in project_configuration.keys()][0]\n",
+ "antismash_version = dependency_version[\"antismash\"]\n",
+ "\n",
+ "query_dir = report_dir / f\"bigslice/query_as_{antismash_version}/\"\n",
+ "df_bigfam_model = pd.read_csv(query_dir / \"gcf_summary.csv\")\n",
+ "df_bigfam_hits = pd.read_csv(report_dir / f\"bigslice/query_as_{antismash_version}/query_network.csv\")\n",
+ "antismash_region_path = report_dir / f\"tables/df_regions_antismash_{antismash_version}.csv\"\n",
+ "df_annotated_bigfam_model = pd.read_csv(query_dir / \"gcf_annotation.csv\", index_col=0)\n",
+ "df_network = pd.read_csv(query_dir / \"query_network.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0d2b11e1-1cd9-4443-bc77-6d4bbb7da529",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filtering\n",
+ "#df_network = df_network[df_network[\"rank\"] < 3] # get first degree only\n",
+ "df_annotated_bigfam_model = df_annotated_bigfam_model[df_annotated_bigfam_model[\"bgc_member\"] < 15000] # remove model with really large size\n",
+ "df_network = df_network[df_network[\"gcf_id\"].isin(df_annotated_bigfam_model.index)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "45a7732c-6a11-43a9-b2bd-4406db0be290",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cut = df_annotated_bigfam_model[df_annotated_bigfam_model.top_taxa_proportion <= 0.3]\n",
+ "\n",
+ "G = generate_bigfam_network(df_network)\n",
+ "G = annotate_bigfam_antismash(G, antismash_region_path)\n",
+ "G = annotate_bigfam_models(G, df_annotated_bigfam_model)\n",
+ "G_raw = G.copy()\n",
+ "\n",
+ "# position nodes\n",
+ "pos = nx.nx_agraph.graphviz_layout(G)\n",
+ "for n, p in pos.items():\n",
+ " G.nodes[n]['pos'] = p"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "92ad9896-2043-49b8-9e39-aab12f57f5fa",
+ "metadata": {},
+ "source": [
+ "## Summary Result\n",
+ "### Distribution of the assigned genus from BiG-FAM Model hits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "26c9b54f-c1ed-4180-ae32-f9635f958509",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Calculate the number of GCFs with representation across different genera\n",
+ "df_genus_dist = pd.DataFrame(index=df_annotated_bigfam_model.index)\n",
+ "for gcf_id in df_annotated_bigfam_model.index:\n",
+ " taxa = str(df_annotated_bigfam_model.loc[gcf_id, \"taxa_distribution\"])\n",
+ " taxa_dict = ast.literal_eval(taxa)\n",
+ " for genus in taxa_dict.keys():\n",
+ " df_genus_dist.loc[gcf_id, genus] = taxa_dict[genus]\n",
+ "df_genus_dist.fillna(0, inplace=True)\n",
+ "\n",
+ "# Genera represented by most number of GCFs\n",
+ "df_genus_dist_binary = df_genus_dist > 0 \n",
+ "df_genus_dist_binary = df_genus_dist_binary * 1\n",
+ "\n",
+ "# Assuming df_genus_dist_binary.sum().sort_values(ascending=False)[:10] is stored in a variable named 'data'\n",
+ "data = df_genus_dist_binary.sum().sort_values(ascending=False)[:10]\n",
+ "\n",
+ "# Convert Series to DataFrame for Altair plotting\n",
+ "data_df = data.reset_index()\n",
+ "data_df.columns = ['Genus', 'Counts']\n",
+ "\n",
+ "# Create Altair Chart\n",
+ "chart = alt.Chart(data_df).mark_bar().encode(\n",
+ " y=alt.Y('Genus:N', sort='-x', title='Genera'),\n",
+ " x=alt.X('Counts:Q', title='Counts'),\n",
+ " color=alt.Color('Genus:N', legend=None) # Color by Genera, but we don't need a legend here\n",
+ ").properties(\n",
+ " title='Top 10 Genus Distribution',\n",
+ " width=700,\n",
+ " height=300\n",
+ ")\n",
+ "\n",
+ "# Display the chart\n",
+ "chart"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "087dfb47-9c8f-4c61-84d4-2d72b7ffcab4",
+ "metadata": {},
+ "source": [
+ "### Diversity and Member Size of BiG-FAM model hits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "76e25859-6a93-4e5e-897d-f7fe60ba259d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "bgc_hits = len([n for n in G_raw.nodes if G_raw.nodes[n]['node_type'] == 'BGC'])\n",
+ "GCF_hits = len([n for n in G_raw.nodes if G_raw.nodes[n]['node_type'] != 'BGC'])\n",
+ "shannon_non_zero = df_annotated_bigfam_model[df_annotated_bigfam_model['H-index'] != 0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c50933f-688d-49ca-84c0-18ef7220815d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "shannon_min_non_zero_tax = ast.literal_eval(str(df_annotated_bigfam_model.loc[shannon_non_zero['H-index'].astype(float).idxmin(), \"taxa_distribution\"]))\n",
+ "shannon_min_non_zero_tax = {k : v/sum(shannon_min_non_zero_tax.values()) for k,v in shannon_min_non_zero_tax.items()}\n",
+ "shannon_min_non_zero_tax_clean = [f\"{k} ({v:.1%})\" for k,v in sorted(shannon_min_non_zero_tax.items(), key=lambda x:x[1], reverse=True)]\n",
+ "\n",
+ "def get_top_taxa(df, gcf):\n",
+ " result = f\"{df.loc[gcf, 'top_taxa']} ({df.loc[gcf, 'top_taxa_proportion'] * df.loc[gcf, 'bgc_member']:.0f} genomes)\"\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a5251f9c-5dc9-4f61-a43d-d5e2558353c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mapping = df_bigfam_hits.gcf_id.value_counts()\n",
+ "for gcf in df_annotated_bigfam_model.index:\n",
+ " df_annotated_bigfam_model.loc[gcf, \"dataset_hits\"] = mapping[gcf]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d0d36af-3a0c-4949-9f02-482e2595fa0a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "domain = list(df_annotated_bigfam_model[df_annotated_bigfam_model[\"top_taxa_proportion\"] > 0.1][\"top_taxa\"].value_counts().to_dict().keys())\n",
+ "domain.append(\"Other\")\n",
+ "\n",
+ "r = []\n",
+ "range_ = [\"#264653\", \"#287271\", \"#2a9d8f\", \"#8ab17d\", \"#e9c46a\", \"#f4a261\", \"#ee8959\", \"#e76f51\", \"#edede9\"]\n",
+ "for num, d in enumerate(domain):\n",
+ " if num < len(range_):\n",
+ " r.append(range_[num])\n",
+ " else:\n",
+ " r.append(\"white\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5882dd62-7c22-432d-ac99-6fe2203c3217",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "source = df_annotated_bigfam_model.copy()\n",
+ "source = source.reset_index().rename(columns={\"gcf_id\":\"BiG-FAM_id\"})\n",
+ "for i in source.index:\n",
+ " if source.loc[i, \"top_taxa_proportion\"] <= 0.5:\n",
+ " source.loc[i, \"top_taxa\"] = \"Other\"\n",
+ " \n",
+ "chart_one = alt.Chart(source).mark_point().encode(\n",
+ " alt.Y('H-index:Q',\n",
+ " scale=alt.Scale(domain=(-0.5, 5)),\n",
+ " axis=alt.Axis(title=\"Shannon Index (H')\")\n",
+ " ),\n",
+ " alt.X('bgc_member:Q',\n",
+ " scale=alt.Scale(type=\"log\"),\n",
+ " axis=alt.Axis(title=\"Member Size\")\n",
+ " ),\n",
+ " alt.Size('dataset_hits',\n",
+ " scale=alt.Scale(type='pow'),# domain=(1, 30)), \n",
+ " title=\"Number of hits in dataset\"\n",
+ " ),\n",
+ " alt.Color(\"top_taxa:N\", scale=alt.Scale(domain=domain, range=r), title=\"Top Genus (>=50%)\"),\n",
+ " tooltip=['BiG-FAM_id', 'bgc_member', 'chemical_class_hits', 'top_chemical_class', alt.Tooltip('top_chemical_class_proportion', format='.1%'), \n",
+ " 'top_chemical_subclass', alt.Tooltip('top_chemical_subclass_proportion', format='.1%'),\n",
+ " 'taxonomic_level', 'richness', 'top_taxa', alt.Tooltip('top_taxa_proportion', format='.1%'), \n",
+ " alt.Tooltip('H-index:Q', format='.2f')],\n",
+ ").mark_point(\n",
+ " filled=True,\n",
+ " stroke='black',\n",
+ " strokeWidth=0.5,\n",
+ " opacity=0.8,\n",
+ " size=1000\n",
+ ").configure_header(\n",
+ " title=None,\n",
+ " labels=False\n",
+ ").configure_axis(\n",
+ " labelFontSize=10,\n",
+ " titleFontSize=12\n",
+ ").configure_legend(\n",
+ " labelFontSize=10,\n",
+ " titleFontSize=12,\n",
+ ").configure_view(\n",
+ " continuousHeight=500,\n",
+ " continuousWidth=500,\n",
+ ").configure_legend(\n",
+ " orient='right'\n",
+ ")\n",
+ "\n",
+ "chart_one"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eb27456a-066f-476a-83d1-08bd307f28f8",
+ "metadata": {},
+ "source": [
+ "### Network of BiG-FAM model hits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5a760c64-c7e2-442c-92e3-ef1f1eccda63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "color_category = 'node_type'\n",
+ "colormap = {\"BiG-FAM GCFs\" : \"red\",\n",
+ " \"BGC\" : \"blue\"}\n",
+ "node_trace_category = {}\n",
+ "\n",
+ "for node in G.nodes:\n",
+ " nodeshape = \"circle\"\n",
+ " if node in df_annotated_bigfam_model.index:\n",
+ " G.nodes[node]['node_type'] = \"BiG-FAM GCFs\"\n",
+ " if G.nodes[node]['node_type'] == \"BiG-FAM GCFs\":\n",
+ " nodeshape = \"diamond\"\n",
+ " color = colormap[\"BiG-FAM GCFs\"]\n",
+ " cat = \"BiG-FAM GCFs\"\n",
+ " else: \n",
+ " cat = G.nodes[node][color_category]\n",
+ " color = colormap[cat]\n",
+ " if 'similarity' in G.nodes[node].keys():\n",
+ " if G.nodes[node]['similarity'] > 0.8:\n",
+ " cat = cat + \" \" + \"Known (antiSMASH) > 80% similarity\"\n",
+ " nodeshape = \"circle\"\n",
+ " elif G.nodes[node]['similarity'] > 0.4:\n",
+ " cat = cat + \" \" + \"Known (antiSMASH) > 40% similarity\"\n",
+ " nodeshape = \"triangle-up\"\n",
+ " elif G.nodes[node]['similarity'] < 0.4:\n",
+ " cat = cat + \" \" + \"Unknown (antiSMASH) < 40% similarity\"\n",
+ " nodeshape = \"triangle-down\"\n",
+ " else:\n",
+ " cat = cat + \" \" + \"Unknown\"\n",
+ " nodeshape = \"circle\"\n",
+ " \n",
+ " linecolor = \"black\"\n",
+ " \n",
+ " G.nodes[node]['node_trace'] = cat\n",
+ " node_trace_category[cat] = {\"nodecolor\" : color,\n",
+ " \"nodeshape\" : nodeshape,\n",
+ " \"linecolor\" : linecolor}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bb871352-8602-438d-8404-82e36f2c1afd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "edge_trace = create_edge_trace(G, \"bigslice_similarity\")\n",
+ "traces = [edge_trace]\n",
+ "for cat in node_trace_category.keys():\n",
+ " nodeopacity = 0.8\n",
+ " color = node_trace_category[cat][\"nodecolor\"]\n",
+ " nodeshape = node_trace_category[cat][\"nodeshape\"]\n",
+ " linecolor = node_trace_category[cat][\"linecolor\"]\n",
+ " node_trace = create_node_trace(G, cat, color, nodesymbol=nodeshape, nodeopacity=nodeopacity, nodesize=10, linewidth=1, linecolor=linecolor)\n",
+ " traces.append(node_trace)\n",
+ " \n",
+ "fig2 = go.Figure(data=traces,\n",
+ " layout=go.Layout(\n",
+ " paper_bgcolor='rgba(0,0,0,0)',\n",
+ " plot_bgcolor='rgba(0,0,0,0)',\n",
+ " showlegend=True,\n",
+ " hovermode='closest',\n",
+ " margin=dict(b=20,l=5,r=5,t=40),\n",
+ " xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
+ " yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n",
+ " width=1200, height=800))\n",
+ "\n",
+ "outfile = Path(\"assets/figures/bigfam-query.html\")\n",
+ "outfile.parent.mkdir(parents=True, exist_ok=True)\n",
+ "fig2.write_html(outfile)\n",
+ "\n",
+ "display(HTML(filename=str(outfile)))"
]
},
{
@@ -66,7 +508,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/notebook/seqfu.py.ipynb b/workflow/notebook/seqfu.py.ipynb
index 4c59a893..79a1b65c 100644
--- a/workflow/notebook/seqfu.py.ipynb
+++ b/workflow/notebook/seqfu.py.ipynb
@@ -53,17 +53,14 @@
"source": [
"source = df\n",
"\n",
- "chart = alt.Chart(source).mark_circle().encode(\n",
- " alt.X(alt.repeat(\"column\"), type='quantitative', scale=alt.Scale(zero=False)),\n",
- " alt.Y(alt.repeat(\"row\"), type='quantitative', scale=alt.Scale(zero=False)),\n",
- " color='Total',\n",
- " tooltip=['genome_id', 'Count', 'Total', 'gc', 'N50', 'N75', 'N90', 'AuN', 'Min', 'Max']\n",
+ "chart = alt.Chart(source).mark_circle(size=150).encode(\n",
+ " alt.X(\"gc\", type='quantitative', title='GC (%)', scale=alt.Scale(zero=False)).axis(format='.2%'),\n",
+ " alt.Y(\"N50\", type='quantitative', scale=alt.Scale(zero=False)),\n",
+ " color=alt.Color('Total', scale=alt.Scale(domain=[df['Total'].min(), df['Total'].max()], range=['blue', 'red'])),\n",
+ " tooltip=['genome_id', 'Count', 'Total', 'gc', 'N50', 'N75', 'N90', 'AuN', 'Min', 'Max', 'Organism']\n",
").properties(\n",
- " width=150,\n",
- " height=150,\n",
- ").repeat(\n",
- " row=['Count', 'Total', 'N50'],\n",
- " column=['Count', 'Total', 'N50'],\n",
+ " width=500,\n",
+ " height=500,\n",
").properties(\n",
" title = \"Genome QC Statistics\",\n",
").interactive()\n",
@@ -117,7 +114,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.17"
+ "version": "3.9.18"
}
},
"nbformat": 4,
diff --git a/workflow/rules.yaml b/workflow/rules.yaml
index 442a1e66..7be65a5a 100644
--- a/workflow/rules.yaml
+++ b/workflow/rules.yaml
@@ -179,7 +179,7 @@ deeptfactor:
- 'Kim G.B., Gao Y., Palsson B.O., Lee S.Y. 2020. DeepTFactor: A deep learning-based
tool for the prediction of transcription factors. [PNAS. doi: 10.1073/pnas.2021171118](https://www.pnas.org/doi/10.1073/pnas.2021171118)'
deeptfactor-roary:
- final_output: data/interim/deeptfactor_roary/{name}/
+ final_output: data/processed/{name}/tables/df_deeptfactor_roary.tsv
description: Use DeepTFactor on Roary outputs.
category: Comparative Genomics
link:
diff --git a/workflow/rules/bgc_analytics.smk b/workflow/rules/bgc_analytics.smk
index 0d28a8fd..4c208190 100644
--- a/workflow/rules/bgc_analytics.smk
+++ b/workflow/rules/bgc_analytics.smk
@@ -112,9 +112,11 @@ rule write_dependency_versions:
"../envs/bgc_analytics.yaml"
log:
"logs/bgc_analytics/{name}/write_dependency_versions.log",
+ params:
+ antismash_version=antismash_major_version
shell:
"""
- python workflow/bgcflow/bgcflow/data/get_dependencies.py {output} 2> {log}
+ python workflow/bgcflow/bgcflow/data/get_dependencies.py {output} {params.antismash_version} 2> {log}
"""
rule get_project_metadata:
diff --git a/workflow/rules/bigslice.smk b/workflow/rules/bigslice.smk
index abd81201..1052effc 100644
--- a/workflow/rules/bigslice.smk
+++ b/workflow/rules/bigslice.smk
@@ -25,7 +25,7 @@ rule bigslice:
"../envs/bigslice.yaml"
threads: 16
params:
- threshold=900,
+ threshold="0.4",
resources:
tmpdir="data/interim/tempdir",
log:
@@ -65,12 +65,14 @@ rule query_bigslice:
n_ranks=10,
query_name="{name}",
run_id=6,
+ threshold=300,
+ normalize="--no_normalize_feature ",
resources:
tmpdir="data/interim/tempdir",
shell:
"""
TIMESTAMP=$(date --iso-8601=hours)
- bigslice --query {input.tmp_dir} --n_ranks {params.n_ranks} {input.bigslice_dir} -t {threads} --query_name {params.query_name}_$TIMESTAMP --run_id {params.run_id} &>> {log}
+ bigslice --query {input.tmp_dir} --n_ranks {params.n_ranks} {input.bigslice_dir} -t {threads} --query_name {params.query_name}_$TIMESTAMP --run_id {params.run_id} --threshold {params.threshold} {params.normalize} &>> {log}
python workflow/bgcflow/bgcflow/data/get_bigslice_query_result.py {params.query_name} {output.folder} {input.bigslice_dir} &>> {log}
"""
@@ -97,6 +99,7 @@ rule summarize_bigslice_query:
rule annotate_bigfam_hits:
input:
+ summary_table="data/processed/{name}/tables/df_antismash_{version}_summary.csv",
gcf_summary_csv="data/processed/{name}/bigslice/query_as_{version}/gcf_summary.csv",
output:
models="data/processed/{name}/bigslice/query_as_{version}/gcf_annotation.csv",
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 35eccef9..41b95aa5 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -8,7 +8,7 @@ from pathlib import Path
import peppy
min_version("7.14.0")
-__version__ = "0.7.6"
+__version__ = "0.7.7"
container: "docker://matinnu/bgcflow:latest"
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index 70948dba..f8597943 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -14,7 +14,7 @@ if len(py_wildcards) > 0:
output:
notebook="data/processed/{name}/docs/{bgcflow_rules_py}.ipynb",
conda:
- "../envs/bgc_analytics.yaml"
+ "../envs/bgcflow_notes.yaml"
log:
"logs/report/{bgcflow_rules_py}-report-copy-{name}.log",
wildcard_constraints:
@@ -33,7 +33,7 @@ if len(py_wildcards) > 0:
output:
markdown="data/processed/{name}/docs/{bgcflow_rules_py}.md",
conda:
- "../envs/bgc_analytics.yaml"
+ "../envs/bgcflow_notes.yaml"
log:
"logs/report/{bgcflow_rules_py}-report-{name}.log",
wildcard_constraints:
@@ -50,7 +50,7 @@ if len(rpy_wildcards) > 0:
output:
notebook="data/processed/{name}/docs/{bgcflow_rules_rpy}.ipynb",
conda:
- "../envs/bgc_analytics.yaml"
+ "../envs/bgcflow_notes.yaml"
log:
"logs/report/{bgcflow_rules_rpy}-report-copy-{name}.log",
wildcard_constraints:
diff --git a/workflow/rules/roary.smk b/workflow/rules/roary.smk
index f060689d..6b5befbf 100644
--- a/workflow/rules/roary.smk
+++ b/workflow/rules/roary.smk
@@ -43,23 +43,24 @@ rule deeptfactor_roary:
roary_dir="data/interim/roary/{name}/",
resource="resources/deeptfactor/",
output:
- deeptfactor_dir=directory("data/interim/deeptfactor_roary/{name}/"),
+ df_deeptfactor="data/processed/{name}/tables/df_deeptfactor_roary.tsv",
conda:
"../envs/deeptfactor.yaml"
threads: 8
log:
"logs/deeptfactor-roary/deeptfactor-roary-{name}.log",
params:
- faa="../../data/interim/roary/{name}/pan_genome_reference.fa",
- outdir="../../data/interim/deeptfactor_roary/{name}/",
+ outdir="data/interim/deeptfactor_roary/{name}",
shell:
"""
+ workdir=$PWD
+ mkdir -p data/processed/{wildcards.name} 2>> {log}
(cd {input.resource} && python tf_running.py \
- -i {params.faa} -o {params.outdir} \
+ -i $workdir/{input.roary_dir}/pan_genome_reference.fa -o $workdir/{params.outdir} \
-g cpu -cpu {threads}) 2>> {log}
+ cp {params.outdir}/prediction_result.txt {output.df_deeptfactor} &>> {log}
"""
-
rule diamond_roary:
input:
roary_dir="data/interim/roary/{name}/",