From 5c4941d231ae7e531ebf0035cc9d53c1772798e2 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Thu, 12 Oct 2023 16:26:05 +0200 Subject: [PATCH] feat: update to v0.7.7 - BiG-FAM fixes & notebooks (#291) * fix: correct bigslice annotation rule * chore: correct log file * fix: remove gggenomes from main r_notebook env * feat: experimental env for extracting mmseqs2 with gggenomes * fix: handle numpy and matplotlib via conda * notebook: add bigfam query visualization * fix: correct html display for bigfam query * fix: auto detect antismash major version and its env * feat: use custom bigslice with no normalize option * feat: change bigslice threshold * fix: handle empty known resistance table * notebook: add instruction to run bigslice clustering result * notebook: improve display for mash and seqfu * notebook: better instruction to start bigslice server * notebook: correct fastani distances * notebook: add barplot * notebook: add sunburst * notebook: tidy sections * notebook: switch table and figure location * fix: move deeptf roary output to processed folder * notebooks: changen conda env for notebooks * fix: deeptf roary input * notebook: adjust label size and rotation * test: add dry run for other subworkflows * test: omit build database for now * docs: add example report demo * docs: add wrapper icon * chore: bump version to v0.7.7 * fix: pin python version for duckdb * fix: pin python 3.11 for duckdb * fix: add unzip for dbt * docs: simplify install guide * fix: pin duckdb to 0.8.1 from conda --- .github/workflows/push.yml | 36 ++ README.md | 19 +- .../bgcflow/bgcflow/data/arts_extract_all.py | 6 +- .../bgcflow/bgcflow/data/get_dependencies.py | 24 +- workflow/envs/bgcflow_notes.yaml | 11 +- workflow/envs/bigslice.yaml | 2 +- workflow/envs/dbt-duckdb.yaml | 6 +- workflow/notebook/bigslice.py.ipynb | 56 ++- workflow/notebook/deeptfactor.py.ipynb | 91 +++- workflow/notebook/fastani.py.ipynb | 33 +- workflow/notebook/mash.py.ipynb | 32 +- workflow/notebook/prokka-gbk.py.ipynb | 51 +- workflow/notebook/query-bigslice.py.ipynb | 448 +++++++++++++++++- workflow/notebook/seqfu.py.ipynb | 19 +- workflow/rules.yaml | 2 +- workflow/rules/bgc_analytics.smk | 4 +- workflow/rules/bigslice.smk | 7 +- workflow/rules/common.smk | 2 +- workflow/rules/report.smk | 6 +- workflow/rules/roary.smk | 11 +- 20 files changed, 794 insertions(+), 72 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index c3716c92..81c5536b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -46,6 +46,42 @@ jobs: stagein: "conda config --set channel_priority strict" args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n" + dry-run-report: + runs-on: ubuntu-latest + needs: + - linting + - formatting + steps: + - name: Checkout repository and submodules + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Dry-run workflow + uses: snakemake/snakemake-github-action@v1.24.0 + with: + directory: .tests + snakefile: workflow/Report + stagein: "conda config --set channel_priority strict" + args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n" + + dry-run-metabase: + runs-on: ubuntu-latest + needs: + - linting + - formatting + steps: + - name: Checkout repository and submodules + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Dry-run workflow + uses: snakemake/snakemake-github-action@v1.24.0 + with: + directory: .tests + snakefile: workflow/Metabase + stagein: "conda config --set channel_priority strict" + args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache -n" + unit-test: runs-on: ubuntu-latest needs: diff --git a/README.md b/README.md index a47ad4c8..894a5f13 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Snakemake](https://img.shields.io/badge/snakemake-≥7.14.0-brightgreen.svg)](https://snakemake.bitbucket.io) [![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pep.databio.org) [![wiki](https://img.shields.io/badge/wiki-documentation-forestgreen)](https://github.com/NBChub/bgcflow/wiki) +[![bgcflow-wrapper](https://img.shields.io/badge/CLI-BGCFlow_wrapper-orange)](https://github.com/NBChub/bgcflow_wrapper) +[![example report](https://img.shields.io/badge/demo-report-blue)](https://nbchub.github.io/bgcflow_report_demo/) `BGCFlow` is a systematic workflow for the analysis of biosynthetic gene clusters across large collections of genomes (pangenomes) from internal & public datasets. @@ -15,13 +17,14 @@ At present, `BGCFlow` is only tested and confirmed to work on **Linux** systems Please use the latest version of `BGCFlow` available. ## Quick Start -A quick and easy way to use `BGCFlow` using [`bgcflow_wrapper`](https://github.com/NBChub/bgcflow_wrapper). +A quick and easy way to use `BGCFlow` using the command line interface wrapper: +[![bgcflow-wrapper](https://img.shields.io/badge/CLI-BGCFlow_wrapper-orange)](https://github.com/NBChub/bgcflow_wrapper) 1. Create a conda environment and install the [`BGCFlow` python wrapper](https://github.com/NBChub/bgcflow_wrapper) : ```bash # create and activate a new conda environment -conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y +conda create -n bgcflow -c conda-forge python=3.11 pip openjdk -y # also install java for metabase conda activate bgcflow # install `BGCFlow` wrapper @@ -37,10 +40,6 @@ With the environment activated, install or setup this configurations: ```bash conda config --set channel_priority disabled conda config --describe channel_priority -``` - - Java (required to run `metabase`) -```bash -conda install openjdk ``` 3. Deploy and run BGCFlow, change `your_bgcflow_directory` variable accordingly: @@ -52,7 +51,10 @@ bgcflow init # initiate `BGCFlow` config and examples from template bgcflow run -n # do a dry run, remove the flag "-n" to run the example dataset ``` -4. Build and serve interactive report (after `bgcflow run` finished). The report will be served in [http://localhost:8001/](http://localhost:8001/): +4. Build and serve interactive report (after `bgcflow run` finished). The report will be served in [http://localhost:8001/](http://localhost:8001/). A demo of the report is available here: + + [![example report](https://img.shields.io/badge/demo-report-blue)](https://nbchub.github.io/bgcflow_report_demo/) + ```bash # build a report bgcflow build report @@ -65,7 +67,8 @@ bgcflow serve --project Lactobacillus_delbrueckii ``` -- For detailed usage and configurations, have a look at the [`BGCFlow` WIKI](https://github.com/NBChub/bgcflow/wiki/) (`under development`) :warning: +- For detailed usage and configurations, have a look at the WIKI: +[![wiki](https://img.shields.io/badge/wiki-documentation-forestgreen)](https://github.com/NBChub/bgcflow/wiki) - Read more about [`bgcflow_wrapper`](https://github.com/NBChub/bgcflow_wrapper) for a detailed overview of the command line interface. [![asciicast](https://asciinema.org/a/595149.svg)](https://asciinema.org/a/595149) diff --git a/workflow/bgcflow/bgcflow/data/arts_extract_all.py b/workflow/bgcflow/bgcflow/data/arts_extract_all.py index 46cf5ad6..ecbb442e 100644 --- a/workflow/bgcflow/bgcflow/data/arts_extract_all.py +++ b/workflow/bgcflow/bgcflow/data/arts_extract_all.py @@ -430,6 +430,7 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None): pandas.DataFrame A DataFrame representation of the extracted Known Resistance Hits information, with each row representing a hit and columns representing hit attributes. + If the input DataFrame is empty, an empty DataFrame is returned. Example ------- @@ -465,7 +466,6 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None): ] = f"{str(genome_id)}__{str(model)}__{str(scaffold)}__{str(sequence_id)}" arts_knownhits = ( arts_knownhits.drop(columns=["Sequence description"]) - .set_index("index") .rename( columns={ "#Model": "model", @@ -474,6 +474,10 @@ def extract_arts_knownhits(arts_knownhits_tsv, outfile_hits, genome_id=None): } ) ) + if arts_knownhits.empty: + logging.warning("ARTS Known Resistance Hits table is empty.") + else: + arts_knownhits = arts_knownhits.set_index("index") arts_knownhits_dict = arts_knownhits.T.to_dict() outfile_hits.parent.mkdir(parents=True, exist_ok=True) logging.info(f"Writing Known Resistance Hits JSON to: {outfile_hits}") diff --git a/workflow/bgcflow/bgcflow/data/get_dependencies.py b/workflow/bgcflow/bgcflow/data/get_dependencies.py index 8648ec09..97f4e6ca 100644 --- a/workflow/bgcflow/bgcflow/data/get_dependencies.py +++ b/workflow/bgcflow/bgcflow/data/get_dependencies.py @@ -1,24 +1,36 @@ import json +import logging import sys +log_format = "%(levelname)-8s %(asctime)s %(message)s" +date_format = "%d/%m %H:%M:%S" +logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG) + import yaml # list of the main dependecies used in the workflow dependencies = { "antismash": r"workflow/envs/antismash.yaml", + "bigslice": r"workflow/envs/bigslice.yaml", + "cblaster": r"workflow/envs/cblaster.yaml", "prokka": r"workflow/envs/prokka.yaml", - "mlst": r"workflow/envs/mlst.yaml", "eggnog-mapper": r"workflow/envs/eggnog.yaml", "roary": r"workflow/envs/roary.yaml", - "refseq_masher": r"workflow/envs/refseq_masher.yaml", "seqfu": r"workflow/envs/seqfu.yaml", + "checkm": r"workflow/envs/checkm.yaml", + "gtdbtk": r"workflow/envs/gtdbtk.yaml", } -def get_dependency_version(dep, dep_key): +def get_dependency_version(dep, dep_key, antismash_version="7"): """ return dependency version tags given a dictionary (dep) and its key (dep_key) """ + if dep_key == "antismash": + logging.info(f"AntiSMASH version is: {antismash_version}") + if antismash_version == "6": + dep[dep_key] = "workflow/envs/antismash_v6.yaml" + logging.info(f"Getting software version for: {dep_key}") with open(dep[dep_key]) as file: result = [] documents = yaml.full_load(file) @@ -38,14 +50,14 @@ def get_dependency_version(dep, dep_key): return str(result) -def write_dependecies_to_json(outfile, dep=dependencies): +def write_dependecies_to_json(outfile, antismash_version, dep=dependencies): """ write dependency version to a json file """ with open(outfile, "w") as file: dv = {} for ky in dep.keys(): - vr = get_dependency_version(dep, ky) + vr = get_dependency_version(dep, ky, antismash_version=antismash_version) dv[ky] = vr json.dump( dv, @@ -57,4 +69,4 @@ def write_dependecies_to_json(outfile, dep=dependencies): if __name__ == "__main__": - write_dependecies_to_json(sys.argv[1]) + write_dependecies_to_json(sys.argv[1], sys.argv[2]) diff --git a/workflow/envs/bgcflow_notes.yaml b/workflow/envs/bgcflow_notes.yaml index d4c19a3f..b63b10b5 100644 --- a/workflow/envs/bgcflow_notes.yaml +++ b/workflow/envs/bgcflow_notes.yaml @@ -4,6 +4,7 @@ channels: - bioconda - defaults dependencies: + - python>=3.9,<3.12 - jupyterlab>3 - ipywidgets>=7.6 - jupyter-dash @@ -13,13 +14,17 @@ dependencies: - python-kaleido - bokeh - openpyxl - - nbconvert + - nbconvert==6.4.2 - ipykernel - - ete3 - xlrd >= 1.0.0 + - altair + - itables + - scikit-learn + - pyarrow + - pygraphviz - pip - pip: - biopython + - jupyterlab-dash - networkx - alive_progress - - ../bgcflow diff --git a/workflow/envs/bigslice.yaml b/workflow/envs/bigslice.yaml index b0b04c5a..99579f65 100644 --- a/workflow/envs/bigslice.yaml +++ b/workflow/envs/bigslice.yaml @@ -41,4 +41,4 @@ dependencies: - scipy==1.9.3 - six==1.16.0 - threadpoolctl==3.1.0 - - git+https://github.com/medema-group/bigslice.git@103d8f2 + - git+https://github.com/NBChub/bigslice.git@c0085de diff --git a/workflow/envs/dbt-duckdb.yaml b/workflow/envs/dbt-duckdb.yaml index f5c1340e..4544edc3 100644 --- a/workflow/envs/dbt-duckdb.yaml +++ b/workflow/envs/dbt-duckdb.yaml @@ -1,9 +1,13 @@ name: dbt-duckdb channels: - conda-forge + - bioconda - defaults dependencies: + - python==3.11 + - python-duckdb==0.8.1 + - unzip - pip - pip: - dbt-duckdb==1.6.0 - - dbt-metabase==0.9.15 + - dbt-metabase==0.9.15 \ No newline at end of file diff --git a/workflow/notebook/bigslice.py.ipynb b/workflow/notebook/bigslice.py.ipynb index 62e9b44d..e3c17e61 100644 --- a/workflow/notebook/bigslice.py.ipynb +++ b/workflow/notebook/bigslice.py.ipynb @@ -21,6 +21,8 @@ "source": [ "import pandas as pd\n", "from pathlib import Path\n", + "import shutil, json\n", + "from IPython.display import display, Markdown\n", "\n", "import warnings\n", "warnings.filterwarnings('ignore')" @@ -33,7 +35,57 @@ "metadata": {}, "outputs": [], "source": [ - "report_dir = Path(\"../\")" + "report_dir = Path(\"../\")\n", + "bgcflow_dir = report_dir / (\"../../../\")\n", + "envs = bgcflow_dir / \"workflow/envs/bigslice.yaml\"\n", + "\n", + "metadata = report_dir / \"metadata/dependency_versions.json\"\n", + "with open(metadata, \"r\") as f:\n", + " dependency_version = json.load(f)\n", + "\n", + "# Define the destination path\n", + "destination_path = Path(\"assets/envs/bigslice.yaml\")\n", + "\n", + "# Ensure the destination directory exists\n", + "destination_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Copy the file\n", + "shutil.copy(envs, destination_path);" + ] + }, + { + "cell_type": "markdown", + "id": "43ca282f", + "metadata": {}, + "source": [ + "## Usage\n", + "\n", + "You can start the BiG-SLICE flask app to view the clustering result.\n", + "\n", + "Steps:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed378d7c-7d1a-4715-a06d-8850c1344387", + "metadata": {}, + "outputs": [], + "source": [ + "run_bigslice=f\"\"\"- Install the conda environment:\n", + "\n", + "```bash\n", + " conda install -f {report_dir.resolve()}/docs/assets/envs/bigslice.yaml\n", + "```\n", + "\n", + "- Run the app\n", + "\n", + "```bash\n", + " port='5001'\n", + " conda run -n bigslice bash {report_dir.resolve()}/cluster_as_{dependency_version[\"antismash\"]}/start_server.sh $port\n", + "```\n", + "\"\"\"\n", + "display(Markdown(run_bigslice))" ] }, { @@ -69,7 +121,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/notebook/deeptfactor.py.ipynb b/workflow/notebook/deeptfactor.py.ipynb index d3030026..99bbe250 100644 --- a/workflow/notebook/deeptfactor.py.ipynb +++ b/workflow/notebook/deeptfactor.py.ipynb @@ -26,6 +26,7 @@ "from IPython.display import display, Markdown, HTML\n", "import itables.options as opt\n", "from itables import to_html_datatable as DT\n", + "import plotly.graph_objects as go\n", "opt.css = \"\"\"\n", ".itables table td { font-style: italic; font-size: .8em;}\n", ".itables table th { font-style: oblique; font-size: .8em; }\n", @@ -88,7 +89,9 @@ "metadata": {}, "outputs": [], "source": [ - "df_deeptf = pd.merge(df.reset_index().drop(columns='genome_id'), df_aa.reset_index(), on=\"locus_tag\", how=\"outer\")" + "df_deeptf = pd.merge(df.reset_index().drop(columns='genome_id'), df_aa.reset_index(), on=\"locus_tag\", how=\"outer\")\n", + "df_deeptf.deeptfactor_prediction = df_deeptf.deeptfactor_prediction.fillna(False)\n", + "df_deeptf = df_deeptf.fillna(0)" ] }, { @@ -101,6 +104,92 @@ "display(HTML(DT(df_deeptf, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\", \"searchable\": True}], maxColumns=df_deeptf.shape[1], maxBytes=0)))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "864665c9-ea6c-4e24-914f-889d7e5563ff", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_sunburst_plot(data, outfile_sunburst, outfile_barh):\n", + " # Calculate the necessary information\n", + " total_annotated = data[\"deeptfactor_prediction\"].sum()\n", + " hypothetical_count = data[data[\"deeptfactor_prediction\"] & data[\"annotation\"].str.contains(\"hypothetical\", case=False)].shape[0]\n", + " non_annotated_hypothetical_count = data[~data[\"deeptfactor_prediction\"] & data[\"annotation\"].str.contains(\"hypothetical\", case=False)].shape[0]\n", + " non_hypothetical_annotated = data[data[\"deeptfactor_prediction\"] & ~data[\"annotation\"].str.contains(\"hypothetical\", case=False)]\n", + " annotation_counts = non_hypothetical_annotated[\"annotation\"].value_counts()\n", + "\n", + " # Sunburst plot data\n", + " labels = [\"Locus Tags\", \"Annotated by deepTF\", \"Not Annotated\", \n", + " \"Hypothetical Proteins\", \"Other Proteins\", \n", + " \"Not Annotated - Hypothetical\", \"Not Annotated - Other\"]\n", + " parents = [\"\", \"Locus Tags\", \"Locus Tags\", \n", + " \"Annotated by deepTF\", \"Annotated by deepTF\", \n", + " \"Not Annotated\", \"Not Annotated\"]\n", + " values = [len(data), total_annotated, len(data) - total_annotated, \n", + " hypothetical_count, total_annotated - hypothetical_count, \n", + " non_annotated_hypothetical_count, len(data) - total_annotated - non_annotated_hypothetical_count]\n", + "\n", + " # Create the sunburst plot\n", + " fig1 = go.Figure(go.Sunburst(\n", + " labels=labels,\n", + " parents=parents,\n", + " values=values,\n", + " maxdepth=2,\n", + " marker=dict(colors=['#f5f5f5', '#66b2ff', '#ff9999', '#ffcccc', '#99ccff', '#ffdddd', '#aaddff'])\n", + " ))\n", + " fig1.update_layout(height=800, title=\"Sunburst Plot of Locus Tags and Protein Types\")\n", + " fig1.write_html(outfile_sunburst)\n", + "\n", + " # Create the horizontal bar chart for annotations\n", + " fig2 = go.Figure(go.Bar(\n", + " y=annotation_counts.index,\n", + " x=annotation_counts.values,\n", + " marker_color='#66b2ff',\n", + " orientation='h'\n", + " ))\n", + " fig2.update_layout(title=\"Distribution of Annotations for Non-Hypothetical, deepTF-Annotated Entries\", \n", + " yaxis_title=\"Annotation\", \n", + " xaxis_title=\"Count\",\n", + " height=800)\n", + " fig2.write_html(outfile_barh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb8b4144-7336-49fc-8f30-cbe837b47de1", + "metadata": {}, + "outputs": [], + "source": [ + "outfile1 = Path(f\"assets/figures/deeptf_sunburst.html\")\n", + "outfile1.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + "outfile2 = Path(f\"assets/figures/deeptf_barh.html\")\n", + "outfile2.parent.mkdir(parents=True, exist_ok=True)\n", + "generate_sunburst_plot(df_deeptf, outfile1, outfile2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "392407dc-77bf-4e07-8e93-ae3114d04af9", + "metadata": {}, + "outputs": [], + "source": [ + "display(HTML(filename=str(outfile1)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f7658b2-639e-4553-98ee-c873fc116e30", + "metadata": {}, + "outputs": [], + "source": [ + "display(HTML(filename=str(outfile2)))" + ] + }, { "cell_type": "markdown", "id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8", diff --git a/workflow/notebook/fastani.py.ipynb b/workflow/notebook/fastani.py.ipynb index 980ec241..0705abd6 100644 --- a/workflow/notebook/fastani.py.ipynb +++ b/workflow/notebook/fastani.py.ipynb @@ -92,7 +92,11 @@ "outputs": [], "source": [ "df_fastani = pd.read_csv(report_dir / 'fastani/df_fastani.csv', index_col=0)\n", - "df_gtdb = pd.read_csv(report_dir / 'tables' / 'df_gtdb_meta.csv', index_col='genome_id')" + "df_gtdb = pd.read_csv(report_dir / 'tables' / 'df_gtdb_meta.csv', index_col='genome_id')\n", + "\n", + "# correct table\n", + "for i in df_fastani.index:\n", + " df_fastani.loc[i, i] = 100" ] }, { @@ -112,15 +116,18 @@ "source": [ "df_fastani_corr = df_fastani.fillna(0).corr()\n", "\n", - "plt.figure(figsize=(30, 7))\n", - "plt.title(\"FastANI Similarity\")\n", + "plt.figure(figsize=(30, 10))\n", + "#plt.title(\"FastANI Similarity\")\n", "\n", "selected_data = df_fastani_corr.copy()\n", "clusters = shc.linkage(selected_data, \n", " method='ward', \n", " metric=\"euclidean\",\n", " optimal_ordering=True,)\n", - "shc.dendrogram(Z=clusters, labels=df_fastani_corr.index, leaf_rotation=90)\n", + "shc.dendrogram(Z=clusters, labels=df_fastani_corr.index, orientation='left') # Set orientation to 'left'\n", + "\n", + "plt.yticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n", + "plt.xticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n", "plt.show()" ] }, @@ -213,12 +220,20 @@ " \n", "comm_colors = df_hclusts['color_code']\n", "plt.figure()\n", + "\n", "# sns.set_theme(color_codes=True)\n", "g = sns.clustermap(df_fastani,\n", - " figsize=(50,50), row_linkage=clusters, col_linkage=clusters,\n", - " row_colors=comm_colors, col_colors=comm_colors)\n", - "g.ax_heatmap.set_xlabel('Genomes')\n", - "g.ax_heatmap.set_ylabel('Genomes')\n", + " figsize=(20,20), row_linkage=clusters, col_linkage=clusters,\n", + " row_colors=comm_colors, col_colors=comm_colors, cmap=\"rocket_r\")\n", + "g.ax_heatmap.set_xlabel('Genomes', fontsize=18)\n", + "g.ax_heatmap.set_ylabel('Genomes', fontsize=18)\n", + "plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=12) # set rotation and font size for x-axis labels\n", + "plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=12) # set font size for y-axis labels\n", + "\n", + "# Adjust font size of the colorbar's tick labels\n", + "cbar = g.cax\n", + "cbar.set_yticklabels(cbar.get_yticklabels(), fontsize=16)\n", + "\n", "plt.show()" ] }, @@ -255,7 +270,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/notebook/mash.py.ipynb b/workflow/notebook/mash.py.ipynb index 5807d138..914211b5 100644 --- a/workflow/notebook/mash.py.ipynb +++ b/workflow/notebook/mash.py.ipynb @@ -116,15 +116,18 @@ "source": [ "df_mash_corr = df_mash.fillna(0).corr()\n", "\n", - "plt.figure(figsize=(30, 7))\n", - "plt.title(\"MASH Distances\")\n", + "plt.figure(figsize=(30, 10))\n", + "#plt.title(\"MASH Distances\", fontsize=20) # You can adjust the title font size here\n", "\n", "selected_data = df_mash_corr.copy()\n", "clusters = shc.linkage(selected_data, \n", " method='ward', \n", " metric=\"euclidean\",\n", - " optimal_ordering=True,)\n", - "shc.dendrogram(Z=clusters, labels=df_mash_corr.index, leaf_rotation=90)\n", + " optimal_ordering=True)\n", + "shc.dendrogram(Z=clusters, labels=df_mash_corr.index, orientation='left') # Set orientation to 'left'\n", + "\n", + "plt.yticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n", + "plt.xticks(fontsize=14) # Adjust the font size for y-axis ticks (labels) for horizontal dendrogram\n", "plt.show()" ] }, @@ -217,12 +220,21 @@ " \n", "comm_colors = df_hclusts['color_code']\n", "plt.figure()\n", + "\n", + "\n", "# sns.set_theme(color_codes=True)\n", - "g = sns.clustermap(df_mash,\n", - " figsize=(50,50), row_linkage=clusters, col_linkage=clusters,\n", - " row_colors=comm_colors, col_colors=comm_colors)\n", - "g.ax_heatmap.set_xlabel('Genomes')\n", - "g.ax_heatmap.set_ylabel('Genomes')\n", + "g = sns.clustermap((1 - df_mash)*100,\n", + " figsize=(20,20), row_linkage=clusters, col_linkage=clusters,\n", + " row_colors=comm_colors, col_colors=comm_colors, cmap=\"rocket_r\")\n", + "g.ax_heatmap.set_xlabel('Genomes', fontsize=18)\n", + "g.ax_heatmap.set_ylabel('Genomes', fontsize=18)\n", + "plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, fontsize=12) # set rotation and font size for x-axis labels\n", + "plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize=12) # set font size for y-axis labels\n", + "\n", + "# Adjust font size of the colorbar's tick labels\n", + "cbar = g.cax\n", + "cbar.set_yticklabels(cbar.get_yticklabels(), fontsize=16)\n", + "\n", "plt.show()" ] }, @@ -256,7 +268,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/notebook/prokka-gbk.py.ipynb b/workflow/notebook/prokka-gbk.py.ipynb index 94c28d8d..5301343f 100644 --- a/workflow/notebook/prokka-gbk.py.ipynb +++ b/workflow/notebook/prokka-gbk.py.ipynb @@ -23,7 +23,8 @@ "source": [ "import pandas as pd\n", "from pathlib import Path\n", - "\n", + "import altair as alt\n", + "import json\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", @@ -47,7 +48,11 @@ }, "outputs": [], "source": [ - "report_dir = Path(\"../\")" + "report_dir = Path(\"../\")\n", + "metadata = report_dir / \"metadata/project_metadata.json\"\n", + "with open(metadata, \"r\") as f:\n", + " metadata = json.load(f)\n", + "project_name = [i for i in metadata.keys()][0]" ] }, { @@ -81,6 +86,46 @@ "display(HTML(DT(df, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))" ] }, + { + "cell_type": "markdown", + "id": "7645aa83", + "metadata": {}, + "source": [ + "## Summary Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68ba254c-ab05-45ac-aa81-8c0c05e49f1c", + "metadata": {}, + "outputs": [], + "source": [ + "source = df.copy()\n", + "source[\"dataset\"] = project_name\n", + "\n", + "# Create a list of charts, one for each column to plot\n", + "colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854', '#ffd92f', '#e5c494']\n", + "\n", + "charts = []\n", + "\n", + "for idx, col in enumerate(['contigs', 'bases', 'CDS', 'rRNA', 'repeat_region', 'tRNA', 'tmRNA']):\n", + " data_range = source[col].max() - source[col].min()\n", + " buffer = 0.10 * data_range # 10% buffer\n", + " ymin = source[col].min() - buffer\n", + " ymax = source[col].max() + buffer\n", + " chart = alt.Chart(source).mark_boxplot(size=50, median=dict(color='black')).encode(\n", + " y=alt.Y(f'{col}:Q', title=None, scale=alt.Scale(domain=(ymin, ymax))),\n", + " x=alt.X(\"dataset:N\", axis=None), # This is used to align the boxplots vertically\n", + " color=alt.value(colors[idx]), # Color of the boxplot\n", + " opacity=alt.value(0.7) # Opacity of the boxplot\n", + " ).properties(title=f'{col}', width=100, height=150)\n", + " \n", + " charts.append(chart)\n", + "\n", + "alt.hconcat(*charts)" + ] + }, { "cell_type": "markdown", "id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8", @@ -114,7 +159,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/notebook/query-bigslice.py.ipynb b/workflow/notebook/query-bigslice.py.ipynb index 9bf016ee..f9c450ba 100644 --- a/workflow/notebook/query-bigslice.py.ipynb +++ b/workflow/notebook/query-bigslice.py.ipynb @@ -21,9 +21,133 @@ "source": [ "import pandas as pd\n", "from pathlib import Path\n", + "import sqlite3\n", + "from IPython.display import display, Markdown, HTML\n", + "\n", + "import json, yaml\n", + "import altair as alt\n", + "import ast\n", "\n", "import warnings\n", - "warnings.filterwarnings('ignore')" + "warnings.filterwarnings('ignore')\n", + "\n", + "import networkx as nx\n", + "import plotly.graph_objects as go\n", + "import altair as alt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dd35e5b-bca8-4c8a-9187-ddfd61e1d957", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_bigfam_network(df, rank=0):\n", + " \"\"\"\n", + " Build a networkx graph from bigscape df network\n", + " \"\"\"\n", + " df[df['rank'] == rank]\n", + " G = nx.from_pandas_edgelist(df, source='bgc_id', target='gcf_id', edge_attr=['membership_value',\n", + " 'rank'], edge_key='bigfam_edge_id')\n", + " return G" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1a957a4-f659-436d-9497-bf5933572124", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_bigfam_models(G, df, columns = ['bgc_member', 'chemical_class_hits', 'top_chemical_class',\n", + " 'top_chemical_class_proportion', 'top_chemical_subclass',\n", + " 'top_chemical_subclass_proportion', 'taxonomic_hits', \n", + " 'taxonomic_level', 'H-index', 'richness', 'top_taxa', \n", + " 'top_taxa_proportion']):\n", + " for bgc in G.nodes:\n", + " if bgc in df.index:\n", + " G.nodes[bgc]['node_type'] = \"BiG-FAM GCFs\"\n", + " G.nodes[bgc]['text'] = f\"GCF {bgc}
size: {df.loc[bgc, 'bgc_member']}
top_chemical_class: {df.loc[bgc, 'top_chemical_class']} ({df.loc[bgc, 'top_chemical_class_proportion']:.0%})\\\n", + "
top_chemical_subclass: {df.loc[bgc, 'top_chemical_subclass']} ({df.loc[bgc, 'top_chemical_subclass_proportion']:.0%})\\\n", + "
top_taxa: {df.loc[bgc, 'top_taxa']} ({df.loc[bgc, 'top_taxa_proportion']:.0%})\"\n", + " for c in columns:\n", + " G.nodes[bgc][c] = df.loc[bgc, c]\n", + " return G" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd034c15-182e-4d65-8266-3392a2a0ec60", + "metadata": {}, + "outputs": [], + "source": [ + "def annotate_bigfam_antismash(G, antismash_region_path, columns = [\"genome_id\", \"product\", \"contig_edge\", \"region_length\", \"most_similar_known_cluster_id\", \"most_similar_known_cluster_description\", \"most_similar_known_cluster_type\", \"similarity\"]):\n", + " df_antismash = pd.read_csv(antismash_region_path, index_col=0)\n", + " for bgc in G.nodes:\n", + " if bgc in df_antismash.index:\n", + " G.nodes[bgc]['node_type'] = \"BGC\"\n", + " G.nodes[bgc]['text'] = f\"{bgc}
{df_antismash.loc[bgc, 'product']}\"\n", + " if 'most_similar_known_cluster_description' in df_antismash.columns:\n", + " G.nodes[bgc]['text'] = G.nodes[bgc]['text'] + f\"
{df_antismash.loc[bgc, 'most_similar_known_cluster_description']}\"\n", + " for c in df_antismash.columns:\n", + " if c in columns:\n", + " G.nodes[bgc][c] = df_antismash.loc[bgc, c]\n", + " return G" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8923e90-4526-4146-ba33-6cff84cf2f7c", + "metadata": {}, + "outputs": [], + "source": [ + "def create_edge_trace(G, name):\n", + " edge_trace = go.Scatter(\n", + " x=[],\n", + " y=[],\n", + " name=name,\n", + " line=dict(width=0.5,color='#888'),\n", + " hoverinfo='none',\n", + " mode='lines')\n", + "\n", + " for edge in G.edges():\n", + " x0, y0 = G.nodes[edge[0]]['pos']\n", + " x1, y1 = G.nodes[edge[1]]['pos']\n", + " edge_trace['x'] += tuple([x0, x1, None])\n", + " edge_trace['y'] += tuple([y0, y1, None])\n", + " return edge_trace\n", + "\n", + "def create_node_trace(G, node_trace_category, color, showtextlabel=False, nodesize=10, nodeopacity=0.8, nodesymbol=\"circle\", linewidth=1, linecolor=\"black\", textposition=\"top center\"):\n", + " if showtextlabel:\n", + " markermode = \"markers+text\"\n", + " else:\n", + " markermode = \"markers\"\n", + " node_trace = go.Scatter(\n", + " x=[],\n", + " y=[],\n", + " text=[],\n", + " textposition=textposition,\n", + " mode=markermode,\n", + " hoverinfo='text',\n", + " name=node_trace_category,\n", + " marker=dict(\n", + " symbol=nodesymbol,\n", + " opacity=nodeopacity,\n", + " showscale=False,\n", + " color=color,\n", + " size=nodesize,\n", + " line=dict(width=linewidth, color=linecolor)))\n", + "\n", + " for node in G.nodes():\n", + " if G.nodes[node][\"node_trace\"] == node_trace_category:\n", + " x, y = G.nodes[node]['pos']\n", + " node_trace['x'] += tuple([x])\n", + " node_trace['y'] += tuple([y])\n", + " node_trace['text'] +=tuple([G.nodes[node]['text']])\n", + " return node_trace" ] }, { @@ -33,7 +157,325 @@ "metadata": {}, "outputs": [], "source": [ - "report_dir = Path(\"../\")" + "report_dir = Path(\"../\")\n", + "\n", + "with open(report_dir / \"metadata/project_metadata.json\", \"r\") as f:\n", + " project_configuration = json.load(f)\n", + "with open(report_dir / \"metadata/dependency_versions.json\", \"r\") as f:\n", + " dependency_version = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a457b21e-12a4-4465-bab1-0a9cf2eb309b", + "metadata": {}, + "outputs": [], + "source": [ + "project_name = [i for i in project_configuration.keys()][0]\n", + "antismash_version = dependency_version[\"antismash\"]\n", + "\n", + "query_dir = report_dir / f\"bigslice/query_as_{antismash_version}/\"\n", + "df_bigfam_model = pd.read_csv(query_dir / \"gcf_summary.csv\")\n", + "df_bigfam_hits = pd.read_csv(report_dir / f\"bigslice/query_as_{antismash_version}/query_network.csv\")\n", + "antismash_region_path = report_dir / f\"tables/df_regions_antismash_{antismash_version}.csv\"\n", + "df_annotated_bigfam_model = pd.read_csv(query_dir / \"gcf_annotation.csv\", index_col=0)\n", + "df_network = pd.read_csv(query_dir / \"query_network.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d2b11e1-1cd9-4443-bc77-6d4bbb7da529", + "metadata": {}, + "outputs": [], + "source": [ + "# filtering\n", + "#df_network = df_network[df_network[\"rank\"] < 3] # get first degree only\n", + "df_annotated_bigfam_model = df_annotated_bigfam_model[df_annotated_bigfam_model[\"bgc_member\"] < 15000] # remove model with really large size\n", + "df_network = df_network[df_network[\"gcf_id\"].isin(df_annotated_bigfam_model.index)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45a7732c-6a11-43a9-b2bd-4406db0be290", + "metadata": {}, + "outputs": [], + "source": [ + "df_cut = df_annotated_bigfam_model[df_annotated_bigfam_model.top_taxa_proportion <= 0.3]\n", + "\n", + "G = generate_bigfam_network(df_network)\n", + "G = annotate_bigfam_antismash(G, antismash_region_path)\n", + "G = annotate_bigfam_models(G, df_annotated_bigfam_model)\n", + "G_raw = G.copy()\n", + "\n", + "# position nodes\n", + "pos = nx.nx_agraph.graphviz_layout(G)\n", + "for n, p in pos.items():\n", + " G.nodes[n]['pos'] = p" + ] + }, + { + "cell_type": "markdown", + "id": "92ad9896-2043-49b8-9e39-aab12f57f5fa", + "metadata": {}, + "source": [ + "## Summary Result\n", + "### Distribution of the assigned genus from BiG-FAM Model hits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c9b54f-c1ed-4180-ae32-f9635f958509", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the number of GCFs with representation across different genera\n", + "df_genus_dist = pd.DataFrame(index=df_annotated_bigfam_model.index)\n", + "for gcf_id in df_annotated_bigfam_model.index:\n", + " taxa = str(df_annotated_bigfam_model.loc[gcf_id, \"taxa_distribution\"])\n", + " taxa_dict = ast.literal_eval(taxa)\n", + " for genus in taxa_dict.keys():\n", + " df_genus_dist.loc[gcf_id, genus] = taxa_dict[genus]\n", + "df_genus_dist.fillna(0, inplace=True)\n", + "\n", + "# Genera represented by most number of GCFs\n", + "df_genus_dist_binary = df_genus_dist > 0 \n", + "df_genus_dist_binary = df_genus_dist_binary * 1\n", + "\n", + "# Assuming df_genus_dist_binary.sum().sort_values(ascending=False)[:10] is stored in a variable named 'data'\n", + "data = df_genus_dist_binary.sum().sort_values(ascending=False)[:10]\n", + "\n", + "# Convert Series to DataFrame for Altair plotting\n", + "data_df = data.reset_index()\n", + "data_df.columns = ['Genus', 'Counts']\n", + "\n", + "# Create Altair Chart\n", + "chart = alt.Chart(data_df).mark_bar().encode(\n", + " y=alt.Y('Genus:N', sort='-x', title='Genera'),\n", + " x=alt.X('Counts:Q', title='Counts'),\n", + " color=alt.Color('Genus:N', legend=None) # Color by Genera, but we don't need a legend here\n", + ").properties(\n", + " title='Top 10 Genus Distribution',\n", + " width=700,\n", + " height=300\n", + ")\n", + "\n", + "# Display the chart\n", + "chart" + ] + }, + { + "cell_type": "markdown", + "id": "087dfb47-9c8f-4c61-84d4-2d72b7ffcab4", + "metadata": {}, + "source": [ + "### Diversity and Member Size of BiG-FAM model hits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76e25859-6a93-4e5e-897d-f7fe60ba259d", + "metadata": {}, + "outputs": [], + "source": [ + "bgc_hits = len([n for n in G_raw.nodes if G_raw.nodes[n]['node_type'] == 'BGC'])\n", + "GCF_hits = len([n for n in G_raw.nodes if G_raw.nodes[n]['node_type'] != 'BGC'])\n", + "shannon_non_zero = df_annotated_bigfam_model[df_annotated_bigfam_model['H-index'] != 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c50933f-688d-49ca-84c0-18ef7220815d", + "metadata": {}, + "outputs": [], + "source": [ + "shannon_min_non_zero_tax = ast.literal_eval(str(df_annotated_bigfam_model.loc[shannon_non_zero['H-index'].astype(float).idxmin(), \"taxa_distribution\"]))\n", + "shannon_min_non_zero_tax = {k : v/sum(shannon_min_non_zero_tax.values()) for k,v in shannon_min_non_zero_tax.items()}\n", + "shannon_min_non_zero_tax_clean = [f\"{k} ({v:.1%})\" for k,v in sorted(shannon_min_non_zero_tax.items(), key=lambda x:x[1], reverse=True)]\n", + "\n", + "def get_top_taxa(df, gcf):\n", + " result = f\"{df.loc[gcf, 'top_taxa']} ({df.loc[gcf, 'top_taxa_proportion'] * df.loc[gcf, 'bgc_member']:.0f} genomes)\"\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5251f9c-5dc9-4f61-a43d-d5e2558353c4", + "metadata": {}, + "outputs": [], + "source": [ + "mapping = df_bigfam_hits.gcf_id.value_counts()\n", + "for gcf in df_annotated_bigfam_model.index:\n", + " df_annotated_bigfam_model.loc[gcf, \"dataset_hits\"] = mapping[gcf]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d0d36af-3a0c-4949-9f02-482e2595fa0a", + "metadata": {}, + "outputs": [], + "source": [ + "domain = list(df_annotated_bigfam_model[df_annotated_bigfam_model[\"top_taxa_proportion\"] > 0.1][\"top_taxa\"].value_counts().to_dict().keys())\n", + "domain.append(\"Other\")\n", + "\n", + "r = []\n", + "range_ = [\"#264653\", \"#287271\", \"#2a9d8f\", \"#8ab17d\", \"#e9c46a\", \"#f4a261\", \"#ee8959\", \"#e76f51\", \"#edede9\"]\n", + "for num, d in enumerate(domain):\n", + " if num < len(range_):\n", + " r.append(range_[num])\n", + " else:\n", + " r.append(\"white\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5882dd62-7c22-432d-ac99-6fe2203c3217", + "metadata": {}, + "outputs": [], + "source": [ + "source = df_annotated_bigfam_model.copy()\n", + "source = source.reset_index().rename(columns={\"gcf_id\":\"BiG-FAM_id\"})\n", + "for i in source.index:\n", + " if source.loc[i, \"top_taxa_proportion\"] <= 0.5:\n", + " source.loc[i, \"top_taxa\"] = \"Other\"\n", + " \n", + "chart_one = alt.Chart(source).mark_point().encode(\n", + " alt.Y('H-index:Q',\n", + " scale=alt.Scale(domain=(-0.5, 5)),\n", + " axis=alt.Axis(title=\"Shannon Index (H')\")\n", + " ),\n", + " alt.X('bgc_member:Q',\n", + " scale=alt.Scale(type=\"log\"),\n", + " axis=alt.Axis(title=\"Member Size\")\n", + " ),\n", + " alt.Size('dataset_hits',\n", + " scale=alt.Scale(type='pow'),# domain=(1, 30)), \n", + " title=\"Number of hits in dataset\"\n", + " ),\n", + " alt.Color(\"top_taxa:N\", scale=alt.Scale(domain=domain, range=r), title=\"Top Genus (>=50%)\"),\n", + " tooltip=['BiG-FAM_id', 'bgc_member', 'chemical_class_hits', 'top_chemical_class', alt.Tooltip('top_chemical_class_proportion', format='.1%'), \n", + " 'top_chemical_subclass', alt.Tooltip('top_chemical_subclass_proportion', format='.1%'),\n", + " 'taxonomic_level', 'richness', 'top_taxa', alt.Tooltip('top_taxa_proportion', format='.1%'), \n", + " alt.Tooltip('H-index:Q', format='.2f')],\n", + ").mark_point(\n", + " filled=True,\n", + " stroke='black',\n", + " strokeWidth=0.5,\n", + " opacity=0.8,\n", + " size=1000\n", + ").configure_header(\n", + " title=None,\n", + " labels=False\n", + ").configure_axis(\n", + " labelFontSize=10,\n", + " titleFontSize=12\n", + ").configure_legend(\n", + " labelFontSize=10,\n", + " titleFontSize=12,\n", + ").configure_view(\n", + " continuousHeight=500,\n", + " continuousWidth=500,\n", + ").configure_legend(\n", + " orient='right'\n", + ")\n", + "\n", + "chart_one" + ] + }, + { + "cell_type": "markdown", + "id": "eb27456a-066f-476a-83d1-08bd307f28f8", + "metadata": {}, + "source": [ + "### Network of BiG-FAM model hits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a760c64-c7e2-442c-92e3-ef1f1eccda63", + "metadata": {}, + "outputs": [], + "source": [ + "color_category = 'node_type'\n", + "colormap = {\"BiG-FAM GCFs\" : \"red\",\n", + " \"BGC\" : \"blue\"}\n", + "node_trace_category = {}\n", + "\n", + "for node in G.nodes:\n", + " nodeshape = \"circle\"\n", + " if node in df_annotated_bigfam_model.index:\n", + " G.nodes[node]['node_type'] = \"BiG-FAM GCFs\"\n", + " if G.nodes[node]['node_type'] == \"BiG-FAM GCFs\":\n", + " nodeshape = \"diamond\"\n", + " color = colormap[\"BiG-FAM GCFs\"]\n", + " cat = \"BiG-FAM GCFs\"\n", + " else: \n", + " cat = G.nodes[node][color_category]\n", + " color = colormap[cat]\n", + " if 'similarity' in G.nodes[node].keys():\n", + " if G.nodes[node]['similarity'] > 0.8:\n", + " cat = cat + \" \" + \"Known (antiSMASH) > 80% similarity\"\n", + " nodeshape = \"circle\"\n", + " elif G.nodes[node]['similarity'] > 0.4:\n", + " cat = cat + \" \" + \"Known (antiSMASH) > 40% similarity\"\n", + " nodeshape = \"triangle-up\"\n", + " elif G.nodes[node]['similarity'] < 0.4:\n", + " cat = cat + \" \" + \"Unknown (antiSMASH) < 40% similarity\"\n", + " nodeshape = \"triangle-down\"\n", + " else:\n", + " cat = cat + \" \" + \"Unknown\"\n", + " nodeshape = \"circle\"\n", + " \n", + " linecolor = \"black\"\n", + " \n", + " G.nodes[node]['node_trace'] = cat\n", + " node_trace_category[cat] = {\"nodecolor\" : color,\n", + " \"nodeshape\" : nodeshape,\n", + " \"linecolor\" : linecolor}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb871352-8602-438d-8404-82e36f2c1afd", + "metadata": {}, + "outputs": [], + "source": [ + "edge_trace = create_edge_trace(G, \"bigslice_similarity\")\n", + "traces = [edge_trace]\n", + "for cat in node_trace_category.keys():\n", + " nodeopacity = 0.8\n", + " color = node_trace_category[cat][\"nodecolor\"]\n", + " nodeshape = node_trace_category[cat][\"nodeshape\"]\n", + " linecolor = node_trace_category[cat][\"linecolor\"]\n", + " node_trace = create_node_trace(G, cat, color, nodesymbol=nodeshape, nodeopacity=nodeopacity, nodesize=10, linewidth=1, linecolor=linecolor)\n", + " traces.append(node_trace)\n", + " \n", + "fig2 = go.Figure(data=traces,\n", + " layout=go.Layout(\n", + " paper_bgcolor='rgba(0,0,0,0)',\n", + " plot_bgcolor='rgba(0,0,0,0)',\n", + " showlegend=True,\n", + " hovermode='closest',\n", + " margin=dict(b=20,l=5,r=5,t=40),\n", + " xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n", + " yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),\n", + " width=1200, height=800))\n", + "\n", + "outfile = Path(\"assets/figures/bigfam-query.html\")\n", + "outfile.parent.mkdir(parents=True, exist_ok=True)\n", + "fig2.write_html(outfile)\n", + "\n", + "display(HTML(filename=str(outfile)))" ] }, { @@ -66,7 +508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/notebook/seqfu.py.ipynb b/workflow/notebook/seqfu.py.ipynb index 4c59a893..79a1b65c 100644 --- a/workflow/notebook/seqfu.py.ipynb +++ b/workflow/notebook/seqfu.py.ipynb @@ -53,17 +53,14 @@ "source": [ "source = df\n", "\n", - "chart = alt.Chart(source).mark_circle().encode(\n", - " alt.X(alt.repeat(\"column\"), type='quantitative', scale=alt.Scale(zero=False)),\n", - " alt.Y(alt.repeat(\"row\"), type='quantitative', scale=alt.Scale(zero=False)),\n", - " color='Total',\n", - " tooltip=['genome_id', 'Count', 'Total', 'gc', 'N50', 'N75', 'N90', 'AuN', 'Min', 'Max']\n", + "chart = alt.Chart(source).mark_circle(size=150).encode(\n", + " alt.X(\"gc\", type='quantitative', title='GC (%)', scale=alt.Scale(zero=False)).axis(format='.2%'),\n", + " alt.Y(\"N50\", type='quantitative', scale=alt.Scale(zero=False)),\n", + " color=alt.Color('Total', scale=alt.Scale(domain=[df['Total'].min(), df['Total'].max()], range=['blue', 'red'])),\n", + " tooltip=['genome_id', 'Count', 'Total', 'gc', 'N50', 'N75', 'N90', 'AuN', 'Min', 'Max', 'Organism']\n", ").properties(\n", - " width=150,\n", - " height=150,\n", - ").repeat(\n", - " row=['Count', 'Total', 'N50'],\n", - " column=['Count', 'Total', 'N50'],\n", + " width=500,\n", + " height=500,\n", ").properties(\n", " title = \"Genome QC Statistics\",\n", ").interactive()\n", @@ -117,7 +114,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.17" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/workflow/rules.yaml b/workflow/rules.yaml index 442a1e66..7be65a5a 100644 --- a/workflow/rules.yaml +++ b/workflow/rules.yaml @@ -179,7 +179,7 @@ deeptfactor: - 'Kim G.B., Gao Y., Palsson B.O., Lee S.Y. 2020. DeepTFactor: A deep learning-based tool for the prediction of transcription factors. [PNAS. doi: 10.1073/pnas.2021171118](https://www.pnas.org/doi/10.1073/pnas.2021171118)' deeptfactor-roary: - final_output: data/interim/deeptfactor_roary/{name}/ + final_output: data/processed/{name}/tables/df_deeptfactor_roary.tsv description: Use DeepTFactor on Roary outputs. category: Comparative Genomics link: diff --git a/workflow/rules/bgc_analytics.smk b/workflow/rules/bgc_analytics.smk index 0d28a8fd..4c208190 100644 --- a/workflow/rules/bgc_analytics.smk +++ b/workflow/rules/bgc_analytics.smk @@ -112,9 +112,11 @@ rule write_dependency_versions: "../envs/bgc_analytics.yaml" log: "logs/bgc_analytics/{name}/write_dependency_versions.log", + params: + antismash_version=antismash_major_version shell: """ - python workflow/bgcflow/bgcflow/data/get_dependencies.py {output} 2> {log} + python workflow/bgcflow/bgcflow/data/get_dependencies.py {output} {params.antismash_version} 2> {log} """ rule get_project_metadata: diff --git a/workflow/rules/bigslice.smk b/workflow/rules/bigslice.smk index abd81201..1052effc 100644 --- a/workflow/rules/bigslice.smk +++ b/workflow/rules/bigslice.smk @@ -25,7 +25,7 @@ rule bigslice: "../envs/bigslice.yaml" threads: 16 params: - threshold=900, + threshold="0.4", resources: tmpdir="data/interim/tempdir", log: @@ -65,12 +65,14 @@ rule query_bigslice: n_ranks=10, query_name="{name}", run_id=6, + threshold=300, + normalize="--no_normalize_feature ", resources: tmpdir="data/interim/tempdir", shell: """ TIMESTAMP=$(date --iso-8601=hours) - bigslice --query {input.tmp_dir} --n_ranks {params.n_ranks} {input.bigslice_dir} -t {threads} --query_name {params.query_name}_$TIMESTAMP --run_id {params.run_id} &>> {log} + bigslice --query {input.tmp_dir} --n_ranks {params.n_ranks} {input.bigslice_dir} -t {threads} --query_name {params.query_name}_$TIMESTAMP --run_id {params.run_id} --threshold {params.threshold} {params.normalize} &>> {log} python workflow/bgcflow/bgcflow/data/get_bigslice_query_result.py {params.query_name} {output.folder} {input.bigslice_dir} &>> {log} """ @@ -97,6 +99,7 @@ rule summarize_bigslice_query: rule annotate_bigfam_hits: input: + summary_table="data/processed/{name}/tables/df_antismash_{version}_summary.csv", gcf_summary_csv="data/processed/{name}/bigslice/query_as_{version}/gcf_summary.csv", output: models="data/processed/{name}/bigslice/query_as_{version}/gcf_annotation.csv", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 35eccef9..41b95aa5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -8,7 +8,7 @@ from pathlib import Path import peppy min_version("7.14.0") -__version__ = "0.7.6" +__version__ = "0.7.7" container: "docker://matinnu/bgcflow:latest" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 70948dba..f8597943 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -14,7 +14,7 @@ if len(py_wildcards) > 0: output: notebook="data/processed/{name}/docs/{bgcflow_rules_py}.ipynb", conda: - "../envs/bgc_analytics.yaml" + "../envs/bgcflow_notes.yaml" log: "logs/report/{bgcflow_rules_py}-report-copy-{name}.log", wildcard_constraints: @@ -33,7 +33,7 @@ if len(py_wildcards) > 0: output: markdown="data/processed/{name}/docs/{bgcflow_rules_py}.md", conda: - "../envs/bgc_analytics.yaml" + "../envs/bgcflow_notes.yaml" log: "logs/report/{bgcflow_rules_py}-report-{name}.log", wildcard_constraints: @@ -50,7 +50,7 @@ if len(rpy_wildcards) > 0: output: notebook="data/processed/{name}/docs/{bgcflow_rules_rpy}.ipynb", conda: - "../envs/bgc_analytics.yaml" + "../envs/bgcflow_notes.yaml" log: "logs/report/{bgcflow_rules_rpy}-report-copy-{name}.log", wildcard_constraints: diff --git a/workflow/rules/roary.smk b/workflow/rules/roary.smk index f060689d..6b5befbf 100644 --- a/workflow/rules/roary.smk +++ b/workflow/rules/roary.smk @@ -43,23 +43,24 @@ rule deeptfactor_roary: roary_dir="data/interim/roary/{name}/", resource="resources/deeptfactor/", output: - deeptfactor_dir=directory("data/interim/deeptfactor_roary/{name}/"), + df_deeptfactor="data/processed/{name}/tables/df_deeptfactor_roary.tsv", conda: "../envs/deeptfactor.yaml" threads: 8 log: "logs/deeptfactor-roary/deeptfactor-roary-{name}.log", params: - faa="../../data/interim/roary/{name}/pan_genome_reference.fa", - outdir="../../data/interim/deeptfactor_roary/{name}/", + outdir="data/interim/deeptfactor_roary/{name}", shell: """ + workdir=$PWD + mkdir -p data/processed/{wildcards.name} 2>> {log} (cd {input.resource} && python tf_running.py \ - -i {params.faa} -o {params.outdir} \ + -i $workdir/{input.roary_dir}/pan_genome_reference.fa -o $workdir/{params.outdir} \ -g cpu -cpu {threads}) 2>> {log} + cp {params.outdir}/prediction_result.txt {output.df_deeptfactor} &>> {log} """ - rule diamond_roary: input: roary_dir="data/interim/roary/{name}/",