From 47526367ec4aaf18a4038cc2584642e9b3d0145f Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 15 Jul 2024 00:02:55 +0200 Subject: [PATCH 1/3] feat: add AMRFinderPlus pipeline --- .../project_config.yaml | 1 + workflow/Snakefile | 1 + .../bgcflow/bgcflow/data/gather_amrfinder.py | 64 +++++++++++++ workflow/envs/amrfinderplus.post-deploy.sh | 1 + workflow/envs/amrfinderplus.yaml | 7 ++ workflow/notebook/amrfinderplus.py.ipynb | 92 +++++++++++++++++++ workflow/rules.yaml | 8 ++ workflow/rules/amrfinderplus.smk | 52 +++++++++++ 8 files changed, 226 insertions(+) create mode 100644 workflow/bgcflow/bgcflow/data/gather_amrfinder.py create mode 100644 workflow/envs/amrfinderplus.post-deploy.sh create mode 100644 workflow/envs/amrfinderplus.yaml create mode 100644 workflow/notebook/amrfinderplus.py.ipynb create mode 100644 workflow/rules/amrfinderplus.smk diff --git a/.examples/Lactobacillus_delbrueckii/project_config.yaml b/.examples/Lactobacillus_delbrueckii/project_config.yaml index c51e6c02..d811dda7 100644 --- a/.examples/Lactobacillus_delbrueckii/project_config.yaml +++ b/.examples/Lactobacillus_delbrueckii/project_config.yaml @@ -29,3 +29,4 @@ rules: cblaster-genome: TRUE cblaster-bgc: FALSE gecco: TRUE + amrfinderplus: TRUE diff --git a/workflow/Snakefile b/workflow/Snakefile index ea23261c..81e33db6 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -92,6 +92,7 @@ include: "rules/cblaster.smk" include: "rules/data_warehouse.smk" include: "rules/convert_genbank.smk" include: "rules/gecco.smk" +include: "rules/amrfinderplus.smk" ##### 4. Generate user-defined local resources custom_resource_dir(config["resources_path"], resource_mapping) diff --git a/workflow/bgcflow/bgcflow/data/gather_amrfinder.py b/workflow/bgcflow/bgcflow/data/gather_amrfinder.py new file mode 100644 index 00000000..e296f03d --- /dev/null +++ b/workflow/bgcflow/bgcflow/data/gather_amrfinder.py @@ -0,0 +1,64 @@ +import logging +import sys +from pathlib import Path + +import pandas as pd + +log_format = "%(levelname)-8s %(asctime)s %(message)s" +date_format = "%d/%m %H:%M:%S" +logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG) + + +def cleanup_amrfinder(input_file, genome_id=None): + """ + Cleans up the AMRFinder output file by filling null values in the 'Protein identifier' column + with a combination of 'Contig id', 'Start', and 'Stop' coordinates. Adds a 'genome_id' column. + + Parameters: + input_file (str): Path to the input file. + genome_id (str, optional): Genome identifier. If not provided, it will be derived from the input file name. + + Returns: + pd.DataFrame: Cleaned DataFrame. + """ + input_path = Path(input_file) + if genome_id is None: + genome_id = input_path.stem + df = pd.read_csv(input_path, sep="\t") + + # Fill null values in 'Protein identifier' with a combination of 'Contig id', 'Start', and 'Stop' + df["Protein identifier"] = df.apply( + lambda row: row["Protein identifier"] + if pd.notnull(row["Protein identifier"]) + else f"{row['Contig id']}_{row['Start']}_{row['Stop']}", + axis=1, + ) + df["genome_id"] = genome_id + + return df + + +def gather_amrfinder(input_list, output_file): + """ + Gathers and cleans up multiple AMRFinder output files listed in an input file, + concatenates them into a single DataFrame, and writes the result to an output file. + + Parameters: + input_list (str): Path to the file containing a list of input file paths. + output_file (str): Path to the output file where the concatenated DataFrame will be saved. + + Returns: + None + """ + dataframes = [] + + with open(input_list, "r") as f: + data = f.readlines() + + dataframes = pd.concat([cleanup_amrfinder(i.strip("\n")) for i in data]) + dataframes.to_csv(output_file, index=False) + return + + +if __name__ == "__main__": + gather_amrfinder(sys.argv[1], sys.argv[2]) diff --git a/workflow/envs/amrfinderplus.post-deploy.sh b/workflow/envs/amrfinderplus.post-deploy.sh new file mode 100644 index 00000000..0ab98e3e --- /dev/null +++ b/workflow/envs/amrfinderplus.post-deploy.sh @@ -0,0 +1 @@ +amrfinder -u diff --git a/workflow/envs/amrfinderplus.yaml b/workflow/envs/amrfinderplus.yaml new file mode 100644 index 00000000..666a7da8 --- /dev/null +++ b/workflow/envs/amrfinderplus.yaml @@ -0,0 +1,7 @@ +name: amrfinder +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - ncbi-amrfinderplus diff --git a/workflow/notebook/amrfinderplus.py.ipynb b/workflow/notebook/amrfinderplus.py.ipynb new file mode 100644 index 00000000..792f3520 --- /dev/null +++ b/workflow/notebook/amrfinderplus.py.ipynb @@ -0,0 +1,92 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aa0f0d95-5e6b-436f-a971-9486d0841329", + "metadata": {}, + "source": [ + "# AMRFinderPlus\n", + "Summary of [AMRFinderPlus](https://github.com/ncbi/amr) results from project: `[{{ project().name }}]` \n", + "\n", + "## Description\n", + "Identify AMR genes and point mutations, and virulence and stress resistance genes in assembled bacterial nucleotide and protein sequence.\n", + "\n", + "## AMR Hits Overview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d21448bb-afe9-4fdb-b964-2143a0ccd1e4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from IPython.display import display, Markdown, HTML\n", + "from pathlib import Path\n", + "import altair as alt\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "from itables import to_html_datatable as DT\n", + "import itables.options as opt\n", + "opt.css = \"\"\"\n", + ".itables table td { font-style: italic; font-size: .8em;}\n", + ".itables table th { font-style: oblique; font-size: .8em; }\n", + "\"\"\"\n", + "opt.classes = [\"display\", \"compact\"]\n", + "opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53fb8b15-379c-4be1-827e-243602f89d95", + "metadata": {}, + "outputs": [], + "source": [ + "report_dir = Path(\"../\")\n", + "amr_table = report_dir / \"tables/df_amrfinderplus.csv\"\n", + "df = pd.read_csv(amr_table)\n", + "\n", + "display(HTML(DT(df, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))" + ] + }, + { + "cell_type": "markdown", + "id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8", + "metadata": {}, + "source": [ + "[Download Table]({{ project().file_server() }}/tables/df_amrfinderplus.csv){:target=\"_blank\" .md-button}\n", + "\n", + "## References\n", + "\n", + "{% for i in project().rule_used['amrfinderplus']['references'] %}\n", + "- *{{ i }}*\n", + "{% endfor %}\n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflow/rules.yaml b/workflow/rules.yaml index 05ad061c..e6b9f350 100644 --- a/workflow/rules.yaml +++ b/workflow/rules.yaml @@ -241,3 +241,11 @@ gecco: - "Accurate de novo identification of biosynthetic gene clusters with GECCO. Laura M Carroll, Martin Larralde, Jonas Simon Fleck, Ruby Ponnudurai, Alessio Milanese, Elisa Cappio Barazzone, Georg Zeller. bioRxiv 2021.05.03.442509; doi:10.1101/2021.05.03.442509" +amrfinderplus: + final_output: "data/processed/{name}/tables/df_amrfinderplus.csv" + description: Identify AMR genes and point mutations, and virulence and stress resistance genes in assembled bacterial nucleotide and protein sequence. + category: Genome Mining + link: + - https://github.com/ncbi/amr + references: + - "Feldgarden M, Brover V, Gonzalez-Escalona N, Frye JG, Haendiges J, Haft DH, Hoffmann M, Pettengill JB, Prasad AB, Tillman GE, Tyson GH, Klimke W. AMRFinderPlus and the Reference Gene Catalog facilitate examination of the genomic links among antimicrobial resistance, stress response, and virulence. Sci Rep. 2021 Jun 16;11(1):12728. doi: 10.1038/s41598-021-91456-0. PMID: 34135355; PMCID: PMC8208984." diff --git a/workflow/rules/amrfinderplus.smk b/workflow/rules/amrfinderplus.smk new file mode 100644 index 00000000..b7333670 --- /dev/null +++ b/workflow/rules/amrfinderplus.smk @@ -0,0 +1,52 @@ +rule amrfinderplus: + input: + fna="data/interim/fasta/{strains}.fna", + faa="data/interim/prokka/{strains}/{strains}.faa", + gff="data/interim/prokka/{strains}/{strains}.gff", + output: + table="data/interim/amrfinderplus/{strains}.tsv", + log: + "logs/amrfinderplus/amrfinderplus/{strains}.log" + conda: + "../envs/amrfinderplus.yaml" + params: + annotation="prokka", + ident_min="-1", + coverage_min="0.5", + translation_table="11", + threads: 4 + shell: + """ + amrfinder -p {input.faa} \ + -n {input.fna} \ + -g {input.gff} \ + -a {params.annotation} \ + --plus \ + --ident_min {params.ident_min} \ + --coverage_min {params.coverage_min} \ + --translation_table {params.translation_table} \ + --threads {threads} \ + --log {log} > {output.table} + """ + +rule amrfinder_gather: + input: + tables=lambda wildcards: expand( + "data/interim/amrfinderplus/{strains}.tsv", + strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()], + ), + output: + table="data/processed/{name}/tables/df_amrfinderplus.csv" + conda: + "../envs/bgc_analytics.yaml" + log: + "logs/amrfinderplus/gather/amrfinderplus_gather_{name}.log" + shell: + """ + TMPDIR="data/interim/tmp/{wildcards.name}" + mkdir -p $TMPDIR + INPUT_TSV="$TMPDIR/df_amrfinderplus.txt" + echo '{input.tables}' > $INPUT_TSV + python workflow/bgcflow/bgcflow/data/gather_amrfinder.py $INPUT_TSV {output.table} 2>> {log} + rm $INPUT_TSV + """ From f8248d45a74ba33924e240e1adfe2376df02f2c8 Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 15 Jul 2024 08:21:13 +0200 Subject: [PATCH 2/3] fix: correct reading multiple lines for amrfinder --- .examples/Lactobacillus_delbrueckii/project_config.yaml | 2 +- workflow/bgcflow/bgcflow/data/gather_amrfinder.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.examples/Lactobacillus_delbrueckii/project_config.yaml b/.examples/Lactobacillus_delbrueckii/project_config.yaml index d811dda7..a116871b 100644 --- a/.examples/Lactobacillus_delbrueckii/project_config.yaml +++ b/.examples/Lactobacillus_delbrueckii/project_config.yaml @@ -29,4 +29,4 @@ rules: cblaster-genome: TRUE cblaster-bgc: FALSE gecco: TRUE - amrfinderplus: TRUE + amrfinderplus: FALSE diff --git a/workflow/bgcflow/bgcflow/data/gather_amrfinder.py b/workflow/bgcflow/bgcflow/data/gather_amrfinder.py index e296f03d..5befd5e0 100644 --- a/workflow/bgcflow/bgcflow/data/gather_amrfinder.py +++ b/workflow/bgcflow/bgcflow/data/gather_amrfinder.py @@ -53,9 +53,9 @@ def gather_amrfinder(input_list, output_file): dataframes = [] with open(input_list, "r") as f: - data = f.readlines() + data = f.read() - dataframes = pd.concat([cleanup_amrfinder(i.strip("\n")) for i in data]) + dataframes = pd.concat([cleanup_amrfinder(i) for i in data.split()]) dataframes.to_csv(output_file, index=False) return From 1caf8f65f68e9892f4f6f0192bee377d7f14503d Mon Sep 17 00:00:00 2001 From: Matin Nuhamunada Date: Mon, 15 Jul 2024 08:24:47 +0200 Subject: [PATCH 3/3] tests: update test with latest config options --- .tests/config/test1/project_config.yaml | 2 +- .tests/config/test2/project_config.yaml | 3 ++- .tests/config/test3/project_config.yaml | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.tests/config/test1/project_config.yaml b/.tests/config/test1/project_config.yaml index d5cf75f8..4dac2d3c 100644 --- a/.tests/config/test1/project_config.yaml +++ b/.tests/config/test1/project_config.yaml @@ -20,5 +20,5 @@ rules: deeptfactor-roary: FALSE cblaster-genome: TRUE cblaster-bgc: TRUE - ppanggolin: TRUE gecco: TRUE + amrfinderplus: TRUE diff --git a/.tests/config/test2/project_config.yaml b/.tests/config/test2/project_config.yaml index 2b2976cd..faea834d 100644 --- a/.tests/config/test2/project_config.yaml +++ b/.tests/config/test2/project_config.yaml @@ -20,4 +20,5 @@ rules: deeptfactor-roary: FALSE cblaster-genome: TRUE cblaster-bgc: TRUE - ppanggolin: TRUE + gecco: TRUE + amrfinderplus: TRUE diff --git a/.tests/config/test3/project_config.yaml b/.tests/config/test3/project_config.yaml index e71663cb..4dac2d3c 100644 --- a/.tests/config/test3/project_config.yaml +++ b/.tests/config/test3/project_config.yaml @@ -20,4 +20,5 @@ rules: deeptfactor-roary: FALSE cblaster-genome: TRUE cblaster-bgc: TRUE - ppanggolin: TRUE + gecco: TRUE + amrfinderplus: TRUE