Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Amrfinderplus #353

Merged
merged 3 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .examples/Lactobacillus_delbrueckii/project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ rules:
cblaster-genome: TRUE
cblaster-bgc: FALSE
gecco: TRUE
amrfinderplus: FALSE
2 changes: 1 addition & 1 deletion .tests/config/test1/project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ rules:
deeptfactor-roary: FALSE
cblaster-genome: TRUE
cblaster-bgc: TRUE
ppanggolin: TRUE
gecco: TRUE
amrfinderplus: TRUE
3 changes: 2 additions & 1 deletion .tests/config/test2/project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ rules:
deeptfactor-roary: FALSE
cblaster-genome: TRUE
cblaster-bgc: TRUE
ppanggolin: TRUE
gecco: TRUE
amrfinderplus: TRUE
3 changes: 2 additions & 1 deletion .tests/config/test3/project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ rules:
deeptfactor-roary: FALSE
cblaster-genome: TRUE
cblaster-bgc: TRUE
ppanggolin: TRUE
gecco: TRUE
amrfinderplus: TRUE
1 change: 1 addition & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ include: "rules/cblaster.smk"
include: "rules/data_warehouse.smk"
include: "rules/convert_genbank.smk"
include: "rules/gecco.smk"
include: "rules/amrfinderplus.smk"

##### 4. Generate user-defined local resources
custom_resource_dir(config["resources_path"], resource_mapping)
Expand Down
64 changes: 64 additions & 0 deletions workflow/bgcflow/bgcflow/data/gather_amrfinder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
import sys
from pathlib import Path

import pandas as pd

log_format = "%(levelname)-8s %(asctime)s %(message)s"
date_format = "%d/%m %H:%M:%S"
logging.basicConfig(format=log_format, datefmt=date_format, level=logging.DEBUG)


def cleanup_amrfinder(input_file, genome_id=None):
"""
Cleans up the AMRFinder output file by filling null values in the 'Protein identifier' column
with a combination of 'Contig id', 'Start', and 'Stop' coordinates. Adds a 'genome_id' column.

Parameters:
input_file (str): Path to the input file.
genome_id (str, optional): Genome identifier. If not provided, it will be derived from the input file name.

Returns:
pd.DataFrame: Cleaned DataFrame.
"""
input_path = Path(input_file)
if genome_id is None:
genome_id = input_path.stem
df = pd.read_csv(input_path, sep="\t")

# Fill null values in 'Protein identifier' with a combination of 'Contig id', 'Start', and 'Stop'
df["Protein identifier"] = df.apply(
lambda row: row["Protein identifier"]
if pd.notnull(row["Protein identifier"])
else f"{row['Contig id']}_{row['Start']}_{row['Stop']}",
axis=1,
)
df["genome_id"] = genome_id

return df


def gather_amrfinder(input_list, output_file):
"""
Gathers and cleans up multiple AMRFinder output files listed in an input file,
concatenates them into a single DataFrame, and writes the result to an output file.

Parameters:
input_list (str): Path to the file containing a list of input file paths.
output_file (str): Path to the output file where the concatenated DataFrame will be saved.

Returns:
None
"""
dataframes = []

with open(input_list, "r") as f:
data = f.read()

dataframes = pd.concat([cleanup_amrfinder(i) for i in data.split()])
dataframes.to_csv(output_file, index=False)
return


if __name__ == "__main__":
gather_amrfinder(sys.argv[1], sys.argv[2])
1 change: 1 addition & 0 deletions workflow/envs/amrfinderplus.post-deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
amrfinder -u
7 changes: 7 additions & 0 deletions workflow/envs/amrfinderplus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: amrfinder
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- ncbi-amrfinderplus
92 changes: 92 additions & 0 deletions workflow/notebook/amrfinderplus.py.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "aa0f0d95-5e6b-436f-a971-9486d0841329",
"metadata": {},
"source": [
"# AMRFinderPlus\n",
"Summary of [AMRFinderPlus](https://github.com/ncbi/amr) results from project: `[{{ project().name }}]` \n",
"\n",
"## Description\n",
"Identify AMR genes and point mutations, and virulence and stress resistance genes in assembled bacterial nucleotide and protein sequence.\n",
"\n",
"## AMR Hits Overview"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d21448bb-afe9-4fdb-b964-2143a0ccd1e4",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from IPython.display import display, Markdown, HTML\n",
"from pathlib import Path\n",
"import altair as alt\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"from itables import to_html_datatable as DT\n",
"import itables.options as opt\n",
"opt.css = \"\"\"\n",
".itables table td { font-style: italic; font-size: .8em;}\n",
".itables table th { font-style: oblique; font-size: .8em; }\n",
"\"\"\"\n",
"opt.classes = [\"display\", \"compact\"]\n",
"opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53fb8b15-379c-4be1-827e-243602f89d95",
"metadata": {},
"outputs": [],
"source": [
"report_dir = Path(\"../\")\n",
"amr_table = report_dir / \"tables/df_amrfinderplus.csv\"\n",
"df = pd.read_csv(amr_table)\n",
"\n",
"display(HTML(DT(df, columnDefs=[{\"className\": \"dt-center\", \"targets\": \"_all\"}],)))"
]
},
{
"cell_type": "markdown",
"id": "4d8fe1ca-30f9-472a-9e01-4e9db61825b8",
"metadata": {},
"source": [
"[Download Table]({{ project().file_server() }}/tables/df_amrfinderplus.csv){:target=\"_blank\" .md-button}\n",
"\n",
"## References\n",
"<font size=\"2\">\n",
"{% for i in project().rule_used['amrfinderplus']['references'] %}\n",
"- *{{ i }}*\n",
"{% endfor %}\n",
"</font>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
8 changes: 8 additions & 0 deletions workflow/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,11 @@ gecco:
- "Accurate de novo identification of biosynthetic gene clusters with GECCO. Laura M Carroll,
Martin Larralde, Jonas Simon Fleck, Ruby Ponnudurai, Alessio Milanese, Elisa Cappio Barazzone,
Georg Zeller. bioRxiv 2021.05.03.442509; doi:10.1101/2021.05.03.442509"
amrfinderplus:
final_output: "data/processed/{name}/tables/df_amrfinderplus.csv"
description: Identify AMR genes and point mutations, and virulence and stress resistance genes in assembled bacterial nucleotide and protein sequence.
category: Genome Mining
link:
- https://github.com/ncbi/amr
references:
- "Feldgarden M, Brover V, Gonzalez-Escalona N, Frye JG, Haendiges J, Haft DH, Hoffmann M, Pettengill JB, Prasad AB, Tillman GE, Tyson GH, Klimke W. AMRFinderPlus and the Reference Gene Catalog facilitate examination of the genomic links among antimicrobial resistance, stress response, and virulence. Sci Rep. 2021 Jun 16;11(1):12728. doi: 10.1038/s41598-021-91456-0. PMID: 34135355; PMCID: PMC8208984."
52 changes: 52 additions & 0 deletions workflow/rules/amrfinderplus.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
rule amrfinderplus:
input:
fna="data/interim/fasta/{strains}.fna",
faa="data/interim/prokka/{strains}/{strains}.faa",
gff="data/interim/prokka/{strains}/{strains}.gff",
output:
table="data/interim/amrfinderplus/{strains}.tsv",
log:
"logs/amrfinderplus/amrfinderplus/{strains}.log"
conda:
"../envs/amrfinderplus.yaml"
params:
annotation="prokka",
ident_min="-1",
coverage_min="0.5",
translation_table="11",
threads: 4
shell:
"""
amrfinder -p {input.faa} \
-n {input.fna} \
-g {input.gff} \
-a {params.annotation} \
--plus \
--ident_min {params.ident_min} \
--coverage_min {params.coverage_min} \
--translation_table {params.translation_table} \
--threads {threads} \
--log {log} > {output.table}
"""

rule amrfinder_gather:
input:
tables=lambda wildcards: expand(
"data/interim/amrfinderplus/{strains}.tsv",
strains=[s for s in PEP_PROJECTS[wildcards.name].sample_table.genome_id.unique()],
),
output:
table="data/processed/{name}/tables/df_amrfinderplus.csv"
conda:
"../envs/bgc_analytics.yaml"
log:
"logs/amrfinderplus/gather/amrfinderplus_gather_{name}.log"
shell:
"""
TMPDIR="data/interim/tmp/{wildcards.name}"
mkdir -p $TMPDIR
INPUT_TSV="$TMPDIR/df_amrfinderplus.txt"
echo '{input.tables}' > $INPUT_TSV
python workflow/bgcflow/bgcflow/data/gather_amrfinder.py $INPUT_TSV {output.table} 2>> {log}
rm $INPUT_TSV
"""
Loading