Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add fetch-busco-db action and update evaluate-busco accordingly #143

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions q2_moshpit/busco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .busco import evaluate_busco
from .busco import evaluate_busco, fetch_busco_db

__all__ = ["evaluate_busco"]
__all__ = ["evaluate_busco", "fetch_busco_db"]
81 changes: 76 additions & 5 deletions q2_moshpit/busco/busco.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,27 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------


import os
import tempfile
import warnings
import q2_moshpit.busco.utils
from q2_moshpit.busco.utils import (
_parse_busco_params,
_render_html,
)
from q2_moshpit._utils import _process_common_input_params
from q2_moshpit._utils import (
_process_common_input_params,
colorify,
run_command
)
from q2_types.reference_db._format import BuscoDatabaseDirFmt
from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt


def evaluate_busco(
output_dir: str,
bins: MultiMAGSequencesDirFmt,
busco_db: BuscoDatabaseDirFmt = None,
mode: str = "genome",
lineage_dataset: str = None,
augustus: bool = False,
Expand Down Expand Up @@ -53,14 +58,46 @@ def evaluate_busco(
busco_output: all busco output files
qiime_html: html for rendering the output plots
"""

# Create dictionary with local variables
# (kwargs passed to the function or their defaults) excluding
# "output_dir" and "bins"
kwargs = {
k: v for k, v in locals().items() if k not in ["output_dir", "bins"]
k: v for k, v in locals().items() if k not in [
"output_dir", "bins", "busco_db"
]
}

# Add busco_db to kwargs
if busco_db is not None:
kwargs["offline"] = True
kwargs["download_path"] = f"{str(busco_db)}/busco_downloads"

# Validate lineage_dataset input if provided.
if lineage_dataset is not None:
if any([auto_lineage, auto_lineage_euk, auto_lineage_prok]):
warnings.warn(
f"`--p-lineage-dataset` was specified as {lineage_dataset}. "
"--p-auto-lineage flags will be ignored."
)
kwargs["auto_lineage"] = False
kwargs["auto_lineage_euk"] = False
kwargs["auto_lineage_prok"] = False

# Check that lineage in deed exits inside Busco DB (if provided)
if busco_db is not None:
if not os.path.exists(
f"{str(busco_db)}/busco_downloads/lineages/{lineage_dataset}"
):
present_lineages = os.listdir(
os.path.join(str(busco_db), "busco_downloads/lineages/")
)
raise ValueError(
f"The specified --p-lineage-dataset {lineage_dataset} "
"is not present in input database (--i-busco-db). \n"
"Printing lineage datasets present in input database: \n"
f"{present_lineages}"
)

# Filter out all kwargs that are None, False or 0.0
common_args = _process_common_input_params(
processing_func=_parse_busco_params, params=kwargs
Expand Down Expand Up @@ -106,3 +143,37 @@ def evaluate_busco(
# Render qiime html report
# Result included in final output
_render_html(output_dir, all_summaries_df)


def fetch_busco_db(
virus: bool, prok: bool, euk: bool
) -> BuscoDatabaseDirFmt:
# Init output object
busco_db = BuscoDatabaseDirFmt(path=None, mode='w')

# Parse input
if all([virus, prok, euk]):
args = ["all"]
else:
args = [
variable_name
for variable_name, flag in [
('virus', virus),
('prokaryota', prok),
('eukaryota', euk)
]
if flag
]

# Download
print(colorify("Downloading BUSCO database..."))
run_command(cmd=["busco", "--download", *args], cwd=str(busco_db))

# Let user know that the process is compleat but there still needs
# some time
print(colorify(
"Download completed. \n"
"Copying files from temporary directory to final location..."
))

return busco_db
187 changes: 177 additions & 10 deletions q2_moshpit/busco/tests/test_busco.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import tempfile
import pandas as pd
from q2_moshpit.busco.busco import evaluate_busco
from q2_moshpit.busco.busco import evaluate_busco, fetch_busco_db
from unittest.mock import patch, ANY
from qiime2.plugin.testing import TestPluginBase
from q2_types.reference_db._format import BuscoDatabaseDirFmt
from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt


class TestBUSCO(TestPluginBase):
class TestEvaluateBUSCO(TestPluginBase):
package = "q2_moshpit.busco.tests"

@classmethod
Expand All @@ -30,7 +30,6 @@ def setUpClass(self):
mode="r",
)

# Integration test busco.
@patch('q2_moshpit.busco.utils._run_busco')
@patch('q2_moshpit.busco.utils._zip_busco_plots')
@patch('q2_moshpit.busco.utils._draw_busco_plots')
Expand All @@ -56,9 +55,6 @@ def test_integration_busco(
run_busco (unittest.mock): mock object for function
`_run_busco`.
"""
# import shutil
# path_to_look_at_html = "/Users/santiago/Downloads/busco_debug_bench"

with tempfile.TemporaryDirectory() as tmp_path:
# This side effect will return the all_summaries_dfs
p = self.get_data_path("all_batch_summaries.csv")
Expand All @@ -67,9 +63,6 @@ def test_integration_busco(
# Run busco
evaluate_busco(output_dir=str(tmp_path), bins=self.mags)

# For render debugging
# shutil.copytree(str(tmp_path), path_to_look_at_html)

# Check for the existence of the html file
self.assertTrue(os.path.exists(f"{tmp_path}/index.html"))

Expand Down Expand Up @@ -102,3 +95,177 @@ def test_integration_busco(
paths_to_plots=ANY,
zip_path=os.path.join(tmp_path, "busco_plots.zip")
)

@patch('q2_moshpit.busco.utils._run_busco')
@patch('q2_moshpit.busco.utils._zip_busco_plots')
@patch('q2_moshpit.busco.utils._draw_busco_plots')
@patch('q2_moshpit.busco.utils._collect_summaries_and_save')
def test_integration_busco_offline(
self,
collect_summaries,
draw_busco_plots,
zip_busco_plots,
run_busco
):
with tempfile.TemporaryDirectory() as tmp_path:
# This side effect will return the all_summaries_dfs
p = self.get_data_path("all_batch_summaries.csv")
collect_summaries.return_value = pd.read_csv(p)

busco_db = BuscoDatabaseDirFmt(path=None, mode="w")
# Run busco
evaluate_busco(
output_dir=str(tmp_path),
bins=self.mags,
busco_db=busco_db
)

# Assert that the calls where done properly
run_busco.assert_called_once_with(
output_dir=run_busco.call_args.kwargs['output_dir'],
mags=self.mags,
params=[
'--mode', 'genome',
'--cpu', '1',
'--contig_break', '10',
'--evalue', '0.001',
'--limit', '3',
'--offline',
'--download_path', f"{str(busco_db)}/busco_downloads"
]
)

@patch('q2_moshpit.busco.utils._run_busco')
@patch('q2_moshpit.busco.utils._zip_busco_plots')
@patch('q2_moshpit.busco.utils._draw_busco_plots')
@patch('q2_moshpit.busco.utils._collect_summaries_and_save')
def test_integration_busco_w_lineage(
self,
collect_summaries,
draw_busco_plots,
zip_busco_plots,
run_busco
):
with tempfile.TemporaryDirectory() as tmp_path:
# This side effect will return the all_summaries_dfs
p = self.get_data_path("all_batch_summaries.csv")
collect_summaries.return_value = pd.read_csv(p)

with self.assertWarnsRegex(
Warning,
"`--p-lineage-dataset` was specified as lineage_1. "
"--p-auto-lineage flags will be ignored."
):
# Run busco
evaluate_busco(
output_dir=str(tmp_path),
bins=self.mags,
lineage_dataset="lineage_1",
auto_lineage=True,
auto_lineage_euk=True,
auto_lineage_prok=True,
)

# Assert that the calls where done properly
run_busco.assert_called_once_with(
output_dir=run_busco.call_args.kwargs['output_dir'],
mags=self.mags,
params=[
'--mode', 'genome',
'--lineage_dataset', "lineage_1",
'--cpu', '1',
'--contig_break', '10',
'--evalue', '0.001',
'--limit', '3',
]
)

@patch('q2_moshpit.busco.utils._run_busco')
@patch('q2_moshpit.busco.utils._zip_busco_plots')
@patch('q2_moshpit.busco.utils._draw_busco_plots')
@patch('q2_moshpit.busco.utils._collect_summaries_and_save')
def test_integration_busco_offline_w_lineage(
self,
collect_summaries,
draw_busco_plots,
zip_busco_plots,
run_busco
):
with tempfile.TemporaryDirectory() as tmp_path:
# This side effect will return the all_summaries_dfs
p = self.get_data_path("all_batch_summaries.csv")
collect_summaries.return_value = pd.read_csv(p)

# Give path to valid database
p = self.get_data_path("busco_db")
busco_db = BuscoDatabaseDirFmt(path=p, mode="r")

# Run busco
evaluate_busco(
output_dir=str(tmp_path),
bins=self.mags,
busco_db=busco_db,
lineage_dataset="lineage_1",
)

# Assert that the calls where done properly
run_busco.assert_called_once_with(
output_dir=run_busco.call_args.kwargs['output_dir'],
mags=self.mags,
params=[
'--mode', 'genome',
'--lineage_dataset', "lineage_1",
'--cpu', '1',
'--contig_break', '10',
'--evalue', '0.001',
'--limit', '3',
'--offline',
'--download_path', f"{str(busco_db)}/busco_downloads"
]
)

def test_integration_busco_offline_w_lineage_invalid(self):
with tempfile.TemporaryDirectory() as tmp_path:
# Give path to valid database
p = self.get_data_path("busco_db")
busco_db = BuscoDatabaseDirFmt(path=p, mode="r")

with self.assertRaisesRegex(
ValueError,
"lineage_2 is not present in input database"
):
# Run busco
evaluate_busco(
output_dir=str(tmp_path),
bins=self.mags,
busco_db=busco_db,
lineage_dataset="lineage_2",
)


class TestFetchBUSCO(TestPluginBase):
package = "q2_moshpit.busco.tests"

@patch("subprocess.run")
def test_fetch_busco_db_virus(self, subp_run):
busco_db = fetch_busco_db(virus=True, prok=False, euk=False)

# Check that command was called in the expected way
cmd = ["busco", "--download", "virus"]
subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db))

@patch("subprocess.run")
def test_fetch_busco_db_prok_euk(self, subp_run):
busco_db = fetch_busco_db(virus=False, prok=True, euk=True)

# Check that command was called in the expected way
cmd = ["busco", "--download", "prokaryota", "eukaryota"]
subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db))

@patch("subprocess.run")
def test_fetch_busco_db_all(self, subp_run):
busco_db = fetch_busco_db(virus=True, prok=True, euk=True)

# Check that command was called in the expected way
cmd = ["busco", "--download", "all"]
subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db))
9 changes: 7 additions & 2 deletions q2_moshpit/busco/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,8 @@ def test_run_busco(self, subp_run):
"--out_path", output_dir,
"-o", sample_id
],
check=True
check=True,
cwd=os.path.dirname(output_dir)
))

# Run busco and save paths to run summaries
Expand Down Expand Up @@ -322,7 +323,11 @@ def test_run_busco_exception(self, subp_run):
"--out_path", output_dir,
"-o", "sample1"
]
subp_run.assert_called_once_with(cmd, check=True)
subp_run.assert_called_once_with(
cmd,
check=True,
cwd=tmp_path
)

def test_parse_df_columns(self):
# This side effect will return the all_summaries_dfs
Expand Down
2 changes: 1 addition & 1 deletion q2_moshpit/busco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def _run_busco(
"-o",
sample
])
run_command(cmd)
run_command(cmd, cwd=os.path.dirname(output_dir))

# Check for output
path_to_run_summary = os.path.join(
Expand Down
Loading
Loading