diff --git a/q2_moshpit/busco/__init__.py b/q2_moshpit/busco/__init__.py index 080a3f3b..43a1f40a 100644 --- a/q2_moshpit/busco/__init__.py +++ b/q2_moshpit/busco/__init__.py @@ -6,6 +6,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from .busco import evaluate_busco +from .busco import evaluate_busco, fetch_busco_db -__all__ = ["evaluate_busco"] +__all__ = ["evaluate_busco", "fetch_busco_db"] diff --git a/q2_moshpit/busco/busco.py b/q2_moshpit/busco/busco.py index fbcbae99..60047616 100644 --- a/q2_moshpit/busco/busco.py +++ b/q2_moshpit/busco/busco.py @@ -5,22 +5,27 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - - import os import tempfile +import warnings import q2_moshpit.busco.utils from q2_moshpit.busco.utils import ( _parse_busco_params, _render_html, ) -from q2_moshpit._utils import _process_common_input_params +from q2_moshpit._utils import ( + _process_common_input_params, + colorify, + run_command +) +from q2_types.reference_db._format import BuscoDatabaseDirFmt from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt def evaluate_busco( output_dir: str, bins: MultiMAGSequencesDirFmt, + busco_db: BuscoDatabaseDirFmt = None, mode: str = "genome", lineage_dataset: str = None, augustus: bool = False, @@ -53,14 +58,46 @@ def evaluate_busco( busco_output: all busco output files qiime_html: html for rendering the output plots """ - # Create dictionary with local variables # (kwargs passed to the function or their defaults) excluding # "output_dir" and "bins" kwargs = { - k: v for k, v in locals().items() if k not in ["output_dir", "bins"] + k: v for k, v in locals().items() if k not in [ + "output_dir", "bins", "busco_db" + ] } + # Add busco_db to kwargs + if busco_db is not None: + kwargs["offline"] = True + kwargs["download_path"] = f"{str(busco_db)}/busco_downloads" + + # Validate lineage_dataset input if provided. + if lineage_dataset is not None: + if any([auto_lineage, auto_lineage_euk, auto_lineage_prok]): + warnings.warn( + f"`--p-lineage-dataset` was specified as {lineage_dataset}. " + "--p-auto-lineage flags will be ignored." + ) + kwargs["auto_lineage"] = False + kwargs["auto_lineage_euk"] = False + kwargs["auto_lineage_prok"] = False + + # Check that lineage in deed exits inside Busco DB (if provided) + if busco_db is not None: + if not os.path.exists( + f"{str(busco_db)}/busco_downloads/lineages/{lineage_dataset}" + ): + present_lineages = os.listdir( + os.path.join(str(busco_db), "busco_downloads/lineages/") + ) + raise ValueError( + f"The specified --p-lineage-dataset {lineage_dataset} " + "is not present in input database (--i-busco-db). \n" + "Printing lineage datasets present in input database: \n" + f"{present_lineages}" + ) + # Filter out all kwargs that are None, False or 0.0 common_args = _process_common_input_params( processing_func=_parse_busco_params, params=kwargs @@ -106,3 +143,37 @@ def evaluate_busco( # Render qiime html report # Result included in final output _render_html(output_dir, all_summaries_df) + + +def fetch_busco_db( + virus: bool, prok: bool, euk: bool + ) -> BuscoDatabaseDirFmt: + # Init output object + busco_db = BuscoDatabaseDirFmt(path=None, mode='w') + + # Parse input + if all([virus, prok, euk]): + args = ["all"] + else: + args = [ + variable_name + for variable_name, flag in [ + ('virus', virus), + ('prokaryota', prok), + ('eukaryota', euk) + ] + if flag + ] + + # Download + print(colorify("Downloading BUSCO database...")) + run_command(cmd=["busco", "--download", *args], cwd=str(busco_db)) + + # Let user know that the process is compleat but there still needs + # some time + print(colorify( + "Download completed. \n" + "Copying files from temporary directory to final location..." + )) + + return busco_db diff --git a/q2_moshpit/busco/tests/test_busco.py b/q2_moshpit/busco/tests/test_busco.py index 981f79c0..c394f2ea 100644 --- a/q2_moshpit/busco/tests/test_busco.py +++ b/q2_moshpit/busco/tests/test_busco.py @@ -5,17 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - import os import tempfile import pandas as pd -from q2_moshpit.busco.busco import evaluate_busco +from q2_moshpit.busco.busco import evaluate_busco, fetch_busco_db from unittest.mock import patch, ANY from qiime2.plugin.testing import TestPluginBase +from q2_types.reference_db._format import BuscoDatabaseDirFmt from q2_types.per_sample_sequences._format import MultiMAGSequencesDirFmt -class TestBUSCO(TestPluginBase): +class TestEvaluateBUSCO(TestPluginBase): package = "q2_moshpit.busco.tests" @classmethod @@ -30,7 +30,6 @@ def setUpClass(self): mode="r", ) - # Integration test busco. @patch('q2_moshpit.busco.utils._run_busco') @patch('q2_moshpit.busco.utils._zip_busco_plots') @patch('q2_moshpit.busco.utils._draw_busco_plots') @@ -56,9 +55,6 @@ def test_integration_busco( run_busco (unittest.mock): mock object for function `_run_busco`. """ - # import shutil - # path_to_look_at_html = "/Users/santiago/Downloads/busco_debug_bench" - with tempfile.TemporaryDirectory() as tmp_path: # This side effect will return the all_summaries_dfs p = self.get_data_path("all_batch_summaries.csv") @@ -67,9 +63,6 @@ def test_integration_busco( # Run busco evaluate_busco(output_dir=str(tmp_path), bins=self.mags) - # For render debugging - # shutil.copytree(str(tmp_path), path_to_look_at_html) - # Check for the existence of the html file self.assertTrue(os.path.exists(f"{tmp_path}/index.html")) @@ -102,3 +95,177 @@ def test_integration_busco( paths_to_plots=ANY, zip_path=os.path.join(tmp_path, "busco_plots.zip") ) + + @patch('q2_moshpit.busco.utils._run_busco') + @patch('q2_moshpit.busco.utils._zip_busco_plots') + @patch('q2_moshpit.busco.utils._draw_busco_plots') + @patch('q2_moshpit.busco.utils._collect_summaries_and_save') + def test_integration_busco_offline( + self, + collect_summaries, + draw_busco_plots, + zip_busco_plots, + run_busco + ): + with tempfile.TemporaryDirectory() as tmp_path: + # This side effect will return the all_summaries_dfs + p = self.get_data_path("all_batch_summaries.csv") + collect_summaries.return_value = pd.read_csv(p) + + busco_db = BuscoDatabaseDirFmt(path=None, mode="w") + # Run busco + evaluate_busco( + output_dir=str(tmp_path), + bins=self.mags, + busco_db=busco_db + ) + + # Assert that the calls where done properly + run_busco.assert_called_once_with( + output_dir=run_busco.call_args.kwargs['output_dir'], + mags=self.mags, + params=[ + '--mode', 'genome', + '--cpu', '1', + '--contig_break', '10', + '--evalue', '0.001', + '--limit', '3', + '--offline', + '--download_path', f"{str(busco_db)}/busco_downloads" + ] + ) + + @patch('q2_moshpit.busco.utils._run_busco') + @patch('q2_moshpit.busco.utils._zip_busco_plots') + @patch('q2_moshpit.busco.utils._draw_busco_plots') + @patch('q2_moshpit.busco.utils._collect_summaries_and_save') + def test_integration_busco_w_lineage( + self, + collect_summaries, + draw_busco_plots, + zip_busco_plots, + run_busco + ): + with tempfile.TemporaryDirectory() as tmp_path: + # This side effect will return the all_summaries_dfs + p = self.get_data_path("all_batch_summaries.csv") + collect_summaries.return_value = pd.read_csv(p) + + with self.assertWarnsRegex( + Warning, + "`--p-lineage-dataset` was specified as lineage_1. " + "--p-auto-lineage flags will be ignored." + ): + # Run busco + evaluate_busco( + output_dir=str(tmp_path), + bins=self.mags, + lineage_dataset="lineage_1", + auto_lineage=True, + auto_lineage_euk=True, + auto_lineage_prok=True, + ) + + # Assert that the calls where done properly + run_busco.assert_called_once_with( + output_dir=run_busco.call_args.kwargs['output_dir'], + mags=self.mags, + params=[ + '--mode', 'genome', + '--lineage_dataset', "lineage_1", + '--cpu', '1', + '--contig_break', '10', + '--evalue', '0.001', + '--limit', '3', + ] + ) + + @patch('q2_moshpit.busco.utils._run_busco') + @patch('q2_moshpit.busco.utils._zip_busco_plots') + @patch('q2_moshpit.busco.utils._draw_busco_plots') + @patch('q2_moshpit.busco.utils._collect_summaries_and_save') + def test_integration_busco_offline_w_lineage( + self, + collect_summaries, + draw_busco_plots, + zip_busco_plots, + run_busco + ): + with tempfile.TemporaryDirectory() as tmp_path: + # This side effect will return the all_summaries_dfs + p = self.get_data_path("all_batch_summaries.csv") + collect_summaries.return_value = pd.read_csv(p) + + # Give path to valid database + p = self.get_data_path("busco_db") + busco_db = BuscoDatabaseDirFmt(path=p, mode="r") + + # Run busco + evaluate_busco( + output_dir=str(tmp_path), + bins=self.mags, + busco_db=busco_db, + lineage_dataset="lineage_1", + ) + + # Assert that the calls where done properly + run_busco.assert_called_once_with( + output_dir=run_busco.call_args.kwargs['output_dir'], + mags=self.mags, + params=[ + '--mode', 'genome', + '--lineage_dataset', "lineage_1", + '--cpu', '1', + '--contig_break', '10', + '--evalue', '0.001', + '--limit', '3', + '--offline', + '--download_path', f"{str(busco_db)}/busco_downloads" + ] + ) + + def test_integration_busco_offline_w_lineage_invalid(self): + with tempfile.TemporaryDirectory() as tmp_path: + # Give path to valid database + p = self.get_data_path("busco_db") + busco_db = BuscoDatabaseDirFmt(path=p, mode="r") + + with self.assertRaisesRegex( + ValueError, + "lineage_2 is not present in input database" + ): + # Run busco + evaluate_busco( + output_dir=str(tmp_path), + bins=self.mags, + busco_db=busco_db, + lineage_dataset="lineage_2", + ) + + +class TestFetchBUSCO(TestPluginBase): + package = "q2_moshpit.busco.tests" + + @patch("subprocess.run") + def test_fetch_busco_db_virus(self, subp_run): + busco_db = fetch_busco_db(virus=True, prok=False, euk=False) + + # Check that command was called in the expected way + cmd = ["busco", "--download", "virus"] + subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db)) + + @patch("subprocess.run") + def test_fetch_busco_db_prok_euk(self, subp_run): + busco_db = fetch_busco_db(virus=False, prok=True, euk=True) + + # Check that command was called in the expected way + cmd = ["busco", "--download", "prokaryota", "eukaryota"] + subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db)) + + @patch("subprocess.run") + def test_fetch_busco_db_all(self, subp_run): + busco_db = fetch_busco_db(virus=True, prok=True, euk=True) + + # Check that command was called in the expected way + cmd = ["busco", "--download", "all"] + subp_run.assert_called_once_with(cmd, check=True, cwd=str(busco_db)) diff --git a/q2_moshpit/busco/tests/test_utils.py b/q2_moshpit/busco/tests/test_utils.py index 7c67eb02..d7e90d90 100644 --- a/q2_moshpit/busco/tests/test_utils.py +++ b/q2_moshpit/busco/tests/test_utils.py @@ -279,7 +279,8 @@ def test_run_busco(self, subp_run): "--out_path", output_dir, "-o", sample_id ], - check=True + check=True, + cwd=os.path.dirname(output_dir) )) # Run busco and save paths to run summaries @@ -322,7 +323,11 @@ def test_run_busco_exception(self, subp_run): "--out_path", output_dir, "-o", "sample1" ] - subp_run.assert_called_once_with(cmd, check=True) + subp_run.assert_called_once_with( + cmd, + check=True, + cwd=tmp_path + ) def test_parse_df_columns(self): # This side effect will return the all_summaries_dfs diff --git a/q2_moshpit/busco/utils.py b/q2_moshpit/busco/utils.py index d3a1161e..73a82c75 100644 --- a/q2_moshpit/busco/utils.py +++ b/q2_moshpit/busco/utils.py @@ -255,7 +255,7 @@ def _run_busco( "-o", sample ]) - run_command(cmd) + run_command(cmd, cwd=os.path.dirname(output_dir)) # Check for output path_to_run_summary = os.path.join( diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index c743dddb..12edbde7 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -6,7 +6,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import importlib - from q2_types.distance_matrix import DistanceMatrix from q2_types.feature_data import ( FeatureData, Sequence, Taxonomy, ProteinSequence @@ -35,7 +34,8 @@ from q2_types.kraken2._type import BrackenDB from q2_types.per_sample_sequences._type import AlignmentMap from q2_types.reference_db import ( - ReferenceDB, Diamond, Eggnog, NCBITaxonomy, EggnogProteinSequences + ReferenceDB, Diamond, Eggnog, NCBITaxonomy, EggnogProteinSequences, + BuscoDB ) citations = Citations.load('citations.bib', package='q2_moshpit') @@ -797,6 +797,121 @@ citations=[citations["huerta_cepas_eggnog_2019"]] ) +# First bool flag only allowed to be True when the DB contains all lineages +# Second bool flag only allowed to be True when the DB has property "eukaryota" +# Third bool flag only allowed to be True when the DB has property "prokaryota" +# Triple false option = setting where user specifies the lineage dataset +( + i_busco_db, + p_auto_lineage, p_auto_lineage_euk, p_auto_lineage_prok, + _ +) = TypeMap({ + ( + ReferenceDB[ + BuscoDB % Properties(['virus', 'prokaryota', 'eukaryota']) + ], + Bool % Choices(True), + Bool % Choices(False), + Bool % Choices(False), + ): Int, # Placeholder type because visualizations have no output + ( + ReferenceDB[ + BuscoDB % Properties(['virus', 'prokaryota', 'eukaryota']) + ], + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[ + BuscoDB % Properties(['virus', 'prokaryota', 'eukaryota']) + ], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(True), + ): Int, + ( + ReferenceDB[ + BuscoDB % Properties(['virus', 'prokaryota', 'eukaryota']) + ], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['prokaryota', 'eukaryota'])], + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['prokaryota', 'eukaryota'])], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(True), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['prokaryota', 'eukaryota'])], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['virus', 'eukaryota'])], + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['virus', 'eukaryota'])], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['virus', 'prokaryota'])], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(True), + ): Int, + ( + ReferenceDB[BuscoDB % Properties(['virus', 'prokaryota'])], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties('prokaryota')], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(True), + ): Int, + ( + ReferenceDB[BuscoDB % Properties('prokaryota')], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties('eukaryota')], + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties('eukaryota')], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, + ( + ReferenceDB[BuscoDB % Properties('virus')], + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(False), + ): Int, +}) + plugin.methods.register_function( function=q2_moshpit.partition.partition_sample_data_mags, inputs={"mags": SampleData[MAGs]}, @@ -900,9 +1015,9 @@ "augustus": Bool, "augustus_parameters": Str, "augustus_species": Str, - "auto_lineage": Bool, - "auto_lineage_euk": Bool, - "auto_lineage_prok": Bool, + "auto_lineage": p_auto_lineage, + "auto_lineage_euk": p_auto_lineage_euk, + "auto_lineage_prok": p_auto_lineage_prok, "cpu": Int % Range(1, None), "config": Str, "contig_break": Int % Range(0, None), @@ -915,6 +1030,7 @@ "miniprot": Bool, "scaffold_composition": Bool, } + busco_param_descriptions = { "mode": "Specify which BUSCO analysis mode to run." "Currently only the 'genome' option is supported, " @@ -967,15 +1083,17 @@ "`scaffold_composition.txt`.", } - plugin.visualizers.register_function( function=q2_moshpit.busco.evaluate_busco, inputs={ "bins": SampleData[MAGs], + "busco_db": i_busco_db }, parameters=busco_params, input_descriptions={ - "bins": "MAGs to be analyzed.", + "bins": "MAGs to be analyzed", + "busco_db": "BUSCO database. If provided BUSCO will run in offline " + "mode" }, parameter_descriptions=busco_param_descriptions, name="Evaluate quality of the generated MAGs using BUSCO.", @@ -986,6 +1104,70 @@ citations=[citations["manni_busco_2021"]], ) + +p_virus, p_prok, p_euk, o_busco_db = TypeMap({ + ( + Bool % Choices(True), + Bool % Choices(True), + Bool % Choices(True) + ): ReferenceDB[BuscoDB % Properties(['virus', 'prokaryota', 'eukaryota'])], + ( + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(True) + ): ReferenceDB[BuscoDB % Properties(['prokaryota', 'eukaryota'])], + ( + Bool % Choices(True), + Bool % Choices(False), + Bool % Choices(True) + ): ReferenceDB[BuscoDB % Properties(['virus', 'eukaryota'])], + ( + Bool % Choices(True), + Bool % Choices(True), + Bool % Choices(False) + ): ReferenceDB[BuscoDB % Properties(['virus', 'prokaryota'])], + ( + Bool % Choices(True), + Bool % Choices(False), + Bool % Choices(False) + ): ReferenceDB[BuscoDB % Properties('virus')], + ( + Bool % Choices(False), + Bool % Choices(True), + Bool % Choices(False) + ): ReferenceDB[BuscoDB % Properties('prokaryota')], + ( + Bool % Choices(False), + Bool % Choices(False), + Bool % Choices(True) + ): ReferenceDB[BuscoDB % Properties('eukaryota')], +}) + + +plugin.methods.register_function( + function=q2_moshpit.busco.fetch_busco_db, + inputs={}, + outputs=[('busco_db', o_busco_db)], + output_descriptions={ + 'busco_db': "BUSCO database for the specified lineages" + }, + parameters={ + "virus": p_virus, + "prok": p_prok, + "euk": p_euk, + }, + parameter_descriptions={ + "virus": "Download the virus dataset", + "prok": "Download the prokaryote dataset", + "euk": "Download the eukaryote dataset", + }, + name="Download BUSCO database.", + description="Downloads BUSCO database for the specified lineage. " + "Output can be used to run BUSCO wit the evaluate-busco " + "action.", + citations=[citations["manni_busco_2021"]], +) + plugin.methods.register_function( function=q2_moshpit.prodigal.predict_genes_prodigal, inputs={