Skip to content

Commit

Permalink
ENH: Using sample data as input/outputs for checkV analysis action
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristosMatzoros authored Aug 5, 2024
1 parent 4e290c5 commit ef8b9f0
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 103 deletions.
87 changes: 55 additions & 32 deletions q2_viromics/checkv_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,10 @@
import warnings

import pandas as pd
import qiime2
from q2_types.feature_data import DNAFASTAFormat
from q2_types.per_sample_sequences import ContigSequencesDirFmt

from q2_viromics._utils import run_command
from q2_viromics.types._format import CheckVDBDirFmt
from q2_viromics.types._format import CheckVDBDirFmt, CheckVMetadataDirFmt

warnings.simplefilter(action="ignore", category=FutureWarning)

Expand All @@ -28,7 +27,7 @@ def checkv_end_to_end(tmp, sequences, database, num_threads):
cmd = [
"checkv",
"end_to_end",
str(sequences.path),
str(sequences),
str(tmp),
"-d",
str(internal_db_name),
Expand All @@ -55,42 +54,66 @@ def read_tsv_file(file_name, tmp):


def checkv_analysis(
sequences: DNAFASTAFormat,
sequences: ContigSequencesDirFmt,
database: CheckVDBDirFmt,
num_threads: int = 1,
) -> (
DNAFASTAFormat,
DNAFASTAFormat,
qiime2.Metadata,
qiime2.Metadata,
qiime2.Metadata,
qiime2.Metadata,
ContigSequencesDirFmt,
ContigSequencesDirFmt,
CheckVMetadataDirFmt,
CheckVMetadataDirFmt,
CheckVMetadataDirFmt,
CheckVMetadataDirFmt,
):

viral_sequences = DNAFASTAFormat()
proviral_sequences = DNAFASTAFormat()

with tempfile.TemporaryDirectory() as tmp:
# Execute the "checkv end_to_end" command
checkv_end_to_end(tmp, sequences, database, num_threads)

# Copy the viral sequences file
shutil.copy(os.path.join(tmp, "viruses.fna"), str(viral_sequences))

# Copy the proviral sequences file
shutil.copy(os.path.join(tmp, "proviruses.fna"), str(proviral_sequences))
viral_sequences = ContigSequencesDirFmt()
proviral_sequences = ContigSequencesDirFmt()
quality_summary = CheckVMetadataDirFmt()
contamination = CheckVMetadataDirFmt()
completeness = CheckVMetadataDirFmt()
complete_genomes = CheckVMetadataDirFmt()

for sample_id, contigs_fp in sequences.sample_dict().items():
viral_path = os.path.join(str(viral_sequences), f"{sample_id}_contigs.fa")
proviral_path = os.path.join(str(proviral_sequences), f"{sample_id}_contigs.fa")
quality_summary_path = os.path.join(
str(quality_summary), f"{sample_id}_quality_summary.tsv"
)
contamination_path = os.path.join(
str(contamination), f"{sample_id}_contamination.tsv"
)
completeness_path = os.path.join(
str(completeness), f"{sample_id}_completeness.tsv"
)
complete_genomes_path = os.path.join(
str(complete_genomes), f"{sample_id}_complete_genomes.tsv"
)

# Read the TSV files into a DataFrame
quality_summary_df = read_tsv_file("quality_summary.tsv", tmp)
contamination_df = read_tsv_file("contamination.tsv", tmp)
completeness_df = read_tsv_file("completeness.tsv", tmp)
complete_genomes_df = read_tsv_file("complete_genomes.tsv", tmp)
with tempfile.TemporaryDirectory() as tmp:
# Execute the "checkv end_to_end" command
checkv_end_to_end(tmp, contigs_fp, database, num_threads)

# Define the filenames and destination paths in a list of tuples
files_and_destinations = [
("viruses.fna", viral_path),
("proviruses.fna", proviral_path),
("quality_summary.tsv", quality_summary_path),
("contamination.tsv", contamination_path),
("completeness.tsv", completeness_path),
("complete_genomes.tsv", complete_genomes_path),
]

# Ensure the destination directories exist and move files
for filename, dst in files_and_destinations:
src = os.path.join(tmp, filename)
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.move(src, dst)

return (
viral_sequences,
proviral_sequences,
qiime2.Metadata(quality_summary_df),
qiime2.Metadata(contamination_df),
qiime2.Metadata(completeness_df),
qiime2.Metadata(complete_genomes_df),
quality_summary,
contamination,
completeness,
complete_genomes,
)
39 changes: 24 additions & 15 deletions q2_viromics/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,20 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from q2_types.feature_data import FeatureData, Sequence
from q2_types.metadata import ImmutableMetadata
from q2_types.per_sample_sequences import Contigs
from q2_types.sample_data import SampleData
from qiime2.plugin import Citations, Int, Plugin, Range

from q2_viromics import __version__
from q2_viromics.checkv_analysis import checkv_analysis
from q2_viromics.checkv_fetch_db import checkv_fetch_db
from q2_viromics.genomad_fetch_db import genomad_fetch_db
from q2_viromics.types._format import CheckVDBDirFmt, GenomadDBDirFmt
from q2_viromics.types._type import CheckVDB, GenomadDB
from q2_viromics.types._format import (
CheckVDBDirFmt,
CheckVMetadataDirFmt,
GenomadDBDirFmt,
)
from q2_viromics.types._type import CheckVDB, CheckVMetadata, GenomadDB

citations = Citations.load("citations.bib", package="q2_viromics")

Expand All @@ -23,9 +27,8 @@
version=__version__,
website="https://github.com/bokulich-lab/q2-viromics",
package="q2_viromics",
description="A QIIME 2 plugin for evaluating viral genome quality "
"and completeness from metagenomes and removing "
"host contamination.",
description="A QIIME 2 plugin for detecting viral genomes and assessing "
"their quality.",
short_description="A QIIME 2 plugin for detecting viral genomes and assessing "
"their quality.",
citations=[citations["Caporaso-Bolyen-2024"]],
Expand All @@ -34,9 +37,10 @@
plugin.register_formats(
CheckVDBDirFmt,
GenomadDBDirFmt,
CheckVMetadataDirFmt,
)

plugin.register_semantic_types(CheckVDB, GenomadDB)
plugin.register_semantic_types(CheckVDB, GenomadDB, CheckVMetadata)

plugin.register_artifact_class(
CheckVDB,
Expand All @@ -50,6 +54,11 @@
description=("Genomad database."),
)

plugin.register_semantic_type_to_format(
SampleData[CheckVMetadata],
directory_format=CheckVMetadataDirFmt,
)

plugin.methods.register_function(
function=genomad_fetch_db,
inputs={},
Expand Down Expand Up @@ -85,7 +94,7 @@
plugin.methods.register_function(
function=checkv_analysis,
inputs={
"sequences": FeatureData[Sequence],
"sequences": SampleData[Contigs],
"database": CheckVDB,
},
parameters={
Expand All @@ -99,12 +108,12 @@
"num_threads": "Number of threads to use for prodigal-gv and DIAMOND.",
},
outputs=[
("viruses", FeatureData[Sequence]),
("proviruses", FeatureData[Sequence]),
("quality_summary", ImmutableMetadata),
("contamination", ImmutableMetadata),
("completeness", ImmutableMetadata),
("complete_genomes", ImmutableMetadata),
("viruses", SampleData[Contigs]),
("proviruses", SampleData[Contigs]),
("quality_summary", SampleData[CheckVMetadata]),
("contamination", SampleData[CheckVMetadata]),
("completeness", SampleData[CheckVMetadata]),
("complete_genomes", SampleData[CheckVMetadata]),
],
output_descriptions={
"viruses": "Viral sequences.",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
contig_id contig_length provirus proviral_length gene_count viral_genes host_genes checkv_quality miuvig_quality completeness completeness_method contamination kmer_freq warnings
Caudo-circular 31801 No NA 46 9 1 Complete High-quality 100.0 DTR (high-confidence) 0.0 1.0
Caudo-linear 13842 No NA 14 11 1 Low-quality Genome-fragment 43.2 AAI-based (high-confidence) 0.0 1.0
Caudo-provirus 213842 No NA 228 11 152 High-quality High-quality 100.0 HMM-based (lower-bound) 0.0 1.02
NCLDV-linear 1181549 No NA 1008 799 1 High-quality High-quality 99.21 AAI-based (high-confidence) 0.0 1.01
ssDNA-linear 4761 No NA 2 2 0 High-quality High-quality 92.88 AAI-based (high-confidence) 0.0 1.02
RNA-linear 5542 No NA 5 4 0 Medium-quality Genome-fragment 66.46 AAI-based (high-confidence) 0.0 1.0
lavidaviridae-linear 8598 No NA 9 1 0 Low-quality Genome-fragment 43.81 AAI-based (high-confidence) 0.0 1.0
nonviral-linear 200000 No NA 214 0 151 Not-determined Genome-fragment NA NA 0.0 1.02 no viral genes detected
Caudo-1hallmarkgene 606 No NA 1 0 0 Not-determined Genome-fragment NA NA 0.0 1.0 no viral genes detected
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
contig_id contig_length provirus proviral_length gene_count viral_genes host_genes checkv_quality miuvig_quality completeness completeness_method contamination kmer_freq warnings
Caudo-circular 31801 No NA 46 9 1 Complete High-quality 100.0 DTR (high-confidence) 0.0 1.0
Caudo-linear 13842 No NA 14 11 1 Low-quality Genome-fragment 43.2 AAI-based (high-confidence) 0.0 1.0
Caudo-provirus 213842 No NA 228 11 152 High-quality High-quality 100.0 HMM-based (lower-bound) 0.0 1.02
NCLDV-linear 1181549 No NA 1008 799 1 High-quality High-quality 99.21 AAI-based (high-confidence) 0.0 1.01
ssDNA-linear 4761 No NA 2 2 0 High-quality High-quality 92.88 AAI-based (high-confidence) 0.0 1.02
RNA-linear 5542 No NA 5 4 0 Medium-quality Genome-fragment 66.46 AAI-based (high-confidence) 0.0 1.0
lavidaviridae-linear 8598 No NA 9 1 0 Low-quality Genome-fragment 43.81 AAI-based (high-confidence) 0.0 1.0
nonviral-linear 200000 No NA 214 0 151 Not-determined Genome-fragment NA NA 0.0 1.02 no viral genes detected
Caudo-1hallmarkgene 606 No NA 1 0 0 Not-determined Genome-fragment NA NA 0.0 1.0 no viral genes detected
84 changes: 32 additions & 52 deletions q2_viromics/tests/test_checkv_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from unittest.mock import MagicMock, mock_open, patch

import pandas as pd
import qiime2
from q2_types.feature_data import DNAFASTAFormat

from q2_viromics.checkv_analysis import (
Expand All @@ -26,8 +25,8 @@ class TestCheckvAnalysis(unittest.TestCase):
def test_checkv_end_to_end_success(self, mock_run_command):
# Mock the paths
mock_tmp = "/fake/tmp"
mock_sequences = MagicMock(spec=DNAFASTAFormat)
mock_sequences.path = "/fake/sequences"
mock_sequences = MagicMock()
mock_sequences.__str__.return_value = "/fake/sequences"
mock_database = MagicMock()
mock_database.path = "/fake/database"
mock_database_listdir = ["internal_db"]
Expand Down Expand Up @@ -85,85 +84,66 @@ def test_checkv_end_to_end_failure(self, mock_run_command):
)

@patch("q2_viromics.checkv_analysis.checkv_end_to_end")
@patch("q2_viromics.checkv_analysis.DNAFASTAFormat")
@patch("q2_viromics.checkv_analysis.pd.read_csv")
@patch("shutil.copy")
@patch("shutil.move")
@patch("tempfile.TemporaryDirectory")
def test_checkv_analysis_success(
self,
mock_tempdir,
mock_shutil_copy,
mock_read_csv,
mock_DNAFASTAFormat,
mock_shutil_move,
mock_checkv_end_to_end,
):
# Mock the context managers
# Mock the temporary directory context manager
mock_tempdir.return_value.__enter__.return_value = "/fake/tmp"

# Mock the data frames with string indices
mock_quality_summary_df = pd.DataFrame({"mock": ["data"]}, index=["sample_1"])
mock_contamination_df = pd.DataFrame({"mock": ["data"]}, index=["sample_2"])
mock_completeness_df = pd.DataFrame({"mock": ["data"]}, index=["sample_3"])
mock_complete_genomes_df = pd.DataFrame({"mock": ["data"]}, index=["sample_4"])
mock_read_csv.side_effect = [
mock_quality_summary_df,
mock_contamination_df,
mock_completeness_df,
mock_complete_genomes_df,
]

# Set valid index names for Qiime2 Metadata
mock_quality_summary_df.index.name = "sample-id"
mock_contamination_df.index.name = "sample-id"
mock_completeness_df.index.name = "sample-id"
mock_complete_genomes_df.index.name = "sample-id"

# Mock the sequences and database
mock_sequences = MagicMock(spec=DNAFASTAFormat)
mock_sequences.path = "/fake/sequences"
mock_sequences = MagicMock()
mock_sequences.sample_dict.return_value = {"sample_1": "/fake/sequences"}

mock_database = MagicMock()
mock_database.path = "/fake/database"

# Call the function
result = checkv_analysis(mock_sequences, mock_database, num_threads=1)

# Assertions
# Assertions for checkv_end_to_end call
mock_checkv_end_to_end.assert_called_once_with(
"/fake/tmp", mock_sequences, mock_database, 1
"/fake/tmp", "/fake/sequences", mock_database, 1
)

# Assertions for file movements
mock_shutil_move.assert_any_call(
"/fake/tmp/viruses.fna", str(result[0]) + "/sample_1_contigs.fa"
)
mock_shutil_move.assert_any_call(
"/fake/tmp/proviruses.fna", str(result[1]) + "/sample_1_contigs.fa"
)
mock_shutil_copy.assert_any_call("/fake/tmp/viruses.fna", str(result[0]))
mock_shutil_copy.assert_any_call("/fake/tmp/proviruses.fna", str(result[1]))
mock_read_csv.assert_any_call(
mock_shutil_move.assert_any_call(
"/fake/tmp/quality_summary.tsv",
sep="\t",
na_values=["NA", "", "NaN"],
index_col=0,
str(result[2]) + "/sample_1_quality_summary.tsv",
)
mock_read_csv.assert_any_call(
mock_shutil_move.assert_any_call(
"/fake/tmp/contamination.tsv",
sep="\t",
na_values=["NA", "", "NaN"],
index_col=0,
str(result[3]) + "/sample_1_contamination.tsv",
)
mock_read_csv.assert_any_call(
"/fake/tmp/completeness.tsv",
sep="\t",
na_values=["NA", "", "NaN"],
index_col=0,
mock_shutil_move.assert_any_call(
"/fake/tmp/completeness.tsv", str(result[4]) + "/sample_1_completeness.tsv"
)
mock_read_csv.assert_any_call(
mock_shutil_move.assert_any_call(
"/fake/tmp/complete_genomes.tsv",
sep="\t",
na_values=["NA", "", "NaN"],
index_col=0,
str(result[5]) + "/sample_1_complete_genomes.tsv",
)

# Convert the DataFrame to Metadata
expected_quality_summary_metadata = qiime2.Metadata(mock_quality_summary_df)
expected_contamination_metadata = qiime2.Metadata(mock_contamination_df)
expected_completeness_metadata = qiime2.Metadata(mock_completeness_df)
expected_complete_genomes_metadata = qiime2.Metadata(mock_complete_genomes_df)

# Verify the Metadata
self.assertEqual(result[2], expected_quality_summary_metadata)
self.assertEqual(result[3], expected_contamination_metadata)
self.assertEqual(result[4], expected_completeness_metadata)
self.assertEqual(result[5], expected_complete_genomes_metadata)


class TestReadTSVFile(unittest.TestCase):
@patch(
Expand Down
16 changes: 16 additions & 0 deletions q2_viromics/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from q2_viromics.types._format import (
CheckVDBDirFmt,
CheckVMetadataDirFmt,
GeneralBinaryFileFormat,
GeneralTSVFormat,
GenomadDBDirFmt,
Expand Down Expand Up @@ -129,3 +130,18 @@ def test_CheckVDB_GeneralTSVFormat(self):
filepath = self.get_data_path("type/genomad_database_dir/")
format = GenomadDBDirFmt(filepath, mode="r")
format.validate()


class TestCheckVMetadataDirFmt(TestPluginBase):
package = "q2_viromics.tests"

def test_CheckVMetadataDirFmt(self):
filepath = self.get_data_path("type/checkVMetadata/")
format = CheckVMetadataDirFmt(filepath, mode="r")
format.validate()

def test_CheckVMetadataDirFmt_path_maker(self):
obj = CheckVMetadataDirFmt("type/checkVMetadata/", mode="r")
result_path = obj.metadata_files_path_maker(name="sample1_quality_summary")
expected_path = "type/checkVMetadata/sample1_quality_summary.tsv"
self.assertEqual(str(result_path), expected_path)
Loading

0 comments on commit ef8b9f0

Please sign in to comment.