diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f5f14b3..ac0f2ae3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Initial release of nf-core/multiplesequencealign, created with the [nf-core](htt - [[#146](https://github.com/nf-core/multiplesequencealign/pull/146)] - Only show additional process tags when they exists and use the same ubuntu version in all modules. - [[#145](https://github.com/nf-core/multiplesequencealign/pull/145)] - Add consensus MSA. - [[#147](https://github.com/nf-core/multiplesequencealign/pull/147)] - Add small testing profile + some fixes of the shiny app. +- [[#148](https://github.com/nf-core/multiplesequencealign/pull/148)] - Add UPP module. ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 76048d4c..f2f33cbf 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -56,6 +56,10 @@ > Notredame C, Higgins DG, Heringa J. T-Coffee: A novel method for fast and accurate multiple sequence alignment. J Mol Biol. 2000 Sep 8;302(1):205-17. doi: 10.1006/jmbi.2000.4042. PMID: 10964570. +- [UPP](https://academic.oup.com/bioinformatics/article/39/1/btad007/6982552) + + > Park M, Ivanovic S, Chu G, Shen C, Warnow T. UPP2: fast and accurate alignment of datasets with fragmentary sequences. Bioinformatics. 2023 Jan 1;39(1):btad007. doi: 10.1093/bioinformatics/btad007. PMID: 36625535; PMCID: PMC9846425. + ## Python packages - [Biopython](https://pubmed.ncbi.nlm.nih.gov/19304878/) diff --git a/conf/modules.config b/conf/modules.config index 541db20c..0fdc9259 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -122,7 +122,7 @@ withName: "CREATE_TCOFFEETEMPLATE" { ext.prefix = { "${meta.id}" } } - withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|FOLDMASON_EASYMSA|KALIGN_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN" { + withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|FOLDMASON_EASYMSA|KALIGN_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN|UPP_ALIGN" { tag = { [ "${meta.id}", diff --git a/docs/usage.md b/docs/usage.md index 323b99c0..37f35ccc 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -50,6 +50,7 @@ The available ALIGN methods are listed below (those that accept guide trees are - [MAGUS](https://github.com/vlasmirnov/MAGUS) (accepts guide tree) - [MUSCLE5](https://drive5.com/muscle5/manual/) - [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) +- [UPP](https://github.com/smirarab/sepp) (accepts guide tree) **SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): diff --git a/modules.json b/modules.json index 80488bc0..46f8b5c3 100644 --- a/modules.json +++ b/modules.json @@ -125,6 +125,11 @@ "branch": "master", "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", "installed_by": ["modules"] + }, + "upp/align": { + "branch": "master", + "git_sha": "3be751e610b332efd94c2e82ddab5b5c65cfe852", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/upp/align/environment.yml b/modules/nf-core/upp/align/environment.yml new file mode 100644 index 00000000..da0eaa9a --- /dev/null +++ b/modules/nf-core/upp/align/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::sepp=4.5.5 + - conda-forge::pigz=2.8 diff --git a/modules/nf-core/upp/align/main.nf b/modules/nf-core/upp/align/main.nf new file mode 100644 index 00000000..1b254ea4 --- /dev/null +++ b/modules/nf-core/upp/align/main.nf @@ -0,0 +1,71 @@ +process UPP_ALIGN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/sepp_pigz:d72591720d0277b1': + 'community.wave.seqera.io/library/sepp_pigz:ea6dbc7704a2e251' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(tree) + val(compress) + + output: + tuple val(meta), path("*.aln{.gz,}"), emit: alignment + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def tree_args = tree ? "-t $tree" : "" + """ + + if [ "$workflow.containerEngine" = 'singularity' ]; then + export CONDA_PREFIX="/opt/conda/" + export PASTA_TOOLS_DEVDIR="/opt/conda/bin/" + fi + + run_upp.py \\ + $args \\ + -x $task.cpus \\ + -s ${fasta} \\ + -d . \\ + -o ${prefix} \\ + -p ./upp-temporary + + mv ${prefix}_alignment.fasta ${prefix}.aln + + if ${compress}; then + pigz -p ${task.cpus} ${prefix}.aln + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + upp: \$(run_upp.py -v | grep "run_upp" | cut -f2 -d" ") + pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' )) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + + if [ "$compress" = true ]; then + echo | gzip > "${prefix}.aln.gz" + else + touch "${prefix}.aln" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + upp: \$(run_upp.py -v | grep "run_upp" | cut -f2 -d" ") + pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/upp/align/meta.yml b/modules/nf-core/upp/align/meta.yml new file mode 100644 index 00000000..a5470c2c --- /dev/null +++ b/modules/nf-core/upp/align/meta.yml @@ -0,0 +1,57 @@ +name: "upp_align" +description: Aligns protein structures using UPP +keywords: + - alignment + - MSA + - genomics + - structure +tools: + - "upp": + description: "SATe-enabled phylogenetic placement" + homepage: "https://github.com/smirarab/sepp/tree/master" + documentation: "https://github.com/smirarab/sepp/blob/master/README.UPP.md" + tool_dev_url: "https://github.com/smirarab/sepp/tree/master" + doi: "10.1093/bioinformatics/btad007" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test']` + - fasta: + type: file + description: Input sequences in FASTA format + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing tree information + e.g. `[ id:'test_tree']` + - tree: + type: file + description: Input guide tree in Newick format + pattern: "*.{dnd}" + - compress: + type: boolean + description: Flag representing whether the output MSA should be compressed. Set to true to enable/false to disable compression. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test']` + - alignment: + type: file + description: Alignment file, in FASTA format. May be gzipped or uncompressed, depending on if compress is set to true or false + pattern: "*.aln{.gz,}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@luisas" +maintainers: + - "@luisas" diff --git a/modules/nf-core/upp/align/tests/main.nf.test b/modules/nf-core/upp/align/tests/main.nf.test new file mode 100644 index 00000000..7425d984 --- /dev/null +++ b/modules/nf-core/upp/align/tests/main.nf.test @@ -0,0 +1,96 @@ +nextflow_process { + + name "Test Process UPP_ALIGN" + script "../main.nf" + process "UPP_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "upp" + tag "upp/align" + tag "famsa/guidetree" + + test("fasta - align_sequence - uncompressed") { + config "./nextflow.config" + + when { + process { + """ + input[0] = [ [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + input[1] = [[:],[]] + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("fasta - with_tree - compressed") { + config "./nextflow.config" + + setup { + + run("FAMSA_GUIDETREE") { + script "../../../famsa/guidetree/main.nf" + process { + """ + input[0] = [ [ id:'tree' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + + """ + } + } + } + when { + process { + """ + input[0] = [ [ id:'test_tree' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + input[1] = FAMSA_GUIDETREE.out.tree.collect{ meta, tree -> tree }.map{ tree -> [[ id: 'test_tree'], tree]} + input[2] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match()} + ) + } + } + + test("stub") { + config "./nextflow.config" + + options "-stub" + when { + process { + """ + input[0] = [ [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + input[1] = [[:],[]] + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match()} + ) + } + } +} diff --git a/modules/nf-core/upp/align/tests/main.nf.test.snap b/modules/nf-core/upp/align/tests/main.nf.test.snap new file mode 100644 index 00000000..d34254a1 --- /dev/null +++ b/modules/nf-core/upp/align/tests/main.nf.test.snap @@ -0,0 +1,101 @@ +{ + "fasta - align_sequence - uncompressed": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.aln:md5,e6b5291e9cdb40e9b7c72688e4da533b" + ] + ], + "1": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ], + "alignment": [ + [ + { + "id": "test" + }, + "test.aln:md5,e6b5291e9cdb40e9b7c72688e4da533b" + ] + ], + "versions": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-17T07:51:30.876772941" + }, + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.aln:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ], + "alignment": [ + [ + { + "id": "test" + }, + "test.aln:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-17T07:36:19.135281969" + }, + "fasta - with_tree - compressed": { + "content": [ + { + "0": [ + [ + { + "id": "test_tree" + }, + "test_tree.aln.gz:md5,e6b5291e9cdb40e9b7c72688e4da533b" + ] + ], + "1": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ], + "alignment": [ + [ + { + "id": "test_tree" + }, + "test_tree.aln.gz:md5,e6b5291e9cdb40e9b7c72688e4da533b" + ] + ], + "versions": [ + "versions.yml:md5,b431bb15ae86dcd4485d921df1752a98" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-17T07:52:05.47226891" + } +} \ No newline at end of file diff --git a/modules/nf-core/upp/align/tests/nextflow.config b/modules/nf-core/upp/align/tests/nextflow.config new file mode 100644 index 00000000..30ae1f46 --- /dev/null +++ b/modules/nf-core/upp/align/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: "UPP_ALIGN" { + ext.args = { "-m amino" } + } + +} diff --git a/modules/nf-core/upp/align/tests/tags.yml b/modules/nf-core/upp/align/tests/tags.yml new file mode 100644 index 00000000..adb26a51 --- /dev/null +++ b/modules/nf-core/upp/align/tests/tags.yml @@ -0,0 +1,2 @@ +upp/align: + - "modules/nf-core/upp/align/**" diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index 4b4b9aed..2a67f62b 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -15,12 +15,13 @@ include { KALIGN_ALIGN } from '../../modules/nf-core/kalign include { LEARNMSA_ALIGN } from '../../modules/nf-core/learnmsa/align/main' include { MAFFT } from '../../modules/nf-core/mafft/main' include { MAGUS_ALIGN } from '../../modules/nf-core/magus/align/main' +include { MTMALIGN_ALIGN } from '../../modules/nf-core/mtmalign/align/main' include { MUSCLE5_SUPER5 } from '../../modules/nf-core/muscle5/super5/main' include { TCOFFEE_ALIGN } from '../../modules/nf-core/tcoffee/align/main' include { TCOFFEE_ALIGN as TCOFFEE3D_ALIGN } from '../../modules/nf-core/tcoffee/align/main' include { TCOFFEE_ALIGN as REGRESSIVE_ALIGN } from '../../modules/nf-core/tcoffee/align/main' include { TCOFFEE_CONSENSUS as CONSENSUS } from '../../modules/nf-core/tcoffee/consensus/main' -include { MTMALIGN_ALIGN } from '../../modules/nf-core/mtmalign/align/main' +include { UPP_ALIGN } from '../../modules/nf-core/upp/align/main' workflow ALIGN { take: @@ -90,6 +91,7 @@ workflow ALIGN { regressive: it[0]["aligner"] == "REGRESSIVE" tcoffee: it[0]["aligner"] == "TCOFFEE" tcoffee3d: it[0]["aligner"] == "3DCOFFEE" + upp: it[0]["aligner"] == "UPP" } .set { ch_fasta_trees } @@ -261,6 +263,23 @@ workflow ALIGN { ch_msa = ch_msa.mix(REGRESSIVE_ALIGN.out.alignment) ch_versions = ch_versions.mix(REGRESSIVE_ALIGN.out.versions.first()) + // ----------------- UPP ------------------- + ch_fasta_trees.upp + .multiMap{ + meta, fastafile, treefile -> + fasta: [ meta, fastafile ] + tree: [ meta, treefile ] + } + .set { ch_fasta_trees_upp } + + UPP_ALIGN ( + ch_fasta_trees_upp.fasta, + ch_fasta_trees_upp.tree, + compress + ) + ch_msa = ch_msa.mix(UPP_ALIGN.out.alignment) + ch_versions = ch_versions.mix(UPP_ALIGN.out.versions.first()) + // 2. SEQUENCE + STRUCTURE BASED if(params.templates_suffix == ".pdb"){ diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index c9cb1811..4a23f982 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -187,13 +187,15 @@ def toolCitationText() { "Clustal Omega (Sievers et al., 2011)", "FAMSA (Deorowicz et al., 2016)", "FastQC (Andrews 2010),", + "Foldmason (Gilchrist et al., 2024)", "Kalign 3 (Lassmann, 2019)", + "learnMSA (Becker & Stanke, 2022)", "MAFFT (Katoh et al., 2002)", + "mTM-align (Dong et al., 2018)", "MultiQC (Ewels et al., 2016)", "Muscle5 (Edgar, 2022)", "T-Coffee (Notredame et al., 2000)", - "learnMSA (Becker & Stanke, 2022)", - "mTM-align (Dong et al., 2018)" + "UPP (Park et al., 2023)" ].join(' ').trim() return citation_text @@ -209,10 +211,12 @@ def toolBibliographyText() { "
  • Dong R, Peng Z, Zhang Y, Yang J. mTM-align: an algorithm for fast and accurate multiple protein structure alignment. Bioinformatics. 2018 May 15;34(10):1719-1725. doi: 10.1093/bioinformatics/btx828. PMID: 29281009; PMCID: PMC5946935.
  • ", "
  • Edgar RC. Muscle5: High-accuracy alignment ensembles enable unbiased assessments of sequence homology and phylogeny. Nat Commun. 2022 Nov 15;13(1):6968. doi: 10.1038/s41467-022-34630-w. PMID: 36379955; PMCID: PMC9664440.
  • ", "
  • Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.
  • ", + "
  • Cameron L.M. Gilchrist, Milot Mirdita, Martin Steinegger. bioRxiv 2024.08.01.606130; doi: https://doi.org/10.1101/2024.08.01.606130.
  • ", "
  • Katoh K, Misawa K, Kuma K, Miyata T. MAFFT: a novel method for rapid multiple sequence alignment based on fast Fourier transform. Nucleic Acids Res. 2002 Jul 15;30(14):3059-66. doi: 10.1093/nar/gkf436. PMID: 12136088; PMCID: PMC135756.
  • ", "
  • Lassmann T. Kalign 3: multiple sequence alignment of large data sets. Bioinformatics. 2019 Oct 26;36(6):1928–9. doi: 10.1093/bioinformatics/btz795. Epub ahead of print. PMID: 31665271; PMCID: PMC7703769.
  • ", "
  • Notredame C, Higgins DG, Heringa J. T-Coffee: A novel method for fast and accurate multiple sequence alignment. J Mol Biol. 2000 Sep 8;302(1):205-17. doi: 10.1006/jmbi.2000.4042. PMID: 10964570.
  • ", "
  • O'Sullivan O, Suhre K, Abergel C, Higgins DG, Notredame C. 3DCoffee: combining protein sequences and structures within multiple sequence alignments. J Mol Biol. 2004 Jul 2;340(2):385-95. doi: 10.1016/j.jmb.2004.04.058. PMID: 15201059.
  • ", + "
  • Park M, Ivanovic S, Chu G, Shen C, Warnow T. UPP2: fast and accurate alignment of datasets with fragmentary sequences. Bioinformatics. 2023 Jan 1;39(1):btad007. doi: 10.1093/bioinformatics/btad007. PMID: 36625535; PMCID: PMC9846425.
  • ", "
  • Sievers F, Wilm A, Dineen D, Gibson TJ, Karplus K, Li W, Lopez R, McWilliam H, Remmert M, Söding J, Thompson JD, Higgins DG. Fast, scalable generation of high-quality protein multiple sequence alignments using Clustal Omega. Mol Syst Biol. 2011 Oct 11;7:539. doi: 10.1038/msb.2011.75. PMID: 21988835; PMCID: PMC3261699.
  • " ].join(' ').trim() @@ -395,9 +399,13 @@ class Utils { args = fix_args(tool,args,"REGRESSIVE", "-reg_method", "famsa_msa") args = fix_args(tool,args,"REGRESSIVE", "-reg_nseq", "1000") args = fix_args(tool,args,"REGRESSIVE", "-output", "fasta_aln") + // TCOFFEE args = fix_args(tool,args,"TCOFFEE", "-output", "fasta_aln") + // UPP + args = fix_args(tool,args,"UPP", "-m", "amino") + return args }