Merge pull request #176 from luisas/chaining_proteinfold

Chaining proteinfold
nf-core · Nov 27, 2024 · 2da7cf0 · 2da7cf0
2 parents 783b1d2 + ccb04b0
commit 2da7cf0
Show file tree

Hide file tree

Showing 28 changed files with 614 additions and 166 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,7 @@ Initial release of nf-core/multiplesequencealign, created with the [nf-core](htt
 - [[#147](https://github.com/nf-core/multiplesequencealign/pull/147)] - Add small testing profile + some fixes of the shiny app.
 - [[#148](https://github.com/nf-core/multiplesequencealign/pull/148)] - Add UPP module.
 - [[#150](https://github.com/nf-core/multiplesequencealign/pull/150)] - Update modules and readme for pre-release.
+- [[#174](https://github.com/nf-core/multiplesequencealign/issues/174)] - Add the chaining of proteinfold output to MSA input.
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -38,9 +38,8 @@ The pipeline performs the following steps:
 
 ## Usage
 
-:::note
-If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
-:::
+> [!NOTE]
+> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
 #### 1. SAMPLESHEET
 
@@ -50,16 +49,15 @@ It should look like this:
 `samplesheet.csv`:
 
 ```csv
-id,fasta,reference,dependencies,template
+id,fasta,reference,optional_data,template
 seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures,seatoxin_template.txt
 toxin,toxin.fa,toxin-ref.fa,toxin_structures,toxin_template.txt
 ```
 
 Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be aligned and the associated (if available) reference alignments and dependency files (this can be anything from protein structure or any other information you would want to use in your favourite MSA tool).
 
-:::note
-The only required input is the id column and either fasta or dependencies.
-:::
+> [!NOTE]
+> The only required input is the id column and either fasta or optional_data.
 
 #### 2. TOOLSHEET
 
@@ -78,9 +76,8 @@ FAMSA, -gt upgma -medoidtree, FAMSA,
 FAMSA,,REGRESSIVE,
 ```
 
-:::note
-The only required input is aligner.
-:::
+> [!NOTE]
+> The only required input is `aligner`.
 
 #### 3. RUN THE PIPELINE
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -23,7 +23,7 @@
                 "type": "string",
                 "default": ""
             },
-            "dependencies": {
+            "optional_data": {
                 "type": "string",
                 "default": ""
             },
@@ -33,6 +33,6 @@
             }
         },
         "required": ["id"],
-        "anyOf": [{ "required": ["fasta"] }, { "required": ["dependencies"] }]
+        "anyOf": [{ "required": ["fasta"] }, { "required": ["optional_data"] }]
     }
 }
diff --git a/docs/images/nf-core-msa_metro_map.png b/docs/images/nf-core-msa_metro_map.png
diff --git a/docs/usage.md b/docs/usage.md
@@ -100,23 +100,23 @@ The sample sheet defines the **input data** that the pipeline will process.
 It should look like this:
 
 ```csv title="samplesheet.csv"
-id,fasta,reference,dependencies,template
+id,fasta,reference,optional_data,template
 seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures,seatoxin_template.txt
 toxin,toxin.fa,toxin-ref.fa,toxin_structures,toxin_template.txt
 ```
 
 Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed.
 
-| Column         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `id`           | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It must be unique.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| `fasta`        | Required (At least one of fasta or dependencies must be provided). Full path to the fasta file that contains the sequence to be aligned.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `reference`    | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| `dependencies` | Required (At least one of fasta or dependencies must be provided). Full path to the folder that contains the dependency files (e.g. protein structures) for the sequences to be aligned. Currently, it is used for structural aligners and structure-based evaluation steps. It can be left empty.                                                                                                                                                                                                                                                                                                                                                                                                 |
-| `template`     | Optional. Files that define the mapping between the input sequence and the dependency files (e.g. protein structures) to be used. Used by 3D-Coffee. If not specified, they will be automatically generated assuming that the sequence name provided in the fasta is the same as the file name of the corresponding PDB file. E.g. if you set (default) the parameter templates_suffix to .pdb, then: ">MyProteinName" in the fasta file and "MyProteinName.pdb" for the corresponding protein structure. For more information on how to generate a template file manually, please look at the T-Coffee [documentation](https://tcoffee.readthedocs.io/en/latest/tcoffee_main_documentation.html). |
+| Column          | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `id`            | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It must be unique.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `fasta`         | Required (At least one of fasta or optional_data must be provided). Full path to the fasta file that contains the sequence to be aligned.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `reference`     | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| `optional_data` | Required (At least one of fasta or optional_data must be provided). Full path to the folder that contains the dependency files (e.g. protein structures) for the sequences to be aligned. Currently, it is used for structural aligners and structure-based evaluation steps. It can be left empty.                                                                                                                                                                                                                                                                                                                                                                                                |
+| `template`      | Optional. Files that define the mapping between the input sequence and the dependency files (e.g. protein structures) to be used. Used by 3D-Coffee. If not specified, they will be automatically generated assuming that the sequence name provided in the fasta is the same as the file name of the corresponding PDB file. E.g. if you set (default) the parameter templates_suffix to .pdb, then: ">MyProteinName" in the fasta file and "MyProteinName.pdb" for the corresponding protein structure. For more information on how to generate a template file manually, please look at the T-Coffee [documentation](https://tcoffee.readthedocs.io/en/latest/tcoffee_main_documentation.html). |
 
 :::note
-You can have some samples with dependencies and/or references and some without. The pipeline will run the modules requiring dependencies/references only on the samples for which you have provided the required information and the others will be just skipped.
+You can have some samples with optional_data and/or references and some without. The pipeline will run the modules requiring optional_data/references only on the samples for which you have provided the required information and the others will be just skipped.
 :::
 
 ## Toolsheet input

diff --git a/docs/usage/chaining_with_proteinfold.md b/docs/usage/chaining_with_proteinfold.md
@@ -0,0 +1,49 @@
+# Using nf-core/proteinfold to generate the input protein structures
+
+Structural aligners leverage protein structural information to render the MSA.
+
+You can provide your PDB structures via the samplesheet, as outlined in the primary usage documentation. However, if you do not already have protein structures available, you may opt to use protein structure prediction tools to create these models.
+
+To facilitate this, we offer seamless integration with the nf-core/proteinfold pipeline, enabling you to generate the protein structures required for this workflow.
+
+To do so, you only need to build one samplesheet file, in the exact format required by nf-core/multiplesequencealign pipeline.
+This is made compatible with nf-core/proteinfold and will predict and output the structures in the format required by the nf-core/multiplesquencealign pipeline.
+
+Now, to run you simply can use the following code.
+
+> [!NOTE]
+> Please refer to the [proteinfold documentation](https://nf-co.re/proteinfold/1.1.1/) for picking your favourite params.
+
+Here we showcase how to run proteinfold in its colabfold local flavour - but it works for all the proteinfold modes.
+
+```bash
+nextflow run nf-core/proteinfold \
+   --input ./samplesheet.csv \
+   --outdir ./proteinfold_results \
+   --split_fasta \
+   -r dev \
+   --mode colabfold \
+   --colabfold_server local \
+   --colabfold_db <null (default) | PATH> \
+   --num_recycle 3 \
+   --use_amber <true/false> \
+   --colabfold_model_preset "AlphaFold2-ptm" \
+   --use_gpu <true/false> \
+   --db_load_mode 0
+   -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+
+
+nextflow run nf-core/multiplesequencealign \
+   --input ./samplesheet.csv \
+   --tools ./toolsheet.csv \
+   --optional_data_dir ./proteinfold_results/*/*/top_ranked_structures \
+   --outdir ./results \
+   -profile <docker/singularity/podman/shifter/charliecloud/conda/institute>
+
+```
+
+> [!NOTE]
+> The one imporant parameter NOT to forget in proteinfold for the chaining is `--split_fasta`. This will allow to use a multifasta file as input for monomer predictions, needed by the MSA pipeline.The rest of the proteinfold parameters can and should be tuned according to your preferences for your proteinfold run. Please refer to the proteinfold documentation for this.
+
+> [!WARNING]
+> This is currently an experimetal feature and only available in the dev branch of proteinfold, so also do not forget `-r dev`. This feature will be soon available with the next release of nf-core/proteinfold.
diff --git a/docs/extending.md → docs/usage/extending.md b/docs/extending.md → docs/usage/extending.md
diff --git a/modules.json b/modules.json
@@ -7,7 +7,7 @@
                 "nf-core": {
                     "clustalo/align": {
                         "branch": "master",
-                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "git_sha": "2a8530b890878747f5063a894bad9fb2abd5c071",
                         "installed_by": ["modules"]
                     },
                     "clustalo/guidetree": {
@@ -99,12 +99,12 @@
                     },
                     "tcoffee/alncompare": {
                         "branch": "master",
-                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "git_sha": "ffa000ab3c33df25a165b5f9a039c4cbb665a77b",
                         "installed_by": ["modules"]
                     },
                     "tcoffee/consensus": {
                         "branch": "master",
-                        "git_sha": "66b22564bc1bc0db7292f2073cdef954ead773e7",
+                        "git_sha": "023e51187884ea6cc7290767486f551565f1b77a",
                         "installed_by": ["modules"]
                     },
                     "tcoffee/irmsd": {
@@ -143,17 +143,17 @@
                 "nf-core": {
                     "utils_nextflow_pipeline": {
                         "branch": "master",
-                        "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
+                        "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b",
                         "installed_by": ["subworkflows"]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
-                        "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
+                        "git_sha": "1b89f75f1aa2021ec3360d0deccd0f6e97240551",
                         "installed_by": ["subworkflows"]
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
-                        "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
+                        "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e",
                         "installed_by": ["subworkflows"]
                     }
                 }

diff --git a/modules/nf-core/clustalo/align/main.nf b/modules/nf-core/clustalo/align/main.nf