From e87572cea26295eb256f60232c540dd765844600 Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 23 Oct 2024 18:19:11 +0200 Subject: [PATCH 01/28] first commit --- .github/workflows/ci.yml | 1 + assets/samplesheet.csv | 6 +-- assets/schema_input.json | 11 +++++- conf/modules_colabfold.config | 16 ++++++-- conf/test_split_fasta.config | 39 +++++++++++++++++++ docs/usage.md | 2 + nextflow.config | 2 + nextflow_schema.json | 5 +++ .../utils_nfcore_proteinfold_pipeline/main.nf | 12 ++++++ 9 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 conf/test_split_fasta.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 161ca5e8..196a9393 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,7 @@ jobs: - "test_colabfold_webserver" - "test_colabfold_download" - "test_esmfold" + - "test_split_fasta" isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 467fdcf0..5e7df047 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sequence,fasta -T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta -T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta +sequence,fasta,reference,dependencies +seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/af2_structures/seatoxin-ref.tar.gz +toxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref, \ No newline at end of file diff --git a/assets/schema_input.json b/assets/schema_input.json index c261ae58..2bbdf919 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,6 +13,12 @@ "errorMessage": "Sequence name must be provided and cannot contain spaces", "meta": ["id"] }, + "id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sequence name must be provided and cannot contain spaces", + "meta": ["id"] + }, "fasta": { "type": "string", "format": "file-path", @@ -21,6 +27,9 @@ "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'" } }, - "required": ["sequence", "fasta"] + "oneOf": [ + { "required": ["sequence", "fasta"] }, + { "required": ["id", "fasta"] } + ] } } diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 2efcfa01..00da59e7 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -67,10 +67,18 @@ if (params.colabfold_server == 'local') { params.use_templates ? '--templates' : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_01.pdb' + ], ] } } diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config new file mode 100644 index 00000000..c3feb113 --- /dev/null +++ b/conf/test_split_fasta.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + Use as follows: + nextflow run nf-core/proteinfold -profile test_colabfold_local, --outdir +---------------------------------------------------------------------------------------- +*/ + +stubRun = true + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data to test colabfold with the colabfold webserver analysis + mode = 'colabfold' + colabfold_server = 'local' + split_fasta = true + colabfold_db = "${projectDir}/assets/dummy_db_dir" + //input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' +} + +process { + withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { + container = 'biocontainers/gawk:5.1.0' + } +} diff --git a/docs/usage.md b/docs/usage.md index 43b0d86a..d4502203 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,6 +35,8 @@ The samplesheet can have as many columns as you desire, however, there is a stri An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +Each FASTA file is assumed to contain a single protein sequence unless you are using multimer mode. If you want to provide a FASTA file with multiple protein sequences, each to be folded individually, you can supply one or more FASTA files containing one or more sequences and use the --split_fasta parameter. In this case, each sequence in the FASTA file will be folded individually and in parallel, as if you had listed each sequence separately in the samplesheet. + ## Running the pipeline The typical commands for running the pipeline on AlphaFold2, Colabfold and ESMFold modes are shown below. diff --git a/nextflow.config b/nextflow.config index d8fc2623..ed874e2c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null mode = 'alphafold2' // {alphafold2, colabfold, esmfold} use_gpu = false + split_fasta = false // Alphafold2 parameters alphafold2_mode = "standard" @@ -240,6 +241,7 @@ profiles { test_colabfold_webserver { includeConfig 'conf/test_colabfold_webserver.config' } test_colabfold_download { includeConfig 'conf/test_colabfold_download.config' } test_esmfold { includeConfig 'conf/test_esmfold.config' } + test_split_fasta { includeConfig 'conf/test_split_fasta.config' } test_full { includeConfig 'conf/test_full.config' } test_full_alphafold2_standard { includeConfig 'conf/test_full.config' } test_full_alphafold2_split { includeConfig 'conf/test_full_alphafold_split.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 313997a8..8df979ce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -40,6 +40,11 @@ "description": "Run on CPUs (default) or GPUs", "fa_icon": "fas fa-microchip" }, + "split_fasta": { + "type": "boolean", + "description": "Split input fasta file in multiple fasta files each of them containing one sequence to be folded", + "fa_icon": "fas fa-microchip" + }, "email": { "type": "string", "description": "Email address for completion summary.", diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index fa0545a6..9c3ebe1c 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -67,6 +67,18 @@ workflow PIPELINE_INITIALISATION { // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) + if (params.split_fasta) { + + ch_samplesheet.splitFasta(record: [id:true]) + .map{ record -> record.id.toString() } + .set{ ID }.view() + ch_samplesheet = ch_samplesheet.map{meta, fasta -> fasta} + .splitFasta( by:1, file: true ) + .map{fasta -> [[id:record.id], fasta ]}.view() + } + + ch_samplesheet.view() + emit: samplesheet = ch_samplesheet versions = ch_versions From d78bf35f140e51ad575f2319e56212ecab656eec Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 16:04:39 +0200 Subject: [PATCH 02/28] update --- assets/samplesheet.csv | 3 -- assets/schema_input.json | 11 +------ conf/modules.config | 8 +++++ conf/modules_alphafold2.config | 30 +++++++++++++++---- conf/modules_colabfold.config | 20 +++++++++---- conf/modules_esmfold.config | 11 +++++-- conf/test_split_fasta.config | 6 ++-- docs/output.md | 14 +++++---- main.nf | 6 +++- .../utils_nfcore_proteinfold_pipeline/main.nf | 27 +++++++++++------ 10 files changed, 90 insertions(+), 46 deletions(-) delete mode 100644 assets/samplesheet.csv diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5e7df047..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sequence,fasta,reference,dependencies -seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/af2_structures/seatoxin-ref.tar.gz -toxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref, \ No newline at end of file diff --git a/assets/schema_input.json b/assets/schema_input.json index 2bbdf919..c261ae58 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,12 +13,6 @@ "errorMessage": "Sequence name must be provided and cannot contain spaces", "meta": ["id"] }, - "id": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sequence name must be provided and cannot contain spaces", - "meta": ["id"] - }, "fasta": { "type": "string", "format": "file-path", @@ -27,9 +21,6 @@ "errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'" } }, - "oneOf": [ - { "required": ["sequence", "fasta"] }, - { "required": ["id", "fasta"] } - ] + "required": ["sequence", "fasta"] } } diff --git a/conf/modules.config b/conf/modules.config index c56b11eb..5f6fbd9f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,6 +50,14 @@ process { ] } + withName: 'GENERATE_REPORT'{ + publishDir = [ + path: { "${params.outdir}/report" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'FOLDSEEK_EASYSEARCH' { ext.args = { params.foldseek_easysearch_arg ? "$params.foldseek_easysearch_arg" : "--format-mode 3" } publishDir = [ diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 33b04c38..c8b4fab3 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -40,9 +40,18 @@ if (params.alphafold2_mode == 'standard') { params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*.1.alphafold.pdb' + ] ] } } @@ -64,9 +73,18 @@ if (params.alphafold2_mode == 'split_msa_prediction') { if(params.use_gpu) { accelerator = 1 } ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: 'ranked_0.pdb' + ] ] } } diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 00da59e7..ecf87d75 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -30,10 +30,18 @@ if (params.colabfold_server == 'webserver') { params.host_url ? "--host-url ${params.host_url}" : '' ].join(' ').trim() publishDir = [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, - mode: 'copy', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + [ + path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + mode: 'copy', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*.*' + ], + [ + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + mode: 'copy', + saveAs: { "${meta.id}.pdb" }, + pattern: '*_relaxed_rank_01.pdb' + ] ] } } @@ -68,13 +76,13 @@ if (params.colabfold_server == 'local') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index d8356924..ba523450 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -14,11 +14,18 @@ process { withName: 'RUN_ESMFOLD' { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ - path: { "${params.outdir}/esmfold" }, + [ + path: { "${params.outdir}/esmfold/complete_results" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.*' + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/esmfold" }, + mode: 'copy', + pattern: '*.pdb' ] + ] } withName: 'NFCORE_PROTEINFOLD:ESMFOLD:MULTIQC' { diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index c3feb113..44130987 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -24,12 +24,12 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data to test colabfold with the colabfold webserver analysis - mode = 'colabfold' + mode = 'colabold' colabfold_server = 'local' split_fasta = true colabfold_db = "${projectDir}/assets/dummy_db_dir" - //input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' + //input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' } process { diff --git a/docs/output.md b/docs/output.md index 9b9a8fb8..291614a5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,10 +23,9 @@ The directories listed below will be created in the output directory after the p
Output files -- `AlphaFold2/` - - `/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings - - `.alphafold.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models +- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. + - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `.pdb` that is the structure with the highest pLDDT score (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -91,7 +90,9 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold/webserver/` or `colabfold/local/` based on the selected mode that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs and scores, prediction metadata, logs and section timings +- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. + - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `.pdb` that is the structure with the highest pLDDT score (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -117,7 +118,8 @@ Below you can find some indicative examples of the output images produced by Col - `esmfold/` - `.pdb` that is the structure with the highest pLDDT score (ranked first) - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue for each of the 5 predicted models + - `complete_results` + - `_plddt_mqc.tsv` that presents the pLDDT scores per residue. - `DBs/` that contains symbolic links to the downloaded database and parameter files diff --git a/main.nf b/main.nf index eaf0eac1..34c1f7e0 100644 --- a/main.nf +++ b/main.nf @@ -64,6 +64,7 @@ workflow NFCORE_PROTEINFOLD { ch_multiqc = Channel.empty() ch_versions = Channel.empty() ch_report_input = Channel.empty() + ch_outputsheet = Channel.empty() // // WORKFLOW: Run alphafold2 @@ -146,7 +147,6 @@ workflow NFCORE_PROTEINFOLD { params.create_colabfold_index ) ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS.out.versions) - // // WORKFLOW: Run nf-core/colabfold workflow // @@ -159,6 +159,7 @@ workflow NFCORE_PROTEINFOLD { PREPARE_COLABFOLD_DBS.out.uniref30, params.num_recycles_colabfold ) + ch_multiqc = COLABFOLD.out.multiqc_report ch_versions = ch_versions.mix(COLABFOLD.out.versions) ch_report_input = ch_report_input.mix( @@ -168,6 +169,8 @@ workflow NFCORE_PROTEINFOLD { .join(COLABFOLD.out.msa) .map { it[0]["model"] = "colabfold"; it } ) + // ch_outputsheet = ch_report_input.transpose(by:1).filter{it[1].name.contains("rank_01")} + // ch_outputsheet.view() } // @@ -231,6 +234,7 @@ workflow NFCORE_PROTEINFOLD { ) } + emit: multiqc_report = ch_multiqc } diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index 9c3ebe1c..c3642f50 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -68,17 +68,26 @@ workflow PIPELINE_INITIALISATION { ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) if (params.split_fasta) { - - ch_samplesheet.splitFasta(record: [id:true]) - .map{ record -> record.id.toString() } - .set{ ID }.view() - ch_samplesheet = ch_samplesheet.map{meta, fasta -> fasta} - .splitFasta( by:1, file: true ) - .map{fasta -> [[id:record.id], fasta ]}.view() + // Extract all sequence headers from the fasta file + // to keep track of which sequences belong to which dataset + // and create a new channel [[id:{dataset_id}, sequence:{sequence_id}]] + ch_samplesheet.splitFasta(by:1, record: [header:true]) + .map{meta, record -> [record.header, meta]} + .set{dataset_sequence_mapping} + + // Split the fasta file into individual files for each sequence + ch_samplesheet.map{ meta,fasta -> fasta} + .splitFasta( record: [id: true, sequence: true] ) + .collectFile { item -> + [ "${item["id"]}.fa", ">" + item["id"] + '\n' +item["sequence"] ] + }.map{ + file -> [file.baseName, file] + }.combine(dataset_sequence_mapping, by:0) + .map{ + id, file, meta -> [[id:id, dataset:meta.id], file] + }.set{ch_samplesheet} } - ch_samplesheet.view() - emit: samplesheet = ch_samplesheet versions = ch_versions From 66968b9c6d02e86672c39151fd9ba402593768de Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:10:39 +0200 Subject: [PATCH 03/28] update --- conf/test_split_fasta.config | 5 +-- .../utils_nfcore_proteinfold_pipeline/main.nf | 40 +++++++++++++------ 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index 44130987..9eca8853 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -24,16 +24,15 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data to test colabfold with the colabfold webserver analysis - mode = 'colabold' + mode = 'colabfold' colabfold_server = 'local' split_fasta = true colabfold_db = "${projectDir}/assets/dummy_db_dir" input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' - //input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' } process { withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { container = 'biocontainers/gawk:5.1.0' } -} +} \ No newline at end of file diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index c3642f50..faf0b3ff 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -66,25 +66,20 @@ workflow PIPELINE_INITIALISATION { // Create channel from input file provided through params.input // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) - if (params.split_fasta) { - // Extract all sequence headers from the fasta file - // to keep track of which sequences belong to which dataset - // and create a new channel [[id:{dataset_id}, sequence:{sequence_id}]] - ch_samplesheet.splitFasta(by:1, record: [header:true]) - .map{meta, record -> [record.header, meta]} - .set{dataset_sequence_mapping} + // here we have to validate that the ids are unique and valid as an extra step + // since it is not done with the samplesheet schema (they are all in the same file) + ch_samplesheet.map { meta, fasta -> + validateFasta(fasta) + } // Split the fasta file into individual files for each sequence ch_samplesheet.map{ meta,fasta -> fasta} - .splitFasta( record: [id: true, sequence: true] ) + .splitFasta( record: [header: true, sequence: true] ) .collectFile { item -> - [ "${item["id"]}.fa", ">" + item["id"] + '\n' +item["sequence"] ] + [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] }.map{ - file -> [file.baseName, file] - }.combine(dataset_sequence_mapping, by:0) - .map{ - id, file, meta -> [[id:id, dataset:meta.id], file] + file -> [[id: file.baseName], file] }.set{ch_samplesheet} } @@ -235,3 +230,22 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } +def cleanHeader(header) { + return header.replaceAll(" ", "_").replaceAll(",", "").replaceAll(";","") +} + +def validateFasta(fasta) { + // extract headers + def headers = fasta.findAll { it.startsWith('>') } + // if headers are not unique, throw an error + if (headers.size() != headers.unique().size()) { + throw new Exception("Invalid FASTA file. The headers are not unique.") + } + // check headers that are malformed + headers.each { header -> + if (header =~ /[ \t;,]/) { + // warn user that the header contains special characters + log.warn "The header ${header} contains special characters. They have been automatically removed." + } + } +} \ No newline at end of file From a2ab2cedc1532da32c2f1168a8691208a9a216ed Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:18:20 +0200 Subject: [PATCH 04/28] revert main.nf --- main.nf | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 34c1f7e0..eaf0eac1 100644 --- a/main.nf +++ b/main.nf @@ -64,7 +64,6 @@ workflow NFCORE_PROTEINFOLD { ch_multiqc = Channel.empty() ch_versions = Channel.empty() ch_report_input = Channel.empty() - ch_outputsheet = Channel.empty() // // WORKFLOW: Run alphafold2 @@ -147,6 +146,7 @@ workflow NFCORE_PROTEINFOLD { params.create_colabfold_index ) ch_versions = ch_versions.mix(PREPARE_COLABFOLD_DBS.out.versions) + // // WORKFLOW: Run nf-core/colabfold workflow // @@ -159,7 +159,6 @@ workflow NFCORE_PROTEINFOLD { PREPARE_COLABFOLD_DBS.out.uniref30, params.num_recycles_colabfold ) - ch_multiqc = COLABFOLD.out.multiqc_report ch_versions = ch_versions.mix(COLABFOLD.out.versions) ch_report_input = ch_report_input.mix( @@ -169,8 +168,6 @@ workflow NFCORE_PROTEINFOLD { .join(COLABFOLD.out.msa) .map { it[0]["model"] = "colabfold"; it } ) - // ch_outputsheet = ch_report_input.transpose(by:1).filter{it[1].name.contains("rank_01")} - // ch_outputsheet.view() } // @@ -234,7 +231,6 @@ workflow NFCORE_PROTEINFOLD { ) } - emit: multiqc_report = ch_multiqc } From 91dead17fbfa29ddc937fada5677b377efb6a9e4 Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:28:49 +0200 Subject: [PATCH 05/28] fix output folder --- conf/modules_alphafold2.config | 12 ++++++------ conf/modules_colabfold.config | 8 ++++---- conf/modules_esmfold.config | 6 +++--- docs/output.md | 15 ++++++--------- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index c8b4fab3..b9deab54 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -40,14 +40,14 @@ if (params.alphafold2_mode == 'standard') { params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' ].join(' ').trim() publishDir = [ - [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + [ + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*.1.alphafold.pdb' @@ -63,7 +63,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { withName: 'RUN_ALPHAFOLD2_MSA' { ext.args = params.max_template_date ? "--max_template_date ${params.max_template_date}" : '' publishDir = [ - path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -74,13 +74,13 @@ if (params.alphafold2_mode == 'split_msa_prediction') { ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/complete_results" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: 'ranked_0.pdb' diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index ecf87d75..922a3da5 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -31,13 +31,13 @@ if (params.colabfold_server == 'webserver') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' @@ -76,13 +76,13 @@ if (params.colabfold_server == 'local') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/complete_results" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index ba523450..92c2405a 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -15,13 +15,13 @@ process { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ [ - path: { "${params.outdir}/esmfold/complete_results" }, + path: { "${params.outdir}/esmfold" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*.tsv' + pattern: '*' ], [ - path: { "${params.outdir}/esmfold" }, + path: { "${params.outdir}/esmfold/best_structures" }, mode: 'copy', pattern: '*.pdb' ] diff --git a/docs/output.md b/docs/output.md index 291614a5..faa7da7f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,9 +23,8 @@ The directories listed below will be created in the output directory after the p
Output files -- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. - - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `.pdb` that is the structure with the highest pLDDT score (ranked first) +- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -90,9 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. - - `complete_results/` that contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `.pdb` that is the structure with the highest pLDDT score (ranked first) +- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -117,9 +115,8 @@ Below you can find some indicative examples of the output images produced by Col Output files - `esmfold/` - - `.pdb` that is the structure with the highest pLDDT score (ranked first) - - `complete_results` - - `_plddt_mqc.tsv` that presents the pLDDT scores per residue. + contains the predicted structures. + - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From a22a92d8d25afaec9700dffa933fab793bce9d3a Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:44:01 +0200 Subject: [PATCH 06/28] update --- conf/modules_alphafold2.config | 2 +- conf/test_split_fasta.config | 2 +- .../local/utils_nfcore_proteinfold_pipeline/main.nf | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index b9deab54..c27defbd 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -73,7 +73,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { if(params.use_gpu) { accelerator = 1 } ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ - [ + [ path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index 9eca8853..b7e5ead0 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -35,4 +35,4 @@ process { withName: 'MMSEQS_COLABFOLDSEARCH|COLABFOLD_BATCH' { container = 'biocontainers/gawk:5.1.0' } -} \ No newline at end of file +} diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index faf0b3ff..6611eefe 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -235,17 +235,17 @@ def cleanHeader(header) { } def validateFasta(fasta) { - // extract headers + // extract headers def headers = fasta.findAll { it.startsWith('>') } // if headers are not unique, throw an error if (headers.size() != headers.unique().size()) { throw new Exception("Invalid FASTA file. The headers are not unique.") } - // check headers that are malformed + // check headers that are malformed headers.each { header -> if (header =~ /[ \t;,]/) { // warn user that the header contains special characters log.warn "The header ${header} contains special characters. They have been automatically removed." } } -} \ No newline at end of file +} From 58a19c891ccfd2d95489c9ea62430c6fb52dd17a Mon Sep 17 00:00:00 2001 From: luisas Date: Fri, 25 Oct 2024 17:45:40 +0200 Subject: [PATCH 07/28] fix lintin --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index faa7da7f..cc74c1d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -115,7 +115,7 @@ Below you can find some indicative examples of the output images produced by Col Output files - `esmfold/` - contains the predicted structures. + contains the predicted structures. - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From 250095188c7c7f7c10e7bfefb8cde810878db5de Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:24:20 +0100 Subject: [PATCH 08/28] Update conf/modules_alphafold2.config Co-authored-by: Jose Espinosa-Carrasco --- conf/modules_alphafold2.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index c27defbd..90031abd 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -47,7 +47,7 @@ if (params.alphafold2_mode == 'standard') { pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*.1.alphafold.pdb' From ace5aef6c384a87e66a4e7e83e445abae1c61a4e Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:24:28 +0100 Subject: [PATCH 09/28] Update conf/modules_alphafold2.config Co-authored-by: Jose Espinosa-Carrasco --- conf/modules_alphafold2.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 90031abd..cfc063a7 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -80,7 +80,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/best_structures" }, + path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: 'ranked_0.pdb' From b32702e050d82e1034d8161a453c995c0b64549b Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:24:38 +0100 Subject: [PATCH 10/28] Update conf/modules_colabfold.config Co-authored-by: Jose Espinosa-Carrasco --- conf/modules_colabfold.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 922a3da5..c1d89c7c 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -37,7 +37,7 @@ if (params.colabfold_server == 'webserver') { pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' From 47b252f8317da60c1fcbf262a636526a0d6daf59 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:24:45 +0100 Subject: [PATCH 11/28] Update conf/modules_colabfold.config Co-authored-by: Jose Espinosa-Carrasco --- conf/modules_colabfold.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index c1d89c7c..cfd1b7a1 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -82,7 +82,7 @@ if (params.colabfold_server == 'local') { pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/best_structures" }, + path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_01.pdb' From d6d798e67f58289dd5ba0115f17163d10eb6561e Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:24:52 +0100 Subject: [PATCH 12/28] Update conf/modules_esmfold.config Co-authored-by: Jose Espinosa-Carrasco --- conf/modules_esmfold.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index 92c2405a..9af290fc 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -21,7 +21,7 @@ process { pattern: '*' ], [ - path: { "${params.outdir}/esmfold/best_structures" }, + path: { "${params.outdir}/esmfold/top_ranked_structures" }, mode: 'copy', pattern: '*.pdb' ] From b987430625bc54f9d0cb677b2bee8ab117559548 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:25:06 +0100 Subject: [PATCH 13/28] Update docs/output.md Co-authored-by: Jose Espinosa-Carrasco --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index cc74c1d0..fef008eb 100644 --- a/docs/output.md +++ b/docs/output.md @@ -24,7 +24,7 @@ The directories listed below will be created in the output directory after the p Output files - `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) + - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From 1a09418690a9534d7bf1d0152b594e82e026cbe2 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:25:17 +0100 Subject: [PATCH 14/28] Update docs/usage.md Co-authored-by: Jose Espinosa-Carrasco --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index d4502203..8f3f5105 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,7 +35,7 @@ The samplesheet can have as many columns as you desire, however, there is a stri An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -Each FASTA file is assumed to contain a single protein sequence unless you are using multimer mode. If you want to provide a FASTA file with multiple protein sequences, each to be folded individually, you can supply one or more FASTA files containing one or more sequences and use the --split_fasta parameter. In this case, each sequence in the FASTA file will be folded individually and in parallel, as if you had listed each sequence separately in the samplesheet. +Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the --split_fasta parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet. ## Running the pipeline From 1bf0d3b17f2eb2b5f8252cc6f3016e2e7abb0368 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Mon, 18 Nov 2024 10:25:33 +0100 Subject: [PATCH 15/28] Update nextflow_schema.json Co-authored-by: Jose Espinosa-Carrasco --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 8df979ce..d8191d6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,7 +42,7 @@ }, "split_fasta": { "type": "boolean", - "description": "Split input fasta file in multiple fasta files each of them containing one sequence to be folded", + "description": "Split input multi-fasta file in separated fasta files each of them containing one sequence to be folded", "fa_icon": "fas fa-microchip" }, "email": { From c8de3181c80deb180e2e801d10a08b57fce1d516 Mon Sep 17 00:00:00 2001 From: luisas Date: Mon, 18 Nov 2024 11:25:39 +0100 Subject: [PATCH 16/28] update config af2 --- conf/modules_alphafold2.config | 4 ++-- conf/modules_esmfold.config | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index cfc063a7..18e1022b 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -50,7 +50,7 @@ if (params.alphafold2_mode == 'standard') { path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, - pattern: '*.1.alphafold.pdb' + pattern: '*_alphafold2.pdb' ] ] } @@ -83,7 +83,7 @@ if (params.alphafold2_mode == 'split_msa_prediction') { path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, - pattern: 'ranked_0.pdb' + pattern: '*_alphafold2.pdb' ] ] } diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index 9af290fc..30c80772 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -23,6 +23,7 @@ process { [ path: { "${params.outdir}/esmfold/top_ranked_structures" }, mode: 'copy', + saveAs: { "${meta.id}.pdb" }, pattern: '*.pdb' ] ] From b53fa81e37a97f04a009c6f15a379439dfbbc726 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Thu, 21 Nov 2024 14:24:25 +0000 Subject: [PATCH 17/28] fix --- conf/modules.config | 8 -------- 1 file changed, 8 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5f6fbd9f..c56b11eb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,14 +50,6 @@ process { ] } - withName: 'GENERATE_REPORT'{ - publishDir = [ - path: { "${params.outdir}/report" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: 'FOLDSEEK_EASYSEARCH' { ext.args = { params.foldseek_easysearch_arg ? "$params.foldseek_easysearch_arg" : "--format-mode 3" } publishDir = [ From 673d30417b266ef393878be3488aa756d1b8925d Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 21 Nov 2024 17:58:09 +0100 Subject: [PATCH 18/28] update --- conf/modules_colabfold.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index cfd1b7a1..21021900 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -40,7 +40,7 @@ if (params.colabfold_server == 'webserver') { path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, - pattern: '*_relaxed_rank_01.pdb' + pattern: '*_relaxed_rank_001*.pdb' ] ] } @@ -85,7 +85,7 @@ if (params.colabfold_server == 'local') { path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, - pattern: '*_relaxed_rank_01.pdb' + pattern: '*_relaxed_rank_001*.pdb' ], ] } From 3f22c4291c2a3a47a33997e6076653a7f62e269d Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 11:08:42 +0100 Subject: [PATCH 19/28] Update conf/test_split_fasta.config Co-authored-by: Jose Espinosa-Carrasco --- conf/test_split_fasta.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_split_fasta.config b/conf/test_split_fasta.config index b7e5ead0..a1c3c683 100644 --- a/conf/test_split_fasta.config +++ b/conf/test_split_fasta.config @@ -28,7 +28,7 @@ params { colabfold_server = 'local' split_fasta = true colabfold_db = "${projectDir}/assets/dummy_db_dir" - input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' + input = params.pipelines_testdata_base_path + 'proteinfold/testdata/samplesheet/v1.0/samplesheet_multimer.csv' } process { From 4e748ad97031b3946649f46169208d3dc94ccfdb Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 11:09:14 +0100 Subject: [PATCH 20/28] Update subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf Co-authored-by: Jose Espinosa-Carrasco --- subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index 6611eefe..2b57012b 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -67,7 +67,7 @@ workflow PIPELINE_INITIALISATION { // ch_samplesheet = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) if (params.split_fasta) { - // here we have to validate that the ids are unique and valid as an extra step + // TODO: here we have to validate that the ids are unique and valid as an extra step // since it is not done with the samplesheet schema (they are all in the same file) ch_samplesheet.map { meta, fasta -> validateFasta(fasta) From 15ac126a289d80792da886032ea21b6030a0e5e2 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 11:10:21 +0100 Subject: [PATCH 21/28] Update subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf Co-authored-by: Jose Espinosa-Carrasco --- .../utils_nfcore_proteinfold_pipeline/main.nf | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf index 2b57012b..c9bd0d57 100644 --- a/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinfold_pipeline/main.nf @@ -74,13 +74,16 @@ workflow PIPELINE_INITIALISATION { } // Split the fasta file into individual files for each sequence - ch_samplesheet.map{ meta,fasta -> fasta} - .splitFasta( record: [header: true, sequence: true] ) - .collectFile { item -> - [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] - }.map{ - file -> [[id: file.baseName], file] - }.set{ch_samplesheet} + ch_samplesheet + .map { meta,fasta -> fasta } + .splitFasta( record: [header: true, sequence: true] ) + .collectFile { item -> + [ "${cleanHeader(item["header"])}.fa", ">" + cleanHeader(item["header"]) + '\n' +item["sequence"] ] + } + .map { + file -> [[id: file.baseName], file] + } + .set { ch_samplesheet } } emit: From c80c6c18bd06ee201e77c299283e0f58796cf284 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 11:10:32 +0100 Subject: [PATCH 22/28] Update docs/usage.md Co-authored-by: Jose Espinosa-Carrasco --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 5ac9cb66..3ac88ecd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -35,7 +35,7 @@ The samplesheet can have as many columns as you desire, however, there is a stri An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the --split_fasta parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet. +Each FASTA file should contain a single protein sequence unless using multimer mode. To provide a FASTA file with multiple sequences for individual folding, you can use one or more FASTA files with the `--split_fasta` parameter. This will treat each sequence in the FASTA file as a separate entry, folding them individually and in parallel, as if each sequence were listed separately in the samplesheet. ## Running the pipeline From 5e97599fefb0a10c3ace18023380742f1c5fc3ff Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 11:10:44 +0100 Subject: [PATCH 23/28] Update docs/output.md Co-authored-by: Jose Espinosa-Carrasco --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index fef008eb..c9f35bd3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,7 +23,7 @@ The directories listed below will be created in the output directory after the p
Output files -- `alphafold_standard/` or `alphafold_split_msa_prediction/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. +- `alphafold2_standard/` or `alphafold2_split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From f88c9dfce66449e7076f90e058a4b2ceb3b2cfdb Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 10:11:54 +0000 Subject: [PATCH 24/28] fix leftover --- conf/modules_esmfold.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index 30c80772..967d77f7 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -18,7 +18,7 @@ process { path: { "${params.outdir}/esmfold" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: '*' + pattern: '*.*' ], [ path: { "${params.outdir}/esmfold/top_ranked_structures" }, From ec35c2b31d116e42d877cb4b55fd47c5f92a3149 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 10:13:45 +0000 Subject: [PATCH 25/28] update samplesheet --- assets/samplesheet.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 assets/samplesheet.csv diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv new file mode 100644 index 00000000..b458d604 --- /dev/null +++ b/assets/samplesheet.csv @@ -0,0 +1,3 @@ +id,fasta +T1024,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1024.fasta +T1026,https://raw.githubusercontent.com/nf-core/test-datasets/proteinfold/testdata/sequences/T1026.fasta From 1ffad8ff05ea0bf29e454e46de43854ba8afec65 Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 10:20:10 +0000 Subject: [PATCH 26/28] fix review --- conf/modules_alphafold2.config | 8 ++++---- conf/modules_colabfold.config | 8 ++++---- conf/modules_esmfold.config | 4 ++-- docs/output.md | 10 +++++----- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/conf/modules_alphafold2.config b/conf/modules_alphafold2.config index 18e1022b..a12105ab 100644 --- a/conf/modules_alphafold2.config +++ b/conf/modules_alphafold2.config @@ -41,13 +41,13 @@ if (params.alphafold2_mode == 'standard') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_alphafold2.pdb' @@ -74,13 +74,13 @@ if (params.alphafold2_mode == 'split_msa_prediction') { ext.args = params.use_gpu ? '--use_gpu_relax=true' : '--use_gpu_relax=false' publishDir = [ [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}" }, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/alphafold2_${params.alphafold2_mode}/top_ranked_structures" }, + path: { "${params.outdir}/alphafold2/${params.alphafold2_mode}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_alphafold2.pdb' diff --git a/conf/modules_colabfold.config b/conf/modules_colabfold.config index 21021900..c37214d3 100644 --- a/conf/modules_colabfold.config +++ b/conf/modules_colabfold.config @@ -31,13 +31,13 @@ if (params.colabfold_server == 'webserver') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, + path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_001*.pdb' @@ -76,13 +76,13 @@ if (params.colabfold_server == 'local') { ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}" }, + path: { "${params.outdir}/colabfold/${params.colabfold_server}" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/colabfold_${params.colabfold_server}/top_ranked_structures" }, + path: { "${params.outdir}/colabfold/${params.colabfold_server}/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*_relaxed_rank_001*.pdb' diff --git a/conf/modules_esmfold.config b/conf/modules_esmfold.config index 967d77f7..3468718f 100644 --- a/conf/modules_esmfold.config +++ b/conf/modules_esmfold.config @@ -15,13 +15,13 @@ process { ext.args = {params.use_gpu ? '' : '--cpu-only'} publishDir = [ [ - path: { "${params.outdir}/esmfold" }, + path: { "${params.outdir}/esmfold/default" }, mode: 'copy', saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: '*.*' ], [ - path: { "${params.outdir}/esmfold/top_ranked_structures" }, + path: { "${params.outdir}/esmfold/default/top_ranked_structures" }, mode: 'copy', saveAs: { "${meta.id}.pdb" }, pattern: '*.pdb' diff --git a/docs/output.md b/docs/output.md index c9f35bd3..542c8140 100644 --- a/docs/output.md +++ b/docs/output.md @@ -23,7 +23,7 @@ The directories listed below will be created in the output directory after the p
Output files -- `alphafold2_standard/` or `alphafold2_split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. +- `alphafold2/standard/` or `alphafold2/split_msa_prediction/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files @@ -89,8 +89,8 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold_webserver/` or `colabfold_local/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) +- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. + - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
@@ -114,9 +114,9 @@ Below you can find some indicative examples of the output images produced by Col
Output files -- `esmfold/` +- `esmfold/default` contains the predicted structures. - - `best_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) + - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files
From 4ddb088f27abc31277904b81d17f8ff0a4f47e09 Mon Sep 17 00:00:00 2001 From: Jose Espinosa-Carrasco Date: Tue, 26 Nov 2024 11:39:51 +0100 Subject: [PATCH 27/28] Update docs/output.md --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 542c8140..df1591b4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -89,7 +89,7 @@ Below you can find an indicative example of the TSV file with the pLDDT scores p
Output files -- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. Contain the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. +- `colabfold/webserver/` or `colabfold/local/` based on the selected mode. It contains the computed MSAs, unrelaxed structures, relaxed structures, ranked structures, raw model outputs, prediction metadata, and section timings. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the 5 predicted models. - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files From 3742d4aa8f150557b46b19fffe0b2aaffb20beeb Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Tue, 26 Nov 2024 10:45:49 +0000 Subject: [PATCH 28/28] fix output --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 542c8140..0f114a2d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -115,7 +115,7 @@ Below you can find some indicative examples of the output images produced by Col Output files - `esmfold/default` - contains the predicted structures. + contains the predicted structures. Specifically, `_plddt_mqc.tsv` presents the pLDDT scores per residue for each of the predicted models. - `top_ranked_structures/.pdb` that is the structure with the highest pLDDT score per input (ranked first) - `DBs/` that contains symbolic links to the downloaded database and parameter files