diff --git a/.editorconfig b/.editorconfig index b78de6e6..b6b31907 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js,cff}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index fea6264d..9a85c826 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -45,6 +45,6 @@ body: * Nextflow version _(eg. 22.10.1)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter or Charliecloud)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ * Version of nf-core/fetchngs _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c9f23b88..a8f29dc0 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,7 +15,8 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/fetc - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md) +- [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index d648893c..00000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on published releases. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - release: - types: [published] - workflow_dispatch: -jobs: - run-tower: - name: Run AWS full tests - if: github.repository == 'nf-core/fetchngs' - runs-on: ubuntu-latest - steps: - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-${{ github.sha }}" - } - profiles: test_full,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index 6e6a8c52..00000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-tower: - name: Run AWS tests - if: github.repository == 'nf-core/fetchngs' - runs-on: ubuntu-latest - steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower - uses: nf-core/tower-action@v3 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-test-${{ github.sha }}" - } - profiles: test,aws_tower - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: tower_action_*.log diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 708158fa..3747c012 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,7 +13,7 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/fetchngs' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/fetchngs ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/fetchngs ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 00000000..694e90ec --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v7 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml new file mode 100644 index 00000000..e25a97aa --- /dev/null +++ b/.github/workflows/cloud_tests_full.yml @@ -0,0 +1,81 @@ +name: full-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using full sized data +on: + release: + types: [published] + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-full-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_full" + profiles: test_full,public_aws_ecr + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_full" + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-full-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' || !github.event.inputs }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_full" + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml new file mode 100644 index 00000000..e7636fdc --- /dev/null +++ b/.github/workflows/cloud_tests_small.yml @@ -0,0 +1,76 @@ +name: small-sized tests on cloud providers +run-name: Submitting workflow to all cloud providers using small sized data +on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-small-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_small" + profiles: test,public_aws_ecr + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'gcp' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + run-small-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v1 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_small" + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-test-${{ github.sha }}" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 858d622e..888cb4bc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.8" architecture: "x64" - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..0c31cdb9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v2.7.1" + hooks: + - id: prettier diff --git a/CHANGELOG.md b/CHANGELOG.md index 54242118..12ef2d50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,46 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-16 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Esha Joshi](https://github.com/ejseqera) +- [Maxime Garcia](https://github.com/maxulysse) +- [Moritz E. Beber](https://github.com/Midnighter) +- [Rob Syme](https://github.com/robsyme) +- [sirclockalot](https://github.com/sirclockalot) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress +- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) +- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids +- [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file +- [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data +- [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files +- [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 +- [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [PR #157](https://github.com/nf-core/fetchngs/pull/157) - Add `public_aws_ecr.config` to source mulled containers when using `public.ecr.aws` Docker Biocontainer registry + +### Software dependencies + +| Dependency | Old version | New version | +| --------------- | ----------- | ----------- | +| `synapseclient` | 2.6.0 | 2.7.1 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + ## [[1.9](https://github.com/nf-core/fetchngs/releases/tag/1.9)] - 2022-12-21 ### Enhancements & fixes diff --git a/README.md b/README.md index fdc60a60..4d6c2d35 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,55 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/fetchngs) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/fetchngs)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/fetchngs)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. +## Usage -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/fetchngs/results). +> **Note** +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +> with `-profile test` before running the workflow on actual data. -## Pipeline summary +First, prepare a samplesheet with your input data that looks as follows: + +`ids.csv`: + +```csv +SRR9984183 +SRR13191702 +ERR1160846 +ERR1109373 +DRR028935 +DRR026872 +``` + +Each line represents a database id. Please see next section for supported ids. + +Now, you can run the pipeline using: + +```bash +nextflow run nf-core/fetchngs \ + -profile \ + --input ids.csv \ + --outdir +``` + +> **Warning:** +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + +For more details, please refer to the [usage documentation](https://nf-co.re/fetchngs/usage) and the [parameter documentation](https://nf-co.re/fetchngs/parameters). + +## Supported ids Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps: -### SRA / ENA / DDBJ ids +### SRA / ENA / DDBJ / GEO ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -31,18 +65,6 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet -### GEO ids - -Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). - -As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead: - -- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) -- Click `SRA Run Selector` at the bottom of the GEO accession page -- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` - -This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`. - ### Synapse ids 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. @@ -50,45 +72,18 @@ This downloads a text file called `SRR_Acc_List.txt` that can be directly provid 3. Download FastQ files in parallel via `synapse get` 4. Collate paths to FastQ files in a single samplesheet -### Samplesheet format +## Pipeline output -The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include: +The columns in the output samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines (see [usage docs](https://nf-co.re/fetchngs/usage#samplesheet-format)), these currently include: - [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) - [nf-core/atacseq](https://nf-co.re/atacseq/usage#samplesheet-input) - Ilumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) - [nf-core/taxprofiler](https://nf-co.re/nf-core/taxprofiler) -See [usage docs](https://nf-co.re/fetchngs/1.8/usage#samplesheet-format) for more details. - -## Quick Start - -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=22.10.1`) - -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) (you can follow [this tutorial](https://singularity-tutorial.github.io/01-installation/)), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(you can use [`Conda`](https://conda.io/miniconda.html) both to install Nextflow itself and also to manage software within pipelines. Please only use it within pipelines as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_. - -3. Download the pipeline and test it on a minimal dataset with a single command: - - ```bash - nextflow run nf-core/fetchngs -profile test,YOURPROFILE --outdir - ``` - - Note that some form of configuration will be needed so that Nextflow knows how to fetch the required software. This is usually done in the form of a config profile (`YOURPROFILE` in the example command above). You can chain multiple config profiles in a comma-separated string. - - > - The pipeline comes with config profiles called `docker`, `singularity`, `podman`, `shifter`, `charliecloud` and `conda` which instruct the pipeline to use the named tool for software management. For example, `-profile test,docker`. - > - Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > - If you are using `singularity`, please use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to download images first, before running the pipeline. Setting the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options enables you to store and re-use the images from a central location for future pipeline runs. - > - If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. - -4. Start running your own analysis! - - ```bash - nextflow run nf-core/fetchngs --input ids.csv --outdir -profile - ``` - -## Documentation - -The nf-core/fetchngs pipeline comes with documentation about the pipeline [usage](https://nf-co.re/fetchngs/usage), [parameters](https://nf-co.re/fetchngs/parameters) and [output](https://nf-co.re/fetchngs/output). +To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/fetchngs/results) tab on the nf-core website pipeline page. +For more details about the output files and reports, please refer to the +[output documentation](https://nf-co.re/fetchngs/output). ## Credits diff --git a/assets/schema_input.json b/assets/schema_input.json index 71f0f976..9a800216 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,8 +8,8 @@ "type": "array", "items": { "type": "string", - "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$", - "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } } } diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 70627791..0ffafba8 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -14,7 +14,8 @@ from urllib.error import HTTPError, URLError from urllib.parse import urlencode from urllib.request import urlopen - +import json +import time logger = logging.getLogger() @@ -57,14 +58,12 @@ # Full list of accepted fields can be obtained here: # https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run ENA_METADATA_FIELDS = ( - "accession", "run_accession", "experiment_accession", "sample_accession", "secondary_sample_accession", "study_accession", "secondary_study_accession", - "parent_study", "submission_accession", "run_alias", "experiment_alias", @@ -84,7 +83,6 @@ "sample_title", "experiment_title", "study_title", - "description", "sample_description", "fastq_md5", "fastq_bytes", @@ -191,10 +189,9 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {"GSE", "GSM"} + _GEO_GSM_PREFIXES = {"GSM"} + _GEO_GSE_PREFIXES = {"GDS", "GSE"} _SRA_PREFIXES = { - "PRJNA", - "SAMN", "DRA", "DRP", "DRS", @@ -202,7 +199,7 @@ class DatabaseResolver: "PRJDB", "SAMD", } - _ENA_PREFIXES = {"ERR", "SRR", "DRR"} + _ENA_PREFIXES = {"ERR", "SRR", "SAMN", "DRR"} @classmethod def expand_identifier(cls, identifier): @@ -218,7 +215,9 @@ def expand_identifier(cls, identifier): """ prefix = ID_REGEX.match(identifier).group(1) - if prefix in cls._GEO_PREFIXES: + if prefix in cls._GEO_GSM_PREFIXES: + return cls._gsm_to_srx(identifier) + elif prefix in cls._GEO_GSE_PREFIXES: return cls._gse_to_srx(identifier) elif prefix in cls._SRA_PREFIXES: return cls._id_to_srx(identifier) @@ -243,21 +242,44 @@ def _id_to_srx(cls, identifier): return [row["Experiment"] for row in open_table(response, delimiter=",")] @classmethod - def _gse_to_srx(cls, identifier): - """Resolve the identifier to SRA experiments.""" + def _gsm_to_srx(cls, identifier): + """Resolve the GEO identifier to SRA experiments.""" ids = [] - params = {"id": identifier, "db": "gds", "rettype": "runinfo", "retmode": "text"} - response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}") + params = {"term": identifier, "db": "sra", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) - gsm_ids = [ - line.split("=")[1].strip() - for line in response.text().splitlines() - if line.split("=")[1].strip().startswith("GSM") - ] + r_json = json.loads(response.text()) + gsm_ids = r_json["esearchresult"]["idlist"] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) return ids + @classmethod + def _gds_to_gsm(cls, identifier): + """Resolve the GEO UIDs to GSM IDs to then resolve to SRA IDs.""" + ids = [] + params = {"id": identifier, "db": "gds", "retmode": "json", "retmax": 10} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + + for each in r_json["result"][identifier]["samples"][0:]: + ids += cls._gsm_to_srx(each["accession"]) + return ids + + @classmethod + def _gse_to_srx(cls, identifier): + """Resolve the GSE identifier to GEO UIDs.""" + ids = [] + params = {"term": identifier, "db": "gds", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + gds_uids = r_json["esearchresult"]["idlist"] + for gds_uid in gds_uids: + ids += cls._gds_to_gsm(gds_uid) + return ids + @classmethod def _id_to_erx(cls, identifier): """Resolve the identifier to ENA experiments.""" @@ -366,23 +388,52 @@ def validate_fields_parameter(param, valid_vals, param_desc): if len(set(user_vals) & set(valid_vals)) == len(user_vals): return user_vals else: + invalid_vals = [x for x in user_vals if x not in valid_vals] logger.error( f"Please provide a valid value for {param_desc}!\n" f"Provided values = {param}\n" - f"Accepted values = {','.join(valid_vals)}" + f"Accepted values = {','.join(valid_vals)}\n" + f"The following values are invalid: {','.join(invalid_vals)}\n" ) sys.exit(1) def fetch_url(url): """Return a response object for the given URL and handle errors appropriately.""" + sleep_time = 5 # Hardcode sleep duration in seconds + max_num_attempts = 3 # Hardcode max number of request attempts + attempt = 0 + try: with urlopen(url) as response: return Response(response=response) + except HTTPError as e: - logger.error("The server couldn't fulfill the request.") - logger.error(f"Status: {e.code} {e.reason}") - sys.exit(1) + if e.status == 429: + # If the response is 429, sleep and retry + if "Retry-After" in e.headers: + retry_after = int(e.headers["Retry-After"]) + logging.warning(f"Received 429 response from server. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + else: + logging.warning(f"Received 429 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 # Increment sleep time + attempt += 1 + return fetch_url(url) # Recursive call to retry request + + elif e.status == 500: + # If the response is 500, sleep and retry max 3 times + if attempt <= max_num_attempts: + logging.warning(f"Received 500 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 + attempt += 1 + return fetch_url(url) + else: + logging.error("Exceeded max request attempts. Exiting.") + sys.exit(1) + except URLError as e: logger.error("We failed to reach a server.") logger.error(f"Reason: {e.reason}") diff --git a/conf/base.config b/conf/base.config index 4382da20..4767e522 100644 --- a/conf/base.config +++ b/conf/base.config @@ -14,7 +14,7 @@ process { memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' diff --git a/conf/modules.config b/conf/modules.config index c42f4d60..f5cb1c77 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -77,6 +77,7 @@ if (params.input_type == 'sra') { } withName: SRATOOLS_FASTERQDUMP { + ext.args = '--split-files --include-technical' publishDir = [ path: { "${params.outdir}/fastq" }, mode: params.publish_dir_mode, diff --git a/conf/public_aws_ecr.config b/conf/public_aws_ecr.config new file mode 100644 index 00000000..14b577d7 --- /dev/null +++ b/conf/public_aws_ecr.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + AWS ECR Config +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config to set public AWS ECR images wherever possible + This improves speed when running on AWS infrastructure. + Use this as an example template when using your own private registry. +---------------------------------------------------------------------------------------- +*/ + +docker.registry = 'public.ecr.aws' +podman.registry = 'public.ecr.aws' + +process { + withName: '.*:SRA_IDS_TO_RUNINFO' { + container = 'quay.io/biocontainers/python:3.9--1' + } + withName: '.*:SRA_RUNINFO_TO_FTP' { + container = 'quay.io/biocontainers/python:3.9--1' + } + withName: '.*:MULTIQC_MAPPINGS_CONFIG' { + container = 'quay.io/biocontainers/python:3.9--1' + } + withName: '.*:SRATOOLS_FASTERQDUMP' { + container = 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' + } + withName: '.*:SRA_MERGE_SAMPLESHEET' { + container = 'quay.io/nf-core/ubuntu:20.04' + } + withName: '.*:SYNAPSE_MERGE_SAMPLESHEET' { + container = 'quay.io/nf-core/ubuntu:20.04' + } +} diff --git a/conf/test_full.config b/conf/test_full.config index 2f0303ea..fed693e0 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,10 +10,12 @@ ---------------------------------------------------------------------------------------- */ +cleanup = true + params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test_full.csv' } diff --git a/docs/output.md b/docs/output.md index daaca914..7402976c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: - Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids) + 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) 2. [Synapse ids](#synapse-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -### SRA / ENA / DDBJ ids +### SRA / ENA / DDBJ / GEO ids
Output files - `fastq/` - - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ. + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. - `fastq/md5/` - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. - `samplesheet/` diff --git a/docs/usage.md b/docs/usage.md index 9172b403..0720a680 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `Synapse` | -| ------------ | ------------ | ------------ | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | | -| SRS6531847 | ERS4399630 | DRS090921 | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | -| SRP256957 | ERP120836 | DRP004793 | | -| SRA1068758 | ERA2420837 | DRA008156 | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | +| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | +| ------------ | ------------ | ------------ | ---------- | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | +| SRS6531847 | ERS4399630 | DRS090921 | | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | +| SRP256957 | ERP120836 | DRP004793 | | | +| SRA1068758 | ERA2420837 | DRA008156 | | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | ### SRR / ERR / DRR ids @@ -70,12 +70,25 @@ From v1.9 of this pipeline the default `strandedness` in the output samplesheet If FTP connections are blocked on your network use the [`--force_sratools_download`](https://nf-co.re/fetchngs/parameters#force_sratools_download) parameter to force the pipeline to download data using sra-tools instead of the ENA FTP. +### Downloading dbGAP data with JWT + +As of v1.10.0, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/) on a supported cloud computing environment (Amazon Web Services or Google Cloud Platform). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. + +Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for _all_ runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. + +Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. + +To test this functionality in your cloud computing environment, you can use the protected dbGAP cloud testing study with experiment accession `SRX512039`: + +- On the [SRA Run Selector page for `SRX512039`](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRX512039&o=acc_s%3Aa), select the two available runs (`SRR1219865` and `SRR1219902`) and click on `JWT Cart` to download a key file called `cart.jwt` that can be directly provided to the pipeline with `--dbgap_key cart.jwt` +- Click on `Accession List` to download a text file called `SRR_Acc_List.txt` with the SRR IDs that can be directly provided to the pipeline with `--input SRR_Acc_List.txt` + ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/fetchngs --input ids.csv --outdir -profile docker +nextflow run nf-core/fetchngs --input ./ids.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -89,6 +102,28 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. + +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. + +> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). + +The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/fetchngs -profile docker -params-file params.yaml +``` + +with `params.yaml` containing: + +```yaml +input: './ids.csv' +outdir: './results/' +<...> +``` + +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -105,6 +140,10 @@ First, go to the [nf-core/fetchngs releases page](https://github.com/nf-core/fet This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. + +> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. + ## Core Nextflow arguments > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). @@ -113,7 +152,7 @@ This version number will be logged in reports when you run the pipeline, so that Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. @@ -137,8 +176,10 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. ### `-resume` @@ -156,102 +197,19 @@ Specify the path to a specific config file (this is a core Nextflow command). Se Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -For example, if the nf-core/rnaseq pipeline is failing after multiple re-submissions of the `STAR_ALIGN` process due to an exit code of `137` this would indicate that there is an out of memory issue: - -```console -[62/149eb0] NOTE: Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) -- Execution is retried (1) -Error executing process > 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)' - -Caused by: - Process `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) - -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - - -Command exit status: - 137 - -Command output: - (empty) - -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb - -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` -``` - -#### For beginners - -A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. - -#### Advanced option on process level - -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). -We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. -If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). -The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. -The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. -Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. -The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. - -```nextflow -process { - withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { - memory = 100.GB - } -} -``` - -> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. -> -> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. - -### Updating containers (advanced users) - -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. - -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: - - - For Docker: +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +### Custom Containers - - For Singularity: +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version may be out of date. - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. - - For Conda: +### Custom Tool Arguments - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 33cd4f6e..9b34804d 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -2,6 +2,7 @@ // This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. // +import nextflow.Nextflow import org.everit.json.schema.Schema import org.everit.json.schema.loader.SchemaLoader import org.everit.json.schema.ValidationException @@ -83,6 +84,7 @@ class NfcoreSchema { 'stub-run', 'test', 'w', + 'with-apptainer', 'with-charliecloud', 'with-conda', 'with-dag', @@ -177,7 +179,7 @@ class NfcoreSchema { } if (has_error) { - System.exit(1) + Nextflow.error('Exiting!') } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 66126c5f..99858d2a 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the nf-core/fetchngs pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -20,7 +22,7 @@ class WorkflowMain { // // Generate help string // - public static String help(workflow, params, log) { + public static String help(workflow, params) { def command = "nextflow run ${workflow.manifest.name} --input ids.csv -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) @@ -33,7 +35,7 @@ class WorkflowMain { // // Generate parameter summary log string // - public static String paramsSummaryLog(workflow, params, log) { + public static String paramsSummaryLog(workflow, params) { def summary_log = '' summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) @@ -49,7 +51,7 @@ class WorkflowMain { // Print help to screen if required if (params.help) { - log.info help(workflow, params, log) + log.info help(workflow, params) System.exit(0) } @@ -61,7 +63,7 @@ class WorkflowMain { } // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + log.info paramsSummaryLog(workflow, params) // Validate workflow parameters via the JSON schema if (params.validate_params) { @@ -81,20 +83,18 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.csv'" - System.exit(1) + Nextflow.error("Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.csv'") } // Check valid input_type has been provided def input_types = ['sra', 'synapse'] if (!input_types.contains(params.input_type)) { - log.error "Invalid option: '${params.input_type}'. Valid options for '--input_type': ${input_types.join(', ')}." - System.exit(1) + Nextflow.error("Invalid option: '${params.input_type}'. Valid options for '--input_type': ${input_types.join(', ')}.") } } // Check if input ids are from the SRA - public static Boolean isSraId(input, log) { + public static Boolean isSraId(input) { def is_sra = false def total_ids = 0 def no_match_ids = [] @@ -111,15 +111,14 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" - System.exit(1) + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!") } } return is_sra } // Check if input ids are from the Synapse platform - public static Boolean isSynapseId(input, log) { + public static Boolean isSynapseId(input) { def is_synapse = false def total_ids = 0 def no_match_ids = [] @@ -136,8 +135,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" - System.exit(1) + Nextflow.error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ or Synapse ids!") } } return is_synapse diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index 90d86f1c..3c092a6f 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -2,17 +2,18 @@ // This file holds several functions specific to the workflow/sra.nf in the nf-core/fetchngs pipeline // +import nextflow.Nextflow + class WorkflowSra { // // Check and validate parameters // - public static void initialise(params, log, valid_params) { + public static void initialise(params, valid_params) { // Check minimal ENA fields are provided to download FastQ files def ena_metadata_fields = params.ena_metadata_fields ? params.ena_metadata_fields.split(',').collect{ it.trim().toLowerCase() } : valid_params['ena_metadata_fields'] if (!ena_metadata_fields.containsAll(valid_params['ena_metadata_fields'])) { - log.error "Invalid option: '${params.ena_metadata_fields}'. Minimally required fields for '--ena_metadata_fields': '${valid_params['ena_metadata_fields'].join(',')}'" - System.exit(1) + Nextflow.error("Invalid option: '${params.ena_metadata_fields}'. Minimally required fields for '--ena_metadata_fields': '${valid_params['ena_metadata_fields'].join(',')}'") } } @@ -29,21 +30,4 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } - - // Fail pipeline if input ids are from the GEO - public static void isGeoFail(ids, log) { - def pattern = /^(GS[EM])(\d+)$/ - for (id in ids) { - if (id =~ pattern) { - log.error "===================================================================================\n" + - " GEO id detected: ${id}\n" + - " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + - " Please remove any GEO ids from the input samplesheet.\n\n" + - " Please see:\n" + - " https://github.com/nf-core/fetchngs/pull/102\n" + - "===================================================================================" - System.exit(1) - } - } - } } diff --git a/main.nf b/main.nf index 2c4b52f2..6da732be 100644 --- a/main.nf +++ b/main.nf @@ -39,12 +39,12 @@ Channel // Auto-detect input id type def input_type = '' -if (WorkflowMain.isSraId(ch_input, log)) { +if (WorkflowMain.isSraId(ch_input)) { input_type = 'sra' -} else if (WorkflowMain.isSynapseId(ch_input, log)) { +} else if (WorkflowMain.isSynapseId(ch_input)) { input_type = 'synapse' } else { - exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / GEO / DDBJ or Synapse ids!' } if (params.input_type == input_type) { @@ -63,7 +63,7 @@ if (params.input_type == input_type) { workflow NFCORE_FETCHNGS { // - // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids + // WORKFLOW: Download FastQ files for SRA / ENA / GEO / DDBJ ids // if (params.input_type == 'sra') { SRA ( ch_ids ) diff --git a/modules.json b/modules.json index c8cd187a..3137b88b 100644 --- a/modules.json +++ b/modules.json @@ -7,22 +7,22 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "custom/sratoolsncbisettings": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/fasterqdump": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] }, "sratools/prefetch": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] } } @@ -31,7 +31,7 @@ "nf-core": { "fastq_download_prefetch_fasterqdump_sratools": { "branch": "master", - "git_sha": "03711bcb7fa2a7088eb54abb1fca326d30e602c2", + "git_sha": "6712754854ae2832abfff3f0800cdb4a6a60bfca", "installed_by": ["subworkflows"] } } diff --git a/modules/local/multiqc_mappings_config.nf b/modules/local/multiqc_mappings_config.nf index 4f6c95bd..8efe1caa 100644 --- a/modules/local/multiqc_mappings_config.nf +++ b/modules/local/multiqc_mappings_config.nf @@ -4,7 +4,7 @@ process MULTIQC_MAPPINGS_CONFIG { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path csv diff --git a/modules/local/sra_fastq_ftp.nf b/modules/local/sra_fastq_ftp.nf index 464a327e..2b7769ff 100644 --- a/modules/local/sra_fastq_ftp.nf +++ b/modules/local/sra_fastq_ftp.nf @@ -7,7 +7,7 @@ process SRA_FASTQ_FTP { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" input: tuple val(meta), val(fastq) diff --git a/modules/local/sra_ids_to_runinfo.nf b/modules/local/sra_ids_to_runinfo.nf index 49c83554..7d47f5e3 100644 --- a/modules/local/sra_ids_to_runinfo.nf +++ b/modules/local/sra_ids_to_runinfo.nf @@ -6,7 +6,7 @@ process SRA_IDS_TO_RUNINFO { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: val id diff --git a/modules/local/sra_merge_samplesheet.nf b/modules/local/sra_merge_samplesheet.nf index 4b94a823..1c2ee7df 100644 --- a/modules/local/sra_merge_samplesheet.nf +++ b/modules/local/sra_merge_samplesheet.nf @@ -3,7 +3,7 @@ process SRA_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path ('samplesheets/*') diff --git a/modules/local/sra_runinfo_to_ftp.nf b/modules/local/sra_runinfo_to_ftp.nf index 3a060f7b..9c83cf53 100644 --- a/modules/local/sra_runinfo_to_ftp.nf +++ b/modules/local/sra_runinfo_to_ftp.nf @@ -4,7 +4,7 @@ process SRA_RUNINFO_TO_FTP { conda "conda-forge::python=3.9.5" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" + 'biocontainers/python:3.9--1' }" input: path runinfo diff --git a/modules/local/synapse_get.nf b/modules/local/synapse_get.nf index f43e4d5e..c8a6d7a4 100644 --- a/modules/local/synapse_get.nf +++ b/modules/local/synapse_get.nf @@ -4,10 +4,10 @@ process SYNAPSE_GET { label 'process_low' label 'error_retry' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val meta diff --git a/modules/local/synapse_list.nf b/modules/local/synapse_list.nf index f42357ab..0c03f8b2 100644 --- a/modules/local/synapse_list.nf +++ b/modules/local/synapse_list.nf @@ -3,10 +3,10 @@ process SYNAPSE_LIST { tag "$id" label 'process_low' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val id diff --git a/modules/local/synapse_merge_samplesheet.nf b/modules/local/synapse_merge_samplesheet.nf index f46a1fbf..4cb2abc3 100644 --- a/modules/local/synapse_merge_samplesheet.nf +++ b/modules/local/synapse_merge_samplesheet.nf @@ -4,7 +4,7 @@ process SYNAPSE_MERGE_SAMPLESHEET { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: path ('samplesheets/*') diff --git a/modules/local/synapse_show.nf b/modules/local/synapse_show.nf index 0bd6cc12..e1f756a5 100644 --- a/modules/local/synapse_show.nf +++ b/modules/local/synapse_show.nf @@ -3,10 +3,10 @@ process SYNAPSE_SHOW { tag "$id" label 'process_low' - conda "bioconda::synapseclient=2.6.0" + conda "bioconda::synapseclient=2.7.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/synapseclient:2.6.0--pyh5e36f6f_0' : - 'quay.io/biocontainers/synapseclient:2.6.0--pyh5e36f6f_0' }" + 'https://depot.galaxyproject.org/singularity/synapseclient:2.7.1--pyh7cba7a3_0' : + 'biocontainers/synapseclient:2.7.1--pyh7cba7a3_0' }" input: val id diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df21765..ebc87273 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a0..c32657de 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/custom/sratoolsncbisettings/main.nf b/modules/nf-core/custom/sratoolsncbisettings/main.nf index 36be10d3..5deb8892 100644 --- a/modules/nf-core/custom/sratoolsncbisettings/main.nf +++ b/modules/nf-core/custom/sratoolsncbisettings/main.nf @@ -5,7 +5,7 @@ process CUSTOM_SRATOOLSNCBISETTINGS { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" output: path('*.mkfg') , emit: ncbi_settings diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf index ca5ee763..2d9090e2 100644 --- a/modules/nf-core/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -5,15 +5,16 @@ process SRATOOLS_FASTERQDUMP { conda "bioconda::sra-tools=2.11.0 conda-forge::pigz=2.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : - 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" input: tuple val(meta), path(sra) path ncbi_settings + path certificate output: - tuple val(meta), path(fastq), emit: reads - path "versions.yml" , emit: versions + tuple val(meta), path('*.fastq.gz'), emit: reads + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -22,12 +23,8 @@ process SRATOOLS_FASTERQDUMP { def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - - // WARNING: Paired-end data extracted by fasterq-dump (--split-3 the default) - // always creates *_1.fastq *_2.fastq files but sometimes also - // an additional *.fastq file for unpaired reads which we ignore here. - fastq = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' def outfile = meta.single_end ? "${prefix}.fastq" : prefix + def key_file = certificate ? "--perm ${certificate}" : '' """ export NCBI_SETTINGS="\$PWD/${ncbi_settings}" @@ -35,6 +32,7 @@ process SRATOOLS_FASTERQDUMP { $args \\ --threads $task.cpus \\ --outfile $outfile \\ + ${key_file} \\ ${sra.name} pigz \\ diff --git a/modules/nf-core/sratools/fasterqdump/meta.yml b/modules/nf-core/sratools/fasterqdump/meta.yml index d6fbd444..629bdca5 100644 --- a/modules/nf-core/sratools/fasterqdump/meta.yml +++ b/modules/nf-core/sratools/fasterqdump/meta.yml @@ -27,7 +27,11 @@ input: description: > An NCBI user settings file. pattern: "*.mkfg" - + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" output: - meta: type: map diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf index 57e8a3c9..ba7be4bd 100644 --- a/modules/nf-core/sratools/prefetch/main.nf +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -5,11 +5,12 @@ process SRATOOLS_PREFETCH { conda "bioconda::sra-tools=2.11.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sra-tools:2.11.0--pl5321ha49a11a_3' : - 'quay.io/biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" + 'biocontainers/sra-tools:2.11.0--pl5321ha49a11a_3' }" input: tuple val(meta), val(id) path ncbi_settings + path certificate output: tuple val(meta), path(id), emit: sra @@ -20,6 +21,7 @@ process SRATOOLS_PREFETCH { shell: args = task.ext.args ?: '' + args += certificate ? " --perm ${certificate}" : '' args2 = task.ext.args2 ?: '5 1 100' // template 'retry_with_backoff.sh' } diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml index a3a26522..9817b0b2 100644 --- a/modules/nf-core/sratools/prefetch/meta.yml +++ b/modules/nf-core/sratools/prefetch/meta.yml @@ -19,7 +19,7 @@ input: Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - id: - type: val + type: string description: > A string denoting an SRA id. - ncbi_settings: @@ -27,7 +27,11 @@ input: description: > An NCBI user settings file. pattern: "*.mkfg" - + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" output: - meta: type: map diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh index cec0ab43..e08dbb6a 100755 --- a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh +++ b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh @@ -47,6 +47,8 @@ retry_with_backoff !{args2} \ !{args} \ !{id} +[ -f !{id}.sralite ] && { mkdir -p !{id}; mv "!{id}.sralite" !{id}/; } + vdb-validate !{id} cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index d6686ae4..1e9aaed0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,10 +15,11 @@ params { nf_core_pipeline = null nf_core_rnaseq_strandedness = 'auto' ena_metadata_fields = null - sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description' + sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description' synapse_config = null force_sratools_download = false skip_fastq_download = false + dbgap_key = null // Boilerplate options outdir = null @@ -70,7 +71,11 @@ try { // } profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + } conda { conda.enabled = true docker.enabled = false @@ -78,6 +83,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } mamba { conda.enabled = true @@ -87,14 +93,17 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -102,37 +111,57 @@ profiles { singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } gitpod { executor.name = 'local' executor.cpus = 16 executor.memory = 60.GB } + public_aws_ecr { + includeConfig 'conf/public_aws_ecr.config' + } test { includeConfig 'conf/test.config' } test_synapse { includeConfig 'conf/test_synapse.config' } test_full { includeConfig 'conf/test_full.config' } @@ -152,6 +181,12 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Set default registry for Docker and Podman independent of -profile +// Will not be used unless Docker / Podman are enabled +// Set to your registry if you have a mirror of containers +docker.registry = 'quay.io' +podman.registry = 'quay.io' + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true @@ -177,7 +212,7 @@ manifest { description = """Pipeline to fetch metadata and raw FastQ files from public databases""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.9' + version = '1.10.0' doi = 'https://doi.org/10.5281/zenodo.5070524' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 696914e4..ce2c13cc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "pattern": "^\\S+\\.(csv|tsv|txt)$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/GEO/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, "input_type": { "type": "string", @@ -32,13 +32,13 @@ "type": "string", "fa_icon": "fas fa-columns", "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.", - "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run])." + "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run)." }, "sample_mapping_fields": { "type": "string", "fa_icon": "fas fa-globe-americas", "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.", - "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description,description" + "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description" }, "nf_core_pipeline": { "type": "string", @@ -64,6 +64,12 @@ "fa_icon": "fas fa-fast-forward", "description": "Only download metadata for public data database ids and don't download the FastQ files." }, + "dbgap_key": { + "type": "string", + "fa_icon": "fas fa-address-card", + "help_text": "Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit. Users with granted access to controlled data can download the JWT cart file for the study from the SRA Run Selector upon logging in. The JWT file can only be used on cloud platforms and is valid for 1 hour upon creation.", + "format": "file-path" + }, "outdir": { "type": "string", "format": "directory-path", diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index 1e1d0d7b..de31637e 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -7,7 +7,8 @@ include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/ // workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { take: - ch_sra_ids // channel: [ val(meta), val(id) ] + ch_sra_ids // channel: [ val(meta), val(id) ] + ch_dbgap_key // channel: [ path(dbgap_key) ] main: @@ -17,22 +18,22 @@ workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { // Detect existing NCBI user settings or create new ones. // CUSTOM_SRATOOLSNCBISETTINGS() - def settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings // value channel: path(settings) + ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) // // Prefetch sequencing reads in SRA format. // - SRATOOLS_PREFETCH ( ch_sra_ids, settings ) + SRATOOLS_PREFETCH ( ch_sra_ids, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) // // Convert the SRA format into one or more compressed FASTQ files. // - SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings ) + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key ) ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) emit: - reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] - versions = ch_versions // channel: [ versions.yml ] + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml index c385ca21..6ff9442b 100644 --- a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json name: fastq_download_prefetch_fasterqdump_sratools description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). keywords: @@ -21,6 +22,11 @@ input: type: string description: > SRA run identifier. + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + pattern: "*.cart" # TODO Update when we decide on a standard for subworkflow docs output: - meta: diff --git a/workflows/sra.nf b/workflows/sra.nf index 1085598b..44ff8346 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -11,7 +11,7 @@ def valid_params = [ def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) // Validate input parameters -WorkflowSra.initialise(params, log, valid_params) +WorkflowSra.initialise(params, valid_params) /* ======================================================================================== @@ -50,13 +50,6 @@ workflow SRA { main: ch_versions = Channel.empty() - // - // Fail the pipeline if GEO ids detected - // - ids - .collect() - .map { WorkflowSra.isGeoFail(it, log) } - // // MODULE: Get SRA run information for public database ids // @@ -113,7 +106,8 @@ workflow SRA { // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. // FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( - ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } + ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] }, + params.dbgap_key ? file(params.dbgap_key, checkIfExists: true) : [] ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) @@ -123,7 +117,7 @@ workflow SRA { .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) .map { meta, fastq -> - def reads = meta.single_end ? [ fastq ] : fastq + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] def meta_clone = meta.clone() meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : ''