From 132fe5f32de1f458657128e1d1b6ee79c0b46503 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:13:58 +0100 Subject: [PATCH 01/44] Add lots of CI files --- .editorconfig | 12 +++ .git-blame-ignore-revs | 1 + .github/workflows/check-log.yml | 26 +++++++ .github/workflows/lint-code.yml | 130 ++++++++++++++++++++++++++++++++ .gitignore | 2 + .pre-commit-config.yaml | 19 +++++ pyproject.toml | 27 +++++++ requirements-dev.txt | 5 ++ 8 files changed, 222 insertions(+) create mode 100644 .editorconfig create mode 100644 .git-blame-ignore-revs create mode 100644 .github/workflows/check-log.yml create mode 100644 .github/workflows/lint-code.yml create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..70c7a9a8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 4 +indent_style = space + +[*.{md,yml,yaml,cff}] +indent_size = 2 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..a99e5d13 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +# Start adding here diff --git a/.github/workflows/check-log.yml b/.github/workflows/check-log.yml new file mode 100644 index 00000000..1447daba --- /dev/null +++ b/.github/workflows/check-log.yml @@ -0,0 +1,26 @@ +name: Check VERSIONLOG.MD has been updated +on: [pull_request] + +jobs: + check-versionlog: + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for all branches and tags + + - name: Check for VERSIONLOG.MD changes + id: versionlog_check + # 1) Find the common ancestor between the current HEAD and the base branch + # 2) Then see if the versionlog has been updated in the PR since it diverged + # from the common ancestor + run: | + PR_BASE_SHA=$(git merge-base HEAD ${{ github.event.pull_request.base.sha }}) + FILE_CHANGED=$(git diff --name-only $PR_BASE_SHA HEAD | grep 'VERSIONLOG.md' || true) + if [ -n "$FILE_CHANGED" ]; then + echo "VERSIONLOG.MD has been changed." + else + echo "VERSIONLOG.MD has NOT been changed." + exit 1 # Fail the workflow if no changes in VERSIONLOG.MD + fi diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml new file mode 100644 index 00000000..73fb6f97 --- /dev/null +++ b/.github/workflows/lint-code.yml @@ -0,0 +1,130 @@ +name: Lint code +on: [push, pull_request] + +jobs: + # Use ruff to check for code style violations + ruff-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check for style violations + # Configured in pyproject.toml + run: ruff check . + + # Use ruff to check code formatting + ruff-format: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check code formatting + run: ruff format --check . + + # Use mypy for static type checking + mypy-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mypy + # Start by installing type stubs + - name: mypy --> Install stubs + run: echo -e "y" | mypy --install-types **/*.py || exit 0 + - name: mypy --> Static type checking + # Configured in pyprojet.toml + run: mypy **/*.py + + # Use pipreqs to check for missing dependencies + pipreqs-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install pipreqs + run: pip install pipreqs + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Run pipreqs + run: pipreqs --savepath pipreqs.txt + + - name: Compare requirements + run: | + # Extract and sort package names + awk '{print $1}' $1 | sort -u > "$1".compare + awk -F'==' '{print $1}' $2 | sort -u > "$2".compare + + # Compare package lists + if cmp -s "$1".compare "$2".compare + then + echo "Requirements are the same" + exit 0 + else + echo "Requirements are different" + exit 1 + fi + + # Use Prettier to check various file formats + prettier: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install Prettier + run: npm install -g prettier + + - name: Run Prettier --check + run: prettier --check . + + # Use editorconfig to check all remaining file formats + editorconfig: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install editorconfig-checker + run: npm install -g editorconfig-checker + + - name: editorconfig --> Lint files + run: editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html') diff --git a/.gitignore b/.gitignore index eb7ce2ba..91b6d2ab 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,5 @@ _build .benchmarks .coverage __pycache__ +.pytest_cache +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..1c09ed2d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +# .pre-commit-config.yaml +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.6 + hooks: + - id: ruff + - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.7.1" + hooks: + - id: mypy + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v4.0.0-alpha.8" + hooks: + - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.2" + hooks: + - id: editorconfig-checker diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..17ba1fbc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +title = "taca" + + +[tool.ruff.lint] +select =[ + # Ruff default rules + # ------------------------------ + "E4", # pycodestyle Imports + "E7", # pycodestyle Statements + "E9", # pycodestyle Runtime + "F", # Pyflakes + + # Additional Comment + # ------------------------------------------------------ + "I", # isort Best-practice sorting of imports + "UP", # pyupgrade Make sure syntax is up-to-date +] +ignore = [ + "E402", # Module level import not at top of file + "E722", # Do not use bare 'except' + "E741", # Ambiguous variable name +] + + +[tool.mypy] +ignore_missing_imports = true +follow_imports = 'skip' diff --git a/requirements-dev.txt b/requirements-dev.txt index af58407f..9118bd64 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,8 @@ mock sphinx sphinx-rtd-theme pytest +ipython +ipdb +ruff +mypy +pipreqs From 8f1f5d3ee27cacf53d82788420ec947cbd378f2a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:23:07 +0100 Subject: [PATCH 02/44] update outdated readme --- README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 50ce07c3..776051d6 100644 --- a/README.md +++ b/README.md @@ -4,34 +4,105 @@

-## Tool for the Automation of Cleanup and Analyses +# Tool for the Automation of Cleanup and Analyses [![PyPI version](https://badge.fury.io/py/taca.svg)](http://badge.fury.io/py/taca) [![Build Status](https://travis-ci.org/SciLifeLab/TACA.svg?branch=master)](https://travis-ci.org/SciLifeLab/TACA) [![Documentation Status](https://readthedocs.org/projects/taca/badge/?version=latest)](https://readthedocs.org/projects/taca/?badge=latest) [![codecov](https://codecov.io/gh/scilifelab/taca/branch/master/graph/badge.svg)](https://codecov.io/gh/scilifelab/taca) -This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) in Stockholm, Sweden. +This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://ngisweden.scilifelab.se/) in Stockholm, Sweden. -### Install for development -You can install your own fork of taca in for instance a local conda environment for development. Provided you have conda installed: +## Installation + +Inside the repo, run `pip install .` + +## Development + +Run `pip install requirements-dev.txt` to install packages used for development and `pip install -e .` to make the installation editable. + +### Automated linting + +This repo is configured for automated linting. Linter parameters are defined in `pyproject.toml`. + +As of now, we use: + +- [ruff](https://docs.astral.sh/ruff/) to perform automated formatting and a variety of lint checks. + - Run with `ruff check .` and `ruff format .` +- [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking and to prevent contradictory type annotation. + - Run with `mypy **/*.py` +- [pipreqs](https://github.com/bndr/pipreqs) to check that the requirement files are up-to-date with the code. + + - This is run with a custom Bash script in GitHub Actions which will only compare the list of package names. + + ``` + # Extract and sort package names + awk '{print $1}' $1 | sort -u > "$1".compare + awk -F'==' '{print $1}' $2 | sort -u > "$2".compare + + # Compare package lists + if cmp -s "$1".compare "$2".compare + then + echo "Requirements are the same" + exit 0 + else + echo "Requirements are different" + exit 1 + fi + ``` + +- [prettier](https://prettier.io/) to format common languages. + - Run with `prettier .` +- [editorconfig-checker](https://github.com/editorconfig-checker/editorconfig-checker) to enforce `.editorconfig` rules for all files not covered by the tools above. + - Run with + ``` + editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html') + ``` + +#### [GitHub Actions](https://docs.github.com/en/actions) + +Configured in `.github/workflows/lint-code.yml`. Will test all commits in pushes or pull requests, but not change code or prevent merges. + +#### [Pre-commit](https://pre-commit.com/) + +Will prevent local commits that fail linting checks. Configured in `.pre-commit-config.yml`. + +To set up pre-commit checking: + +1. Run `pip install pre-commit` +2. Navigate to the repo root +3. Run `pre-commit install` + +This can be disabled with `pre-commit uninstall` + +#### VS Code automation + +To enable automated linting in VS Code, go the the user `settings.json` and include the following lines: ``` -# clone the repo -git clone https://github.com//TACA.git +"[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", +} +``` -# create an environment -conda create -n taca_dev python=2.7 -conda activate taca_dev +This will run the `ruff`-mediated linting with the same parameters as the `GitHub Actions` and `pre-commit` every time VS Code is used to format the code in the repository. -# install TACA and dependencies for developoment -cd TACA -python setup.py develop -pip install -r ./requirements-dev.txt +To run formatting on save, include the lines: -# Check that tests pass: -cd tests && nosetests -v -s ``` +"[python]": { + "editor.formatOnSave": true, +} +``` + +### Git blame suppression + +When a non-invasive tool is used to tidy up a lot of code, it is useful to supress the Git blame for that particular commit, so the original author can still be traced. + +To do this, add the hash of the commit containing the changes to `.git-blame-ignore-revs`, headed by an explanatory comment. + + +### Deliver command There is also a [plugin for the deliver command](https://github.com/SciLifeLab/taca-ngi-pipeline). To install this in the same development environment: @@ -43,7 +114,7 @@ python setup.py develop pip install -r ./requirements-dev.txt # add required config files and env for taca delivery plugin -echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml +echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml export CHARON_BASE_URL="http://tracking.database.org" export CHARON_API_TOKEN="charonapitokengoeshere" From b9ee704ad4da26790e539b8fe1d39aa71f831ef1 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:25:39 +0100 Subject: [PATCH 03/44] ruff check safe fixes --- doc/conf.py | 26 ++-- setup.py | 11 +- taca/analysis/analysis.py | 84 +++++++------ taca/analysis/analysis_nanopore.py | 10 +- taca/backup/backup.py | 72 +++++------ taca/backup/cli.py | 2 + taca/cleanup/cleanup.py | 72 +++++------ taca/cleanup/cli.py | 2 + taca/cli.py | 6 +- taca/illumina/MiSeq_Runs.py | 24 ++-- taca/illumina/Runs.py | 160 ++++++++++++------------- taca/illumina/Standard_Runs.py | 64 +++++----- taca/nanopore/ONT_run_classes.py | 32 ++--- taca/nanopore/instrument_transfer.py | 16 ++- taca/server_status/cli.py | 7 +- taca/server_status/cronjobs.py | 12 +- taca/server_status/server_status.py | 10 +- taca/testing/cli.py | 4 +- taca/testing/create_uppmax_like_env.py | 121 ++++++++++--------- taca/utils/bioinfo_tab.py | 45 +++---- taca/utils/cli.py | 2 + taca/utils/config.py | 11 +- taca/utils/filesystem.py | 2 +- taca/utils/misc.py | 22 ++-- taca/utils/statusdb.py | 22 ++-- taca/utils/transfer.py | 28 ++--- tests/test_analysis.py | 6 +- tests/test_analysis_nanopore.py | 7 +- tests/test_backup.py | 6 +- tests/test_cleanup.py | 2 +- tests/test_illumina.py | 24 ++-- tests/test_instrument_transfer.py | 22 ++-- tests/test_nanopore.py | 6 +- tests/test_server_status.py | 13 +- tests/test_utils.py | 20 ++-- 35 files changed, 489 insertions(+), 484 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index cb58a377..5c1d130e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # TACA documentation build configuration file, created by # sphinx-quickstart on Wed Sep 17 12:39:41 2014. @@ -12,7 +11,6 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os # If extensions (or modules to document with autodoc) are in another directory, @@ -49,8 +47,8 @@ master_doc = 'index' # General information about the project. -project = u'TACA' -copyright = u'2014, Guillermo Carrasco' +project = 'TACA' +copyright = '2014, Guillermo Carrasco' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -207,8 +205,8 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'TACA.tex', u'TACA Documentation', - u'Guillermo Carrasco', 'manual'), + ('index', 'TACA.tex', 'TACA Documentation', + 'Guillermo Carrasco', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -237,8 +235,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'taca', u'TACA Documentation', - [u'Guillermo Carrasco'], 1) + ('index', 'taca', 'TACA Documentation', + ['Guillermo Carrasco'], 1) ] # If true, show URL addresses after external links. @@ -251,8 +249,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'TACA', u'TACA Documentation', - u'Guillermo Carrasco', 'TACA', 'One line description of project.', + ('index', 'TACA', 'TACA Documentation', + 'Guillermo Carrasco', 'TACA', 'One line description of project.', 'Miscellaneous'), ] @@ -272,10 +270,10 @@ # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u'TACA' -epub_author = u'Guillermo Carrasco' -epub_publisher = u'Guillermo Carrasco' -epub_copyright = u'2014, Guillermo Carrasco' +epub_title = 'TACA' +epub_author = 'Guillermo Carrasco' +epub_publisher = 'Guillermo Carrasco' +epub_copyright = '2014, Guillermo Carrasco' # The basename for the epub file. It defaults to the project name. #epub_basename = u'TACA' diff --git a/setup.py b/setup.py index cc05b49c..d8962c06 100644 --- a/setup.py +++ b/setup.py @@ -1,21 +1,20 @@ -from setuptools import setup, find_packages import glob -import os -import sys +from io import open + +from setuptools import find_packages, setup from taca import __version__ -from io import open try: with open("requirements.txt", "r") as f: install_requires = [x.strip() for x in f.readlines()] -except IOError: +except OSError: install_requires = [] try: with open("dependency_links.txt", "r") as f: dependency_links = [x.strip() for x in f.readlines()] -except IOError: +except OSError: dependency_links = [] diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index c817b064..2ef4aafd 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -2,22 +2,20 @@ import glob import logging import os -import sys import subprocess +import sys +from io import open +from shutil import copyfile, copytree + +from flowcell_parser.classes import RunParametersParser -from shutil import copyfile -from shutil import copytree -from taca.illumina.Standard_Runs import Standard_Run from taca.illumina.MiSeq_Runs import MiSeq_Run from taca.illumina.NextSeq_Runs import NextSeq_Run from taca.illumina.NovaSeq_Runs import NovaSeq_Run from taca.illumina.NovaSeqXPlus_Runs import NovaSeqXPlus_Run +from taca.utils import statusdb from taca.utils.config import CONFIG from taca.utils.transfer import RsyncAgent -from taca.utils import statusdb - -from flowcell_parser.classes import RunParametersParser -from io import open logger = logging.getLogger(__name__) @@ -37,15 +35,15 @@ def get_runObj(run, software): elif os.path.exists(os.path.join(run, 'RunParameters.xml')): run_parameters_file = 'RunParameters.xml' else: - logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run)) + logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}') return run_parameters_path = os.path.join(run, run_parameters_file) try: run_parameters = RunParametersParser(run_parameters_path) except OSError: - logger.warn('Problems parsing the runParameters.xml file at {}. ' - 'This is quite unexpected. please archive the run {} manually'.format(run_parameters_path, run)) + logger.warn(f'Problems parsing the runParameters.xml file at {run_parameters_path}. ' + f'This is quite unexpected. please archive the run {run} manually') else: # Do a case by case test because there are so many version of RunParameters that there is no real other way runtype = run_parameters.data['RunParameters'].get('InstrumentType', @@ -110,8 +108,8 @@ def _upload_to_statusdb(run): try: PFclusters = parser.obj['Undetermined'][lane]['unknown'] except KeyError: - logger.error('While taking extra care of lane {} of NoIndex type ' \ - 'I found out that not all values were available'.format(lane)) + logger.error(f'While taking extra care of lane {lane} of NoIndex type ' \ + 'I found out that not all values were available') continue # In Lanes_stats fix the lane yield parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters) @@ -122,9 +120,9 @@ def _upload_to_statusdb(run): updated += 1 sample['PF Clusters'] = str(PFclusters) if updated != 1: - logger.error('While taking extra care of lane {} of NoIndex type ' + logger.error(f'While taking extra care of lane {lane} of NoIndex type ' 'I updated more than once the barcode_lane. ' - 'This is too much to continue so I will fail.'.format(lane)) + 'This is too much to continue so I will fail.') os.sys.exit() # If I am here it means I changed the HTML representation to something # else to accomodate the wired things we do @@ -144,7 +142,7 @@ def transfer_run(run_dir): mail_recipients = CONFIG.get('mail', {}).get('recipients') if runObj is None: mail_recipients = CONFIG.get('mail', {}).get('recipients') - logger.error('Trying to force a transfer of run {} but the sequencer was not recognized.'.format(run_dir)) + logger.error(f'Trying to force a transfer of run {run_dir} but the sequencer was not recognized.') else: runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients) @@ -170,7 +168,7 @@ def transfer_runfolder(run_dir, pid, exclude_lane): try: with open(new_sample_sheet, 'w') as nss: nss.write(extract_project_samplesheet(original_sample_sheet, pid_list)) - except IOError as e: + except OSError as e: logger.error('An error occured while parsing the samplesheet. ' 'Please check the sample sheet and try again.') raise e @@ -185,14 +183,14 @@ def transfer_runfolder(run_dir, pid, exclude_lane): dir_for_excluding_lane = [] lane_to_exclude = exclude_lane.split(',') for lane in lane_to_exclude: - if os.path.isdir('{}/{}/Thumbnail_Images/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Thumbnail_Images/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Images/Focus/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Images/Focus/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Data/Intensities/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Data/Intensities/BaseCalls/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/BaseCalls/L00{}'.format(lane)]) + if os.path.isdir(f'{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}'): + dir_for_excluding_lane.extend(['--exclude', f'Thumbnail_Images/L00{lane}']) + if os.path.isdir(f'{run_dir_path}/{dir_name}/Images/Focus/L00{lane}'): + dir_for_excluding_lane.extend(['--exclude', f'Images/Focus/L00{lane}']) + if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}'): + dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/L00{lane}']) + if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}'): + dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/BaseCalls/L00{lane}']) try: exclude_options_for_tar = ['--exclude', 'Demultiplexing*', @@ -244,7 +242,7 @@ def transfer_runfolder(run_dir, pid, exclude_lane): os.remove(new_sample_sheet) os.remove(archive) os.remove(md5file) - except IOError as e: + except OSError as e: logger.error('Was not able to delete all temporary files') raise e return @@ -271,32 +269,32 @@ def _process(run): :param taca.illumina.Run run: Run to be processed and transferred """ - logger.info('Checking run {}'.format(run.id)) + logger.info(f'Checking run {run.id}') transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv') if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot - logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id)) + logger.info(f'Run {run.id} already transferred to analysis server, skipping it') return if run.get_run_status() == 'SEQUENCING': - logger.info('Run {} is not finished yet'.format(run.id)) + logger.info(f'Run {run.id} is not finished yet') if 'statusdb' in CONFIG: _upload_to_statusdb(run) elif run.get_run_status() == 'TO_START': if run.get_run_type() == 'NON-NGI-RUN': # For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs) - logger.warn('Run {} marked as {}, ' + logger.warn(f'Run {run.id} marked as {run.get_run_type()}, ' 'TACA will skip this and move the run to ' - 'no-sync directory'.format(run.id, run.get_run_type())) + 'no-sync directory') if 'storage' in CONFIG: run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type]) return - logger.info(('Starting BCL to FASTQ conversion and demultiplexing for run {}'.format(run.id))) + logger.info(f'Starting BCL to FASTQ conversion and demultiplexing for run {run.id}') if 'statusdb' in CONFIG: _upload_to_statusdb(run) run.demultiplex_run() elif run.get_run_status() == 'IN_PROGRESS': - logger.info(('BCL conversion and demultiplexing process in ' - 'progress for run {}, skipping it'.format(run.id))) + logger.info('BCL conversion and demultiplexing process in ' + f'progress for run {run.id}, skipping it') # Upload to statusDB if applies if 'statusdb' in CONFIG: _upload_to_statusdb(run) @@ -307,7 +305,7 @@ def _process(run): # a cycle take the last if out of the elif if run.get_run_status() == 'COMPLETED': run.check_run_status() - logger.info(('Preprocessing of run {} is finished, transferring it'.format(run.id))) + logger.info(f'Preprocessing of run {run.id} is finished, transferring it') # Upload to statusDB if applies if 'statusdb' in CONFIG: _upload_to_statusdb(run) @@ -317,10 +315,10 @@ def _process(run): demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings'])) demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5])) if len(demux_log['error_and_warning_messages'])>5: - demux_summary_message.append("...... Only the first 5 errors or warnings are displayed for Demultiplexing_{}.".format(demux_id)) + demux_summary_message.append(f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}.") # Notify with a mail run completion and stats uploaded if demux_summary_message: - sbt = ("{} Demultiplexing Completed with ERRORs or WARNINGS!".format(run.id)) + sbt = (f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!") msg = """The run {run} has been demultiplexed with errors or warnings! {errors_warnings} @@ -331,7 +329,7 @@ def _process(run): """.format(errors_warnings='\n'.join(demux_summary_message), run=run.id) else: - sbt = ("{} Demultiplexing Completed!".format(run.id)) + sbt = (f"{run.id} Demultiplexing Completed!") msg = """The run {run} has been demultiplexed without any error or warning. The Run will be transferred to the analysis cluster for further analysis. @@ -345,7 +343,7 @@ def _process(run): if 'mfs_path' in CONFIG['analysis']: try: mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id) - logger.info('Copying demultiplex stats, InterOp metadata and XML files for run {} to {}'.format(run.id, mfs_dest)) + logger.info(f'Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}') if not os.path.exists(mfs_dest): os.mkdir(mfs_dest) demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports', @@ -364,7 +362,7 @@ def _process(run): if os.path.exists(interop_src): copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True) except: - logger.warn('Could not copy demultiplex stats, InterOp metadata or XML files for run {}'.format(run.id)) + logger.warn(f'Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}') # Transfer to analysis server if flag is True if run.transfer_to_analysis_server: @@ -383,7 +381,7 @@ def _process(run): # Determine the run type runObj = get_runObj(run, software) if not runObj: - raise RuntimeError("Unrecognized instrument type or incorrect run folder {}".format(run)) + raise RuntimeError(f"Unrecognized instrument type or incorrect run folder {run}") else: _process(runObj) else: @@ -394,12 +392,12 @@ def _process(run): for _run in runs: runObj = get_runObj(_run, software) if not runObj: - logger.warning('Unrecognized instrument type or incorrect run folder {}'.format(run)) + logger.warning(f'Unrecognized instrument type or incorrect run folder {run}') else: try: _process(runObj) except: # This function might throw and exception, # it is better to continue processing other runs - logger.warning('There was an error processing the run {}'.format(run)) + logger.warning(f'There was an error processing the run {run}') pass diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index 74e4c3ef..9141551a 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -1,17 +1,17 @@ """Nanopore analysis methods for TACA.""" -import os import logging +import os import re import traceback -from taca.utils.config import CONFIG -from taca.utils.misc import send_mail from taca.nanopore.ONT_run_classes import ( + ONT_RUN_PATTERN, + ONT_qc_run, ONT_run, ONT_user_run, - ONT_qc_run, - ONT_RUN_PATTERN, ) +from taca.utils.config import CONFIG +from taca.utils.misc import send_mail logger = logging.getLogger(__name__) diff --git a/taca/backup/backup.py b/taca/backup/backup.py index 037b1ea6..88a4188f 100644 --- a/taca/backup/backup.py +++ b/taca/backup/backup.py @@ -1,31 +1,31 @@ """Backup methods and utilities.""" +import csv import logging import os import re import shutil import subprocess as sp import time -import csv - from datetime import datetime -from taca.utils.config import CONFIG -from taca.utils import statusdb, filesystem, misc from io import open +from taca.utils import filesystem, misc, statusdb +from taca.utils.config import CONFIG + logger = logging.getLogger(__name__) -class run_vars(object): +class run_vars: """A simple variable storage class.""" def __init__(self, run, archive_path): self.abs_path = os.path.abspath(run) self.path, self.name = os.path.split(self.abs_path) self.name = self.name.split('.', 1)[0] self.zip = os.path.join(archive_path, f'{self.name}.tar.gz') - self.key = '{}.key'.format(self.name) - self.key_encrypted = '{}.key.gpg'.format(self.name) + self.key = f'{self.name}.key' + self.key_encrypted = f'{self.name}.key.gpg' self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg') -class backup_utils(object): +class backup_utils: """A class object with main utility methods related to backing up.""" def __init__(self, run=None): @@ -49,7 +49,7 @@ def fetch_config_info(self): self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt') self.archive_log_location = CONFIG['backup']['archive_log'] except KeyError as e: - logger.error('Config file is missing the key {}, make sure it have all required information'.format(str(e))) + logger.error(f'Config file is missing the key {str(e)}, make sure it have all required information') raise SystemExit def collect_runs(self, ext=None, filter_by_ext=False): @@ -60,14 +60,14 @@ def collect_runs(self, ext=None, filter_by_ext=False): archive_path = self.archive_dirs[run_type] run = run_vars(self.run, archive_path) if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)): - logger.error('Given run {} did not match a FC pattern'.format(self.run)) + logger.error(f'Given run {self.run} did not match a FC pattern') raise SystemExit if self._is_ready_to_archive(run, ext): self.runs.append(run) else: for adir in self.archive_dirs.values(): if not os.path.isdir(adir): - logger.warn('Path {} does not exist or it is not a directory'.format(adir)) + logger.warn(f'Path {adir} does not exist or it is not a directory') continue for item in os.listdir(adir): if filter_by_ext and not item.endswith(ext): @@ -103,11 +103,11 @@ def avail_disk_space(self, path, run): df_out, df_err = df_proc.communicate() available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024 except Exception as e: - logger.error('Evaluation of disk space failed with error {}'.format(e)) + logger.error(f'Evaluation of disk space failed with error {e}') raise SystemExit if available_size < required_size: - e_msg = 'Required space for encryption is {}GB, but only {}GB available'.format(required_size, available_size) - subjt = 'Low space for encryption - {}'.format(self.host_name) + e_msg = f'Required space for encryption is {required_size}GB, but only {available_size}GB available' + subjt = f'Low space for encryption - {self.host_name}' logger.error(e_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) raise SystemExit @@ -146,7 +146,7 @@ def _get_run_type(self, run): else: run_type = '' except: - logger.warn('Could not fetch run type for run {}'.format(run)) + logger.warn(f'Could not fetch run type for run {run}') return run_type def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]): @@ -194,7 +194,7 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]): if status != 0: self._clean_tmp_files(files_to_remove) if mail_failed: - subjt = 'Command call failed - {}'.format(self.host_name) + subjt = f'Command call failed - {self.host_name}' e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg)) @@ -215,7 +215,7 @@ def _log_pdc_statusdb(self, run): run_date = run_vals[0][2:] else: run_date = run_vals[0] - run_fc = '{}_{}'.format(run_date, run_vals[-1]) + run_fc = f'{run_date}_{run_vals[-1]}' couch_connection = statusdb.StatusdbSession(self.couch_info).connection db = couch_connection[self.couch_info['db']] fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)} @@ -223,9 +223,9 @@ def _log_pdc_statusdb(self, run): doc = db.get(d_id) doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') db.save(doc) - logger.info('Logged "pdc_archived" timestamp for fc {} in statusdb doc "{}"'.format(run, d_id)) + logger.info(f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"') except: - logger.warn('Not able to log "pdc_archived" timestamp for run {}'.format(run)) + logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}') def _is_ready_to_archive(self, run, ext): """Check if the run to be encrypted has finished sequencing and has been copied completely to nas""" @@ -258,7 +258,7 @@ def _move_run_to_archived(self, run): run_type = self._get_run_type(run.name) archived_path = self.archived_dirs[run_type] if os.path.isdir(archived_path): - logger.info('Moving run {} to the archived folder'.format(run.name)) + logger.info(f'Moving run {run.name} to the archived folder') shutil.move(run.name, archived_path) else: logger.warning("Cannot move run to archived, destination does not exist") @@ -306,8 +306,8 @@ def encrypt_runs(cls, run, force): continue # Remove encrypted file if already exists if os.path.exists(run.zip_encrypted): - logger.warn((f'Removing already existing encrypted file for run {run.name}, this is a precaution ' - 'to make sure the file was encrypted with correct key file')) + logger.warn(f'Removing already existing encrypted file for run {run.name}, this is a precaution ' + 'to make sure the file was encrypted with correct key file') bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted]) # Generate random key to use as pasphrase if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files): @@ -356,41 +356,41 @@ def pdc_put(cls, run): """Archive the collected runs to PDC.""" bk = cls(run) bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True) - logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs))) + logger.info(f'In total, found {len(bk.runs)} run(s) to send PDC') for run in bk.runs: - run.flag = '{}.archiving'.format(run.name) + run.flag = f'{run.name}.archiving' run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) if run.path not in bk.archive_dirs.values(): - logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate ' - 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name))) + logger.error('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate ' + 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name)) continue if not os.path.exists(run.dst_key_encrypted): - logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted)) + logger.error(f'Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it') continue with filesystem.chdir(run.path): #skip run if being encrypted - if os.path.exists('{}.encrypting'.format(run.name)): - logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name)) + if os.path.exists(f'{run.name}.encrypting'): + logger.warn(f'Run {run.name} is currently being encrypted, so skipping now') continue # skip run if already ongoing if os.path.exists(run.flag): - logger.warn('Run {} is already being archived, so skipping now'.format(run.name)) + logger.warn(f'Run {run.name} is already being archived, so skipping now') continue if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): - logger.warn('Seems like files related to run {} already exist in PDC, check and cleanup'.format(run.name)) + logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup') continue flag = open(run.flag, 'w').close() - logger.info('Sending file {} to PDC'.format(run.zip_encrypted)) - if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]): + logger.info(f'Sending file {run.zip_encrypted} to PDC') + if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]): time.sleep(15) # give some time just in case 'dsmc' needs to settle - if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]): + if bk._call_commands(cmd1=f'dsmc archive {run.dst_key_encrypted}', tmp_files=[run.flag]): time.sleep(5) # give some time just in case 'dsmc' needs to settle if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted): - logger.info('Successfully sent file {} to PDC, moving file locally from {} to archived folder'.format(run.zip_encrypted, run.path)) + logger.info(f'Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder') bk.log_archived_run(run.zip_encrypted) if bk.couch_info: bk._log_pdc_statusdb(run.name) bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag]) bk._move_run_to_archived(run) continue - logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted)) + logger.warn(f'Sending file {run.zip_encrypted} to PDC failed') diff --git a/taca/backup/cli.py b/taca/backup/cli.py index 07cce810..89128002 100644 --- a/taca/backup/cli.py +++ b/taca/backup/cli.py @@ -1,7 +1,9 @@ """CLI for the backup subcommand.""" import click + from taca.backup.backup import backup_utils as bkut + @click.group() @click.pass_context def backup(ctx): diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index 07600870..80ef1905 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -2,16 +2,16 @@ import logging import os import re - from collections import defaultdict from datetime import datetime from glob import glob - -from taca.utils.config import CONFIG, load_config -from taca.utils import filesystem, misc, statusdb from io import open + from six.moves import map +from taca.utils import filesystem, misc, statusdb +from taca.utils.config import CONFIG, load_config + logger = logging.getLogger(__name__) def cleanup_miarka(days_fastq, days_analysis, @@ -59,9 +59,9 @@ def cleanup_miarka(days_fastq, days_analysis, if date: date = datetime.strptime(date, '%Y-%m-%d') except KeyError as e: - logger.error('Config file is missing the key {}, make sure it has all required information'.format(str(e))) + logger.error(f'Config file is missing the key {str(e)}, make sure it has all required information') raise SystemExit - except ValueError as e: + except ValueError: logger.error('Date given with "--date" option is not in required format, see help for more info') raise SystemExit @@ -101,7 +101,7 @@ def cleanup_miarka(days_fastq, days_analysis, fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn('Flowcell {} does not contain a "{}" directory'.format(fc, flowcell_project_source)) + logger.warn(f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory') continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ @@ -113,7 +113,7 @@ def cleanup_miarka(days_fastq, days_analysis, continue fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files)) if fc_undet_files: - logger.info('All projects was cleaned for FC {}, found {} undeterminded files'.format(fc, len(fc_undet_files))) + logger.info(f'All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files') all_undet_files.extend(list(map(os.path.abspath, fc_undet_files))) if all_undet_files: undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) @@ -142,7 +142,7 @@ def cleanup_miarka(days_fastq, days_analysis, fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn('Flowcell {} do not contain a "{}" direcotry'.format(fc, flowcell_project_source)) + logger.warn(f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry') continue projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ @@ -201,7 +201,7 @@ def cleanup_miarka(days_fastq, days_analysis, _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])])) raise SystemExit - logger.info('Initial list is built with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list))) + logger.info(f'Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}') if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'): filtered_project, proj_count = ([], 0) #go through complied project list and remove files @@ -209,15 +209,15 @@ def cleanup_miarka(days_fastq, days_analysis, proj_count += 1 if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq), proj_count, len(project_clean_list)), default='no'): - logger.info('Will not remove files for project {}'.format(proj)) + logger.info(f'Will not remove files for project {proj}') filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) - logger.info('Removed {}/{} projects from initial list'.format(len(filtered_project), proj_count)) + logger.info(f'Removed {len(filtered_project)}/{proj_count} projects from initial list') if not project_clean_list: logger.info('There are no projects to clean after filtering') return - logger.info('Final list is created with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list))) + logger.info(f'Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}') if not misc.query_yes_no('Proceed with cleanup ?', default='no'): logger.info('Aborting cleanup') return @@ -226,21 +226,21 @@ def cleanup_miarka(days_fastq, days_analysis, for proj, info in project_clean_list.items(): fastq_info = info.get('fastq_to_remove') if fastq_info and isinstance(fastq_info, dict): - logger.info('Cleaning fastq files for project {}'.format(proj)) + logger.info(f'Cleaning fastq files for project {proj}') fastq_fc = fastq_info.get('flowcells', {}) removed_fc = [] for fc, fc_info in fastq_fc.items(): proj_fc_root = fc_info['proj_root'] - logger.info('Removing fastq files from {}'.format(proj_fc_root)) + logger.info(f'Removing fastq files from {proj_fc_root}') if not dry_run: if _remove_files(fc_info['fq_files']): - logger.info('Removed fastq files from FC {} for project {}, marking it as cleaned'.format(fc, proj)) + logger.info(f'Removed fastq files from FC {fc} for project {proj}, marking it as cleaned') _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: proj_data_root = fastq_info['proj_data']['proj_data_root'] - logger.info('All flowcells cleaned for this project, marking it as cleaned in {}'.format(proj_data_root)) + logger.info(f'All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}') _touch_cleaned(proj_data_root) except: pass @@ -248,18 +248,18 @@ def cleanup_miarka(days_fastq, days_analysis, analysis_info = info.get('analysis_to_remove') if analysis_info and isinstance(analysis_info, dict): proj_analysis_root = analysis_info['proj_analysis_root'] - logger.info('cleaning analysis data for project {}'.format(proj)) + logger.info(f'cleaning analysis data for project {proj}') removed_qc = [] for qc, files in analysis_info['analysis_files'].items(): - logger.info('Removing files of "{}" from {}'.format(qc, proj_analysis_root)) + logger.info(f'Removing files of "{qc}" from {proj_analysis_root}') if not dry_run: if _remove_files(files): removed_qc.append(qc) else: - logger.warn('Could not remove some files in qc directory "{}"'.format(qc)) + logger.warn(f'Could not remove some files in qc directory "{qc}"') map(analysis_info['analysis_files'].pop, removed_qc) if len(analysis_info['analysis_files']) == 0: - logger.info('Removed analysis data for project {}, marking it cleaned'.format(proj)) + logger.info(f'Removed analysis data for project {proj}, marking it cleaned') _touch_cleaned(proj_analysis_root) @@ -273,7 +273,7 @@ def get_closed_proj_info(prj, pdoc, tdate=None): if not tdate: tdate = datetime.today() if not pdoc: - logger.warn('Seems like project {} does not have a proper statusdb document, skipping it'.format(prj)) + logger.warn(f'Seems like project {prj} does not have a proper statusdb document, skipping it') elif 'close_date' in pdoc: closed_date = pdoc['close_date'] try: @@ -348,9 +348,9 @@ def get_proj_meta_info(info, days_fastq): template = '\n' def _get_template_string(h, v): try: - v = '{}: {}\n'.format(h, v) + v = f'{h}: {v}\n' except: - v = '{}: Problem getting this'.format(h) + v = f'{h}: Problem getting this' return v template += _get_template_string('Project overview', info.get('name')) template += _get_template_string('Project ID', info.get('pid')) @@ -367,13 +367,13 @@ def _get_template_string(h, v): elif isinstance(analysis_info, dict): f_stat = [] for qc_type, files in analysis_info['analysis_files'].items(): - f_stat.append('{} ({} files)'.format(qc_type, len(files))) + f_stat.append(f'{qc_type} ({len(files)} files)') template += 'Project analyzed: {}\n'.format(', '.join(f_stat)) # set fastq info based upon what we have fq_info = info.get('fastq_to_remove') if isinstance(fq_info, str) and fq_info == "young": - template += 'Project been closed less than {} days, so will not remove any fastq files\n'.format(days_fastq) + template += f'Project been closed less than {days_fastq} days, so will not remove any fastq files\n' elif isinstance(fq_info, dict): proj_fq_info = fq_info.get('proj_data') if not proj_fq_info: @@ -385,7 +385,7 @@ def _get_template_string(h, v): fc_fq_info = fq_info.get('flowcells', {}) fc_num = len(fc_fq_info.keys()) fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()])) - template += 'Flowcells: There are {} FC with total {} fastq files\n'.format(fc_num, fc_files) + template += f'Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n' template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0))) return template @@ -394,8 +394,8 @@ def get_files_size_text(plist): """Get project list dict and give back string with overll sizes.""" fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()])) asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()])) - return '({f}{s}{a}) '.format(f = '~{} fastq data'.format(fsize) if fsize else '', - a = '~{} analysis data'.format(asize) if asize else '', + return '({f}{s}{a}) '.format(f = f'~{fsize} fastq data' if fsize else '', + a = f'~{asize} analysis data' if asize else '', s = ' and ' if fsize and asize else '') def _def_get_size_unit(s): @@ -405,15 +405,15 @@ def _def_get_size_unit(s): gb = mb * 1000 tb = gb * 1000 if s > tb: - s = '~{}tb'.format(int(s/tb)) + s = f'~{int(s/tb)}tb' elif s > gb: - s = '~{}gb'.format(int(s/gb)) + s = f'~{int(s/gb)}gb' elif s > mb: - s = '~{}mb'.format(int(s/mb)) + s = f'~{int(s/mb)}mb' elif s > kb: - s = '~{}kb'.format(int(s/kb)) + s = f'~{int(s/kb)}kb' elif s > 0: - s = '~{}b'.format(int(s/b)) + s = f'~{int(s/b)}b' return str(s) def _remove_files(files): @@ -423,7 +423,7 @@ def _remove_files(files): try: os.remove(fl) except Exception as e: - logger.warn('Could not remove file {} due to "{}"'.format(fl, e.message)) + logger.warn(f'Could not remove file {fl} due to "{e.message}"') status = False return status @@ -432,4 +432,4 @@ def _touch_cleaned(path): try: open(os.path.join(path, 'cleaned'), 'w').close() except Exception as e: - logger.warn('Could not create "cleaned" file in path {} due to "{}"'.format(path, e.message)) + logger.warn(f'Could not create "cleaned" file in path {path} due to "{e.message}"') diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py index 65abaf50..6410567b 100644 --- a/taca/cleanup/cli.py +++ b/taca/cleanup/cli.py @@ -1,8 +1,10 @@ """CLI for the storage subcommand.""" import click + from taca.cleanup import cleanup as cln from taca.utils import misc + @click.group() @click.pass_context @click.option('--status_db_config', diff --git a/taca/cli.py b/taca/cli.py index 1c78dabc..ad8d59b6 100644 --- a/taca/cli.py +++ b/taca/cli.py @@ -1,10 +1,10 @@ -# -*- coding: utf-8 -*- import logging import os -from pkg_resources import iter_entry_points + import click -import taca.log +from pkg_resources import iter_entry_points +import taca.log from taca import __version__ from taca.utils import config as conf diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py index b90d734b..f6585801 100644 --- a/taca/illumina/MiSeq_Runs.py +++ b/taca/illumina/MiSeq_Runs.py @@ -1,8 +1,10 @@ +import logging import os import re import shutil -import logging + from flowcell_parser.classes import SampleSheetParser + from taca.illumina.Standard_Runs import Standard_Run logger = logging.getLogger(__name__) @@ -63,10 +65,10 @@ def _copy_samplesheet(self): # Copy the original samplesheet locally. # Copy again if already done as there might have been changes to the samplesheet try: - shutil.copy(ssname, os.path.join(self.run_dir, '{}.csv'.format(self.flowcell_id))) + shutil.copy(ssname, os.path.join(self.run_dir, f'{self.flowcell_id}.csv')) ssname = os.path.join(self.run_dir, os.path.split(ssname)[1]) except: - raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir)) + raise RuntimeError(f"unable to copy file {ssname} to destination {self.run_dir}") # This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready # to be used it needs some editing. @@ -86,7 +88,7 @@ def _copy_samplesheet(self): except Exception as e: logger.error(e) return False - logger.info(('Created SampleSheet_copy.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest))) + logger.info(f'Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} ') # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv')) @@ -99,7 +101,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC """ - output = u'' + output = '' compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} # Expand the ssparser if there are lanes with 10X or Smart-seq samples index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) @@ -143,12 +145,12 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None if not fields_to_remove: fields_to_remove = [] # Header - output += '[Header]{}'.format(os.linesep) + output += f'[Header]{os.linesep}' for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' output += os.linesep # Data - output += '[Data]{}'.format(os.linesep) + output += f'[Data]{os.linesep}' datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: @@ -163,13 +165,13 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm])) + value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}') else: # Substitute SampleID with SampleName, add Sample_ as prefix - value ='Sample_{}'.format(line[ssparser.dfield_snm]) + value =f'Sample_{line[ssparser.dfield_snm]}' except: # Otherwise add Sample_ as prefix - value = 'Sample_{}'.format(line[ssparser.dfield_sid]) + value = f'Sample_{line[ssparser.dfield_sid]}' elif rename_qPCR_suffix and field in fields_qPCR: value = re.sub('__qPCR_$', '', line[field]) line_ar.append(value) diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index e479e800..fa4618c8 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -1,27 +1,27 @@ -import os -import re import csv -import logging -import subprocess -import shutil import glob import json - +import logging +import os +import re +import shutil +import subprocess from datetime import datetime +from flowcell_parser.classes import LaneBarcodeParser, RunParser, SampleSheetParser + from taca.utils import misc from taca.utils.misc import send_mail -from flowcell_parser.classes import RunParser, LaneBarcodeParser, SampleSheetParser logger = logging.getLogger(__name__) -class Run(object): +class Run: """ Defines an Illumina run """ def __init__(self, run_dir, software, configuration): if not os.path.exists(run_dir): - raise RuntimeError("Could not locate run directory {}".format(run_dir)) + raise RuntimeError(f"Could not locate run directory {run_dir}") if 'analysis_server' not in configuration or \ 'bcl2fastq' not in configuration or \ @@ -35,7 +35,7 @@ def __init__(self, run_dir, software, configuration): logger.warning("Creating link from runParameters.xml to RunParameters.xml") os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml')) elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')): - raise RuntimeError("Could not locate runParameters.xml in run directory {}".format(run_dir)) + raise RuntimeError(f"Could not locate runParameters.xml in run directory {run_dir}") self.run_dir = os.path.abspath(run_dir) self.software = software @@ -67,38 +67,38 @@ def check_run_status(self): if self.software == 'bcl2fastq': legacy_path = '' elif self.software == 'bclconvert': - legacy_path = "Reports/{}".format(self.legacy_dir) + legacy_path = f"Reports/{self.legacy_dir}" # Check the status of running demux # Collect all samplesheets generated before samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while all_demux_done = True for samplesheet in samplesheets: demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] - demux_folder = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id)) + demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") # Check if this job is done if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')): all_demux_done = all_demux_done and True if self.software == 'bcl2fastq': - demux_log = os.path.join(self.run_dir, "demux_{}_bcl2fastq.err".format(demux_id)) + demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl2fastq.err") elif self.software == 'bclconvert': - demux_log = os.path.join(self.run_dir, "demux_{}_bcl-convert.err".format(demux_id)) + demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl-convert.err") else: raise RuntimeError("Unrecognized software!") if os.path.isfile(demux_log): errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log) else: - raise RuntimeError("No demux log file found for sub-demultiplexing {}!".format(demux_id)) + raise RuntimeError(f"No demux log file found for sub-demultiplexing {demux_id}!") self.demux_summary[demux_id] = {'errors' : errors, 'warnings' : warnings, 'error_and_warning_messages' : error_and_warning_messages } if errors or warnings: - logger.info("Sub-Demultiplexing in {} completed with {} errors and {} warnings!".format(demux_folder, errors, warnings)) + logger.info(f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!") else: - logger.info("Sub-Demultiplexing in {} completed without any error or warning.".format(demux_folder)) + logger.info(f"Sub-Demultiplexing in {demux_folder} completed without any error or warning.") else: all_demux_done = all_demux_done and False - logger.info("Sub-Demultiplexing in {} not completed yet.".format(demux_folder)) + logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.") # All demux jobs finished and all stats aggregated under Demultiplexing # Aggreate all the results in the Demultiplexing folder @@ -119,7 +119,7 @@ def _check_demux_log(self, demux_id, demux_log): This function checks the log files of bcl2fastq/bclconvert Errors or warnings will be captured and email notifications will be sent """ - with open(demux_log, 'r') as demux_log_file: + with open(demux_log) as demux_log_file: demux_log_content = demux_log_file.readlines() if self.software == 'bcl2fastq': pattern = r'Processing completed with (\d+) errors and (\d+) warnings' @@ -134,7 +134,7 @@ def _check_demux_log(self, demux_id, demux_log): error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages else: - raise RuntimeError("Bad format with log file demux_{}_bcl2fastq.err".format(demux_id)) + raise RuntimeError(f"Bad format with log file demux_{demux_id}_bcl2fastq.err") elif self.software == 'bclconvert': errors = 0 warnings = 0 @@ -182,7 +182,7 @@ def _get_samplesheet(self): samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'], current_year) - ssname = os.path.join(samplesheets_dir, '{}.csv'.format(self.flowcell_id)) + ssname = os.path.join(samplesheets_dir, f'{self.flowcell_id}.csv') if os.path.exists(ssname): return ssname else: @@ -262,27 +262,27 @@ def transfer_run(self, t_file, mail_recipients=None): command_line.append("--exclude=Demultiplexing_*/*_*") command_line.append("--include=*/") for to_include in self.CONFIG['analysis_server']['sync']['include']: - command_line.append("--include={}".format(to_include)) + command_line.append(f"--include={to_include}") command_line.extend(["--exclude=*", "--prune-empty-dirs"]) r_user = self.CONFIG['analysis_server']['user'] r_host = self.CONFIG['analysis_server']['host'] r_dir = self.CONFIG['analysis_server']['sync']['data_archive'] - remote = "{}@{}:{}".format(r_user, r_host, r_dir) + remote = f"{r_user}@{r_host}:{r_dir}" command_line.extend([self.run_dir, remote]) # Create temp file indicating that the run is being transferred try: open(os.path.join(self.run_dir, 'transferring'), 'w').close() - except IOError as e: - logger.error("Cannot create a file in {}. " - "Check the run name, and the permissions.".format(self.id)) + except OSError as e: + logger.error(f"Cannot create a file in {self.id}. " + "Check the run name, and the permissions.") raise e - started = ("Started transfer of run {} on {}".format(self.id, datetime.now())) + started = (f"Started transfer of run {self.id} on {datetime.now()}") logger.info(started) # In this particular case we want to capture the exception because we want # to delete the transfer file try: - msge_text="I am about to transfer with this command \n{}".format(command_line) + msge_text=f"I am about to transfer with this command \n{command_line}" logger.info(msge_text) misc.call_external_command(command_line, with_log_files=True, prefix="", log_dir=self.run_dir) @@ -290,16 +290,16 @@ def transfer_run(self, t_file, mail_recipients=None): os.remove(os.path.join(self.run_dir, 'transferring')) #Send an email notifying that the transfer failed runname = self.id - sbt = ("Rsync of run {} failed".format(runname)) - msg= """ Rsync of data for run {run} has failed! - Raised the following exception: {e} - """.format(run=runname, e=exception) + sbt = (f"Rsync of run {runname} failed") + msg= f""" Rsync of data for run {runname} has failed! + Raised the following exception: {exception} + """ if mail_recipients: send_mail(sbt, msg, mail_recipients) raise exception - logger.info('Adding run {} to {}'.format(self.id, t_file)) + logger.info(f'Adding run {self.id} to {t_file}') with open(t_file, 'a') as tranfer_file: tsv_writer = csv.writer(tranfer_file, delimiter='\t') tsv_writer.writerow([self.id, str(datetime.now())]) @@ -307,7 +307,7 @@ def transfer_run(self, t_file, mail_recipients=None): #Send an email notifying that the transfer was successful runname = self.id - sbt = ("Rsync of data for run {} to the analysis cluster has finished".format(runname)) + sbt = (f"Rsync of data for run {runname} to the analysis cluster has finished") msg= """ Rsync of data for run {run} to the analysis cluster has finished! The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} @@ -320,7 +320,7 @@ def archive_run(self, destination): :param str destination: the destination folder """ if destination and os.path.isdir(destination): - logger.info('archiving run {}'.format(self.id)) + logger.info(f'archiving run {self.id}') shutil.move(self.run_dir, os.path.join(destination, self.id)) else: logger.warning("Cannot move run to archive, destination does not exist") @@ -331,7 +331,7 @@ def send_mail(self, sbt, msg, rcp): already_seen = False runname = self.id if not sbt: - sbt = "{}".format(runname) + sbt = f"{runname}" misc.send_mail(sbt, msg, rcp) def is_transferred(self, transfer_file): @@ -340,7 +340,7 @@ def is_transferred(self, transfer_file): :param str transfer_file: Path to file with information about transferred runs """ try: - with open(transfer_file, 'r') as file_handle: + with open(transfer_file) as file_handle: transfer_file_contents = csv.reader(file_handle, delimiter='\t') for row in transfer_file_contents: # Rows have two columns: run and transfer date @@ -349,7 +349,7 @@ def is_transferred(self, transfer_file): if os.path.exists(os.path.join(self.run_dir, 'transferring')): return True return False - except IOError: + except OSError: return False def is_unpooled_lane(self, lane): @@ -388,7 +388,7 @@ def _rename_undet(self, lane, samples_per_lane): :param samples_per_lane: lane:sample dict :type status: dict """ - for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, "Undetermined*L0?{}*".format(lane))): + for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")): old_name=os.path.basename(file) old_name_comps=old_name.split("_") old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined @@ -398,7 +398,7 @@ def _rename_undet(self, lane, samples_per_lane): old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper new_name="_".join(old_name_comps) - logger.info("Renaming {} to {}".format(file, os.path.join(os.path.dirname(file), new_name))) + logger.info(f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}") os.rename(file, os.path.join(os.path.dirname(file), new_name)) def _classify_lanes(self, samplesheets): @@ -455,16 +455,16 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))): + for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_',''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:] + new_name_comps = [sample.replace('Sample_',''), f'S{str(sample_counter)}'] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name)) sample_counter += 1 # Make a softlink of lane.html - html_report_lane_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") + html_report_lane_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") if not os.path.isdir(os.path.dirname(html_report_lane_dest)): os.makedirs(os.path.dirname(html_report_lane_dest)) @@ -472,7 +472,7 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le # Modify the laneBarcode.html file html_report_laneBarcode = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), + f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", @@ -511,7 +511,7 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) # Modify the Stats.json file - stat_json_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json") + stat_json_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json") stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json") with open(stat_json_source) as json_data: data = json.load(json_data) @@ -528,15 +528,15 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le json.dump(data, stat_json_new_file) def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes): - elements = [element for element in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) ] + elements = [element for element in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) ] for element in elements: if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), element) + source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", element) dest = os.path.join(self.run_dir, self.demux_dir, element) os.symlink(source, dest) os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats")) # Fetch the lanes that have NoIndex - statsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "*" )) + statsFiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" )) for source in statsFiles: source_name = os.path.split(source)[1] if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: @@ -545,15 +545,15 @@ def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_ dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name) os.symlink(source, dest) for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", file) + source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file) dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file) os.symlink(source, dest) - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports") + source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports") dest = os.path.join(self.run_dir, self.demux_dir, "Reports") if os.path.exists(dest): try: os.rmdir(dest) - except NotADirectoryError as e: + except NotADirectoryError: os.unlink(dest) os.symlink(source, dest) @@ -567,7 +567,7 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data] next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane) for entry in next_html_report_lane_parser.sample_data: - if not entry['Lane'] in lanesInReport: + if entry['Lane'] not in lanesInReport: # If this is a new lane not included before html_report_lane_parser.sample_data.append(entry) # Now all lanes have been inserted @@ -589,9 +589,9 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple entry['% Perfectbarcode'] = None entry['% One mismatchbarcode'] = None # Update the values in Flowcell Summary - html_report_lane_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw) - html_report_lane_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF) - html_report_lane_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases) + html_report_lane_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}' + html_report_lane_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}' + html_report_lane_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}' # Add lanes not present in this demux # Create the new lane.html new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all']) @@ -664,9 +664,9 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple key=lambda k: (k['Lane'].lower(), k['Sample'])) # Update the values in Flowcell Summary - html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw) - html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF) - html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases) + html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}' + html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}' + html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}' # Generate the new report for laneBarcode.html new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html') _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) @@ -774,11 +774,11 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet # Create DemuxSummary.txt files for complex lanes if len(DemuxSummaryFiles_complex_lanes) > 0: for key, value in DemuxSummaryFiles_complex_lanes.items(): - with open(os.path.join(DemultiplexingStats_xml_dir, 'DemuxSummaryF1L{}.txt'.format(key)), 'w') as DemuxSummaryFile: + with open(os.path.join(DemultiplexingStats_xml_dir, f'DemuxSummaryF1L{key}.txt'), 'w') as DemuxSummaryFile: DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n') DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n') for idx, count in value['Barcodes'].items(): - DemuxSummaryFile.write('{}\t{}\n'.format(idx, count)) + DemuxSummaryFile.write(f'{idx}\t{count}\n') open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close() @@ -790,7 +790,7 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p ssparser = SampleSheetParser(samplesheet) demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] html_report_lane = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), + f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", @@ -803,10 +803,10 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p if os.path.exists(html_report_lane): html_reports_lane.append(html_report_lane) else: - raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_lane)) + raise RuntimeError(f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing") html_report_laneBarcode = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), + f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", @@ -819,13 +819,13 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p if os.path.exists(html_report_laneBarcode): html_reports_laneBarcode.append(html_report_laneBarcode) else: - raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_laneBarcode)) + raise RuntimeError(f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing") - stat_json = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json") + stat_json = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json") if os.path.exists(stat_json): stats_json.append(stat_json) else: - raise RuntimeError("Not able to find Stats.json report {}: possible cause is problem in demultiplexing".format(stat_json)) + raise RuntimeError(f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing") # Aggregate fastq lanes_samples = dict() @@ -848,21 +848,21 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))): + for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_', ''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:] + new_name_comps = [sample.replace('Sample_', ''), f'S{str(sample_counter)}'] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name)) sample_counter += 1 # Ordinary cases else: - projects = [project for project in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) if os.path.isdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project))] + projects = [project for project in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) if os.path.isdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project))] for project in projects: if project in "Reports" or project in "Stats": continue - project_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project) + project_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project) project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): # There might be project seqeunced with multiple index lengths @@ -881,7 +881,7 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p # Copy fastq files for undetermined and the undetermined stats for simple lanes only lanes_in_sub_samplesheet = [] header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project'] - with open(samplesheet, mode='r') as sub_samplesheet_file: + with open(samplesheet) as sub_samplesheet_file: sub_samplesheet_reader = csv.reader(sub_samplesheet_file) for row in sub_samplesheet_reader: if row[0] not in header: @@ -890,15 +890,15 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p for lane in lanes_in_sub_samplesheet: if lane in simple_lanes.keys(): undetermined_fastq_files = glob.glob(os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - "Undetermined_S0_L00{}*.fastq*".format(lane))) # Contains only simple lanes undetermined + f"Demultiplexing_{demux_id}", + f"Undetermined_S0_L00{lane}*.fastq*")) # Contains only simple lanes undetermined for fastqfile in undetermined_fastq_files: os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1])) DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), + f"Demultiplexing_{demux_id}", legacy_path, "Stats", - "*L{}*txt".format(lane))) + f"*L{lane}*txt")) if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) for DemuxSummaryFile in DemuxSummaryFiles: @@ -913,7 +913,7 @@ def _aggregate_demux_results_simple_complex(self): if self.software == 'bcl2fastq': legacy_path = '' elif self.software == 'bclconvert': - legacy_path = "Reports/{}".format(self.legacy_dir) + legacy_path = f"Reports/{self.legacy_dir}" else: raise RuntimeError("Unrecognized software!") @@ -982,11 +982,11 @@ def _generate_lane_html(html_file, html_report_lane_parser): html.write('\n') fc_keys = sorted(list(html_report_lane_parser.flowcell_data.keys())) for key in fc_keys: - html.write('{}\n'.format(key)) + html.write(f'{key}\n') html.write('\n') html.write('\n') for key in fc_keys: - html.write('{}\n'.format(html_report_lane_parser.flowcell_data[key])) + html.write(f'{html_report_lane_parser.flowcell_data[key]}\n') html.write('\n') html.write('\n') # LANE SUMMARY TABLE @@ -995,13 +995,13 @@ def _generate_lane_html(html_file, html_report_lane_parser): html.write('\n') lane_keys = sorted(list(html_report_lane_parser.sample_data[0].keys())) for key in lane_keys: - html.write('{}\n'.format(key)) + html.write(f'{key}\n') html.write('\n') for sample in html_report_lane_parser.sample_data: html.write('\n') for key in lane_keys: - html.write('{}\n'.format(sample[key])) + html.write(f'{sample[key]}\n') html.write('\n') html.write('\n') # FOOTER diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py index ca331115..f2699426 100755 --- a/taca/illumina/Standard_Runs.py +++ b/taca/illumina/Standard_Runs.py @@ -1,13 +1,14 @@ +import logging import os import re -import logging from datetime import datetime +from io import open + +from flowcell_parser.classes import SampleSheetParser -from taca.utils.filesystem import chdir from taca.illumina.Runs import Run from taca.utils import misc -from flowcell_parser.classes import SampleSheetParser -from io import open +from taca.utils.filesystem import chdir logger = logging.getLogger(__name__) @@ -64,9 +65,9 @@ def _copy_samplesheet(self): rename_qPCR_suffix = True, fields_qPCR=[ssparser.dfield_snm])) except Exception as e: - logger.error('Encountered the following exception {}'.format(e)) + logger.error(f'Encountered the following exception {e}') return False - logger.info(('Created SampleSheet.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest))) + logger.info(f'Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} ') # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use @@ -258,8 +259,8 @@ def demultiplex_run(self): samples_to_include[lane].append(sample_name) else: samples_to_include.update({lane:[sample_name]}) - except (KeyError, IndexError) as err: - logger.info(('No corresponding mask in lane {}. Skip it.'.format(lane))) + except (KeyError, IndexError): + logger.info(f'No corresponding mask in lane {lane}. Skip it.') continue elif self.software == 'bclconvert': mask = unique_masks[i] @@ -299,7 +300,7 @@ def demultiplex_run(self): base_mask = [] # Make sub-samplesheet with chdir(self.run_dir): - samplesheet_dest='SampleSheet_{}.csv'.format(bcl_cmd_counter) + samplesheet_dest=f'SampleSheet_{bcl_cmd_counter}.csv' with open(samplesheet_dest, 'w') as fcd: fcd.write(self._generate_samplesheet_subset(self.runParserObj.samplesheet, samples_to_include, runSetup, self.software, sample_type, index1_size, index2_size, base_mask, self.CONFIG)) @@ -317,10 +318,9 @@ def demultiplex_run(self): bcl_cmd_counter) misc.call_external_command_detached(cmd, with_log_files = True, - prefix='demux_{}'.format(bcl_cmd_counter)) - logger.info(('BCL to FASTQ conversion and demultiplexing ' \ - 'started for run {} on {}'.format(os.path.basename(self.id), - datetime.now()))) + prefix=f'demux_{bcl_cmd_counter}') + logger.info('BCL to FASTQ conversion and demultiplexing ' \ + f'started for run {os.path.basename(self.id)} on {datetime.now()}') # Demutiplexing done for one mask type and scripts will continue # Working with the next type. Command counter should increase by 1 @@ -346,7 +346,7 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter): for lane in sorted(lanes): # Iterate thorugh each lane and add the correct --use-bases-mask for that lane base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # Get the base_mask - base_mask_expr = '{}:'.format(lane) + ','.join(base_mask) + base_mask_expr = f'{lane}:' + ','.join(base_mask) cl.extend(['--use-bases-mask', base_mask_expr]) # Case with bclconvert elif self.software == 'bclconvert': @@ -355,12 +355,12 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter): else: raise RuntimeError("Unrecognized software!") # Output dir - output_dir = os.path.join(self.run_dir, 'Demultiplexing_{}'.format(bcl_cmd_counter)) + output_dir = os.path.join(self.run_dir, f'Demultiplexing_{bcl_cmd_counter}') if not os.path.exists(output_dir): os.makedirs(output_dir) cl.extend(['--output-dir', output_dir]) # Samplesheet - cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, 'SampleSheet_{}.csv'.format(bcl_cmd_counter)))]) + cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, f'SampleSheet_{bcl_cmd_counter}.csv'))]) # Demux options cl_options = [] if 'options' in self.CONFIG.get(self.software): @@ -374,9 +374,9 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter): if isinstance(option, dict): opt, val = list(option.items())[0] if 'output-dir' not in opt: - cl.extend(['--{}'.format(opt), str(val).lower()]) + cl.extend([f'--{opt}', str(val).lower()]) else: - cl.append('--{}'.format(option)) + cl.append(f'--{option}') return cl def _generate_per_lane_base_mask(self, sample_type, mask_table): @@ -548,7 +548,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) """ - output = u'' + output = '' # Expand the ssparser if there are lanes with 10X or Smart-seq samples index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) @@ -591,12 +591,12 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None if not fields_to_remove: fields_to_remove = [] # Header - output += '[Header]{}'.format(os.linesep) + output += f'[Header]{os.linesep}' for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' output += os.linesep # Data - output += '[Data]{}'.format(os.linesep) + output += f'[Data]{os.linesep}' datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: @@ -611,13 +611,13 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm])) + value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}') else: # Substitute SampleID with SampleName, add Sample_ as prefix - value ='Sample_{}'.format(line[ssparser.dfield_snm]) + value =f'Sample_{line[ssparser.dfield_snm]}' except: # Otherwise add Sample_ as prefix - value = 'Sample_{}'.format(line[ssparser.dfield_sid]) + value = f'Sample_{line[ssparser.dfield_sid]}' elif rename_qPCR_suffix and field in fields_qPCR: value = re.sub('__qPCR_$', '', line[field]) line_ar.append(value) @@ -626,7 +626,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None return output def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, software, sample_type, index1_size, index2_size, base_mask, CONFIG): - output = u'' + output = '' # Prepare index cycles index_cycles = [0, 0] for read in runSetup: @@ -636,13 +636,13 @@ def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, s else: index_cycles[1] = int(read['NumCycles']) # Header - output += '[Header]{}'.format(os.linesep) + output += f'[Header]{os.linesep}' for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' output += os.linesep # Settings for BCL Convert if software == 'bclconvert': - output += '[Settings]{}'.format(os.linesep) + output += f'[Settings]{os.linesep}' output += 'OverrideCycles,{}{}'.format(';'.join(base_mask), os.linesep) if CONFIG.get('bclconvert'): @@ -651,15 +651,15 @@ def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, s if CONFIG['bclconvert']['settings'].get('common'): for setting in CONFIG['bclconvert']['settings']['common']: for k, v in setting.items(): - output += '{},{}{}'.format(k, v, os.linesep) + output += f'{k},{v}{os.linesep}' # Put special settings: if sample_type in CONFIG['bclconvert']['settings'].keys(): for setting in CONFIG['bclconvert']['settings'][sample_type]: for k, v in setting.items(): if (k == 'BarcodeMismatchesIndex1' and index1_size != 0) or (k == 'BarcodeMismatchesIndex2' and index2_size != 0) or 'BarcodeMismatchesIndex' not in k: - output += '{},{}{}'.format(k, v, os.linesep) + output += f'{k},{v}{os.linesep}' # Data - output += '[Data]{}'.format(os.linesep) + output += f'[Data]{os.linesep}' datafields = [] for field in ssparser.datafields: datafields.append(field) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 5f839058..a6743f04 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -1,18 +1,18 @@ -import os -import logging import csv -import shutil import glob -import re import json -import pandas as pd -import subprocess +import logging import os +import re +import shutil +import subprocess +from datetime import datetime from typing import Union -from taca.utils.statusdb import NanoporeRunsConnection -from datetime import datetime +import pandas as pd + from taca.utils.config import CONFIG +from taca.utils.statusdb import NanoporeRunsConnection from taca.utils.transfer import RsyncAgent, RsyncError logger = logging.getLogger(__name__) @@ -22,7 +22,7 @@ ) -class ONT_run(object): +class ONT_run: """General Nanopore run. Expects instantiation from absolute path of run directory on preprocessing server. @@ -39,7 +39,7 @@ def __init__(self, run_abspath: str): ), f"Run {self.run_name} doesn't look like a run dir" # Parse MinKNOW sample and experiment name - with open(self.get_file("/run_path.txt"), "r") as stream: + with open(self.get_file("/run_path.txt")) as stream: self.experiment_name, self.sample_name, _ = stream.read().split("/") # Get info from run name @@ -122,7 +122,7 @@ def assert_contents(self): def is_transferred(self) -> bool: """Return True if run ID in transfer.tsv, else False.""" - with open(self.transfer_details["transfer_log"], "r") as f: + with open(self.transfer_details["transfer_log"]) as f: return self.run_name in f.read() # DB update @@ -230,7 +230,7 @@ def parse_minknow_json(self, db_update): logger.info(f"{self.run_name}:Parsing report JSON...") - dict_json_report = json.load(open(self.get_file("/report*.json"), "r")) + dict_json_report = json.load(open(self.get_file("/report*.json"))) # Initialize return dict parsed_data = {} @@ -352,10 +352,10 @@ def update_transfer_log(self): with open(self.transfer_details["transfer_log"], "a") as f: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow([self.run_name, str(datetime.now())]) - except IOError: + except OSError: msg = f"{self.run_name}: Could not update the transfer logfile {self.transfer_details['transfer_log']}" logger.error(msg) - raise IOError(msg) + raise OSError(msg) # Archive run @@ -404,7 +404,7 @@ def get_anglerfish_exit_code(self) -> Union[int, None]: Return exit code or None. """ if os.path.exists(self.anglerfish_done_abspath): - return int(open(self.anglerfish_done_abspath, "r").read()) + return int(open(self.anglerfish_done_abspath).read()) else: return None @@ -413,7 +413,7 @@ def get_anglerfish_pid(self) -> Union[str, None]: Return process ID or None.""" if os.path.exists(self.anglerfish_ongoing_abspath): - return str(open(self.anglerfish_ongoing_abspath, "r").read()) + return str(open(self.anglerfish_ongoing_abspath).read()) else: return None diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py index 75c2d56d..130a88c8 100644 --- a/taca/nanopore/instrument_transfer.py +++ b/taca/nanopore/instrument_transfer.py @@ -2,14 +2,14 @@ """ __version__ = "1.0.13" +import argparse import logging import os import re import shutil -import argparse import subprocess -from glob import glob from datetime import datetime as dt +from glob import glob def main(args): @@ -53,14 +53,14 @@ def main(args): if run_path.split(os.sep)[-2][0:3] == "QC_": # For QC runs, the sample name should start with "QC_" - logging.info(f"Run categorized as QC.") + logging.info("Run categorized as QC.") rsync_dest = args.dest_dir_qc else: rsync_dest = args.dest_dir - logging.info(f"Dumping run path...") + logging.info("Dumping run path...") dump_path(run_path) - logging.info(f"Dumping QC and MUX history...") + logging.info("Dumping QC and MUX history...") dump_pore_count_history(run_path, pore_counts) if not sequencing_finished(run_path): @@ -119,7 +119,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: """Do a final sync of the run to storage, then archive it. Skip if rsync is already running on the run.""" - logging.info("Performing a final sync of {} to storage".format(run_dir)) + logging.info(f"Performing a final sync of {run_dir} to storage") command = [ "run-one", @@ -140,9 +140,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: archive_finished_run(run_dir, archive_dir) else: logging.info( - "Previous rsync might be running still. Skipping {} for now.".format( - run_dir - ) + f"Previous rsync might be running still. Skipping {run_dir} for now." ) return diff --git a/taca/server_status/cli.py b/taca/server_status/cli.py index 723410df..4b786fc1 100644 --- a/taca/server_status/cli.py +++ b/taca/server_status/cli.py @@ -1,9 +1,12 @@ -import click import logging +import click + +from taca.server_status import ( + cronjobs as cj, # to avoid similar names with command, otherwise exception +) from taca.server_status import server_status as status from taca.utils.config import CONFIG -from taca.server_status import cronjobs as cj # to avoid similar names with command, otherwise exception @click.group(name='server_status') diff --git a/taca/server_status/cronjobs.py b/taca/server_status/cronjobs.py index 9b808bd8..80fd59fc 100644 --- a/taca/server_status/cronjobs.py +++ b/taca/server_status/cronjobs.py @@ -1,20 +1,22 @@ +import datetime +import getpass import logging import platform -import getpass -import datetime from crontab import CronTab + from taca.utils import statusdb from taca.utils.config import CONFIG + def _parse_crontab(): result = {} user = getpass.getuser() - logging.info('Getting crontab for user {}'.format(user)) + logging.info(f'Getting crontab for user {user}') try: crontab = CronTab(user=user) except Exception as e: - logging.error('Cannot get a crontab for user: {}'.format(user)) + logging.error(f'Cannot get a crontab for user: {user}') logging.error(e.message) else: result[user] = [] @@ -71,7 +73,7 @@ def update_cronjob_db(): except Exception as e: logging.error(e.message) else: - logging.info('{} has been successfully updated'.format(server)) + logging.info(f'{server} has been successfully updated') else: logging.warning('Document has not been created/updated') diff --git a/taca/server_status/server_status.py b/taca/server_status/server_status.py index a03a107a..36b6f27a 100644 --- a/taca/server_status/server_status.py +++ b/taca/server_status/server_status.py @@ -1,6 +1,6 @@ -import subprocess -import logging import datetime +import logging +import subprocess from taca.utils import statusdb from taca.utils.config import CONFIG @@ -26,7 +26,7 @@ def get_nases_disk_space(): else: user = config['user'] # Connect via ssh to server and execute the command - command = ['ssh', '-t', '{}@{}'.format(user, server_url), command] + command = ['ssh', '-t', f'{user}@{server_url}', command] result[server_url] = _run_cmd(command) @@ -81,7 +81,7 @@ def _parse_output(output): # for nases 'mounted_on': 'NaN', 'filesystem': 'NaN' } - logging.error('Can not parse the output: {}'.format(output)) + logging.error(f'Can not parse the output: {output}') return result @@ -116,7 +116,7 @@ def update_status_db(data, server_type=None): logging.error(e.message) raise else: - logging.info('{}: Server status has been updated'.format(key)) + logging.info(f'{key}: Server status has been updated') def check_promethion_status(): config = CONFIG.get('promethion_status') diff --git a/taca/testing/cli.py b/taca/testing/cli.py index 63b89a35..4856b75d 100644 --- a/taca/testing/cli.py +++ b/taca/testing/cli.py @@ -1,11 +1,13 @@ """ CLI for the testing commands """ -from __future__ import print_function import os + import click + import taca.testing.create_uppmax_like_env as createupp + @click.group(name='uppmax_env') def uppmax_env(): """ Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log) diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py index e4852b42..9e936857 100644 --- a/taca/testing/create_uppmax_like_env.py +++ b/taca/testing/create_uppmax_like_env.py @@ -1,19 +1,18 @@ """ Load and parse configuration file.""" -from __future__ import print_function +import datetime import logging import os -import datetime import random import subprocess +from io import open from dateutil.relativedelta import relativedelta -from taca.utils.config import CONFIG + from taca.utils import config as conf from taca.utils import filesystem as fs from taca.utils import statusdb -from io import open - +from taca.utils.config import CONFIG logger = logging.getLogger(__name__) @@ -21,34 +20,34 @@ def create_version_report(path): # Creates the file version_report.txt for stuff run ngi_pipeline with open(os.path.join(path, 'version_report.txt'), 'w') as VERSION_REPORT: - VERSION_REPORT.write(u'******\n') - VERSION_REPORT.write(u'README\n') - VERSION_REPORT.write(u'******\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'The versions of programs and references used:\n') - VERSION_REPORT.write(u'piper: unknown\n') - VERSION_REPORT.write(u'bwa: 0.7.12\n') - VERSION_REPORT.write(u'samtools: 0.1.19\n') - VERSION_REPORT.write(u'qualimap: v2.2\n') - VERSION_REPORT.write(u'snpEff: 4.1\n') - VERSION_REPORT.write(u'snpEff reference: GRCh37.75\n') - VERSION_REPORT.write(u'gatk: 3.3-0-geee94ec\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'reference: human_g1k_v37.fasta\n') - VERSION_REPORT.write(u'db_snp: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'hapmap: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'omni: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'1000G_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write(u'indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'piper\n') - VERSION_REPORT.write(u'-----\n') - VERSION_REPORT.write(u'Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n') + VERSION_REPORT.write('******\n') + VERSION_REPORT.write('README\n') + VERSION_REPORT.write('******\n') + VERSION_REPORT.write('\n') + VERSION_REPORT.write('Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n') + VERSION_REPORT.write('\n') + VERSION_REPORT.write('The versions of programs and references used:\n') + VERSION_REPORT.write('piper: unknown\n') + VERSION_REPORT.write('bwa: 0.7.12\n') + VERSION_REPORT.write('samtools: 0.1.19\n') + VERSION_REPORT.write('qualimap: v2.2\n') + VERSION_REPORT.write('snpEff: 4.1\n') + VERSION_REPORT.write('snpEff reference: GRCh37.75\n') + VERSION_REPORT.write('gatk: 3.3-0-geee94ec\n') + VERSION_REPORT.write('\n') + VERSION_REPORT.write('reference: human_g1k_v37.fasta\n') + VERSION_REPORT.write('db_snp: gatk-bundle/2.8\n') + VERSION_REPORT.write('hapmap: gatk-bundle/2.8\n') + VERSION_REPORT.write('omni: gatk-bundle/2.8\n') + VERSION_REPORT.write('1000G_indels: gatk-bundle/2.8\n') + VERSION_REPORT.write('Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n') + VERSION_REPORT.write('\n') + VERSION_REPORT.write('indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n') + VERSION_REPORT.write('indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n') + VERSION_REPORT.write('\n') + VERSION_REPORT.write('piper\n') + VERSION_REPORT.write('-----\n') + VERSION_REPORT.write('Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n') def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None ): # Create something like 160217_ST-E00201_0063_AHJHNYCCXX @@ -83,8 +82,8 @@ def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None # Create dir structure fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', project_name, sample_id)) # Now create the data - fastq_1_dest = '{}_S{}_L00{}_R1_001.fastq.gz'.format(sample_name, counter, lane) - fastq_2_dest = '{}_S{}_L00{}_R2_001.fastq.gz'.format(sample_name, counter, lane) + fastq_1_dest = f'{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz' + fastq_2_dest = f'{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz' counter += 1 if fastq_1 is None: fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name, @@ -98,17 +97,17 @@ def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None project_name, sample_id, fastq_2_dest)) with open(os.path.join(path_to_fc, 'SampleSheet.csv'), 'w') as Samplesheet_file: - Samplesheet_file.write(u'[Header]\n') - Samplesheet_file.write(u'Date,2016-03-29\n') - Samplesheet_file.write(u'Investigator Name,Christian Natanaelsson\n') - Samplesheet_file.write(u'[Data]\n') + Samplesheet_file.write('[Header]\n') + Samplesheet_file.write('Date,2016-03-29\n') + Samplesheet_file.write('Investigator Name,Christian Natanaelsson\n') + Samplesheet_file.write('[Data]\n') for key in header: - Samplesheet_file.write(u'{},'.format(key)) - Samplesheet_file.write(u'\n') + Samplesheet_file.write(f'{key},') + Samplesheet_file.write('\n') for line in samplesheet: for key in header: - Samplesheet_file.write(u'{},'.format(line[key])) - Samplesheet_file.write(u'\n') + Samplesheet_file.write(f'{line[key]},') + Samplesheet_file.write('\n') def create_uppmax_env(ngi_config): paths = {} @@ -122,7 +121,7 @@ def create_uppmax_env(ngi_config): top_dir = ngi_config['analysis']['top_dir'] paths['top_dir'] = top_dir except KeyError as e: - raise SystemExit('Config file is missing the key {}, make sure it have all required information'.format(str(e))) + raise SystemExit(f'Config file is missing the key {str(e)}, make sure it have all required information') if 'environment' not in ngi_config: sys.exit('ERROR: environment must be a field of NGI_CONFIG.') try: @@ -131,10 +130,10 @@ def create_uppmax_env(ngi_config): flowcell_inbox = flowcell_inboxes[0] # I assume there is only one paths['flowcell_inbox'] = flowcell_inbox except ValueError as e: - sys.exit('key error, flowcell_inbox not found in "{}": {}'.format(ngi_config, e)) + sys.exit(f'key error, flowcell_inbox not found in "{ngi_config}": {e}') # Now I need to create the folders for this if not os.path.exists(base_root): - sys.exit('base_root needs to exists: {}'.format(base_root)) + sys.exit(f'base_root needs to exists: {base_root}') fs.create_folder(flowcell_inbox) if sthlm_root is None: path_to_analysis = os.path.join(base_root, top_dir) @@ -190,11 +189,11 @@ def produce_analysis_piper(ngi_config, project_id): fs.create_folder(current_dir) if piper_dir == '05_processed_alignments': for sample_id in os.listdir(data_dir): - bam_file = '{}.clean.dedup.bam'.format(sample_id) + bam_file = f'{sample_id}.clean.dedup.bam' fs.touch(os.path.join(current_dir, bam_file)) if piper_dir == '07_variant_calls': for sample_id in os.listdir(data_dir): - vcf_file = '{}.clean.dedup.recal.bam.raw.indel.vcf.gz'.format(sample_id) + vcf_file = f'{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz' fs.touch(os.path.join(current_dir, vcf_file)) current_dir = os.path.join(piper_ngi_dir, 'sbatch') fs.create_folder(current_dir) @@ -278,7 +277,7 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): 'application': application, 'no_samples': row['value']['no_samples']} else: - print('status {}'.format(project_status)) + print(f'status {project_status}') ## Now I can parse the x_flowcell db to check what I can and cannot use whole_genome_projects = int(2*projects/3) projects_to_reproduce = [] @@ -326,16 +325,16 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): 'noWGreseq_open') # Create ngi_pipeline enviroment - print('#NGI_CONFIG varaible is {}. This variable needs to be in the .bashrc file'.format(ngi_config_file)) - print('NGI_CONFIG={}'.format(ngi_config_file)) + print(f'#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file') + print(f'NGI_CONFIG={ngi_config_file}') try: ngi_config = conf.load_config(ngi_config_file) - except IOError as e: - print('ERROR: {}'.format(e.message)) + except OSError as e: + print(f'ERROR: {e.message}') # Create uppmax env paths = create_uppmax_env(ngi_config) - print('#Going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry'.format(len(projects_to_reproduce))) + print(f'#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry') # Scan over x_flowcell and reproduce FCs flowcellDB = couch_connection['x_flowcells'] reproduced_projects = {} @@ -363,25 +362,25 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): if project not in reproduced_projects: reproduced_projects[project] = [] reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id']) - print('#Reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)'.format(len(reproduced_projects))) + print(f'#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)') for project in projects_to_reproduce: if project[0] in reproduced_projects: - print('# {}: {}'.format(project[0], project[1])) + print(f'# {project[0]}: {project[1]}') # Need to output the command to organise to_be_deleted = [] for project in reproduced_projects: for FC in reproduced_projects[project]: - print('Running: ngi_pipeline_start.py organize flowcell {} -p {}'.format(FC, project)) + print(f'Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}') with open('ngi_pipeline_local.logs', 'w') as NGILOGS: return_value = subprocess.call(['ngi_pipeline_start.py', 'organize', 'flowcell', - '{}'.format(FC), + f'{FC}', '-p', - '{}'.format(project)], + f'{project}'], stdout=NGILOGS, stderr=NGILOGS) if return_value > 0: - print('#project {} not organised: have a look to the logs, but most likely this projec is not in charon'.format(project)) + print(f'#project {project} not organised: have a look to the logs, but most likely this projec is not in charon') if project not in to_be_deleted: to_be_deleted.append(project) @@ -399,4 +398,4 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): with open('projects.txt', 'w') as PROJECTS: for project in projects_to_reproduce: if project[0] in reproduced_projects: - PROJECTS.write(u'{}:{}\n'.format(project[0], project[1])) + PROJECTS.write(f'{project[0]}:{project[1]}\n') diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 47da90a9..8ec36614 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -1,13 +1,14 @@ -import os +import datetime import glob -import re import logging -import datetime +import os +import re +from collections import OrderedDict, defaultdict + +from flowcell_parser.classes import RunParametersParser, SampleSheetParser -from taca.utils.config import CONFIG from taca.utils import statusdb -from flowcell_parser.classes import SampleSheetParser, RunParametersParser -from collections import defaultdict, OrderedDict +from taca.utils.config import CONFIG from taca.utils.misc import send_mail logger = logging.getLogger(__name__) @@ -32,7 +33,7 @@ def collect_runs(): for run_dir in potential_run_dirs: if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir): found_runs.append(os.path.basename(run_dir)) - logger.info('Working on {}'.format(run_dir)) + logger.info(f'Working on {run_dir}') update_statusdb(run_dir) nosync_data_dir = os.path.join(data_dir, 'nosync') potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, '*')) @@ -158,7 +159,7 @@ def get_ss_projects(run_dir): elif os.path.exists(os.path.join(run_dir, 'RunParameters.xml')): run_parameters_file = 'RunParameters.xml' else: - logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run_dir)) + logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}') return [] rp = RunParametersParser(os.path.join(run_dir, run_parameters_file)) if 'Setup' in rp.data['RunParameters']: @@ -182,7 +183,7 @@ def get_ss_projects(run_dir): elif os.path.exists(os.path.join(run_dir, 'SampleSheet.csv')): FCID_samplesheet_origin = os.path.join(run_dir, 'SampleSheet.csv') else: - logger.warn('No samplesheet found for {}'.format(run_dir)) + logger.warn(f'No samplesheet found for {run_dir}') miseq = True lanes = str(1) # Pattern is a bit more rigid since we're no longer also checking for lanes @@ -191,29 +192,29 @@ def get_ss_projects(run_dir): # HiSeq X case elif 'HiSeq X' in runtype: FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['xten_samplesheets'], - current_year, '{}.csv'.format(FCID)) + current_year, f'{FCID}.csv') data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # HiSeq 2500 case elif 'HiSeq' in runtype or 'TruSeq' in runtype: FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['hiseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + current_year, f'{FCID}.csv') data = parse_samplesheet(FCID_samplesheet_origin, run_dir) elif 'NovaSeqXPlus' in runtype: FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseqxplus_samplesheets'], - current_year, '{}.csv'.format(FCID)) + current_year, f'{FCID}.csv') data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NovaSeq 6000 case elif 'NovaSeq' in runtype: FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + current_year, f'{FCID}.csv') data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NextSeq Case elif 'NextSeq' in runtype: FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['nextseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + current_year, f'{FCID}.csv') data = parse_samplesheet(FCID_samplesheet_origin, run_dir) else: - logger.warn('Cannot locate the samplesheet for run {}'.format(run_dir)) + logger.warn(f'Cannot locate the samplesheet for run {run_dir}') return [] # If samplesheet is empty, don't bother going through it @@ -244,7 +245,7 @@ def get_ss_projects(run_dir): lane = False if list(proj_tree.keys()) == []: - logger.info('INCORRECTLY FORMATTED SAMPLESHEET, CHECK {}'.format(run_name)) + logger.info(f'INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}') return proj_tree def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False): @@ -256,13 +257,13 @@ def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False): ss_reader = SampleSheetParser(FCID_samplesheet_origin) data = ss_reader.data except: - logger.warn('Cannot initialize SampleSheetParser for {}. Most likely due to poor comma separation'.format(run_dir)) + logger.warn(f'Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation') return [] if is_miseq: - if not 'Description' in ss_reader.header or not \ + if 'Description' not in ss_reader.header or not \ ('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']): - logger.warn('Run {} not labelled as production or application. Disregarding it.'.format(run_dir)) + logger.warn(f'Run {run_dir} not labelled as production or application. Disregarding it.') # Skip this run return [] return data @@ -308,11 +309,11 @@ def fail_run(runid, project): if project is not None: view = bioinfo_db.view('full_doc/pj_run_to_doc') rows = view[[project, runid]].rows - logger.info('Updating status of {} objects with flowcell_id: {} and project_id {}'.format(len(rows), runid, project)) + logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}') else: view = bioinfo_db.view('full_doc/run_id_to_doc') rows = view[[runid]].rows - logger.info('Updating status of {} objects with flowcell_id: {}'.format(len(rows), runid)) + logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid}') new_timestamp = datetime.datetime.now().isoformat() updated = 0 @@ -327,4 +328,4 @@ def fail_run(runid, project): logger.error('Cannot update object project-sample-run-lane: {}-{}-{}-{}'.format(row.value.get('project_id'), row.value.get('sample'), row.value.get('run_id'), row.value.get('lane'))) logger.error(e) raise e - logger.info('Successfully updated {} objects'.format(updated)) + logger.info(f'Successfully updated {updated} objects') diff --git a/taca/utils/cli.py b/taca/utils/cli.py index bbfdb819..4fa3eafa 100644 --- a/taca/utils/cli.py +++ b/taca/utils/cli.py @@ -1,7 +1,9 @@ """CLI for the bioinfo subcommand.""" import click + import taca.utils.bioinfo_tab as bt + @click.group(name='bioinfo_deliveries') def bioinfo_deliveries(): """Update statusdb with information about FC entry point.""" diff --git a/taca/utils/config.py b/taca/utils/config.py index 74b8876f..004d163a 100644 --- a/taca/utils/config.py +++ b/taca/utils/config.py @@ -1,7 +1,8 @@ """Load and parse configuration file.""" -import yaml from io import open +import yaml + CONFIG = {} def load_config(config_file): @@ -12,8 +13,8 @@ def load_config(config_file): content = yaml.load(f, Loader=yaml.FullLoader) config.update(content) return content - except IOError as e: - e.message = 'Could not open configuration file "{}".'.format(config_file) + except OSError as e: + e.message = f'Could not open configuration file "{config_file}".' raise e def load_yaml_config(config_file): @@ -30,6 +31,6 @@ def load_yaml_config(config_file): content = yaml.load(f, Loader=yaml.FullLoader) CONFIG.update(content) return content - except IOError as e: - e.message = 'Could not open configuration file "{}".'.format(config_file) + except OSError as e: + e.message = f'Could not open configuration file "{config_file}".' raise e diff --git a/taca/utils/filesystem.py b/taca/utils/filesystem.py index f1db6968..957bf818 100644 --- a/taca/utils/filesystem.py +++ b/taca/utils/filesystem.py @@ -26,7 +26,7 @@ def create_folder(target_folder): """ try: os.makedirs(target_folder) - except OSError as e: + except OSError: pass return os.path.exists(target_folder) diff --git a/taca/utils/misc.py b/taca/utils/misc.py index 3f9bec6a..946723e5 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -5,13 +5,15 @@ import smtplib import subprocess import sys - from datetime import datetime from email.mime.text import MIMEText -from taca.utils import statusdb from io import open + from six.moves import input +from taca.utils import statusdb + + def send_mail(subject, content, receiver): """Sends an email. @@ -22,7 +24,7 @@ def send_mail(subject, content, receiver): if not receiver: raise SystemExit('No receiver was given to send mail') msg = MIMEText(content) - msg['Subject'] = 'TACA - {}'.format(subject) + msg['Subject'] = f'TACA - {subject}' msg['From'] = 'TACA@scilifelab.se' msg['to'] = receiver @@ -45,7 +47,7 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): stderr = sys.stderr if with_log_files: if prefix: - logFile = '{}_{}'.format(prefix, logFile) + logFile = f'{prefix}_{logFile}' # Create log dir if it didn't exist in CWD if log_dir and not os.path.exists(log_dir): os.mkdir(log_dir) @@ -53,8 +55,8 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): stdout = open(logFile + '.out', 'a') stderr = open(logFile + '.err', 'a') started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + u'\n') - stdout.write(''.join(['=']*len(cl)) + u'\n') + stdout.write(started + '\n') + stdout.write(''.join(['=']*len(cl)) + '\n') try: subprocess.check_call(cl, stdout=stdout, stderr=stderr) @@ -80,12 +82,12 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None): if with_log_files: if prefix: - command = '{}_{}'.format(prefix, command) + command = f'{prefix}_{command}' stdout = open(command + '.out', 'a') stderr = open(command + '.err', 'a') started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + u'\n') - stdout.write(''.join(['=']*len(cl)) + u'\n') + stdout.write(started + '\n') + stdout.write(''.join(['=']*len(cl)) + '\n') try: p_handle = subprocess.Popen(cl, stdout=stdout, stderr=stderr) @@ -205,7 +207,7 @@ def run_is_demuxed(run, couch_info=None, seq_run_type=None): if len(run_date)>6: run_date = run_date[2:] run_fc = run_terms[-1] - run_name = '{}_{}'.format(run_date, run_fc) + run_name = f'{run_date}_{run_fc}' try: couch_connection = statusdb.StatusdbSession(couch_info).connection fc_db = couch_connection[couch_info['xten_db']] diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 3ae4d291..c02d7ac6 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -1,24 +1,24 @@ """Classes for handling connection to StatusDB.""" -import couchdb -import logging import csv - +import logging from datetime import datetime +import couchdb + logger = logging.getLogger(__name__) -class StatusdbSession(object): +class StatusdbSession: """Wrapper class for couchdb.""" def __init__(self, config, db=None): user = config.get('username') password = config.get('password') url = config.get('url') - url_string = 'https://{}:{}@{}'.format(user, password, url) + url_string = f'https://{user}:{password}@{url}' display_url_string = 'https://{}:{}@{}'.format(user, '*********', url) self.connection = couchdb.Server(url=url_string) if not self.connection: - raise Exception('Couchdb connection failed for url {}'.format(display_url_string)) + raise Exception(f'Couchdb connection failed for url {display_url_string}') if db: self.db_connection = self.connection[db] @@ -40,7 +40,7 @@ def save_db_doc(self, doc, db=None): db = db or self.db db.save(doc) except Exception as e: - raise Exception('Failed saving document due to {}'.format(e)) + raise Exception(f'Failed saving document due to {e}') def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='%Y-%m-%d'): """From information available in flowcell db connection, @@ -111,10 +111,10 @@ def create_ongoing_run( self, ont_run, run_path_file: str, pore_count_history_file: str ): - run_path = open(run_path_file, "r").read().strip() + run_path = open(run_path_file).read().strip() pore_counts = [] - with open(pore_count_history_file, "r") as stream: + with open(pore_count_history_file) as stream: for line in csv.DictReader(stream): pore_counts.append(line) @@ -170,8 +170,8 @@ def merge_dicts(d1, d2): elif d1[key] == d2[key]: pass # same leaf value else: - logger.debug('Values for key {key} in d1 and d2 differ, ' - 'using the value of d1'.format(key=key)) + logger.debug(f'Values for key {key} in d1 and d2 differ, ' + 'using the value of d1') else: d1[key] = d2[key] return d1 diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index 34e6b314..dfefcdea 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -4,15 +4,15 @@ import os import shutil import subprocess +from io import open from taca.utils.filesystem import create_folder -from taca.utils.misc import hashfile, call_external_command -from io import open +from taca.utils.misc import call_external_command, hashfile logger = logging.getLogger(__name__) -class TransferAgent(object): +class TransferAgent: """ (Abstract) superclass representing an Agent that performs file transfers. Agents implementing specific methods for transferring files should extend @@ -64,7 +64,7 @@ def format_options(self): if type(val) == str: val = [val] for v in val: - cmdopts.append('{}={}'.format(param,v)) + cmdopts.append(f'{param}={v}') return cmdopts def transfer(self): @@ -82,7 +82,7 @@ def validate_src_path(self): dest_path=self.dest_path) if not os.path.exists(self.src_path): raise TransferError( - msg='src_path "{}" does not exist'.format(self.src_path), + msg=f'src_path "{self.src_path}" does not exist', src_path=self.src_path, dest_path=self.dest_path) @@ -173,10 +173,10 @@ def remote_path(self): [remote_user]@[remote_host]:[dest_path] """ return '{}{}{}'.format( - '{}@'.format(self.remote_user) \ + f'{self.remote_user}@' \ if self.remote_user is not None \ else '', - '{}:'.format(self.remote_host) \ + f'{self.remote_host}:' \ if self.remote_host is not None \ else '', self.dest_path \ @@ -227,7 +227,7 @@ def validate_transfer(self): tfile, hasher=hasher): return False - except TypeError as e: + except TypeError: raise RsyncValidationError( 'no digest file specified', self.src_path, @@ -269,12 +269,12 @@ def transfer(self): # source, we're all good if self.validate_transfer(): logger.debug('target exists and points to the correct ' - 'source path: "{}"'.format(self.src_path)) + f'source path: "{self.src_path}"') return True # If we are not overwriting, return False if not self.overwrite: - logger.debug('target "{}" exists and will not be ' - 'overwritten'.format(self.dest_path)) + logger.debug(f'target "{self.dest_path}" exists and will not be ' + 'overwritten') return False # If the target is a mount, let's not mess with it if os.path.ismount(self.dest_path): @@ -282,8 +282,7 @@ def transfer(self): # If the target is a link or a file, we remove it if os.path.islink(self.dest_path) or \ os.path.isfile(self.dest_path): - logger.debug('removing existing target file "{}"' - .format(self.dest_path)) + logger.debug(f'removing existing target file "{self.dest_path}"') try: os.unlink(self.dest_path) except OSError as e: @@ -291,8 +290,7 @@ def transfer(self): # If the target is a directory, we remove it and # everything underneath elif os.path.isdir(self.dest_path): - logger.debug('removing existing target folder "{}"' - .format(self.dest_path)) + logger.debug(f'removing existing target folder "{self.dest_path}"') try: shutil.rmtree(self.dest_path) except OSError as e: diff --git a/tests/test_analysis.py b/tests/test_analysis.py index c3150f1d..173c57dd 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,10 +1,10 @@ #!/usr/bin/env python +import json import os -import tempfile import shutil -import json +import tempfile import unittest -import mock +from unittest import mock from taca.analysis import analysis as an from taca.utils import config diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py index 1b3158dc..66688701 100644 --- a/tests/test_analysis_nanopore.py +++ b/tests/test_analysis_nanopore.py @@ -1,14 +1,11 @@ #!/usr/bin/env python import unittest -import logging -import mock -import os +from unittest import mock from taca.analysis.analysis_nanopore import * from taca.nanopore.minion import MinIONqc from taca.utils import config as conf - CONFIG = conf.load_yaml_config('data/taca_test_nanopore_cfg.yaml') class TestNanoporeAnalysis(unittest.TestCase): @@ -68,6 +65,6 @@ def test_process_minion_run_fail_analysis(self, mock_mail): minion_run.qc_run = True process_minion_qc_run(minion_run) email_subject = ('Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2') - email_message = 'The nanoseq analysis failed for run {}.'.format(minion_run.run_id) + email_message = f'The nanoseq analysis failed for run {minion_run.run_id}.' email_recipients = 'test@test.com' mock_mail.assert_called_once_with(email_subject, email_message, email_recipients) diff --git a/tests/test_backup.py b/tests/test_backup.py index c170f79f..4d3a6bad 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -import unittest -import mock -import tempfile import os import shutil +import tempfile +import unittest +from unittest import mock from taca.backup import backup from taca.utils import config as conf diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index d7e04869..2b4f365a 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -4,8 +4,8 @@ import shutil import tempfile import unittest -import mock from datetime import datetime +from unittest import mock from taca.cleanup import cleanup from taca.utils import config as conf diff --git a/tests/test_illumina.py b/tests/test_illumina.py index 5bbf323a..e26a48e5 100644 --- a/tests/test_illumina.py +++ b/tests/test_illumina.py @@ -1,25 +1,21 @@ #!/usr/bin/env python +import filecmp +import json import os -import io import shutil -import tempfile -import unittest -import csv -import json -import mock -import filecmp import subprocess -from datetime import datetime import sys +import tempfile +import unittest +from unittest import mock + +from flowcell_parser.classes import LaneBarcodeParser from taca.analysis.analysis import * -from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html -from taca.illumina.Standard_Runs import Standard_Runs, _generate_clean_samplesheet, _classify_samples, parse_10X_indexes, parse_smartseq_indexes, _generate_samplesheet_subset -from taca.illumina.MiSeq_Runs import MiSeq_Run -from taca.illumina.NovaSeq_Runs import NovaSeq_Run from taca.illumina.NextSeq_Runs import NextSeq_Run -from flowcell_parser.classes import LaneBarcodeParser, SampleSheetParser +from taca.illumina.NovaSeq_Runs import NovaSeq_Run +from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html from taca.utils import config as conf if sys.version_info[0] >= 3: @@ -146,7 +142,7 @@ def setUpClass(self): open(os.path.join(completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close() open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R1_001.fastq.gz'), 'w').close() open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R2_001.fastq.gz'), 'w').close() - with io.open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json: + with open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json: stats_json.write(unicode(json.dumps({'silly': 1}, ensure_ascii=False))) # Copy transfer file with the completed run diff --git a/tests/test_instrument_transfer.py b/tests/test_instrument_transfer.py index 60a1533b..3d9b85fc 100644 --- a/tests/test_instrument_transfer.py +++ b/tests/test_instrument_transfer.py @@ -1,10 +1,12 @@ -from taca.nanopore import instrument_transfer -from unittest.mock import patch, mock_open, call, Mock, MagicMock -import tempfile -import pytest +import json import os import re -import json +import tempfile +from unittest.mock import Mock, call, mock_open, patch + +import pytest + +from taca.nanopore import instrument_transfer DUMMY_RUN_NAME = "20240112_2342_MN19414_TEST12345_randomhash" @@ -156,7 +158,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc): # Check path was dumped assert os.path.exists(run_path + "/run_path.txt") - assert open(run_path + "/run_path.txt", "r").read() == "/".join( + assert open(run_path + "/run_path.txt").read() == "/".join( run_path.split("/")[-3:] ) @@ -179,7 +181,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc): ) + "\n" ) - assert open(run_path + "/pore_count_history.csv", "r").read() == template + assert open(run_path + "/pore_count_history.csv").read() == template def test_sequencing_finished(): @@ -389,7 +391,7 @@ def test_dump_pore_count_history(setup_test_fixture): run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST','FLG')}" os.makedirs(run_path) new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) - assert open(new_file, "r").read() == "" + assert open(new_file).read() == "" tmp.cleanup() # Nothing to add, file is present @@ -398,7 +400,7 @@ def test_dump_pore_count_history(setup_test_fixture): os.makedirs(run_path) open(run_path + "/pore_count_history.csv", "w").write("test") new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) - assert open(new_file, "r").read() == "test" + assert open(new_file).read() == "test" tmp.cleanup() # Something to add @@ -424,5 +426,5 @@ def test_dump_pore_count_history(setup_test_fixture): + "\n" ) - assert open(new_file, "r").read() == template + assert open(new_file).read() == template tmp.cleanup() diff --git a/tests/test_nanopore.py b/tests/test_nanopore.py index cb1e1a15..0220f6de 100644 --- a/tests/test_nanopore.py +++ b/tests/test_nanopore.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -import unittest -import mock import filecmp import os import subprocess +import unittest +from unittest import mock -from taca.nanopore.ONT_run_classes import ONT_run from taca.nanopore.minion_run_class import MinIONqc +from taca.nanopore.ONT_run_classes import ONT_run from taca.utils import config CONFIG = config.load_yaml_config("data/taca_test_nanopore_cfg.yaml") diff --git a/tests/test_server_status.py b/tests/test_server_status.py index 2d24c83d..781adbe9 100644 --- a/tests/test_server_status.py +++ b/tests/test_server_status.py @@ -1,9 +1,10 @@ #!/usr/bin/env python import unittest -import mock +from unittest import mock + import crontab -from taca.server_status import server_status, cronjobs +from taca.server_status import cronjobs, server_status from taca.utils import config CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml') @@ -62,9 +63,9 @@ def test_parse_crontab(self, mock_getpass, mock_crontab): mock_crontab.return_value = crontab.CronTab(tab=INITAL_TAB) mock_getpass.return_value = 'test_user' expected_crontab = {'test_user': - [{'Comment': u'First Comment', + [{'Comment': 'First Comment', 'Day of month': '*', - 'Command': u'firstcommand', + 'Command': 'firstcommand', 'Hour': '*', 'Day of week': '*', 'Enabled': True, @@ -83,9 +84,9 @@ def test_parse_crontab(self, mock_getpass, mock_crontab): def test_update_cronjob_db(self, mock_parser, mock_platform, mock_logging, mock_statusdb): """Update couchdb with cronjobs.""" mock_parser.return_value = {'test_user': - [{'Comment': u'First Comment', + [{'Comment': 'First Comment', 'Day of month': '*', - 'Command': u'firstcommand', + 'Command': 'firstcommand', 'Hour': '*', 'Day of week': '*', 'Enabled': True, diff --git a/tests/test_utils.py b/tests/test_utils.py index c4f6f2d9..e16abbd0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,18 +1,18 @@ """Unit tests for the utils helper functions.""" -import hashlib -import mock import os import shutil import subprocess import tempfile -import unittest import time -import couchdb +import unittest from collections import defaultdict -from taca.utils import misc, filesystem, transfer, config, bioinfo_tab, statusdb +from unittest import mock + from six.moves import map +from taca.utils import bioinfo_tab, config, filesystem, misc, statusdb, transfer + class TestMisc(unittest.TestCase): """Test class for the misc functions.""" @@ -262,8 +262,8 @@ def setUpClass(self): self.rootdir = tempfile.mkdtemp(prefix='test_taca_symlink_src') path = self.rootdir for n in range(3): - open(os.path.join(path, 'file{}'.format(n)), 'w').close() - path = os.path.join(path, 'folder{}'.format(n)) + open(os.path.join(path, f'file{n}'), 'w').close() + path = os.path.join(path, f'folder{n}') os.mkdir(path) @classmethod @@ -424,7 +424,7 @@ def setUpClass(cls): # create a digest file def _write_digest(rootdir, fhandle, fpath): - fhandle.write('{} {}\n'.format(misc.hashfile(fpath), os.path.relpath(fpath, rootdir))) + fhandle.write(f'{misc.hashfile(fpath)} {os.path.relpath(fpath, rootdir)}\n') cls.digestfile = os.path.join(cls.rootdir, 'digestfile.sha1') with open(cls.digestfile, 'w') as digesth: @@ -500,12 +500,12 @@ def test_rsync_agent_dest_paths_constructed(self): 'and empty destination host') self.agent.remote_host = 'localhost' self.assertEqual( - 'localhost:{}'.format(self.destdir), + f'localhost:{self.destdir}', self.agent.remote_path(), 'Destination path was not correct for empty remote user') self.agent.remote_user = 'user' self.assertEqual( - 'user@localhost:{}'.format(self.destdir), + f'user@localhost:{self.destdir}', self.agent.remote_path(), 'Destination path was not correct for non-empty remote user') self.agent.dest_path = None From 6e54689979d39ae58d91ae119055584cf9aab64a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:27:33 +0100 Subject: [PATCH 04/44] ruff check unsafe fixes --- setup.py | 5 ++--- taca/analysis/analysis.py | 1 - taca/backup/backup.py | 5 ++--- taca/cleanup/cleanup.py | 9 +++------ taca/illumina/MiSeq_Runs.py | 2 +- taca/illumina/NovaSeqXPlus_Runs.py | 2 +- taca/illumina/NovaSeq_Runs.py | 2 +- taca/illumina/Runs.py | 1 - taca/illumina/Standard_Runs.py | 8 +++----- taca/nanopore/ONT_run_classes.py | 6 +++--- taca/nanopore/instrument_transfer.py | 2 +- taca/testing/create_uppmax_like_env.py | 1 - taca/utils/bioinfo_tab.py | 6 +++--- taca/utils/config.py | 5 ++--- taca/utils/misc.py | 5 +---- taca/utils/statusdb.py | 8 ++++---- taca/utils/transfer.py | 7 +++---- tests/test_analysis_nanopore.py | 3 --- tests/test_backup.py | 2 +- tests/test_utils.py | 8 +++----- 20 files changed, 34 insertions(+), 54 deletions(-) diff --git a/setup.py b/setup.py index d8962c06..74f35bed 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,17 @@ import glob -from io import open from setuptools import find_packages, setup from taca import __version__ try: - with open("requirements.txt", "r") as f: + with open("requirements.txt") as f: install_requires = [x.strip() for x in f.readlines()] except OSError: install_requires = [] try: - with open("dependency_links.txt", "r") as f: + with open("dependency_links.txt") as f: dependency_links = [x.strip() for x in f.readlines()] except OSError: dependency_links = [] diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index 2ef4aafd..4caf9a48 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -4,7 +4,6 @@ import os import subprocess import sys -from io import open from shutil import copyfile, copytree from flowcell_parser.classes import RunParametersParser diff --git a/taca/backup/backup.py b/taca/backup/backup.py index 88a4188f..97fdec7b 100644 --- a/taca/backup/backup.py +++ b/taca/backup/backup.py @@ -7,7 +7,6 @@ import subprocess as sp import time from datetime import datetime -from io import open from taca.utils import filesystem, misc, statusdb from taca.utils.config import CONFIG @@ -287,7 +286,7 @@ def encrypt_runs(cls, run, force): if os.path.exists(run.flag): logger.warn(f'Run {run.name} is already being encrypted, so skipping now') continue - flag = open(run.flag, 'w').close() + open(run.flag, 'w').close() # zip the run directory if os.path.exists(run.zip): if os.path.isdir(run.name): @@ -379,7 +378,7 @@ def pdc_put(cls, run): if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup') continue - flag = open(run.flag, 'w').close() + open(run.flag, 'w').close() logger.info(f'Sending file {run.zip_encrypted} to PDC') if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]): time.sleep(15) # give some time just in case 'dsmc' needs to settle diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index 80ef1905..498ffcf9 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -5,9 +5,6 @@ from collections import defaultdict from datetime import datetime from glob import glob -from io import open - -from six.moves import map from taca.utils import filesystem, misc, statusdb from taca.utils.config import CONFIG, load_config @@ -74,7 +71,7 @@ def cleanup_miarka(days_fastq, days_analysis, exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): - with open(exclude_projects, 'r') as in_file: + with open(exclude_projects) as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: exclude_list.extend(exclude_projects.split(',')) @@ -119,12 +116,12 @@ def cleanup_miarka(days_fastq, days_analysis, undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files), undet_size), default='no'): - removed = _remove_files(all_undet_files) + _remove_files(all_undet_files) return elif only_analysis: for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]: - proj_abs_path = os.path.join(analysis_dir, pid) + os.path.join(analysis_dir, pid) proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date) if proj_info and proj_info['closed_days'] >= days_analysis: # move on if this project has to be excluded diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py index f6585801..ff7d1095 100644 --- a/taca/illumina/MiSeq_Runs.py +++ b/taca/illumina/MiSeq_Runs.py @@ -17,7 +17,7 @@ class MiSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(MiSeq_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() self._get_samplesheet() diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py index 4acff5e4..f3b34b2e 100644 --- a/taca/illumina/NovaSeqXPlus_Runs.py +++ b/taca/illumina/NovaSeqXPlus_Runs.py @@ -3,7 +3,7 @@ class NovaSeqXPlus_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(NovaSeqXPlus_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py index f6ba71e8..cce59eef 100644 --- a/taca/illumina/NovaSeq_Runs.py +++ b/taca/illumina/NovaSeq_Runs.py @@ -3,7 +3,7 @@ class NovaSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(NovaSeq_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index fa4618c8..56724ac7 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -328,7 +328,6 @@ def archive_run(self, destination): def send_mail(self, sbt, msg, rcp): """ Sends mail about run completion """ - already_seen = False runname = self.id if not sbt: sbt = f"{runname}" diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py index f2699426..98741bb4 100755 --- a/taca/illumina/Standard_Runs.py +++ b/taca/illumina/Standard_Runs.py @@ -2,7 +2,6 @@ import os import re from datetime import datetime -from io import open from flowcell_parser.classes import SampleSheetParser @@ -22,7 +21,7 @@ class Standard_Run(Run): def __init__(self, run_dir, software, configuration): - super(Standard_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() self._copy_samplesheet() @@ -82,7 +81,7 @@ def _parse_10X_indexes(self, indexfile): Todo: Set it up to take the file from config instead """ index_dict = {} - with open(indexfile, 'r') as f: + with open(indexfile) as f: for line in f: line_ = line.rstrip().split(',') index_dict[line_[0]] = line_[1:5] @@ -94,7 +93,7 @@ def _parse_smartseq_indexes(self, indexfile): Todo: Set it up to take the file from config instead """ index_dict = {} - with open(indexfile, 'r') as f: + with open(indexfile) as f: for line in f: line_ = line.rstrip().split(',') if index_dict.get(line_[0]): @@ -425,7 +424,6 @@ def _compute_base_mask(self, runSetup, sample_type, index1_size, is_dual_index, - if runSetup is of size 4, then dual index run """ bm = [] - dual_index_run = False if len(runSetup) > 4: raise RuntimeError("when generating base_masks looks like there are" \ " more than 4 reads in the RunSetup.xml") diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index a6743f04..dfbdf3b8 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -159,7 +159,7 @@ def update_db_entry(self, force_update=False): self.touch_db_entry() # If the run document is marked as "ongoing" or database is being manually updated - if self.db.check_run_status(self) == "ongoing" or force_update == True: + if self.db.check_run_status(self) == "ongoing" or force_update is True: logger.info( f"{self.run_name}: Run exists in the database with run status: {self.db.check_run_status(self)}." ) @@ -372,7 +372,7 @@ class ONT_user_run(ONT_run): def __init__(self, run_abspath: str): self.run_type = "user_run" - super(ONT_user_run, self).__init__(run_abspath) + super().__init__(run_abspath) class ONT_qc_run(ONT_run): @@ -380,7 +380,7 @@ class ONT_qc_run(ONT_run): def __init__(self, run_abspath: str): self.run_type = "qc_run" - super(ONT_qc_run, self).__init__(run_abspath) + super().__init__(run_abspath) # Get Anglerfish attributes from run self.anglerfish_done_abspath = f"{self.run_abspath}/.anglerfish_done" diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py index 130a88c8..a7cabdf1 100644 --- a/taca/nanopore/instrument_transfer.py +++ b/taca/nanopore/instrument_transfer.py @@ -153,7 +153,7 @@ def archive_finished_run(run_dir: str, archive_dir: str): sample_dir = os.path.dirname(run_dir) exp_dir = os.path.dirname(sample_dir) - run_name = os.path.basename(run_dir) + os.path.basename(run_dir) sample_name = os.path.basename(sample_dir) exp_name = os.path.basename(exp_dir) diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py index 9e936857..f0a10ea9 100644 --- a/taca/testing/create_uppmax_like_env.py +++ b/taca/testing/create_uppmax_like_env.py @@ -5,7 +5,6 @@ import os import random import subprocess -from io import open from dateutil.relativedelta import relativedelta diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 8ec36614..022ac93e 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -17,7 +17,7 @@ class Tree(defaultdict): """Constructor for a search tree.""" def __init__(self, value=None): - super(Tree, self).__init__(Tree) + super().__init__(Tree) self.value = value @@ -105,7 +105,7 @@ def update_statusdb(run_dir): # Creates record db.save(obj) # Sets FC error flag - if not project_info[flowcell].value == None: + if project_info[flowcell].value is not None: if (('Failed' in project_info[flowcell].value and 'Failed' not in sample_status) or ('Failed' in sample_status and 'Failed' not in project_info[flowcell].value)): project_info[flowcell].value = 'Ambiguous' @@ -113,7 +113,7 @@ def update_statusdb(run_dir): project_info[flowcell].value = sample_status # Checks if a flowcell needs partial re-doing # Email error per flowcell - if not project_info[flowcell].value == None: + if project_info[flowcell].value is not None: if 'Ambiguous' in project_info[flowcell].value: error_emailer('failed_run', run_id) diff --git a/taca/utils/config.py b/taca/utils/config.py index 004d163a..1a6fd6a1 100644 --- a/taca/utils/config.py +++ b/taca/utils/config.py @@ -1,5 +1,4 @@ """Load and parse configuration file.""" -from io import open import yaml @@ -9,7 +8,7 @@ def load_config(config_file): """Loads a configuration file.""" config = {} try: - with open(config_file, 'r') as f: + with open(config_file) as f: content = yaml.load(f, Loader=yaml.FullLoader) config.update(content) return content @@ -27,7 +26,7 @@ def load_yaml_config(config_file): :raises IOError: If the config file cannot be opened. """ try: - with open(config_file, 'r') as f: + with open(config_file) as f: content = yaml.load(f, Loader=yaml.FullLoader) CONFIG.update(content) return content diff --git a/taca/utils/misc.py b/taca/utils/misc.py index 946723e5..3a4a1d68 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -7,9 +7,6 @@ import sys from datetime import datetime from email.mime.text import MIMEText -from io import open - -from six.moves import input from taca.utils import statusdb @@ -154,7 +151,7 @@ def query_yes_no(question, default='yes', force=False): """ valid = {'yes': True, 'y': True, 'ye': True, 'no': False, 'n': False} - if default == None: + if default is None: prompt = ' [y/n] ' elif default == 'yes': prompt = ' [Y/n] ' diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index c02d7ac6..4bbb70a4 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -67,7 +67,7 @@ def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format=' class ProjectSummaryConnection(StatusdbSession): def __init__(self, config, dbname='projects'): - super(ProjectSummaryConnection, self).__init__(config) + super().__init__(config) self.db = self.connection[dbname] self.name_view = {k.key: k.id for k in self.db.view('project/project_name', reduce=False)} self.id_view = {k.key: k.id for k in self.db.view('project/project_id', reduce=False)} @@ -75,7 +75,7 @@ def __init__(self, config, dbname='projects'): class FlowcellRunMetricsConnection(StatusdbSession): def __init__(self, config, dbname='flowcells'): - super(FlowcellRunMetricsConnection, self).__init__(config) + super().__init__(config) self.db = self.connection[dbname] self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} @@ -83,7 +83,7 @@ def __init__(self, config, dbname='flowcells'): class X_FlowcellRunMetricsConnection(StatusdbSession): def __init__(self, config, dbname='x_flowcells'): - super(X_FlowcellRunMetricsConnection, self).__init__(config) + super().__init__(config) self.db = self.connection[dbname] self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} @@ -92,7 +92,7 @@ def __init__(self, config, dbname='x_flowcells'): class NanoporeRunsConnection(StatusdbSession): def __init__(self, config, dbname='nanopore_runs'): - super(NanoporeRunsConnection, self).__init__(config) + super().__init__(config) self.db = self.connection[dbname] def check_run_exists(self, ont_run) -> bool: diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index dfefcdea..2968d0f9 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -4,7 +4,6 @@ import os import shutil import subprocess -from io import open from taca.utils.filesystem import create_folder from taca.utils.misc import call_external_command, hashfile @@ -135,7 +134,7 @@ def __init__( algorithm will be inferred from the extension of the digest file :param opts: options that will be passed to the rsync command """ - super(RsyncAgent, self).__init__( + super().__init__( src_path=src_path, dest_path=dest_path, opts=opts or self.DEFAULT_OPTS, @@ -246,7 +245,7 @@ def __init__(self, src_path, dest_path, overwrite=True, relative=True, **kwargs) be overwritten if it already exists :param bool relative: if true, the destination symlink will be relative """ - super(SymlinkAgent,self).__init__( + super().__init__( src_path=src_path, dest_path=dest_path, **kwargs) @@ -323,7 +322,7 @@ def validate_transfer(self): class TransferError(Exception): def __init__(self, msg, src_path=None, dest_path=None): - super(TransferError, self).__init__(msg) + super().__init__(msg) self.src_path = src_path self.dest_path = dest_path diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py index 66688701..f1b02676 100644 --- a/tests/test_analysis_nanopore.py +++ b/tests/test_analysis_nanopore.py @@ -45,9 +45,6 @@ def test_process_minion_run_transfer(self, mock_mail, mock_archive, mock_update, mock_cp.return_value = True run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' minion_run = MinIONqc(run_dir, 'dummy/path', None) - email_subject = ('Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2') - email_message = 'Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.' - email_recipients = 'test@test.com' process_minion_qc_run(minion_run) expected_calls = [mock.call('Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2', 'Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.', diff --git a/tests/test_backup.py b/tests/test_backup.py index 4d3a6bad..f1addeb9 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -104,7 +104,7 @@ def test_call_commands_double(self): os.makedirs(tmp_dir) cmd1 = 'ls data/nas/miseq.lab' cmd2 = 'ls data/nas/miseq.lab' - got_output = backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False) + backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False) self.assertTrue(os.path.isfile(tmp_file)) shutil.rmtree(tmp_dir) diff --git a/tests/test_utils.py b/tests/test_utils.py index e16abbd0..2b52f37d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,8 +9,6 @@ from collections import defaultdict from unittest import mock -from six.moves import map - from taca.utils import bioinfo_tab, config, filesystem, misc, statusdb, transfer @@ -119,7 +117,7 @@ def test_run_is_demuxed(self, mock_couch): 'username': 'username', 'password': 'pwd', 'db': 'db'} - is_demultiplexed = misc.run_is_demuxed(run, couch_info=couch_info) + misc.run_is_demuxed(run, couch_info=couch_info) #TODO: should add a check here but not sure how to mock this properly class TestFilesystem(unittest.TestCase): @@ -596,7 +594,7 @@ def test_load_yaml_config(self): {'file': 'data/taca.log'}} self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - missing_config_data = config.load_yaml_config('data/missing_file.yaml)') + config.load_yaml_config('data/missing_file.yaml)') def test_load_config(self): """Load a config file.""" @@ -609,7 +607,7 @@ def test_load_config(self): {'file': 'data/taca.log'}} self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - missing_config_data = config.load_config('data/missing_file.yaml)') + config.load_config('data/missing_file.yaml)') class TestBioinfoTab(unittest.TestCase): """Test class for bioinfo_tab.""" From d5330f615b237beadcec22d5422dff3c02aa54ff Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:28:37 +0100 Subject: [PATCH 05/44] ruff format --- doc/conf.py | 199 ++-- setup.py | 43 +- taca/analysis/analysis.py | 436 +++++--- taca/analysis/analysis_nanopore.py | 4 - taca/analysis/cli.py | 57 +- taca/backup/backup.py | 400 +++++--- taca/backup/cli.py | 58 +- taca/cleanup/cleanup.py | 518 ++++++---- taca/cleanup/cli.py | 140 ++- taca/cli.py | 28 +- taca/illumina/MiSeq_Runs.py | 157 ++- taca/illumina/NextSeq_Runs.py | 2 +- taca/illumina/Runs.py | 1295 ++++++++++++++++-------- taca/illumina/Standard_Runs.py | 799 ++++++++++----- taca/illumina/__init__.py | 2 +- taca/log/__init__.py | 21 +- taca/nanopore/ONT_run_classes.py | 3 - taca/nanopore/__init__.py | 2 +- taca/nanopore/instrument_transfer.py | 11 +- taca/server_status/cli.py | 28 +- taca/server_status/cronjobs.py | 59 +- taca/server_status/server_status.py | 107 +- taca/testing/cli.py | 132 ++- taca/testing/create_uppmax_like_env.py | 557 ++++++---- taca/utils/bioinfo_tab.py | 335 +++--- taca/utils/cli.py | 15 +- taca/utils/config.py | 2 + taca/utils/filesystem.py | 21 +- taca/utils/misc.py | 106 +- taca/utils/statusdb.py | 115 ++- taca/utils/transfer.py | 264 ++--- tests/test_analysis.py | 193 ++-- tests/test_analysis_nanopore.py | 87 +- tests/test_backup.py | 165 +-- tests/test_cleanup.py | 181 ++-- tests/test_illumina.py | 619 +++++++---- tests/test_instrument_transfer.py | 22 +- tests/test_nanopore.py | 290 ++++-- tests/test_server_status.py | 129 ++- tests/test_utils.py | 656 ++++++------ 40 files changed, 5223 insertions(+), 3035 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 5c1d130e..01abb472 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -16,162 +16,163 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.todo", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -#templates_path = ['_templates'] +# templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'TACA' -copyright = '2014, Guillermo Carrasco' +project = "TACA" +copyright = "2014, Guillermo Carrasco" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '1.0' +version = "1.0" # The full version, including alpha/beta/rc tags. -release = '1.0' +release = "1.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +html_theme = "default" +on_rtd = os.environ.get("READTHEDOCS", None) == "True" if not on_rtd: import sphinx_rtd_theme - html_theme = 'sphinx_rtd_theme' + + html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. html_show_copyright = False @@ -179,68 +180,62 @@ # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'TACAdoc' +htmlhelp_basename = "TACAdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'TACA.tex', 'TACA Documentation', - 'Guillermo Carrasco', 'manual'), + ("index", "TACA.tex", "TACA Documentation", "Guillermo Carrasco", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'taca', 'TACA Documentation', - ['Guillermo Carrasco'], 1) -] +man_pages = [("index", "taca", "TACA Documentation", ["Guillermo Carrasco"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -249,89 +244,95 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'TACA', 'TACA Documentation', - 'Guillermo Carrasco', 'TACA', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "TACA", + "TACA Documentation", + "Guillermo Carrasco", + "TACA", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = 'TACA' -epub_author = 'Guillermo Carrasco' -epub_publisher = 'Guillermo Carrasco' -epub_copyright = '2014, Guillermo Carrasco' +epub_title = "TACA" +epub_author = "Guillermo Carrasco" +epub_publisher = "Guillermo Carrasco" +epub_copyright = "2014, Guillermo Carrasco" # The basename for the epub file. It defaults to the project name. -#epub_basename = u'TACA' +# epub_basename = u'TACA' # The HTML theme for the epub output. Since the default themes are not optimized # for small screen space, using the same theme for HTML and epub output is # usually not wise. This defaults to 'epub', a theme designed to save visual # space. -#epub_theme = 'epub' +# epub_theme = 'epub' # The language of the text. It defaults to the language option # or en if the language is not set. -#epub_language = '' +# epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' +# epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. -#epub_identifier = '' +# epub_identifier = '' # A unique identification for the text. -#epub_uid = '' +# epub_uid = '' # A tuple containing the cover image and cover page html template filenames. -#epub_cover = () +# epub_cover = () # A sequence of (type, uri, title) tuples for the guide element of content.opf. -#epub_guide = () +# epub_guide = () # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_pre_files = [] +# epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_post_files = [] +# epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 +# epub_tocdepth = 3 # Allow duplicate toc entries. -#epub_tocdup = True +# epub_tocdup = True # Choose between 'default' and 'includehidden'. -#epub_tocscope = 'default' +# epub_tocscope = 'default' # Fix unsupported image types using the PIL. -#epub_fix_images = False +# epub_fix_images = False # Scale large images. -#epub_max_image_width = 0 +# epub_max_image_width = 0 # How to display URL addresses: 'footnote', 'no', or 'inline'. -#epub_show_urls = 'inline' +# epub_show_urls = 'inline' # If false, no index is generated. -#epub_use_index = True +# epub_use_index = True diff --git a/setup.py b/setup.py index 74f35bed..e278a522 100644 --- a/setup.py +++ b/setup.py @@ -17,32 +17,33 @@ dependency_links = [] -setup(name='taca', +setup( + name="taca", version=__version__, description="Tool for the Automation of Cleanup and Analyses", - long_description='This package contains a set of functionalities that are ' - 'useful in the day-to-day tasks of bioinformatitians in ' - 'National Genomics Infrastructure in Stockholm, Sweden.', - keywords='bioinformatics', - author='NGI-stockholm', - author_email='ngi_pipeline_operators@scilifelab.se', - url='http://taca.readthedocs.org/en/latest/', - license='MIT', - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), - scripts=glob.glob('scripts/*.py'), + long_description="This package contains a set of functionalities that are " + "useful in the day-to-day tasks of bioinformatitians in " + "National Genomics Infrastructure in Stockholm, Sweden.", + keywords="bioinformatics", + author="NGI-stockholm", + author_email="ngi_pipeline_operators@scilifelab.se", + url="http://taca.readthedocs.org/en/latest/", + license="MIT", + packages=find_packages(exclude=["ez_setup", "examples", "tests"]), + scripts=glob.glob("scripts/*.py"), include_package_data=True, zip_safe=False, entry_points={ - 'console_scripts': ['taca = taca.cli:cli'], - 'taca.subcommands': [ - 'cleanup = taca.cleanup.cli:cleanup', - 'analysis = taca.analysis.cli:analysis', - 'bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries', - 'server_status = taca.server_status.cli:server_status', - 'backup = taca.backup.cli:backup', - 'create_env = taca.testing.cli:uppmax_env' - ] + "console_scripts": ["taca = taca.cli:cli"], + "taca.subcommands": [ + "cleanup = taca.cleanup.cli:cleanup", + "analysis = taca.analysis.cli:analysis", + "bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries", + "server_status = taca.server_status.cli:server_status", + "backup = taca.backup.cli:backup", + "create_env = taca.testing.cli:uppmax_env", + ], }, install_requires=install_requires, - dependency_links=dependency_links + dependency_links=dependency_links, ) diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index 4caf9a48..c615b3f9 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -29,54 +29,68 @@ def get_runObj(run, software): None if the sequencer type is unknown of there was an error """ - if os.path.exists(os.path.join(run, 'runParameters.xml')): - run_parameters_file = 'runParameters.xml' - elif os.path.exists(os.path.join(run, 'RunParameters.xml')): - run_parameters_file = 'RunParameters.xml' + if os.path.exists(os.path.join(run, "runParameters.xml")): + run_parameters_file = "runParameters.xml" + elif os.path.exists(os.path.join(run, "RunParameters.xml")): + run_parameters_file = "RunParameters.xml" else: - logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}') + logger.error( + f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}" + ) return run_parameters_path = os.path.join(run, run_parameters_file) try: run_parameters = RunParametersParser(run_parameters_path) except OSError: - logger.warn(f'Problems parsing the runParameters.xml file at {run_parameters_path}. ' - f'This is quite unexpected. please archive the run {run} manually') + logger.warn( + f"Problems parsing the runParameters.xml file at {run_parameters_path}. " + f"This is quite unexpected. please archive the run {run} manually" + ) else: # Do a case by case test because there are so many version of RunParameters that there is no real other way - runtype = run_parameters.data['RunParameters'].get('InstrumentType', - run_parameters.data['RunParameters'].get('ApplicationName', - run_parameters.data['RunParameters'].get('Application', - ''))) - if 'Setup' in run_parameters.data['RunParameters']: + runtype = run_parameters.data["RunParameters"].get( + "InstrumentType", + run_parameters.data["RunParameters"].get( + "ApplicationName", + run_parameters.data["RunParameters"].get("Application", ""), + ), + ) + if "Setup" in run_parameters.data["RunParameters"]: # This is the HiSeq2500, MiSeq, and HiSeqX case try: # Works for recent control software - runtype = run_parameters.data['RunParameters']['Setup']['Flowcell'] + runtype = run_parameters.data["RunParameters"]["Setup"]["Flowcell"] except KeyError: # Use this as second resource but print a warning in the logs - logger.warn('Parsing runParameters to fecth instrument type, ' - 'not found Flowcell information in it. Using ApplicationName') + logger.warn( + "Parsing runParameters to fecth instrument type, " + "not found Flowcell information in it. Using ApplicationName" + ) # Here makes sense to use get with default value '' -> # so that it doesn't raise an exception in the next lines # (in case ApplicationName is not found, get returns None) - runtype = run_parameters.data['RunParameters']['Setup'].get('ApplicationName', '') - - if 'MiSeq' in runtype: - return MiSeq_Run(run, software, CONFIG['analysis']['MiSeq']) - elif 'NextSeq' in runtype: - return NextSeq_Run(run, software, CONFIG['analysis']['NextSeq']) - elif 'NovaSeqXPlus' in runtype: - return NovaSeqXPlus_Run(run, software, CONFIG['analysis']['NovaSeqXPlus']) - elif 'NovaSeq' in runtype: - return NovaSeq_Run(run, software, CONFIG['analysis']['NovaSeq']) + runtype = run_parameters.data["RunParameters"]["Setup"].get( + "ApplicationName", "" + ) + + if "MiSeq" in runtype: + return MiSeq_Run(run, software, CONFIG["analysis"]["MiSeq"]) + elif "NextSeq" in runtype: + return NextSeq_Run(run, software, CONFIG["analysis"]["NextSeq"]) + elif "NovaSeqXPlus" in runtype: + return NovaSeqXPlus_Run(run, software, CONFIG["analysis"]["NovaSeqXPlus"]) + elif "NovaSeq" in runtype: + return NovaSeq_Run(run, software, CONFIG["analysis"]["NovaSeq"]) else: - logger.warn('Unrecognized run type {}, cannot archive the run {}. ' - 'Someone as likely bought a new sequencer without telling ' - 'it to the bioinfo team'.format(runtype, run)) + logger.warn( + "Unrecognized run type {}, cannot archive the run {}. " + "Someone as likely bought a new sequencer without telling " + "it to the bioinfo team".format(runtype, run) + ) return None + def upload_to_statusdb(run_dir, software): """Function to upload run_dir informations to statusDB directly from click interface. @@ -90,60 +104,80 @@ def upload_to_statusdb(run_dir, software): # Make the actual upload _upload_to_statusdb(runObj) + def _upload_to_statusdb(run): """Triggers the upload to statusdb using the dependency flowcell_parser. :param Run run: the object run """ - couch_conf = CONFIG['statusdb'] + couch_conf = CONFIG["statusdb"] couch_connection = statusdb.StatusdbSession(couch_conf).connection - db = couch_connection[couch_conf['xten_db']] + db = couch_connection[couch_conf["xten_db"]] parser = run.runParserObj # Check if I have NoIndex lanes - for element in parser.obj['samplesheet_csv']: - if 'NoIndex' in element['index'] or not element['index']: # NoIndex in the case of HiSeq, empty in the case of HiSeqX - lane = element['Lane'] # This is a lane with NoIndex + for element in parser.obj["samplesheet_csv"]: + if ( + "NoIndex" in element["index"] or not element["index"] + ): # NoIndex in the case of HiSeq, empty in the case of HiSeqX + lane = element["Lane"] # This is a lane with NoIndex # In this case PF Cluster is the number of undetermined reads try: - PFclusters = parser.obj['Undetermined'][lane]['unknown'] + PFclusters = parser.obj["Undetermined"][lane]["unknown"] except KeyError: - logger.error(f'While taking extra care of lane {lane} of NoIndex type ' \ - 'I found out that not all values were available') + logger.error( + f"While taking extra care of lane {lane} of NoIndex type " + "I found out that not all values were available" + ) continue # In Lanes_stats fix the lane yield - parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters) + parser.obj["illumina"]["Demultiplex_Stats"]["Lanes_stats"][int(lane) - 1][ + "PF Clusters" + ] = str(PFclusters) # Now fix Barcode lane stats - updated = 0 # Check that only one update is made - for sample in parser.obj['illumina']['Demultiplex_Stats']['Barcode_lane_statistics']: - if lane in sample['Lane']: + updated = 0 # Check that only one update is made + for sample in parser.obj["illumina"]["Demultiplex_Stats"][ + "Barcode_lane_statistics" + ]: + if lane in sample["Lane"]: updated += 1 - sample['PF Clusters'] = str(PFclusters) + sample["PF Clusters"] = str(PFclusters) if updated != 1: - logger.error(f'While taking extra care of lane {lane} of NoIndex type ' - 'I updated more than once the barcode_lane. ' - 'This is too much to continue so I will fail.') + logger.error( + f"While taking extra care of lane {lane} of NoIndex type " + "I updated more than once the barcode_lane. " + "This is too much to continue so I will fail." + ) os.sys.exit() # If I am here it means I changed the HTML representation to something # else to accomodate the wired things we do # someone told me that in such cases it is better to put a place holder for this - parser.obj['illumina']['Demultiplex_Stats']['NotOriginal'] = 'True' + parser.obj["illumina"]["Demultiplex_Stats"]["NotOriginal"] = "True" # Update info about bcl2fastq tool - if not parser.obj.get('DemultiplexConfig'): - parser.obj['DemultiplexConfig'] = {'Setup': {'Software': run.CONFIG.get('bcl2fastq', {})}} + if not parser.obj.get("DemultiplexConfig"): + parser.obj["DemultiplexConfig"] = { + "Setup": {"Software": run.CONFIG.get("bcl2fastq", {})} + } statusdb.update_doc(db, parser.obj, over_write_db_entry=True) + def transfer_run(run_dir): """Interface for click to force a transfer a run to uppmax. :param: string run_dir: the run to tranfer """ runObj = get_runObj(run_dir, software) - mail_recipients = CONFIG.get('mail', {}).get('recipients') + mail_recipients = CONFIG.get("mail", {}).get("recipients") if runObj is None: - mail_recipients = CONFIG.get('mail', {}).get('recipients') - logger.error(f'Trying to force a transfer of run {run_dir} but the sequencer was not recognized.') + mail_recipients = CONFIG.get("mail", {}).get("recipients") + logger.error( + f"Trying to force a transfer of run {run_dir} but the sequencer was not recognized." + ) else: - runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients) + runObj.transfer_run( + os.path.join("nosync", CONFIG["analysis"]["status_dir"], "transfer.tsv"), + mail_recipients, + ) + def transfer_runfolder(run_dir, pid, exclude_lane): """Transfer the entire run folder for a specified project and run to uppmax. @@ -156,82 +190,105 @@ def transfer_runfolder(run_dir, pid, exclude_lane): # Validate whether run_dir exists or is valid run_dir = os.path.abspath(run_dir) if not os.path.exists(run_dir) or not os.path.isdir(run_dir): - logger.error('Unable to locate the specified run directory for transfer.') + logger.error("Unable to locate the specified run directory for transfer.") sys.exit() - original_sample_sheet = os.path.join(run_dir, 'SampleSheet.csv') - pid_list = list(set([x.strip() for x in pid.split(',')])) - new_sample_sheet = os.path.join(run_dir, '_'.join(pid_list) + '_SampleSheet.txt') + original_sample_sheet = os.path.join(run_dir, "SampleSheet.csv") + pid_list = list(set([x.strip() for x in pid.split(",")])) + new_sample_sheet = os.path.join(run_dir, "_".join(pid_list) + "_SampleSheet.txt") # Write new sample sheet including only rows for the specified project try: - with open(new_sample_sheet, 'w') as nss: + with open(new_sample_sheet, "w") as nss: nss.write(extract_project_samplesheet(original_sample_sheet, pid_list)) except OSError as e: - logger.error('An error occured while parsing the samplesheet. ' - 'Please check the sample sheet and try again.') + logger.error( + "An error occured while parsing the samplesheet. " + "Please check the sample sheet and try again." + ) raise e # Create a tar archive of the runfolder dir_name = os.path.basename(run_dir) - archive = run_dir + '.tar.gz' + archive = run_dir + ".tar.gz" run_dir_path = os.path.dirname(run_dir) # Prepare the options for excluding lanes - if exclude_lane != '': + if exclude_lane != "": dir_for_excluding_lane = [] - lane_to_exclude = exclude_lane.split(',') + lane_to_exclude = exclude_lane.split(",") for lane in lane_to_exclude: - if os.path.isdir(f'{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}'): - dir_for_excluding_lane.extend(['--exclude', f'Thumbnail_Images/L00{lane}']) - if os.path.isdir(f'{run_dir_path}/{dir_name}/Images/Focus/L00{lane}'): - dir_for_excluding_lane.extend(['--exclude', f'Images/Focus/L00{lane}']) - if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}'): - dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/L00{lane}']) - if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}'): - dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/BaseCalls/L00{lane}']) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}"): + dir_for_excluding_lane.extend( + ["--exclude", f"Thumbnail_Images/L00{lane}"] + ) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Images/Focus/L00{lane}"): + dir_for_excluding_lane.extend(["--exclude", f"Images/Focus/L00{lane}"]) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}"): + dir_for_excluding_lane.extend( + ["--exclude", f"Data/Intensities/L00{lane}"] + ) + if os.path.isdir( + f"{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}" + ): + dir_for_excluding_lane.extend( + ["--exclude", f"Data/Intensities/BaseCalls/L00{lane}"] + ) try: - exclude_options_for_tar = ['--exclude', 'Demultiplexing*', - '--exclude', 'demux_*', - '--exclude', 'rsync*', - '--exclude', '*.csv'] - if exclude_lane != '': + exclude_options_for_tar = [ + "--exclude", + "Demultiplexing*", + "--exclude", + "demux_*", + "--exclude", + "rsync*", + "--exclude", + "*.csv", + ] + if exclude_lane != "": exclude_options_for_tar += dir_for_excluding_lane - subprocess.call(['tar'] + exclude_options_for_tar + ['-cvzf', archive, '-C', run_dir_path, dir_name]) + subprocess.call( + ["tar"] + + exclude_options_for_tar + + ["-cvzf", archive, "-C", run_dir_path, dir_name] + ) except subprocess.CalledProcessError as e: - logger.error('Error creating tar archive') + logger.error("Error creating tar archive") raise e # Generate the md5sum under the same folder as run_dir - md5file = archive + '.md5' + md5file = archive + ".md5" try: - f = open(md5file, 'w') + f = open(md5file, "w") os.chdir(run_dir_path) - subprocess.call(['md5sum', os.path.basename(archive)], stdout=f) + subprocess.call(["md5sum", os.path.basename(archive)], stdout=f) f.close() except subprocess.CalledProcessError as e: - logger.error('Error creating md5 file') + logger.error("Error creating md5 file") raise e # Rsync the files to the analysis cluster - destination = CONFIG['analysis']['deliver_runfolder'].get('destination') - rsync_opts = {'-LtDrv': None, - '--chmod': 'g+rw'} - connection_details = CONFIG['analysis']['deliver_runfolder'].get('analysis_server') - archive_transfer = RsyncAgent(archive, - dest_path=destination, - remote_host=connection_details['host'], - remote_user=connection_details['user'], - validate=False, - opts=rsync_opts) - md5_transfer = RsyncAgent(md5file, - dest_path=destination, - remote_host=connection_details['host'], - remote_user=connection_details['user'], - validate=False, - opts=rsync_opts) + destination = CONFIG["analysis"]["deliver_runfolder"].get("destination") + rsync_opts = {"-LtDrv": None, "--chmod": "g+rw"} + connection_details = CONFIG["analysis"]["deliver_runfolder"].get("analysis_server") + archive_transfer = RsyncAgent( + archive, + dest_path=destination, + remote_host=connection_details["host"], + remote_user=connection_details["user"], + validate=False, + opts=rsync_opts, + ) + md5_transfer = RsyncAgent( + md5file, + dest_path=destination, + remote_host=connection_details["host"], + remote_user=connection_details["user"], + validate=False, + opts=rsync_opts, + ) archive_transfer.transfer() md5_transfer.transfer() @@ -242,82 +299,107 @@ def transfer_runfolder(run_dir, pid, exclude_lane): os.remove(archive) os.remove(md5file) except OSError as e: - logger.error('Was not able to delete all temporary files') + logger.error("Was not able to delete all temporary files") raise e return + def extract_project_samplesheet(sample_sheet, pid_list): - header_line = '' - project_entries = '' + header_line = "" + project_entries = "" with open(sample_sheet) as f: for line in f: - if line.split(',')[0] in ('Lane', 'FCID'): # include the header + if line.split(",")[0] in ("Lane", "FCID"): # include the header header_line += line elif any(pid in line for pid in pid_list): - project_entries += line # include only lines related to the specified project + project_entries += ( + line # include only lines related to the specified project + ) new_samplesheet_content = header_line + project_entries return new_samplesheet_content + def run_preprocessing(run, software): """Run demultiplexing in all data directories. :param str run: Process a particular run instead of looking for runs """ + def _process(run): """Process a run/flowcell and transfer to analysis server. :param taca.illumina.Run run: Run to be processed and transferred """ - logger.info(f'Checking run {run.id}') - transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv') - if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot - logger.info(f'Run {run.id} already transferred to analysis server, skipping it') + logger.info(f"Checking run {run.id}") + transfer_file = os.path.join(CONFIG["analysis"]["status_dir"], "transfer.tsv") + if run.is_transferred( + transfer_file + ): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot + logger.info( + f"Run {run.id} already transferred to analysis server, skipping it" + ) return - if run.get_run_status() == 'SEQUENCING': - logger.info(f'Run {run.id} is not finished yet') - if 'statusdb' in CONFIG: + if run.get_run_status() == "SEQUENCING": + logger.info(f"Run {run.id} is not finished yet") + if "statusdb" in CONFIG: _upload_to_statusdb(run) - elif run.get_run_status() == 'TO_START': - if run.get_run_type() == 'NON-NGI-RUN': + elif run.get_run_status() == "TO_START": + if run.get_run_type() == "NON-NGI-RUN": # For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs) - logger.warn(f'Run {run.id} marked as {run.get_run_type()}, ' - 'TACA will skip this and move the run to ' - 'no-sync directory') - if 'storage' in CONFIG: - run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type]) + logger.warn( + f"Run {run.id} marked as {run.get_run_type()}, " + "TACA will skip this and move the run to " + "no-sync directory" + ) + if "storage" in CONFIG: + run.archive_run( + CONFIG["storage"]["archive_dirs"][run.sequencer_type] + ) return - logger.info(f'Starting BCL to FASTQ conversion and demultiplexing for run {run.id}') - if 'statusdb' in CONFIG: + logger.info( + f"Starting BCL to FASTQ conversion and demultiplexing for run {run.id}" + ) + if "statusdb" in CONFIG: _upload_to_statusdb(run) run.demultiplex_run() - elif run.get_run_status() == 'IN_PROGRESS': - logger.info('BCL conversion and demultiplexing process in ' - f'progress for run {run.id}, skipping it') + elif run.get_run_status() == "IN_PROGRESS": + logger.info( + "BCL conversion and demultiplexing process in " + f"progress for run {run.id}, skipping it" + ) # Upload to statusDB if applies - if 'statusdb' in CONFIG: + if "statusdb" in CONFIG: _upload_to_statusdb(run) # This function checks if demux is done run.check_run_status() # Previous elif might change the status to COMPLETED, therefore to avoid skipping # a cycle take the last if out of the elif - if run.get_run_status() == 'COMPLETED': + if run.get_run_status() == "COMPLETED": run.check_run_status() - logger.info(f'Preprocessing of run {run.id} is finished, transferring it') + logger.info(f"Preprocessing of run {run.id} is finished, transferring it") # Upload to statusDB if applies - if 'statusdb' in CONFIG: + if "statusdb" in CONFIG: _upload_to_statusdb(run) demux_summary_message = [] for demux_id, demux_log in run.demux_summary.items(): - if demux_log['errors'] or demux_log['warnings']: - demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings'])) - demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5])) - if len(demux_log['error_and_warning_messages'])>5: - demux_summary_message.append(f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}.") + if demux_log["errors"] or demux_log["warnings"]: + demux_summary_message.append( + "Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format( + demux_id, demux_log["errors"], demux_log["warnings"] + ) + ) + demux_summary_message.append( + "\n".join(demux_log["error_and_warning_messages"][:5]) + ) + if len(demux_log["error_and_warning_messages"]) > 5: + demux_summary_message.append( + f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}." + ) # Notify with a mail run completion and stats uploaded if demux_summary_message: - sbt = (f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!") + sbt = f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!" msg = """The run {run} has been demultiplexed with errors or warnings! {errors_warnings} @@ -326,9 +408,11 @@ def _process(run): The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} - """.format(errors_warnings='\n'.join(demux_summary_message), run=run.id) + """.format( + errors_warnings="\n".join(demux_summary_message), run=run.id + ) else: - sbt = (f"{run.id} Demultiplexing Completed!") + sbt = f"{run.id} Demultiplexing Completed!" msg = """The run {run} has been demultiplexed without any error or warning. The Run will be transferred to the analysis cluster for further analysis. @@ -336,67 +420,103 @@ def _process(run): The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} """.format(run=run.id) - run.send_mail(sbt, msg, rcp=CONFIG['mail']['recipients']) + run.send_mail(sbt, msg, rcp=CONFIG["mail"]["recipients"]) # Copy demultiplex stats file, InterOp meta data and run xml files to shared file system for LIMS purpose - if 'mfs_path' in CONFIG['analysis']: + if "mfs_path" in CONFIG["analysis"]: try: - mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id) - logger.info(f'Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}') + mfs_dest = os.path.join( + CONFIG["analysis"]["mfs_path"][run.sequencer_type.lower()], + run.id, + ) + logger.info( + f"Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}" + ) if not os.path.exists(mfs_dest): os.mkdir(mfs_dest) - demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports', - 'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html') - copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html')) + demulti_stat_src = os.path.join( + run.run_dir, + run.demux_dir, + "Reports", + "html", + run.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) + copyfile( + demulti_stat_src, os.path.join(mfs_dest, "laneBarcode.html") + ) # Copy RunInfo.xml - run_info_xml_src = os.path.join(run.run_dir, 'RunInfo.xml') + run_info_xml_src = os.path.join(run.run_dir, "RunInfo.xml") if os.path.isfile(run_info_xml_src): - copyfile(run_info_xml_src, os.path.join(mfs_dest, 'RunInfo.xml')) + copyfile( + run_info_xml_src, os.path.join(mfs_dest, "RunInfo.xml") + ) # Copy RunParameters.xml - run_parameters_xml_src = os.path.join(run.run_dir, 'RunParameters.xml') + run_parameters_xml_src = os.path.join( + run.run_dir, "RunParameters.xml" + ) if os.path.isfile(run_info_xml_src): - copyfile(run_parameters_xml_src, os.path.join(mfs_dest, 'RunParameters.xml')) + copyfile( + run_parameters_xml_src, + os.path.join(mfs_dest, "RunParameters.xml"), + ) # Copy InterOp - interop_src = os.path.join(run.run_dir, 'InterOp') + interop_src = os.path.join(run.run_dir, "InterOp") if os.path.exists(interop_src): - copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True) + copytree( + interop_src, + os.path.join(mfs_dest, "InterOp"), + dirs_exist_ok=True, + ) except: - logger.warn(f'Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}') + logger.warn( + f"Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}" + ) # Transfer to analysis server if flag is True if run.transfer_to_analysis_server: - mail_recipients = CONFIG.get('mail', {}).get('recipients') - logger.info('Transferring run {} to {} into {}' - .format(run.id, - run.CONFIG['analysis_server']['host'], - run.CONFIG['analysis_server']['sync']['data_archive'])) + mail_recipients = CONFIG.get("mail", {}).get("recipients") + logger.info( + "Transferring run {} to {} into {}".format( + run.id, + run.CONFIG["analysis_server"]["host"], + run.CONFIG["analysis_server"]["sync"]["data_archive"], + ) + ) run.transfer_run(transfer_file, mail_recipients) # Archive the run if indicated in the config file - if 'storage' in CONFIG: #TODO: make sure archiving to PDC is not ongoing - run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type]) + if "storage" in CONFIG: # TODO: make sure archiving to PDC is not ongoing + run.archive_run(CONFIG["storage"]["archive_dirs"][run.sequencer_type]) if run: # Determine the run type runObj = get_runObj(run, software) if not runObj: - raise RuntimeError(f"Unrecognized instrument type or incorrect run folder {run}") + raise RuntimeError( + f"Unrecognized instrument type or incorrect run folder {run}" + ) else: _process(runObj) else: - data_dirs = CONFIG.get('analysis').get('data_dirs') + data_dirs = CONFIG.get("analysis").get("data_dirs") for data_dir in data_dirs: # Run folder looks like DATE_*_*_*, the last section is the FC name. - runs = glob.glob(os.path.join(data_dir, '[1-9]*_*_*_*')) + runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) for _run in runs: runObj = get_runObj(_run, software) if not runObj: - logger.warning(f'Unrecognized instrument type or incorrect run folder {run}') + logger.warning( + f"Unrecognized instrument type or incorrect run folder {run}" + ) else: try: _process(runObj) except: # This function might throw and exception, # it is better to continue processing other runs - logger.warning(f'There was an error processing the run {run}') + logger.warning(f"There was an error processing the run {run}") pass diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index 9141551a..c9dea404 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -36,7 +36,6 @@ def find_run_dirs(dir_to_search: str, skip_dirs: list): def send_error_mail(run_name, error: BaseException): - email_subject = f"Run processed with errors: {run_name}" email_message = f"{str(error)}\n\n{traceback.format_exc()}" email_recipients = CONFIG["mail"]["recipients"] @@ -75,7 +74,6 @@ def process_user_run(ont_user_run: ONT_user_run): if not ont_user_run.is_synced(): logger.info(f"{ont_user_run.run_name}: Run is not fully synced, skipping.") else: - if ont_user_run.is_transferred(): logger.warning( f"{ont_user_run.run_name}: Run is already logged as transferred, sending mail." @@ -157,7 +155,6 @@ def process_qc_run(ont_qc_run: ONT_qc_run): if not ont_qc_run.is_synced(): logger.info(f"{ont_qc_run.run_name}: Run is not fully synced, skipping.") else: - # Assert all files are in place logger.info(f"{ont_qc_run.run_name}: Asserting run contents...") ont_qc_run.assert_contents() @@ -257,7 +254,6 @@ def ont_transfer(run_abspath: str or None, qc: bool = False): # If no run is specified, locate all runs else: - for run_type in ["user_run", "qc_run"]: logger.info(f"Looking for runs of type '{run_type}'...") diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py index ba101d66..52b6423b 100644 --- a/taca/analysis/cli.py +++ b/taca/analysis/cli.py @@ -13,21 +13,42 @@ def analysis(): # Illumina analysis subcommands + @analysis.command() -@click.option('-r', '--run', type=click.Path(exists=True), default=None, - help='Demultiplex only a particular run') -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + default=None, + help="Demultiplex only a particular run", +) +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) def demultiplex(run, software): - """Demultiplex and transfer all runs present in the data directories.""" - an.run_preprocessing(run, software) + """Demultiplex and transfer all runs present in the data directories.""" + an.run_preprocessing(run, software) + @analysis.command() -@click.option('--runfolder-project', is_flag=False, help='Project IDs for runfolder transfer separated by comma') -@click.option('--exclude-lane', default='', help='Lanes to exclude separated by comma') -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') -@click.argument('rundir') +@click.option( + "--runfolder-project", + is_flag=False, + help="Project IDs for runfolder transfer separated by comma", +) +@click.option("--exclude-lane", default="", help="Lanes to exclude separated by comma") +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) +@click.argument("rundir") def transfer(rundir, runfolder_project, exclude_lane, software): """Transfers the run without qc.""" if not runfolder_project: @@ -35,10 +56,16 @@ def transfer(rundir, runfolder_project, exclude_lane, software): else: an.transfer_runfolder(rundir, pid=runfolder_project, exclude_lane=exclude_lane) + @analysis.command() -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') -@click.argument('rundir') +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) +@click.argument("rundir") def updatedb(rundir, software): """Save the run to statusdb.""" an.upload_to_statusdb(rundir, software) @@ -46,6 +73,7 @@ def updatedb(rundir, software): # Nanopore analysis subcommands + @analysis.command() @click.option( "-r", @@ -65,6 +93,7 @@ def ont_transfer(run, qc): """Find and process all runs""" analysis_nanopore.ont_transfer(run, qc) + @analysis.command() @click.argument("run") def ont_updatedb(run): diff --git a/taca/backup/backup.py b/taca/backup/backup.py index 97fdec7b..8d43a558 100644 --- a/taca/backup/backup.py +++ b/taca/backup/backup.py @@ -13,16 +13,19 @@ logger = logging.getLogger(__name__) + class run_vars: """A simple variable storage class.""" + def __init__(self, run, archive_path): self.abs_path = os.path.abspath(run) self.path, self.name = os.path.split(self.abs_path) - self.name = self.name.split('.', 1)[0] - self.zip = os.path.join(archive_path, f'{self.name}.tar.gz') - self.key = f'{self.name}.key' - self.key_encrypted = f'{self.name}.key.gpg' - self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg') + self.name = self.name.split(".", 1)[0] + self.zip = os.path.join(archive_path, f"{self.name}.tar.gz") + self.key = f"{self.name}.key" + self.key_encrypted = f"{self.name}.key.gpg" + self.zip_encrypted = os.path.join(archive_path, f"{self.name}.tar.gz.gpg") + class backup_utils: """A class object with main utility methods related to backing up.""" @@ -30,25 +33,31 @@ class backup_utils: def __init__(self, run=None): self.run = run self.fetch_config_info() - self.host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0] + self.host_name = os.getenv("HOSTNAME", os.uname()[1]).split(".", 1)[0] def fetch_config_info(self): """Try to fecth required info from the config file. Log and exit if any neccesary info is missing.""" try: - self.data_dirs = CONFIG['backup']['data_dirs'] - self.archive_dirs = CONFIG['backup']['archive_dirs'] - self.archived_dirs = CONFIG['backup']['archived_dirs'] - self.exclude_list = CONFIG['backup']['exclude_list'] - self.keys_path = CONFIG['backup']['keys_path'] - self.gpg_receiver = CONFIG['backup']['gpg_receiver'] - self.mail_recipients = CONFIG['mail']['recipients'] - self.check_demux = CONFIG.get('backup', {}).get('check_demux', False) - self.couch_info = CONFIG.get('statusdb') - self.finished_run_indicator = CONFIG.get('storage', {}).get('finished_run_indicator', 'RTAComplete.txt') - self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt') - self.archive_log_location = CONFIG['backup']['archive_log'] + self.data_dirs = CONFIG["backup"]["data_dirs"] + self.archive_dirs = CONFIG["backup"]["archive_dirs"] + self.archived_dirs = CONFIG["backup"]["archived_dirs"] + self.exclude_list = CONFIG["backup"]["exclude_list"] + self.keys_path = CONFIG["backup"]["keys_path"] + self.gpg_receiver = CONFIG["backup"]["gpg_receiver"] + self.mail_recipients = CONFIG["mail"]["recipients"] + self.check_demux = CONFIG.get("backup", {}).get("check_demux", False) + self.couch_info = CONFIG.get("statusdb") + self.finished_run_indicator = CONFIG.get("storage", {}).get( + "finished_run_indicator", "RTAComplete.txt" + ) + self.copy_complete_indicator = CONFIG.get("storage", {}).get( + "copy_complete_indicator", "CopyComplete.txt" + ) + self.archive_log_location = CONFIG["backup"]["archive_log"] except KeyError as e: - logger.error(f'Config file is missing the key {str(e)}, make sure it have all required information') + logger.error( + f"Config file is missing the key {str(e)}, make sure it have all required information" + ) raise SystemExit def collect_runs(self, ext=None, filter_by_ext=False): @@ -58,24 +67,30 @@ def collect_runs(self, ext=None, filter_by_ext=False): run_type = self._get_run_type(self.run) archive_path = self.archive_dirs[run_type] run = run_vars(self.run, archive_path) - if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)): - logger.error(f'Given run {self.run} did not match a FC pattern') + if not ( + re.match(filesystem.RUN_RE, run.name) + or re.match(filesystem.RUN_RE_ONT, run.name) + ): + logger.error(f"Given run {self.run} did not match a FC pattern") raise SystemExit if self._is_ready_to_archive(run, ext): self.runs.append(run) else: for adir in self.archive_dirs.values(): if not os.path.isdir(adir): - logger.warn(f'Path {adir} does not exist or it is not a directory') + logger.warn(f"Path {adir} does not exist or it is not a directory") continue for item in os.listdir(adir): if filter_by_ext and not item.endswith(ext): continue elif item.endswith(ext): - item = item.replace(ext, '') + item = item.replace(ext, "") elif not os.path.isdir(os.path.join(adir, item)): continue - if (re.match(filesystem.RUN_RE, item) or re.match(filesystem.RUN_RE_ONT, item)) and item not in self.runs: + if ( + re.match(filesystem.RUN_RE, item) + or re.match(filesystem.RUN_RE_ONT, item) + ) and item not in self.runs: run_type = self._get_run_type(item) archive_path = self.archive_dirs[run_type] run = run_vars(os.path.join(adir, item), archive_path) @@ -85,7 +100,14 @@ def collect_runs(self, ext=None, filter_by_ext=False): def avail_disk_space(self, path, run): """Check the space on file system based on parent directory of the run.""" # not able to fetch runtype use the max size as precaution, size units in GB - illumina_run_sizes = {'novaseq': 1800, 'miseq': 20, 'nextseq': 250, 'NovaSeqXPlus': 3600, 'promethion': 3000, 'minion': 1000} + illumina_run_sizes = { + "novaseq": 1800, + "miseq": 20, + "nextseq": 250, + "NovaSeqXPlus": 3600, + "promethion": 3000, + "minion": 1000, + } required_size = illumina_run_sizes.get(self._get_run_type(run), 900) * 2 # check for any ongoing runs and add up the required size accrdingly for ddir in self.data_dirs.values(): @@ -94,19 +116,25 @@ def avail_disk_space(self, path, run): for item in os.listdir(ddir): if not re.match(filesystem.RUN_RE, item): continue - if not os.path.exists(os.path.join(ddir, item, 'RTAComplete.txt')): - required_size += illumina_run_sizes.get(self._get_run_type(run), 900) + if not os.path.exists(os.path.join(ddir, item, "RTAComplete.txt")): + required_size += illumina_run_sizes.get( + self._get_run_type(run), 900 + ) # get available free space from the file system try: - df_proc = sp.Popen(['df', path], stdout=sp.PIPE, stderr=sp.PIPE) + df_proc = sp.Popen(["df", path], stdout=sp.PIPE, stderr=sp.PIPE) df_out, df_err = df_proc.communicate() - available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024 + available_size = ( + int(df_out.strip().decode("utf-8").split("\n")[-1].strip().split()[3]) + / 1024 + / 1024 + ) except Exception as e: - logger.error(f'Evaluation of disk space failed with error {e}') + logger.error(f"Evaluation of disk space failed with error {e}") raise SystemExit if available_size < required_size: - e_msg = f'Required space for encryption is {required_size}GB, but only {available_size}GB available' - subjt = f'Low space for encryption - {self.host_name}' + e_msg = f"Required space for encryption is {required_size}GB, but only {available_size}GB available" + subjt = f"Low space for encryption - {self.host_name}" logger.error(e_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) raise SystemExit @@ -117,47 +145,63 @@ def file_in_pdc(self, src_file, silent=True): # non-zero/False though cmd is execudted but file not found src_file_abs = os.path.abspath(src_file) try: - sp.check_call(['dsmc', 'query', 'archive', src_file_abs], stdout=sp.PIPE, stderr=sp.PIPE) + sp.check_call( + ["dsmc", "query", "archive", src_file_abs], + stdout=sp.PIPE, + stderr=sp.PIPE, + ) value = True except sp.CalledProcessError: value = False if not silent: - msg = 'File {} {} in PDC'.format(src_file_abs, 'exist' if value else 'do not exist') + msg = "File {} {} in PDC".format( + src_file_abs, "exist" if value else "do not exist" + ) logger.info(msg) return value def _get_run_type(self, run): """Returns run type based on the flowcell name.""" - run_type = '' + run_type = "" try: - if '_A0' in run: - run_type = 'novaseq' - elif '-' in run.split('_')[-1]: - run_type = 'miseq' - elif '_NS' in run or '_VH' in run: - run_type = 'nextseq' - elif '_LH' in run: - run_type = 'NovaSeqXPlus' - elif '_MN' in run: - run_type = 'minion' - elif re.match("^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$",run): - run_type = 'promethion' + if "_A0" in run: + run_type = "novaseq" + elif "-" in run.split("_")[-1]: + run_type = "miseq" + elif "_NS" in run or "_VH" in run: + run_type = "nextseq" + elif "_LH" in run: + run_type = "NovaSeqXPlus" + elif "_MN" in run: + run_type = "minion" + elif re.match( + "^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$", run + ): + run_type = "promethion" else: - run_type = '' + run_type = "" except: - logger.warn(f'Could not fetch run type for run {run}') + logger.warn(f"Could not fetch run type for run {run}") return run_type - def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]): + def _call_commands( + self, + cmd1, + cmd2=None, + out_file=None, + return_out=False, + mail_failed=False, + tmp_files=[], + ): """Call an external command(s) with atmost two commands per function call. Given 'out_file' is always used for the later cmd and also stdout can be return for the later cmd. In case of failure, the 'tmp_files' are removed""" if out_file: if not cmd2: - stdout1 = open(out_file, 'w') + stdout1 = open(out_file, "w") else: stdout1 = sp.PIPE - stdout2 = open(out_file, 'w') + stdout2 = open(out_file, "w") else: stdout1 = sp.PIPE stdout2 = sp.PIPE @@ -170,7 +214,9 @@ def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_ p2 = sp.Popen(cmd2, stdin=p1.stdout, stdout=stdout2, stderr=sp.PIPE) p2_stat = p2.wait() p2_out, p2_err = p2.communicate() - if not self._check_status(cmd2, p2_stat, p2_err, mail_failed, tmp_files): + if not self._check_status( + cmd2, p2_stat, p2_err, mail_failed, tmp_files + ): return (False, p2_err) if return_out else False p1_stat = p1.wait() p1_out, p1_err = p1.communicate() @@ -193,10 +239,12 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]): if status != 0: self._clean_tmp_files(files_to_remove) if mail_failed: - subjt = f'Command call failed - {self.host_name}' - e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg) + subjt = f"Command call failed - {self.host_name}" + e_msg = "Called cmd: {}\n\nError msg: {}".format(" ".join(cmd), err_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) - logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg)) + logger.error( + 'Command "{}" failed with the error "{}"'.format(" ".join(cmd), err_msg) + ) return False return True @@ -209,20 +257,22 @@ def _clean_tmp_files(self, files): def _log_pdc_statusdb(self, run): """Log the time stamp in statusDB if a file is succussfully sent to PDC.""" try: - run_vals = run.split('_') + run_vals = run.split("_") if len(run_vals[0]) == 8: run_date = run_vals[0][2:] else: run_date = run_vals[0] - run_fc = f'{run_date}_{run_vals[-1]}' + run_fc = f"{run_date}_{run_vals[-1]}" couch_connection = statusdb.StatusdbSession(self.couch_info).connection - db = couch_connection[self.couch_info['db']] - fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)} + db = couch_connection[self.couch_info["db"]] + fc_names = {e.key: e.id for e in db.view("names/name", reduce=False)} d_id = fc_names[run_fc] doc = db.get(d_id) - doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + doc["pdc_archived"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") db.save(doc) - logger.info(f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"') + logger.info( + f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"' + ) except: logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}') @@ -232,24 +282,35 @@ def _is_ready_to_archive(self, run, ext): run_path = run.abs_path rta_file = os.path.join(run_path, self.finished_run_indicator) cp_file = os.path.join(run_path, self.copy_complete_indicator) - if (os.path.exists(rta_file) and os.path.exists(cp_file) and (not self.file_in_pdc(run.zip_encrypted))) or (self._get_run_type(run.name) in ['promethion', 'minion'] and os.path.exists(os.path.join(run_path, ".sync_finished"))): + if ( + os.path.exists(rta_file) + and os.path.exists(cp_file) + and (not self.file_in_pdc(run.zip_encrypted)) + ) or ( + self._get_run_type(run.name) in ["promethion", "minion"] + and os.path.exists(os.path.join(run_path, ".sync_finished")) + ): # Case for encrypting # Run has NOT been encrypted (run.tar.gz.gpg not exists) - if ext == '.tar.gz' and (not os.path.exists(run.zip_encrypted)): - logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving') + if ext == ".tar.gz" and (not os.path.exists(run.zip_encrypted)): + logger.info( + f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving" + ) archive_ready = True # Case for putting data to PDC # Run has already been encrypted (run.tar.gz.gpg exists) - elif ext == '.tar.gz.gpg' and os.path.exists(run.zip_encrypted): - logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC') + elif ext == ".tar.gz.gpg" and os.path.exists(run.zip_encrypted): + logger.info( + f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC" + ) archive_ready = True return archive_ready def log_archived_run(self, file_name): """Write files archived to PDC to log file""" - with open(self.archive_log_location, 'a') as archive_file: - tsv_writer = csv.writer(archive_file, delimiter='\t') + with open(self.archive_log_location, "a") as archive_file: + tsv_writer = csv.writer(archive_file, delimiter="\t") tsv_writer.writerow([file_name, str(datetime.now())]) def _move_run_to_archived(self, run): @@ -257,7 +318,7 @@ def _move_run_to_archived(self, run): run_type = self._get_run_type(run.name) archived_path = self.archived_dirs[run_type] if os.path.isdir(archived_path): - logger.info(f'Moving run {run.name} to the archived folder') + logger.info(f"Moving run {run.name} to the archived folder") shutil.move(run.name, archived_path) else: logger.warning("Cannot move run to archived, destination does not exist") @@ -266,130 +327,207 @@ def _move_run_to_archived(self, run): def encrypt_runs(cls, run, force): """Encrypt the runs that have been collected.""" bk = cls(run) - bk.collect_runs(ext='.tar.gz') - logger.info(f'In total, found {len(bk.runs)} run(s) to be encrypted') + bk.collect_runs(ext=".tar.gz") + logger.info(f"In total, found {len(bk.runs)} run(s) to be encrypted") for run in bk.runs: - run.flag = f'{run.name}.encrypting' + run.flag = f"{run.name}.encrypting" run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag] - logger.info(f'Encryption of run {run.name} is now started') + logger.info(f"Encryption of run {run.name} is now started") # Check if there is enough space and exit if not bk.avail_disk_space(run.path, run.name) # Check if the run in demultiplexed if not force and bk.check_demux: - if not misc.run_is_demuxed(run, bk.couch_info, bk._get_run_type(run.name)): - logger.warn(f'Run {run.name} is not demultiplexed yet, so skipping it') + if not misc.run_is_demuxed( + run, bk.couch_info, bk._get_run_type(run.name) + ): + logger.warn( + f"Run {run.name} is not demultiplexed yet, so skipping it" + ) continue - logger.info(f'Run {run.name} is demultiplexed and proceeding with encryption') + logger.info( + f"Run {run.name} is demultiplexed and proceeding with encryption" + ) with filesystem.chdir(run.path): # skip run if already ongoing if os.path.exists(run.flag): - logger.warn(f'Run {run.name} is already being encrypted, so skipping now') + logger.warn( + f"Run {run.name} is already being encrypted, so skipping now" + ) continue - open(run.flag, 'w').close() + open(run.flag, "w").close() # zip the run directory if os.path.exists(run.zip): if os.path.isdir(run.name): - logger.warn(f'Both run source and zipped archive exist for run {run.name}, skipping run as precaution') + logger.warn( + f"Both run source and zipped archive exist for run {run.name}, skipping run as precaution" + ) bk._clean_tmp_files([run.flag]) continue - logger.info(f'Zipped archive already exist for run {run.name}, so using it for encryption') + logger.info( + f"Zipped archive already exist for run {run.name}, so using it for encryption" + ) else: - exclude_files = " ".join([f'--exclude {x}' for x in bk.exclude_list]) - logger.info(f'Creating zipped archive for run {run.name}') - if bk._call_commands(cmd1=f'tar {exclude_files} -cf - {run.name}', cmd2='pigz --fast -c -', - out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]): - logger.info(f'Run {run.name} was successfully compressed and transferred to {run.zip}') + exclude_files = " ".join( + [f"--exclude {x}" for x in bk.exclude_list] + ) + logger.info(f"Creating zipped archive for run {run.name}") + if bk._call_commands( + cmd1=f"tar {exclude_files} -cf - {run.name}", + cmd2="pigz --fast -c -", + out_file=run.zip, + mail_failed=True, + tmp_files=[run.zip, run.flag], + ): + logger.info( + f"Run {run.name} was successfully compressed and transferred to {run.zip}" + ) else: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue # Remove encrypted file if already exists if os.path.exists(run.zip_encrypted): - logger.warn(f'Removing already existing encrypted file for run {run.name}, this is a precaution ' - 'to make sure the file was encrypted with correct key file') - bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted]) + logger.warn( + f"Removing already existing encrypted file for run {run.name}, this is a precaution " + "to make sure the file was encrypted with correct key file" + ) + bk._clean_tmp_files( + [ + run.zip_encrypted, + run.key, + run.key_encrypted, + run.dst_key_encrypted, + ] + ) # Generate random key to use as pasphrase - if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files): - logger.warn(f'Skipping run {run.name} and moving on') + if not bk._call_commands( + cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files + ): + logger.warn(f"Skipping run {run.name} and moving on") continue - logger.info(f'Generated random phrase key for run {run.name}') + logger.info(f"Generated random phrase key for run {run.name}") # Calculate md5 sum pre encryption if not force: - logger.info('Calculating md5sum before encryption') - md5_call, md5_out = bk._call_commands(cmd1=f'md5sum {run.zip}', return_out=True, tmp_files=tmp_files) + logger.info("Calculating md5sum before encryption") + md5_call, md5_out = bk._call_commands( + cmd1=f"md5sum {run.zip}", return_out=True, tmp_files=tmp_files + ) if not md5_call: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue md5_pre_encrypt = md5_out.split()[0] # Encrypt the zipped run file - logger.info('Encrypting the zipped run file') - if not bk._call_commands(cmd1=(f'gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo ' - f'none -o {run.zip_encrypted} {run.zip}'), tmp_files=tmp_files): - logger.warn(f'Skipping run {run.name} and moving on') + logger.info("Encrypting the zipped run file") + if not bk._call_commands( + cmd1=( + f"gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo " + f"none -o {run.zip_encrypted} {run.zip}" + ), + tmp_files=tmp_files, + ): + logger.warn(f"Skipping run {run.name} and moving on") continue # Decrypt and check for md5 if not force: - logger.info('Calculating md5sum after encryption') - md5_call, md5_out = bk._call_commands(cmd1=f'gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}', - cmd2='md5sum', return_out=True, tmp_files=tmp_files) + logger.info("Calculating md5sum after encryption") + md5_call, md5_out = bk._call_commands( + cmd1=f"gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}", + cmd2="md5sum", + return_out=True, + tmp_files=tmp_files, + ) if not md5_call: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue md5_post_encrypt = md5_out.split()[0] if md5_pre_encrypt != md5_post_encrypt: - logger.error(f'md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on') + logger.error( + f"md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on" + ) bk._clean_tmp_files(tmp_files) continue - logger.info('Md5sum matches before and after encryption') + logger.info("Md5sum matches before and after encryption") # Encrypt and move the key file - if bk._call_commands(cmd1=f'gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}', tmp_files=tmp_files): + if bk._call_commands( + cmd1=f"gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}", + tmp_files=tmp_files, + ): shutil.move(run.key_encrypted, run.dst_key_encrypted) else: - logger.error('Encryption of key file failed, skipping run') + logger.error("Encryption of key file failed, skipping run") continue bk._clean_tmp_files([run.zip, run.key, run.flag]) - logger.info(f'Encryption of run {run.name} is successfully done, removing zipped run file') + logger.info( + f"Encryption of run {run.name} is successfully done, removing zipped run file" + ) @classmethod def pdc_put(cls, run): """Archive the collected runs to PDC.""" bk = cls(run) - bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True) - logger.info(f'In total, found {len(bk.runs)} run(s) to send PDC') + bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True) + logger.info(f"In total, found {len(bk.runs)} run(s) to send PDC") for run in bk.runs: - run.flag = f'{run.name}.archiving' + run.flag = f"{run.name}.archiving" run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) if run.path not in bk.archive_dirs.values(): - logger.error('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate ' - 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name)) + logger.error( + "Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate " + "archive dir before sending it to PDC".format( + ",".join(list(bk.archive_dirs.values())), run.name + ) + ) continue if not os.path.exists(run.dst_key_encrypted): - logger.error(f'Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it') + logger.error( + f"Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it" + ) continue with filesystem.chdir(run.path): - #skip run if being encrypted - if os.path.exists(f'{run.name}.encrypting'): - logger.warn(f'Run {run.name} is currently being encrypted, so skipping now') + # skip run if being encrypted + if os.path.exists(f"{run.name}.encrypting"): + logger.warn( + f"Run {run.name} is currently being encrypted, so skipping now" + ) continue # skip run if already ongoing if os.path.exists(run.flag): - logger.warn(f'Run {run.name} is already being archived, so skipping now') + logger.warn( + f"Run {run.name} is already being archived, so skipping now" + ) continue - if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): - logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup') + if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc( + run.dst_key_encrypted, silent=False + ): + logger.warn( + f"Seems like files related to run {run.name} already exist in PDC, check and cleanup" + ) continue - open(run.flag, 'w').close() - logger.info(f'Sending file {run.zip_encrypted} to PDC') - if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]): - time.sleep(15) # give some time just in case 'dsmc' needs to settle - if bk._call_commands(cmd1=f'dsmc archive {run.dst_key_encrypted}', tmp_files=[run.flag]): - time.sleep(5) # give some time just in case 'dsmc' needs to settle - if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted): - logger.info(f'Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder') + open(run.flag, "w").close() + logger.info(f"Sending file {run.zip_encrypted} to PDC") + if bk._call_commands( + cmd1=f"dsmc archive {run.zip_encrypted}", tmp_files=[run.flag] + ): + time.sleep(15) # give some time just in case 'dsmc' needs to settle + if bk._call_commands( + cmd1=f"dsmc archive {run.dst_key_encrypted}", + tmp_files=[run.flag], + ): + time.sleep( + 5 + ) # give some time just in case 'dsmc' needs to settle + if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc( + run.dst_key_encrypted + ): + logger.info( + f"Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder" + ) bk.log_archived_run(run.zip_encrypted) if bk.couch_info: bk._log_pdc_statusdb(run.name) - bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag]) + bk._clean_tmp_files( + [run.zip_encrypted, run.dst_key_encrypted, run.flag] + ) bk._move_run_to_archived(run) continue - logger.warn(f'Sending file {run.zip_encrypted} to PDC failed') + logger.warn(f"Sending file {run.zip_encrypted} to PDC failed") diff --git a/taca/backup/cli.py b/taca/backup/cli.py index 89128002..60d8c442 100644 --- a/taca/backup/cli.py +++ b/taca/backup/cli.py @@ -7,35 +7,69 @@ @click.group() @click.pass_context def backup(ctx): - """ Backup management methods and utilities """ + """Backup management methods and utilities""" pass + @backup.command() -@click.option('-r', '--run', type=click.Path(exists=True), help="A run (directory or a zipped archive) to be encrypted") -@click.option('-f', '--force', is_flag=True, help="Ignore the checks and just try encryption. USE IT WITH CAUTION.") +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + help="A run (directory or a zipped archive) to be encrypted", +) +@click.option( + "-f", + "--force", + is_flag=True, + help="Ignore the checks and just try encryption. USE IT WITH CAUTION.", +) @click.pass_context def encrypt(ctx, run, force): bkut.encrypt_runs(run, force) -@backup.command(name='put_data') -@click.option('-r', '--run', type=click.Path(exists=True), help="A run name (without extension) to be sent to PDC") + +@backup.command(name="put_data") +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + help="A run name (without extension) to be sent to PDC", +) @click.pass_context def put_data(ctx, run): bkut.pdc_put(run) -@backup.command(name='get_data') -@click.option('-r', '--run', required=True, help="A run name (without extension) to download from PDC") -@click.option('-o', '--outdir', type=click.Path(exists=True, file_okay=False, writable=True), - help="Optional directory name to save the downloaded file. Directory should exist") + +@backup.command(name="get_data") +@click.option( + "-r", + "--run", + required=True, + help="A run name (without extension) to download from PDC", +) +@click.option( + "-o", + "--outdir", + type=click.Path(exists=True, file_okay=False, writable=True), + help="Optional directory name to save the downloaded file. Directory should exist", +) @click.pass_context def get_data(ctx, run, outdir): ## W I P ## raise NotImplementedError + @backup.command() -@click.option('-r', '--run', required=True, type=click.Path(exists=True, dir_okay=False), help="A encripted run file") -@click.option('-k', '--key', required=True, help="Key file to be used for decryption") -@click.option('-p', '--password', help="To pass decryption passphrase via command line") +@click.option( + "-r", + "--run", + required=True, + type=click.Path(exists=True, dir_okay=False), + help="A encripted run file", +) +@click.option("-k", "--key", required=True, help="Key file to be used for decryption") +@click.option("-p", "--password", help="To pass decryption passphrase via command line") @click.pass_context def decrypt(ctx, run, key, password): ## W I P ## diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index 498ffcf9..3a58d871 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -11,11 +11,19 @@ logger = logging.getLogger(__name__) -def cleanup_miarka(days_fastq, days_analysis, - only_fastq, only_analysis, - clean_undetermined, status_db_config, - exclude_projects, list_only, - date, dry_run=False): + +def cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run=False, +): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'miarka' cluster. @@ -46,26 +54,30 @@ def cleanup_miarka(days_fastq, days_analysis, - "*.bam" """ try: - config = CONFIG['cleanup']['miarka'] - flowcell_dir_root = config['flowcell']['root'] - flowcell_project_source = config['flowcell']['relative_project_source'] - flowcell_undet_files = config['flowcell']['undet_file_pattern'] - data_dir = config['data_dir'] - analysis_dir = config['analysis']['root'] - analysis_data_to_remove = config['analysis']['files_to_remove'] + config = CONFIG["cleanup"]["miarka"] + flowcell_dir_root = config["flowcell"]["root"] + flowcell_project_source = config["flowcell"]["relative_project_source"] + flowcell_undet_files = config["flowcell"]["undet_file_pattern"] + data_dir = config["data_dir"] + analysis_dir = config["analysis"]["root"] + analysis_data_to_remove = config["analysis"]["files_to_remove"] if date: - date = datetime.strptime(date, '%Y-%m-%d') + date = datetime.strptime(date, "%Y-%m-%d") except KeyError as e: - logger.error(f'Config file is missing the key {str(e)}, make sure it has all required information') + logger.error( + f"Config file is missing the key {str(e)}, make sure it has all required information" + ) raise SystemExit except ValueError: - logger.error('Date given with "--date" option is not in required format, see help for more info') + logger.error( + 'Date given with "--date" option is not in required format, see help for more info' + ) raise SystemExit # make a connection for project db db_config = load_config(status_db_config) - pcon = statusdb.ProjectSummaryConnection(db_config.get('statusdb')) - assert pcon, 'Could not connect to project database in StatusDB' + pcon = statusdb.ProjectSummaryConnection(db_config.get("statusdb")) + assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] @@ -74,189 +86,302 @@ def cleanup_miarka(days_fastq, days_analysis, with open(exclude_projects) as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: - exclude_list.extend(exclude_projects.split(',')) + exclude_list.extend(exclude_projects.split(",")) # sanity check for mentioned project to exculde or valid - invalid_projects = [p for p in exclude_list if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()] + invalid_projects = [ + p + for p in exclude_list + if p not in pcon.id_view.keys() and p not in pcon.name_view.keys() + ] if invalid_projects: - logger.error('"--exclude_projects" was called with some invalid projects "{}", ' - 'provide valid project name/id'.format(','.join(invalid_projects))) + logger.error( + '"--exclude_projects" was called with some invalid projects "{}", ' + "provide valid project name/id".format(",".join(invalid_projects)) + ) raise SystemExit - #compile list for project to delete + # compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: - logger.info('Building initial project list for removing data...') + logger.info("Building initial project list for removing data...") if only_fastq: - logger.info('Option "--only_fastq" is given, so will not look for analysis data') + logger.info( + 'Option "--only_fastq" is given, so will not look for analysis data' + ) elif only_analysis: - logger.info('Option "--only_analysis" is given, so will not look for fastq data') + logger.info( + 'Option "--only_analysis" is given, so will not look for fastq data' + ) if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: - for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)]: + for fc in [ + d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) + ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn(f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory') + logger.warn( + f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory' + ) continue - projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ - if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ - not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))] + projects_in_fc = [ + d + for d in os.listdir(flowcell_project_source) + if re.match(r"^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$", d) + and not os.path.exists( + os.path.join(flowcell_project_source, d, "cleaned") + ) + ] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue - fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files)) + fc_undet_files = glob( + os.path.join(flowcell_project_source, flowcell_undet_files) + ) if fc_undet_files: - logger.info(f'All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files') - all_undet_files.extend(list(map(os.path.abspath, fc_undet_files))) + logger.info( + f"All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files" + ) + all_undet_files.extend( + list(map(os.path.abspath, fc_undet_files)) + ) if all_undet_files: undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) - if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files), - undet_size), default='no'): - _remove_files(all_undet_files) + if misc.query_yes_no( + "In total found {} undetermined files which are {} in size, delete now ?".format( + len(all_undet_files), undet_size + ), + default="no", + ): + _remove_files(all_undet_files) return elif only_analysis: - for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ - not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]: + for pid in [ + d + for d in os.listdir(analysis_dir) + if re.match(r"^P\d+$", d) + and not os.path.exists(os.path.join(analysis_dir, d, "cleaned")) + ]: os.path.join(analysis_dir, pid) - proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date) - if proj_info and proj_info['closed_days'] >= days_analysis: + proj_info = get_closed_proj_info( + pid, pcon.get_entry(pid, use_id_view=True), date + ) + if proj_info and proj_info["closed_days"] >= days_analysis: # move on if this project has to be excluded - if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: + if ( + proj_info["name"] in exclude_list + or proj_info["pid"] in exclude_list + ): continue - analysis_data, analysis_size = collect_analysis_data_miarka(pid, analysis_dir, analysis_data_to_remove) - proj_info['analysis_to_remove'] = analysis_data - proj_info['analysis_size'] = analysis_size - proj_info['fastq_to_remove'] = 'not_selected' - proj_info['fastq_size'] = 0 - project_clean_list[proj_info['name']] = proj_info + analysis_data, analysis_size = collect_analysis_data_miarka( + pid, analysis_dir, analysis_data_to_remove + ) + proj_info["analysis_to_remove"] = analysis_data + proj_info["analysis_size"] = analysis_size + proj_info["fastq_to_remove"] = "not_selected" + proj_info["fastq_size"] = 0 + project_clean_list[proj_info["name"]] = proj_info else: for flowcell_dir in flowcell_dir_root: - for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: + for fc in [ + d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) + ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn(f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry') + logger.warn( + f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry' + ) continue - projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ - if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ - not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))] + projects_in_fc = [ + d + for d in os.listdir(flowcell_project_source) + if re.match(r"^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$", d) + and not os.path.exists( + os.path.join(flowcell_project_source, d, "cleaned") + ) + ] for _proj in projects_in_fc: - proj = re.sub(r'_+', '.', _proj, 1) + proj = re.sub(r"_+", ".", _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time - if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: - fc_fq_files, fq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj)) - project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] - project_clean_list[proj]['fastq_size'] += fq_size + if ( + proj in project_clean_list + and project_clean_list[proj]["closed_days"] + >= days_fastq + ): + fc_fq_files, fq_size = collect_fastq_data_miarka( + fc_abs_path, + os.path.join(flowcell_project_source, _proj), + ) + project_clean_list[proj]["fastq_to_remove"][ + "flowcells" + ][fc] = fc_fq_files["flowcells"][fc] + project_clean_list[proj]["fastq_size"] += fq_size continue project_processed_list.append(proj) - #by default assume all projects are not old enough for delete - fastq_data, analysis_data = ('young', 'young') + # by default assume all projects are not old enough for delete + fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) - proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date) + proj_info = get_closed_proj_info( + proj, pcon.get_entry(proj), date + ) if proj_info: # move on if this project has to be excluded - if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: + if ( + proj_info["name"] in exclude_list + or proj_info["pid"] in exclude_list + ): continue # if project not old enough for fastq files and only fastq files selected move on to next project - if proj_info['closed_days'] >= days_fastq: - fastq_data, fastq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj), - data_dir, proj_info['pid']) + if proj_info["closed_days"] >= days_fastq: + fastq_data, fastq_size = collect_fastq_data_miarka( + fc_abs_path, + os.path.join(flowcell_project_source, _proj), + data_dir, + proj_info["pid"], + ) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files - if proj_info['closed_days'] >= days_analysis: - analysis_data, analysis_size = collect_analysis_data_miarka(proj_info['pid'], analysis_dir, analysis_data_to_remove) + if proj_info["closed_days"] >= days_analysis: + ( + analysis_data, + analysis_size, + ) = collect_analysis_data_miarka( + proj_info["pid"], + analysis_dir, + analysis_data_to_remove, + ) # if both fastq and analysis files are not old enough move on - if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == 'cleaned') and fastq_data == 'young'): + if (analysis_data == fastq_data) or ( + (not analysis_data or analysis_data == "cleaned") + and fastq_data == "young" + ): continue - elif fastq_data == 'young': + elif fastq_data == "young": continue else: - analysis_data = 'not_selected' - proj_info['fastq_to_remove'] = fastq_data - proj_info['fastq_size'] = fastq_size - proj_info['analysis_to_remove'] = analysis_data - proj_info['analysis_size'] = analysis_size + analysis_data = "not_selected" + proj_info["fastq_to_remove"] = fastq_data + proj_info["fastq_size"] = fastq_size + proj_info["analysis_to_remove"] = analysis_data + proj_info["analysis_size"] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: - logger.info('There are no projects to clean') + logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: - print('Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size') - for p_info in sorted(list(project_clean_list.values()), key=lambda d: d['closed_days'], reverse=True): - print('\t'.join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], - str(p_info['closed_days']), p_info['closed_date'], - _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])])) + print( + "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" + ) + for p_info in sorted( + list(project_clean_list.values()), + key=lambda d: d["closed_days"], + reverse=True, + ): + print( + "\t".join( + [ + p_info["name"], + p_info["pid"], + p_info["bioinfo_responsible"], + str(p_info["closed_days"]), + p_info["closed_date"], + _def_get_size_unit(p_info["fastq_size"]), + _def_get_size_unit(p_info["analysis_size"]), + ] + ) + ) raise SystemExit - logger.info(f'Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}') - if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'): + logger.info( + f"Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}" + ) + if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) - #go through complied project list and remove files + # go through complied project list and remove files for proj, info in project_clean_list.items(): proj_count += 1 - if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq), - proj_count, len(project_clean_list)), default='no'): - logger.info(f'Will not remove files for project {proj}') + if not misc.query_yes_no( + "{}Delete files for this project ({}/{})".format( + get_proj_meta_info(info, days_fastq), + proj_count, + len(project_clean_list), + ), + default="no", + ): + logger.info(f"Will not remove files for project {proj}") filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) - logger.info(f'Removed {len(filtered_project)}/{proj_count} projects from initial list') + logger.info( + f"Removed {len(filtered_project)}/{proj_count} projects from initial list" + ) if not project_clean_list: - logger.info('There are no projects to clean after filtering') + logger.info("There are no projects to clean after filtering") return - logger.info(f'Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}') - if not misc.query_yes_no('Proceed with cleanup ?', default='no'): - logger.info('Aborting cleanup') + logger.info( + f"Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}" + ) + if not misc.query_yes_no("Proceed with cleanup ?", default="no"): + logger.info("Aborting cleanup") return - logger.info('Will start cleaning up project now') + logger.info("Will start cleaning up project now") for proj, info in project_clean_list.items(): - fastq_info = info.get('fastq_to_remove') + fastq_info = info.get("fastq_to_remove") if fastq_info and isinstance(fastq_info, dict): - logger.info(f'Cleaning fastq files for project {proj}') - fastq_fc = fastq_info.get('flowcells', {}) + logger.info(f"Cleaning fastq files for project {proj}") + fastq_fc = fastq_info.get("flowcells", {}) removed_fc = [] for fc, fc_info in fastq_fc.items(): - proj_fc_root = fc_info['proj_root'] - logger.info(f'Removing fastq files from {proj_fc_root}') + proj_fc_root = fc_info["proj_root"] + logger.info(f"Removing fastq files from {proj_fc_root}") if not dry_run: - if _remove_files(fc_info['fq_files']): - logger.info(f'Removed fastq files from FC {fc} for project {proj}, marking it as cleaned') + if _remove_files(fc_info["fq_files"]): + logger.info( + f"Removed fastq files from FC {fc} for project {proj}, marking it as cleaned" + ) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: - proj_data_root = fastq_info['proj_data']['proj_data_root'] - logger.info(f'All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}') + proj_data_root = fastq_info["proj_data"]["proj_data_root"] + logger.info( + f"All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}" + ) _touch_cleaned(proj_data_root) except: pass - analysis_info = info.get('analysis_to_remove') + analysis_info = info.get("analysis_to_remove") if analysis_info and isinstance(analysis_info, dict): - proj_analysis_root = analysis_info['proj_analysis_root'] - logger.info(f'cleaning analysis data for project {proj}') + proj_analysis_root = analysis_info["proj_analysis_root"] + logger.info(f"cleaning analysis data for project {proj}") removed_qc = [] - for qc, files in analysis_info['analysis_files'].items(): + for qc, files in analysis_info["analysis_files"].items(): logger.info(f'Removing files of "{qc}" from {proj_analysis_root}') if not dry_run: if _remove_files(files): removed_qc.append(qc) else: - logger.warn(f'Could not remove some files in qc directory "{qc}"') - map(analysis_info['analysis_files'].pop, removed_qc) - if len(analysis_info['analysis_files']) == 0: - logger.info(f'Removed analysis data for project {proj}, marking it cleaned') + logger.warn( + f'Could not remove some files in qc directory "{qc}"' + ) + map(analysis_info["analysis_files"].pop, removed_qc) + if len(analysis_info["analysis_files"]) == 0: + logger.info( + f"Removed analysis data for project {proj}, marking it cleaned" + ) _touch_cleaned(proj_analysis_root) @@ -264,27 +389,38 @@ def cleanup_miarka(days_fastq, days_analysis, # Class helper methods, not exposed as commands/subcommands # ############################################################# + def get_closed_proj_info(prj, pdoc, tdate=None): """Check and return a dict if project is closed.""" pdict = None if not tdate: tdate = datetime.today() if not pdoc: - logger.warn(f'Seems like project {prj} does not have a proper statusdb document, skipping it') - elif 'close_date' in pdoc: - closed_date = pdoc['close_date'] + logger.warn( + f"Seems like project {prj} does not have a proper statusdb document, skipping it" + ) + elif "close_date" in pdoc: + closed_date = pdoc["close_date"] try: - closed_days = tdate - datetime.strptime(closed_date, '%Y-%m-%d') - pdict = {'name' : pdoc.get('project_name'), - 'pid' : pdoc.get('project_id'), - 'closed_date' : closed_date, - 'closed_days' : closed_days.days, - 'bioinfo_responsible' : pdoc.get('project_summary',{}).get('bioinfo_responsible','')} + closed_days = tdate - datetime.strptime(closed_date, "%Y-%m-%d") + pdict = { + "name": pdoc.get("project_name"), + "pid": pdoc.get("project_id"), + "closed_date": closed_date, + "closed_days": closed_days.days, + "bioinfo_responsible": pdoc.get("project_summary", {}).get( + "bioinfo_responsible", "" + ), + } except: - logger.warn('Problem calculating closed days for project {} with close date {}. Skipping it'.format( - pdoc.get('project_name'), closed_date)) + logger.warn( + "Problem calculating closed days for project {} with close date {}. Skipping it".format( + pdoc.get("project_name"), closed_date + ) + ) return pdict + def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}): """Collect the analysis files that have to be removed from Miarka return a tuple with files and total size of collected files.""" @@ -292,42 +428,57 @@ def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}): proj_abs_path = os.path.join(analysis_root, pid) if not os.path.exists(proj_abs_path): file_list = None - elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')): - file_list = 'cleaned' + elif os.path.exists(os.path.join(proj_abs_path, "cleaned")): + file_list = "cleaned" else: - file_list = {'proj_analysis_root':proj_abs_path, - 'analysis_files': defaultdict(list)} - for qc_type,ext in files_ext_to_remove.items(): + file_list = { + "proj_analysis_root": proj_abs_path, + "analysis_files": defaultdict(list), + } + for qc_type, ext in files_ext_to_remove.items(): qc_path = os.path.join(proj_abs_path, qc_type) if os.path.exists(qc_path): - file_list['analysis_files'][qc_type].extend(collect_files_by_ext(qc_path, ext)) + file_list["analysis_files"][qc_type].extend( + collect_files_by_ext(qc_path, ext) + ) try: - size += sum([sum(map(os.path.getsize, fls)) for fls in file_list['analysis_files'].values()]) + size += sum( + [ + sum(map(os.path.getsize, fls)) + for fls in file_list["analysis_files"].values() + ] + ) except: pass return (file_list, size) + def collect_fastq_data_miarka(fc_root, fc_proj_src, proj_root=None, pid=None): """Collect the fastq files that have to be removed from Miarka. Return a tuple with files and total size of collected files.""" size = 0 - file_list = {'flowcells': defaultdict(dict)} + file_list = {"flowcells": defaultdict(dict)} fc_proj_path = os.path.join(fc_root, fc_proj_src) fc_id = os.path.basename(fc_root) - file_list['flowcells'][fc_id] = {'proj_root': fc_proj_path, - 'fq_files': collect_files_by_ext(fc_proj_path, '*.fastq.gz')} + file_list["flowcells"][fc_id] = { + "proj_root": fc_proj_path, + "fq_files": collect_files_by_ext(fc_proj_path, "*.fastq.gz"), + } if proj_root and pid: proj_abs_path = os.path.join(proj_root, pid) if not os.path.exists(proj_abs_path): - file_list['proj_data'] = None - elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')): - file_list['proj_data'] = 'cleaned' + file_list["proj_data"] = None + elif os.path.exists(os.path.join(proj_abs_path, "cleaned")): + file_list["proj_data"] = "cleaned" else: - file_list['proj_data'] = {'proj_data_root': proj_abs_path, - 'fastq_files' : collect_files_by_ext(proj_abs_path, '*.fastq.gz')} - size += sum(map(os.path.getsize, file_list['flowcells'][fc_id]['fq_files'])) + file_list["proj_data"] = { + "proj_data_root": proj_abs_path, + "fastq_files": collect_files_by_ext(proj_abs_path, "*.fastq.gz"), + } + size += sum(map(os.path.getsize, file_list["flowcells"][fc_id]["fq_files"])) return (file_list, size) + def collect_files_by_ext(path, ext=[]): """Collect files with a given extension from a given path.""" if isinstance(ext, str): @@ -340,60 +491,79 @@ def collect_files_by_ext(path, ext=[]): collected_files.extend(collect_files_by_ext(d, ext)) return collected_files + def get_proj_meta_info(info, days_fastq): """From given info collect meta info for a project.""" - template = '\n' + template = "\n" + def _get_template_string(h, v): try: - v = f'{h}: {v}\n' + v = f"{h}: {v}\n" except: - v = f'{h}: Problem getting this' + v = f"{h}: Problem getting this" return v - template += _get_template_string('Project overview', info.get('name')) - template += _get_template_string('Project ID', info.get('pid')) - template += _get_template_string('Bioinfo Responsible', info.get('bioinfo_responsible','')) - template += _get_template_string('Closed for (days)', info.get('closed_days')) - template += _get_template_string('Closed from (date)', info.get('closed_date')) + + template += _get_template_string("Project overview", info.get("name")) + template += _get_template_string("Project ID", info.get("pid")) + template += _get_template_string( + "Bioinfo Responsible", info.get("bioinfo_responsible", "") + ) + template += _get_template_string("Closed for (days)", info.get("closed_days")) + template += _get_template_string("Closed from (date)", info.get("closed_date")) # set analysis info based upon what we have - analysis_info = info.get('analysis_to_remove') + analysis_info = info.get("analysis_to_remove") if not analysis_info: - template += 'Project analysis: No analysis directory\n' - elif isinstance(analysis_info, str) and analysis_info == 'cleaned': - template += 'Project analysis: Analysis directory already cleaned\n' + template += "Project analysis: No analysis directory\n" + elif isinstance(analysis_info, str) and analysis_info == "cleaned": + template += "Project analysis: Analysis directory already cleaned\n" elif isinstance(analysis_info, dict): f_stat = [] - for qc_type, files in analysis_info['analysis_files'].items(): - f_stat.append(f'{qc_type} ({len(files)} files)') - template += 'Project analyzed: {}\n'.format(', '.join(f_stat)) + for qc_type, files in analysis_info["analysis_files"].items(): + f_stat.append(f"{qc_type} ({len(files)} files)") + template += "Project analyzed: {}\n".format(", ".join(f_stat)) # set fastq info based upon what we have - fq_info = info.get('fastq_to_remove') + fq_info = info.get("fastq_to_remove") if isinstance(fq_info, str) and fq_info == "young": - template += f'Project been closed less than {days_fastq} days, so will not remove any fastq files\n' + template += f"Project been closed less than {days_fastq} days, so will not remove any fastq files\n" elif isinstance(fq_info, dict): - proj_fq_info = fq_info.get('proj_data') + proj_fq_info = fq_info.get("proj_data") if not proj_fq_info: - template += 'Project organized: No organized directory for project\n' + template += "Project organized: No organized directory for project\n" elif isinstance(proj_fq_info, str) and proj_fq_info == "cleaned": - template += 'Project organized: Project directory is already cleaned\n' + template += "Project organized: Project directory is already cleaned\n" elif isinstance(proj_fq_info, dict): - template += 'Project organized: Project is organized with {} fastq files\n'.format(len(proj_fq_info['fastq_files'])) - fc_fq_info = fq_info.get('flowcells', {}) + template += ( + "Project organized: Project is organized with {} fastq files\n".format( + len(proj_fq_info["fastq_files"]) + ) + ) + fc_fq_info = fq_info.get("flowcells", {}) fc_num = len(fc_fq_info.keys()) - fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()])) - template += f'Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n' - template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0))) + fc_files = sum( + map(len, [fc_info.get("fq_files", []) for fc_info in fc_fq_info.values()]) + ) + template += ( + f"Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n" + ) + template += "Estimated data size: {}\n".format( + _def_get_size_unit(info.get("fastq_size", 0) + info.get("fastq_size", 0)) + ) return template + def get_files_size_text(plist): """Get project list dict and give back string with overll sizes.""" - fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()])) - asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()])) - return '({f}{s}{a}) '.format(f = f'~{fsize} fastq data' if fsize else '', - a = f'~{asize} analysis data' if asize else '', - s = ' and ' if fsize and asize else '') + fsize = _def_get_size_unit(sum([i.get("fastq_size", 0) for i in plist.values()])) + asize = _def_get_size_unit(sum([i.get("analysis_size", 0) for i in plist.values()])) + return "({f}{s}{a}) ".format( + f=f"~{fsize} fastq data" if fsize else "", + a=f"~{asize} analysis data" if asize else "", + s=" and " if fsize and asize else "", + ) + def _def_get_size_unit(s): """Change the given size to appropriate unit measurement for better readability.""" @@ -402,17 +572,18 @@ def _def_get_size_unit(s): gb = mb * 1000 tb = gb * 1000 if s > tb: - s = f'~{int(s/tb)}tb' + s = f"~{int(s/tb)}tb" elif s > gb: - s = f'~{int(s/gb)}gb' + s = f"~{int(s/gb)}gb" elif s > mb: - s = f'~{int(s/mb)}mb' + s = f"~{int(s/mb)}mb" elif s > kb: - s = f'~{int(s/kb)}kb' + s = f"~{int(s/kb)}kb" elif s > 0: - s = f'~{int(s/b)}b' + s = f"~{int(s/b)}b" return str(s) + def _remove_files(files): """Remove files from given list.""" status = True @@ -424,9 +595,12 @@ def _remove_files(files): status = False return status + def _touch_cleaned(path): """Touch a 'cleaned' file in a given path.""" try: - open(os.path.join(path, 'cleaned'), 'w').close() + open(os.path.join(path, "cleaned"), "w").close() except Exception as e: - logger.warn(f'Could not create "cleaned" file in path {path} due to "{e.message}"') + logger.warn( + f'Could not create "cleaned" file in path {path} due to "{e.message}"' + ) diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py index 6410567b..fe7e11ba 100644 --- a/taca/cleanup/cli.py +++ b/taca/cleanup/cli.py @@ -7,63 +7,119 @@ @click.group() @click.pass_context -@click.option('--status_db_config', - type=click.Path(exists=True, dir_okay=False), - envvar='STATUS_DB_CONFIG', - help='Path to statusdb-configuration.') +@click.option( + "--status_db_config", + type=click.Path(exists=True, dir_okay=False), + envvar="STATUS_DB_CONFIG", + help="Path to statusdb-configuration.", +) def cleanup(ctx, status_db_config): """Cleaning up servers - management methods and utilities.""" pass + # cleanup subcommands @cleanup.command() -@click.option('-d', '--days', type=click.IntRange(min=1), - help='Days to consider as thershold, should not be combined with option "--hours"') -@click.option('-h', '--hours', type=click.IntRange(min=1), - help='Hours to consider as thershold, should not be combined with option "--days"') +@click.option( + "-d", + "--days", + type=click.IntRange(min=1), + help='Days to consider as thershold, should not be combined with option "--hours"', +) +@click.option( + "-h", + "--hours", + type=click.IntRange(min=1), + help='Hours to consider as thershold, should not be combined with option "--days"', +) @click.pass_context def preproc(ctx, days, hours): """Do appropriate cleanup on preproc.""" seconds = misc.to_seconds(days, hours) cln.cleanup_processing(seconds) + @cleanup.command() -@click.option('--days_fastq', type=click.IntRange(min=1), - help='Days to consider as thershold for removing "fastq" files') -@click.option('--days_analysis', type=click.IntRange(min=1), - help='Days to consider as thershold for removing analysis data') -@click.option('--only_fastq', is_flag=True, - help='Clean only fastq data in "miarka"') -@click.option('--only_analysis', is_flag=True, - help='Clean only analysis data in "miarka"') -@click.option('--date', type=click.STRING, - help='Consider the given date instead of today while collecting closed projects. ' - 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"') -@click.option('--exclude_projects', type=click.STRING, - help='A project or a file with a list of projects to exclude from deleting. ' - 'Either name or id can be given. Examples: --exclude_projects P1234 or ' - '--exclude_projects P1234,P5678 or ' - '--exclude_projects file_with_projects_id.txt') -@click.option('--clean_undetermined', is_flag=True, - help='Remove only the undetermined reads for a flowcell that have ' - 'all project cleaned. All other parameters are ignored if this ' - 'flag is called.') -@click.option('-l', '--list_only', is_flag=True, - help='Only build the project list that will be cleaned') -@click.option('-n', '--dry_run', is_flag=True, - help='Perform dry run i.e. execute nothing but log') +@click.option( + "--days_fastq", + type=click.IntRange(min=1), + help='Days to consider as thershold for removing "fastq" files', +) +@click.option( + "--days_analysis", + type=click.IntRange(min=1), + help="Days to consider as thershold for removing analysis data", +) +@click.option("--only_fastq", is_flag=True, help='Clean only fastq data in "miarka"') +@click.option( + "--only_analysis", is_flag=True, help='Clean only analysis data in "miarka"' +) +@click.option( + "--date", + type=click.STRING, + help="Consider the given date instead of today while collecting closed projects. " + 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"', +) +@click.option( + "--exclude_projects", + type=click.STRING, + help="A project or a file with a list of projects to exclude from deleting. " + "Either name or id can be given. Examples: --exclude_projects P1234 or " + "--exclude_projects P1234,P5678 or " + "--exclude_projects file_with_projects_id.txt", +) +@click.option( + "--clean_undetermined", + is_flag=True, + help="Remove only the undetermined reads for a flowcell that have " + "all project cleaned. All other parameters are ignored if this " + "flag is called.", +) +@click.option( + "-l", + "--list_only", + is_flag=True, + help="Only build the project list that will be cleaned", +) +@click.option( + "-n", "--dry_run", is_flag=True, help="Perform dry run i.e. execute nothing but log" +) @click.pass_context -def miarka(ctx, days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, date, exclude_projects, list_only, dry_run): +def miarka( + ctx, + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + date, + exclude_projects, + list_only, + dry_run, +): """Do appropriate cleanup on Miarka.""" - status_db_config = ctx.parent.params['status_db_config'] + status_db_config = ctx.parent.params["status_db_config"] if only_fastq and only_analysis: - raise SystemExit('ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one') + raise SystemExit( + 'ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one' + ) if not days_fastq and not only_analysis and not clean_undetermined: - raise SystemExit('ERROR: "days_fastq" is not given while not selecting "only_analysis" option') + raise SystemExit( + 'ERROR: "days_fastq" is not given while not selecting "only_analysis" option' + ) if not days_analysis and not only_fastq and not clean_undetermined: - raise SystemExit('ERROR: "days_analysis" is not given while not selecting "only_fastq" option') - cln.cleanup_miarka(days_fastq, days_analysis, - only_fastq, only_analysis, - clean_undetermined, status_db_config, - exclude_projects, list_only, - date, dry_run) + raise SystemExit( + 'ERROR: "days_analysis" is not given while not selecting "only_fastq" option' + ) + cln.cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run, + ) diff --git a/taca/cli.py b/taca/cli.py index ad8d59b6..d777884a 100644 --- a/taca/cli.py +++ b/taca/cli.py @@ -10,26 +10,30 @@ logger = logging.getLogger(__name__) + @click.group() @click.version_option(__version__) # Priority for the configuration file is: environment variable > -c option > default -@click.option('-c', '--config-file', - default=os.path.join(os.environ['HOME'], '.taca/taca.yaml'), - envvar='TACA_CONFIG', - type=click.File('r'), - help='Path to TACA configuration file') - +@click.option( + "-c", + "--config-file", + default=os.path.join(os.environ["HOME"], ".taca/taca.yaml"), + envvar="TACA_CONFIG", + type=click.File("r"), + help="Path to TACA configuration file", +) @click.pass_context def cli(ctx, config_file): - """ Tool for the Automation of Storage and Analyses """ + """Tool for the Automation of Storage and Analyses""" ctx.obj = {} config = conf.load_yaml_config(config_file.name) - log_file = config.get('log', {}).get('file', None) + log_file = config.get("log", {}).get("file", None) if log_file: - level = config.get('log').get('log_level', 'INFO') + level = config.get("log").get("log_level", "INFO") taca.log.init_logger_file(log_file, level) - logger.debug('starting up CLI') + logger.debug("starting up CLI") + -#Add subcommands dynamically to the CLI -for entry_point in iter_entry_points('taca.subcommands'): +# Add subcommands dynamically to the CLI +for entry_point in iter_entry_points("taca.subcommands"): cli.add_command(entry_point.load()) diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py index ff7d1095..0428db3c 100644 --- a/taca/illumina/MiSeq_Runs.py +++ b/taca/illumina/MiSeq_Runs.py @@ -9,11 +9,12 @@ logger = logging.getLogger(__name__) -TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?') -TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?') -SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]') -IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)') -RECIPE_PAT = re.compile('[0-9]+-[0-9]+') +TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?") +TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?") +SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]") +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") +RECIPE_PAT = re.compile("[0-9]+-[0-9]+") + class MiSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): @@ -33,8 +34,7 @@ def _get_samplesheet(self): """Locate and parse the samplesheet for a run. In MiSeq case this is located in FC_DIR/SampleSheet.csv """ - ssname = os.path.join(self.run_dir, - 'SampleSheet.csv') + ssname = os.path.join(self.run_dir, "SampleSheet.csv") if os.path.exists(ssname): # If exists parse the SampleSheet return ssname @@ -49,14 +49,14 @@ def _copy_samplesheet(self): # Load index files indexfile = dict() try: - indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path'] + indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"] except KeyError: - logger.error('Path to index file (10X) not found in the config file') + logger.error("Path to index file (10X) not found in the config file") raise RuntimeError try: - indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path'] + indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"] except KeyError: - logger.error('Path to index file (Smart-seq) not found in the config file') + logger.error("Path to index file (Smart-seq) not found in the config file") raise RuntimeError if ssname is None: return None @@ -65,97 +65,144 @@ def _copy_samplesheet(self): # Copy the original samplesheet locally. # Copy again if already done as there might have been changes to the samplesheet try: - shutil.copy(ssname, os.path.join(self.run_dir, f'{self.flowcell_id}.csv')) + shutil.copy(ssname, os.path.join(self.run_dir, f"{self.flowcell_id}.csv")) ssname = os.path.join(self.run_dir, os.path.split(ssname)[1]) except: - raise RuntimeError(f"unable to copy file {ssname} to destination {self.run_dir}") + raise RuntimeError( + f"unable to copy file {ssname} to destination {self.run_dir}" + ) # This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready # to be used it needs some editing. # This will contain the samplesheet with all the renaiming to be used with bcl2fastq - samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet_copy.csv') + samplesheet_dest = os.path.join(self.run_dir, "SampleSheet_copy.csv") # Check that the samplesheet is not already present. In this case go the next step if os.path.exists(samplesheet_dest): - logger.info('SampleSheet_copy.csv found ... overwriting it') + logger.info("SampleSheet_copy.csv found ... overwriting it") try: - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_clean_samplesheet(ssparser, - indexfile, - fields_to_remove=None, - rename_samples=True, - rename_qPCR_suffix = True, - fields_qPCR=[ssparser.dfield_snm])) + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_clean_samplesheet( + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=True, + fields_qPCR=[ssparser.dfield_snm], + ) + ) except Exception as e: logger.error(e) return False - logger.info(f'Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} ') + logger.info( + f"Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} " + ) # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use - self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv')) - if not self.runParserObj.obj.get('samplesheet_csv'): - self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data + self.runParserObj.samplesheet = SampleSheetParser( + os.path.join(self.run_dir, "SampleSheet_copy.csv") + ) + if not self.runParserObj.obj.get("samplesheet_csv"): + self.runParserObj.obj[ + "samplesheet_csv" + ] = self.runParserObj.samplesheet.data - def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None): + def _generate_clean_samplesheet( + self, + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=False, + fields_qPCR=None, + ): """Generate a 'clean' samplesheet, the given fields will be removed. If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC """ - output = '' - compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + output = "" + compl = {"A": "T", "C": "G", "G": "C", "T": "A"} # Expand the ssparser if there are lanes with 10X or Smart-seq samples - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) # Replace 10X or Smart-seq indices for sample in ssparser.data: - if sample['index'] in index_dict_tenX.keys(): - tenX_index = sample['index'] + if sample["index"] in index_dict_tenX.keys(): + tenX_index = sample["index"] # In the case of 10X dual indexes, replace index and index2 if TENX_DUAL_PAT.findall(tenX_index): - sample['index'] = index_dict_tenX[tenX_index][0] - sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_tenX[tenX_index][1].replace(',','').upper() ] ) ) + sample["index"] = index_dict_tenX[tenX_index][0] + sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_tenX[tenX_index][1] + .replace(",", "") + .upper() + ] + ) + ) # In the case of 10X single indexes, replace the index name with the 4 actual indicies else: x = 0 indices_number = len(index_dict_tenX[tenX_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_tenX[tenX_index][x] + new_sample["index"] = index_dict_tenX[tenX_index][x] ssparser.data.append(new_sample) x += 1 # Set the original 10X index to the 4th correct index - sample['index'] = index_dict_tenX[tenX_index][x] - elif SMARTSEQ_PAT.findall(sample['index']): + sample["index"] = index_dict_tenX[tenX_index][x] + elif SMARTSEQ_PAT.findall(sample["index"]): x = 0 - smartseq_index = sample['index'].split('-')[1] + smartseq_index = sample["index"].split("-")[1] indices_number = len(index_dict_smartseq[smartseq_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_smartseq[smartseq_index][x][0] - new_sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) ) + new_sample["index"] = index_dict_smartseq[smartseq_index][x][0] + new_sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_smartseq[smartseq_index][x][1] + .replace(",", "") + .upper() + ] + ) + ) ssparser.data.append(new_sample) x += 1 - sample['index'] = index_dict_smartseq[smartseq_index][x][0] - sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) ) + sample["index"] = index_dict_smartseq[smartseq_index][x][0] + sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_smartseq[smartseq_index][x][1] + .replace(",", "") + .upper() + ] + ) + ) # Sort to get the added indicies from 10x in the right place # Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID - ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID'))) + ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID"))) if not fields_to_remove: fields_to_remove = [] # Header - output += f'[Header]{os.linesep}' + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Data - output += f'[Data]{os.linesep}' + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: line_ar = [] @@ -165,16 +212,18 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}') + value = re.sub( + "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}" + ) else: # Substitute SampleID with SampleName, add Sample_ as prefix - value =f'Sample_{line[ssparser.dfield_snm]}' + value = f"Sample_{line[ssparser.dfield_snm]}" except: - # Otherwise add Sample_ as prefix - value = f'Sample_{line[ssparser.dfield_sid]}' + # Otherwise add Sample_ as prefix + value = f"Sample_{line[ssparser.dfield_sid]}" elif rename_qPCR_suffix and field in fields_qPCR: - value = re.sub('__qPCR_$', '', line[field]) + value = re.sub("__qPCR_$", "", line[field]) line_ar.append(value) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py index d03b1e9e..6dc8cee1 100755 --- a/taca/illumina/NextSeq_Runs.py +++ b/taca/illumina/NextSeq_Runs.py @@ -3,7 +3,7 @@ class NextSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(Standard_Runs, self).__init__( run_dir, software, configuration) + super(Standard_Runs, self).__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() # NextSeq2000 has a different FC ID pattern that ID contains the first letter for position diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 56724ac7..5fbac30a 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -15,32 +15,39 @@ logger = logging.getLogger(__name__) + class Run: - """ Defines an Illumina run - """ + """Defines an Illumina run""" def __init__(self, run_dir, software, configuration): if not os.path.exists(run_dir): raise RuntimeError(f"Could not locate run directory {run_dir}") - if 'analysis_server' not in configuration or \ - 'bcl2fastq' not in configuration or \ - 'bclconvert' not in configuration or \ - 'samplesheets_dir' not in configuration: - raise RuntimeError("configuration missing required entries " - "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)") - if not os.path.exists(os.path.join(run_dir, 'runParameters.xml')) \ - and os.path.exists(os.path.join(run_dir, 'RunParameters.xml')): + if ( + "analysis_server" not in configuration + or "bcl2fastq" not in configuration + or "bclconvert" not in configuration + or "samplesheets_dir" not in configuration + ): + raise RuntimeError( + "configuration missing required entries " + "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)" + ) + if not os.path.exists( + os.path.join(run_dir, "runParameters.xml") + ) and os.path.exists(os.path.join(run_dir, "RunParameters.xml")): # In NextSeq runParameters is named RunParameters logger.warning("Creating link from runParameters.xml to RunParameters.xml") - os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml')) - elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')): - raise RuntimeError(f"Could not locate runParameters.xml in run directory {run_dir}") + os.symlink("RunParameters.xml", os.path.join(run_dir, "runParameters.xml")) + elif not os.path.exists(os.path.join(run_dir, "runParameters.xml")): + raise RuntimeError( + f"Could not locate runParameters.xml in run directory {run_dir}" + ) self.run_dir = os.path.abspath(run_dir) self.software = software self.id = os.path.basename(os.path.normpath(run_dir)) - pattern = r'(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)' + pattern = r"(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)" m = re.match(pattern, self.id) self.date = m.group(1) self.instrument = m.group(2) @@ -63,51 +70,78 @@ def check_run_status(self): This function checks the status of a run while in progress. In the case of HiSeq check that all demux have been done and in that case perform aggregation """ - dex_status = self.get_run_status() - if self.software == 'bcl2fastq': - legacy_path = '' - elif self.software == 'bclconvert': + dex_status = self.get_run_status() + if self.software == "bcl2fastq": + legacy_path = "" + elif self.software == "bclconvert": legacy_path = f"Reports/{self.legacy_dir}" # Check the status of running demux # Collect all samplesheets generated before - samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while + samplesheets = glob.glob( + os.path.join(self.run_dir, "*_[0-9].csv") + ) # A single digit, this hypothesis should hold for a while all_demux_done = True for samplesheet in samplesheets: demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") # Check if this job is done - if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')): + if os.path.exists( + os.path.join( + self.run_dir, + demux_folder, + legacy_path, + "Stats", + "DemultiplexingStats.xml", + ) + ): all_demux_done = all_demux_done and True - if self.software == 'bcl2fastq': - demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl2fastq.err") - elif self.software == 'bclconvert': - demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl-convert.err") + if self.software == "bcl2fastq": + demux_log = os.path.join( + self.run_dir, f"demux_{demux_id}_bcl2fastq.err" + ) + elif self.software == "bclconvert": + demux_log = os.path.join( + self.run_dir, f"demux_{demux_id}_bcl-convert.err" + ) else: raise RuntimeError("Unrecognized software!") if os.path.isfile(demux_log): - errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log) + ( + errors, + warnings, + error_and_warning_messages, + ) = self._check_demux_log(demux_id, demux_log) else: - raise RuntimeError(f"No demux log file found for sub-demultiplexing {demux_id}!") - self.demux_summary[demux_id] = {'errors' : errors, - 'warnings' : warnings, - 'error_and_warning_messages' : error_and_warning_messages - } + raise RuntimeError( + f"No demux log file found for sub-demultiplexing {demux_id}!" + ) + self.demux_summary[demux_id] = { + "errors": errors, + "warnings": warnings, + "error_and_warning_messages": error_and_warning_messages, + } if errors or warnings: - logger.info(f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!") + logger.info( + f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!" + ) else: - logger.info(f"Sub-Demultiplexing in {demux_folder} completed without any error or warning.") + logger.info( + f"Sub-Demultiplexing in {demux_folder} completed without any error or warning." + ) else: all_demux_done = all_demux_done and False logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.") # All demux jobs finished and all stats aggregated under Demultiplexing # Aggreate all the results in the Demultiplexing folder - if all_demux_done and dex_status!='COMPLETED': - dex_status = 'COMPLETED' + if all_demux_done and dex_status != "COMPLETED": + dex_status = "COMPLETED" self._aggregate_demux_results() self.runParserObj = RunParser(self.run_dir) # Rename undetermined if needed - lanes = misc.return_unique([lanes['Lane'] for lanes in self.runParserObj.samplesheet.data]) + lanes = misc.return_unique( + [lanes["Lane"] for lanes in self.runParserObj.samplesheet.data] + ) samples_per_lane = self.get_samples_per_lane() for lane in lanes: if self.is_unpooled_lane(lane): @@ -121,8 +155,8 @@ def _check_demux_log(self, demux_id, demux_log): """ with open(demux_log) as demux_log_file: demux_log_content = demux_log_file.readlines() - if self.software == 'bcl2fastq': - pattern = r'Processing completed with (\d+) errors and (\d+) warnings' + if self.software == "bcl2fastq": + pattern = r"Processing completed with (\d+) errors and (\d+) warnings" match = re.search(pattern, demux_log_content[-1]) if match: errors = int(match.group(1)) @@ -130,20 +164,22 @@ def _check_demux_log(self, demux_id, demux_log): error_and_warning_messages = [] if errors or warnings: for line in demux_log_content: - if 'ERROR' in line or 'WARN' in line: + if "ERROR" in line or "WARN" in line: error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages else: - raise RuntimeError(f"Bad format with log file demux_{demux_id}_bcl2fastq.err") - elif self.software == 'bclconvert': + raise RuntimeError( + f"Bad format with log file demux_{demux_id}_bcl2fastq.err" + ) + elif self.software == "bclconvert": errors = 0 warnings = 0 error_and_warning_messages = [] for line in demux_log_content: - if 'ERROR' in line: + if "ERROR" in line: errors += 1 error_and_warning_messages.append(line) - elif 'WARNING' in line: + elif "WARNING" in line: warnnings += 1 error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages @@ -170,50 +206,53 @@ def _get_demux_folder(self): def _get_samplesheet(self): """ - Locate and parse the samplesheet for a run. The idea is that there is a folder in - samplesheet_folders that contains a samplesheet named flowecell_id.csv. + Locate and parse the samplesheet for a run. The idea is that there is a folder in + samplesheet_folders that contains a samplesheet named flowecell_id.csv. """ try: # Only implemented for some, (e.g. NovaSeqXPlus) # Will raise AttributeError if not implemented. current_year = self._current_year() except AttributeError: - current_year = '20' + self.id[0:2] + current_year = "20" + self.id[0:2] - samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'], - current_year) - ssname = os.path.join(samplesheets_dir, f'{self.flowcell_id}.csv') + samplesheets_dir = os.path.join(self.CONFIG["samplesheets_dir"], current_year) + ssname = os.path.join(samplesheets_dir, f"{self.flowcell_id}.csv") if os.path.exists(ssname): return ssname else: - raise RuntimeError("not able to find samplesheet {}.csv in {}".format(self.flowcell_id, self.CONFIG['samplesheets_dir'])) + raise RuntimeError( + "not able to find samplesheet {}.csv in {}".format( + self.flowcell_id, self.CONFIG["samplesheets_dir"] + ) + ) def _is_demultiplexing_done(self): - return os.path.exists(os.path.join(self.run_dir, - self._get_demux_folder(), - 'Stats', - 'Stats.json')) + return os.path.exists( + os.path.join(self.run_dir, self._get_demux_folder(), "Stats", "Stats.json") + ) def _is_demultiplexing_started(self): return os.path.exists(os.path.join(self.run_dir, self._get_demux_folder())) def _is_sequencing_done(self): - return os.path.exists(os.path.join(self.run_dir, 'RTAComplete.txt')) and os.path.exists(os.path.join(self.run_dir, 'CopyComplete.txt')) + return os.path.exists( + os.path.join(self.run_dir, "RTAComplete.txt") + ) and os.path.exists(os.path.join(self.run_dir, "CopyComplete.txt")) def get_run_status(self): - """ Return the current status of the run. - """ + """Return the current status of the run.""" demux_started = self._is_demultiplexing_started() demux_done = self._is_demultiplexing_done() sequencing_done = self._is_sequencing_done() if sequencing_done and demux_done: - return 'COMPLETED' # run is done, transfer might be ongoing. + return "COMPLETED" # run is done, transfer might be ongoing. elif sequencing_done and demux_started and not demux_done: - return 'IN_PROGRESS' + return "IN_PROGRESS" elif sequencing_done and not demux_started: - return 'TO_START' + return "TO_START" elif not sequencing_done: - return 'SEQUENCING' + return "SEQUENCING" else: raise RuntimeError("Unexpected status in get_run_status") @@ -249,49 +288,52 @@ def _compute_base_mask(self): raise NotImplementedError("Please Implement this method") def transfer_run(self, t_file, mail_recipients=None): - """ Transfer a run to the analysis server. Will add group R/W permissions to - the run directory in the destination server so that the run can be processed - by any user/account in that group (i.e a functional account...). - :param str t_file: File where to put the transfer information + """Transfer a run to the analysis server. Will add group R/W permissions to + the run directory in the destination server so that the run can be processed + by any user/account in that group (i.e a functional account...). + :param str t_file: File where to put the transfer information """ # The option -a implies -o and -g which is not the desired behaviour - command_line = ['rsync', '-LtDrv'] + command_line = ["rsync", "-LtDrv"] # Add R/W permissions to the group - command_line.append('--chmod=g+rw') + command_line.append("--chmod=g+rw") # This horrible thing here avoids data dup when we use multiple indexes in a lane/FC command_line.append("--exclude=Demultiplexing_*/*_*") command_line.append("--include=*/") - for to_include in self.CONFIG['analysis_server']['sync']['include']: + for to_include in self.CONFIG["analysis_server"]["sync"]["include"]: command_line.append(f"--include={to_include}") command_line.extend(["--exclude=*", "--prune-empty-dirs"]) - r_user = self.CONFIG['analysis_server']['user'] - r_host = self.CONFIG['analysis_server']['host'] - r_dir = self.CONFIG['analysis_server']['sync']['data_archive'] + r_user = self.CONFIG["analysis_server"]["user"] + r_host = self.CONFIG["analysis_server"]["host"] + r_dir = self.CONFIG["analysis_server"]["sync"]["data_archive"] remote = f"{r_user}@{r_host}:{r_dir}" command_line.extend([self.run_dir, remote]) # Create temp file indicating that the run is being transferred try: - open(os.path.join(self.run_dir, 'transferring'), 'w').close() + open(os.path.join(self.run_dir, "transferring"), "w").close() except OSError as e: - logger.error(f"Cannot create a file in {self.id}. " - "Check the run name, and the permissions.") + logger.error( + f"Cannot create a file in {self.id}. " + "Check the run name, and the permissions." + ) raise e - started = (f"Started transfer of run {self.id} on {datetime.now()}") + started = f"Started transfer of run {self.id} on {datetime.now()}" logger.info(started) # In this particular case we want to capture the exception because we want # to delete the transfer file try: - msge_text=f"I am about to transfer with this command \n{command_line}" - logger.info(msge_text) - misc.call_external_command(command_line, with_log_files=True, - prefix="", log_dir=self.run_dir) + msge_text = f"I am about to transfer with this command \n{command_line}" + logger.info(msge_text) + misc.call_external_command( + command_line, with_log_files=True, prefix="", log_dir=self.run_dir + ) except subprocess.CalledProcessError as exception: - os.remove(os.path.join(self.run_dir, 'transferring')) - #Send an email notifying that the transfer failed + os.remove(os.path.join(self.run_dir, "transferring")) + # Send an email notifying that the transfer failed runname = self.id - sbt = (f"Rsync of run {runname} failed") - msg= f""" Rsync of data for run {runname} has failed! + sbt = f"Rsync of run {runname} failed" + msg = f""" Rsync of data for run {runname} has failed! Raised the following exception: {exception} """ if mail_recipients: @@ -299,16 +341,16 @@ def transfer_run(self, t_file, mail_recipients=None): raise exception - logger.info(f'Adding run {self.id} to {t_file}') - with open(t_file, 'a') as tranfer_file: - tsv_writer = csv.writer(tranfer_file, delimiter='\t') + logger.info(f"Adding run {self.id} to {t_file}") + with open(t_file, "a") as tranfer_file: + tsv_writer = csv.writer(tranfer_file, delimiter="\t") tsv_writer.writerow([self.id, str(datetime.now())]) - os.remove(os.path.join(self.run_dir, 'transferring')) + os.remove(os.path.join(self.run_dir, "transferring")) - #Send an email notifying that the transfer was successful + # Send an email notifying that the transfer was successful runname = self.id - sbt = (f"Rsync of data for run {runname} to the analysis cluster has finished") - msg= """ Rsync of data for run {run} to the analysis cluster has finished! + sbt = f"Rsync of data for run {runname} to the analysis cluster has finished" + msg = """ Rsync of data for run {run} to the analysis cluster has finished! The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} """.format(run=runname) @@ -316,36 +358,35 @@ def transfer_run(self, t_file, mail_recipients=None): send_mail(sbt, msg, mail_recipients) def archive_run(self, destination): - """ Move run to the archive folder - :param str destination: the destination folder + """Move run to the archive folder + :param str destination: the destination folder """ if destination and os.path.isdir(destination): - logger.info(f'archiving run {self.id}') + logger.info(f"archiving run {self.id}") shutil.move(self.run_dir, os.path.join(destination, self.id)) else: logger.warning("Cannot move run to archive, destination does not exist") def send_mail(self, sbt, msg, rcp): - """ Sends mail about run completion - """ + """Sends mail about run completion""" runname = self.id if not sbt: sbt = f"{runname}" misc.send_mail(sbt, msg, rcp) def is_transferred(self, transfer_file): - """ Checks wether a run has been transferred to the analysis server or not. - Returns true in the case in which the tranfer is finished or ongoing. - :param str transfer_file: Path to file with information about transferred runs + """Checks wether a run has been transferred to the analysis server or not. + Returns true in the case in which the tranfer is finished or ongoing. + :param str transfer_file: Path to file with information about transferred runs """ try: with open(transfer_file) as file_handle: - transfer_file_contents = csv.reader(file_handle, delimiter='\t') + transfer_file_contents = csv.reader(file_handle, delimiter="\t") for row in transfer_file_contents: # Rows have two columns: run and transfer date if row[0] == os.path.basename(self.id): return True - if os.path.exists(os.path.join(self.run_dir, 'transferring')): + if os.path.exists(os.path.join(self.run_dir, "transferring")): return True return False except OSError: @@ -353,14 +394,14 @@ def is_transferred(self, transfer_file): def is_unpooled_lane(self, lane): """ - :param lane: lane identifier - :type lane: string - :rtype: boolean - :returns: True if the samplesheet has one entry for that lane, False otherwise + :param lane: lane identifier + :type lane: string + :rtype: boolean + :returns: True if the samplesheet has one entry for that lane, False otherwise """ count = 0 for l in self.runParserObj.samplesheet.data: - if l['Lane'] == lane: + if l["Lane"] == lane: count += 1 return count == 1 @@ -374,7 +415,7 @@ def get_samples_per_lane(self): ss = self.runParserObj.samplesheet d = {} for l in ss.data: - d[l['Lane']] = l[ss.dfield_snm] + d[l["Lane"]] = l[ss.dfield_snm] return d def _rename_undet(self, lane, samples_per_lane): @@ -387,25 +428,35 @@ def _rename_undet(self, lane, samples_per_lane): :param samples_per_lane: lane:sample dict :type status: dict """ - for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")): - old_name=os.path.basename(file) - old_name_comps=old_name.split("_") - old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined - old_name_comps[0]=samples_per_lane[lane]#replace Undetermined with samplename + for file in glob.glob( + os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*") + ): + old_name = os.path.basename(file) + old_name_comps = old_name.split("_") + old_name_comps[1] = old_name_comps[0] # replace S0 with Undetermined + old_name_comps[0] = samples_per_lane[ + lane + ] # replace Undetermined with samplename for index, comp in enumerate(old_name_comps): - if comp.startswith('L00'): - old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper - - new_name="_".join(old_name_comps) - logger.info(f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}") + if comp.startswith("L00"): + old_name_comps[index] = comp.replace( + "L00", "L01" + ) # adds a 1 as the second lane number in order to differentiate undetermined from normal in piper + + new_name = "_".join(old_name_comps) + logger.info( + f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}" + ) os.rename(file, os.path.join(os.path.dirname(file), new_name)) def _classify_lanes(self, samplesheets): # Prepare a list for lanes with NoIndex samples noindex_lanes = [] for entry in self.runParserObj.samplesheet.data: - if entry['index'].upper() == 'NOINDEX' or (entry['index'] == '' and entry['index2'] == ''): - noindex_lanes.append(entry['Lane']) + if entry["index"].upper() == "NOINDEX" or ( + entry["index"] == "" and entry["index2"] == "" + ): + noindex_lanes.append(entry["Lane"]) # Prepare a dict with the lane, demux_id and index_length info based on the sub-samplesheets # This is for the purpose of deciding simple_lanes and complex_lanes, plus we should start with the Stats.json file from which demux_id for each lane lane_demuxid_indexlength = dict() @@ -413,10 +464,18 @@ def _classify_lanes(self, samplesheets): demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] ssparser = SampleSheetParser(samplesheet) for row in ssparser.data: - if row['Lane'] not in lane_demuxid_indexlength.keys(): - lane_demuxid_indexlength[row['Lane']] = {demux_id: [len(row.get('index','')), len(row.get('index2',''))]} - elif demux_id not in lane_demuxid_indexlength[row['Lane']].keys(): - lane_demuxid_indexlength[row['Lane']][demux_id] = [len(row.get('index','')), len(row.get('index2',''))] + if row["Lane"] not in lane_demuxid_indexlength.keys(): + lane_demuxid_indexlength[row["Lane"]] = { + demux_id: [ + len(row.get("index", "")), + len(row.get("index2", "")), + ] + } + elif demux_id not in lane_demuxid_indexlength[row["Lane"]].keys(): + lane_demuxid_indexlength[row["Lane"]][demux_id] = [ + len(row.get("index", "")), + len(row.get("index2", "")), + ] else: pass @@ -433,7 +492,12 @@ def _classify_lanes(self, samplesheets): # Dual and longer indexes have higher priority if 0 in list(complex_lanes[key].values())[0] and 0 not in vv: complex_lanes[key] = {vk: vv} - elif (0 in list(complex_lanes[key].values())[0] and 0 in vv) or (0 not in list(complex_lanes[key].values())[0] and 0 not in vv): + elif ( + 0 in list(complex_lanes[key].values())[0] and 0 in vv + ) or ( + 0 not in list(complex_lanes[key].values())[0] + and 0 not in vv + ): if sum(vv) > sum(list(complex_lanes[key].values())[0]): complex_lanes[key] = {vk: vv} else: @@ -441,113 +505,192 @@ def _classify_lanes(self, samplesheets): return noindex_lanes, simple_lanes, complex_lanes - def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, legacy_path): + def _process_noindex_sample_with_fake_index_with_single_demux( + self, demux_id, legacy_path + ): demux_folder = os.path.join(self.run_dir, self.demux_dir) sample_counter = 1 - for entry in sorted(self.runParserObj.samplesheet.data, key=lambda k: k['Lane']): - lane = entry['Lane'] - project = entry['Sample_Project'] - sample = entry['Sample_ID'] + for entry in sorted( + self.runParserObj.samplesheet.data, key=lambda k: k["Lane"] + ): + lane = entry["Lane"] + project = entry["Sample_Project"] + sample = entry["Sample_ID"] project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): os.makedirs(project_dest) sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")): + for file in glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined*L0?{lane}*", + ) + ): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_',''), f'S{str(sample_counter)}'] + old_name_comps[2:] + new_name_comps = [ + sample.replace("Sample_", ""), + f"S{str(sample_counter)}", + ] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) - logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name)) + logger.info( + "For undet sample {}, renaming {} to {}".format( + sample.replace("Sample_", ""), old_name, new_name + ) + ) sample_counter += 1 # Make a softlink of lane.html - html_report_lane_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") - html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") + html_report_lane_source = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) + html_report_lane_dest = os.path.join( + demux_folder, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) if not os.path.isdir(os.path.dirname(html_report_lane_dest)): os.makedirs(os.path.dirname(html_report_lane_dest)) os.symlink(html_report_lane_source, html_report_lane_dest) # Modify the laneBarcode.html file - html_report_laneBarcode = os.path.join(self.run_dir, - f"Demultiplexing_{demux_id}", - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + html_report_laneBarcode = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) html_report_laneBarcode_parser = LaneBarcodeParser(html_report_laneBarcode) lane_project_sample = dict() for entry in html_report_laneBarcode_parser.sample_data: - if entry['Sample'] != 'Undetermined': - lane_project_sample[entry['Lane']] = {'Project': entry['Project'], - 'Sample': entry['Sample'] - } + if entry["Sample"] != "Undetermined": + lane_project_sample[entry["Lane"]] = { + "Project": entry["Project"], + "Sample": entry["Sample"], + } for entry in html_report_laneBarcode_parser.sample_data[:]: - if entry['Sample'] == 'Undetermined': - entry['Project'] = lane_project_sample[entry['Lane']]['Project'] - entry['Sample'] = lane_project_sample[entry['Lane']]['Sample'] + if entry["Sample"] == "Undetermined": + entry["Project"] = lane_project_sample[entry["Lane"]]["Project"] + entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"] else: html_report_laneBarcode_parser.sample_data.remove(entry) - html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data, - key=lambda k: (k['Lane'].lower(), k['Sample'])) - new_html_report_laneBarcode = os.path.join(demux_folder, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + html_report_laneBarcode_parser.sample_data = sorted( + html_report_laneBarcode_parser.sample_data, + key=lambda k: (k["Lane"].lower(), k["Sample"]), + ) + new_html_report_laneBarcode = os.path.join( + demux_folder, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) # Modify the Stats.json file - stat_json_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json") + stat_json_source = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + "Stats.json", + ) stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json") with open(stat_json_source) as json_data: data = json.load(json_data) # Fix the sample stats per lane - for entry in data['ConversionResults'][:]: - del entry['DemuxResults'][0]['IndexMetrics'] - entry['DemuxResults'][0].update(entry['Undetermined']) - del entry['Undetermined'] + for entry in data["ConversionResults"][:]: + del entry["DemuxResults"][0]["IndexMetrics"] + entry["DemuxResults"][0].update(entry["Undetermined"]) + del entry["Undetermined"] # Reset unknown barcodes list - for entry in data['UnknownBarcodes'][:]: - entry['Barcodes'] = {'unknown': 1} + for entry in data["UnknownBarcodes"][:]: + entry["Barcodes"] = {"unknown": 1} # Write to a new Stats.json file - with open(stat_json_new, 'w') as stat_json_new_file: + with open(stat_json_new, "w") as stat_json_new_file: json.dump(data, stat_json_new_file) - def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes): - elements = [element for element in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) ] + def _process_simple_lane_with_single_demux( + self, demux_id, legacy_path, noindex_lanes + ): + elements = [ + element + for element in os.listdir( + os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") + ) + ] for element in elements: - if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case - source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", element) + if ( + "Stats" not in element and "Reports" not in element + ): # skip this folder and treat it differently to take into account the NoIndex case + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", element + ) dest = os.path.join(self.run_dir, self.demux_dir, element) os.symlink(source, dest) os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats")) # Fetch the lanes that have NoIndex - statsFiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" )) + statsFiles = glob.glob( + os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" + ) + ) for source in statsFiles: source_name = os.path.split(source)[1] - if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: - lane = os.path.splitext(os.path.split(source)[1])[0][-1] #lane + if source_name not in [ + "DemultiplexingStats.xml", + "AdapterTrimming.txt", + "ConversionStats.xml", + "Stats.json", + ]: + lane = os.path.splitext(os.path.split(source)[1])[0][-1] # lane if lane not in noindex_lanes: - dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name) + dest = os.path.join( + self.run_dir, self.demux_dir, "Stats", source_name + ) os.symlink(source, dest) - for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: - source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file) + for file in [ + "DemultiplexingStats.xml", + "AdapterTrimming.txt", + "ConversionStats.xml", + "Stats.json", + ]: + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file + ) dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file) os.symlink(source, dest) - source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports") + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports" + ) dest = os.path.join(self.run_dir, self.demux_dir, "Reports") if os.path.exists(dest): try: @@ -556,17 +699,27 @@ def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_ os.unlink(dest) os.symlink(source, dest) - def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode): + def _fix_html_reports_for_complex_lanes( + self, + demux_folder, + index_cycles, + complex_lanes, + noindex_lanes, + html_reports_lane, + html_reports_laneBarcode, + ): # Start with the lane html_report_lane_parser = None for next_html_report_lane in html_reports_lane: if html_report_lane_parser is None: html_report_lane_parser = LaneBarcodeParser(next_html_report_lane) else: - lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data] + lanesInReport = [ + Lane["Lane"] for Lane in html_report_lane_parser.sample_data + ] next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane) for entry in next_html_report_lane_parser.sample_data: - if entry['Lane'] not in lanesInReport: + if entry["Lane"] not in lanesInReport: # If this is a new lane not included before html_report_lane_parser.sample_data.append(entry) # Now all lanes have been inserted @@ -579,193 +732,392 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple Yield_Mbases = 0 for entry in html_report_lane_parser.sample_data: # Update NumberReads for total lane clusters - NumberReads_Summary[entry['Lane']] = {'total_lane_cluster': int(entry['PF Clusters'].replace(',', '')), - 'total_lane_yield': int(entry['Yield (Mbases)'].replace(',', ''))} - Clusters_Raw += int(int(entry['PF Clusters'].replace(',', '')) / float(entry['% PFClusters']) * 100) - Clusters_PF += int(entry['PF Clusters'].replace(',', '')) - Yield_Mbases += int(entry['Yield (Mbases)'].replace(',', '')) - if entry['Lane'] in complex_lanes.keys(): - entry['% Perfectbarcode'] = None - entry['% One mismatchbarcode'] = None + NumberReads_Summary[entry["Lane"]] = { + "total_lane_cluster": int(entry["PF Clusters"].replace(",", "")), + "total_lane_yield": int(entry["Yield (Mbases)"].replace(",", "")), + } + Clusters_Raw += int( + int(entry["PF Clusters"].replace(",", "")) + / float(entry["% PFClusters"]) + * 100 + ) + Clusters_PF += int(entry["PF Clusters"].replace(",", "")) + Yield_Mbases += int(entry["Yield (Mbases)"].replace(",", "")) + if entry["Lane"] in complex_lanes.keys(): + entry["% Perfectbarcode"] = None + entry["% One mismatchbarcode"] = None # Update the values in Flowcell Summary - html_report_lane_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}' - html_report_lane_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}' - html_report_lane_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}' + html_report_lane_parser.flowcell_data["Clusters (Raw)"] = f"{Clusters_Raw:,}" + html_report_lane_parser.flowcell_data["Clusters(PF)"] = f"{Clusters_PF:,}" + html_report_lane_parser.flowcell_data["Yield (MBases)"] = f"{Yield_Mbases:,}" # Add lanes not present in this demux # Create the new lane.html - new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all']) - new_html_report_lane = os.path.join(new_html_report_lane_dir, 'lane.html') + new_html_report_lane_dir = _create_folder_structure( + demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"] + ) + new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html") _generate_lane_html(new_html_report_lane, html_report_lane_parser) # Generate the laneBarcode html_report_laneBarcode_parser = None for next_html_report_laneBarcode in html_reports_laneBarcode: if html_report_laneBarcode_parser is None: - html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode) + html_report_laneBarcode_parser = LaneBarcodeParser( + next_html_report_laneBarcode + ) else: # No need to check samples occuring in more than one file as it would be spotted while softlinking - next_html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode) + next_html_report_laneBarcode_parser = LaneBarcodeParser( + next_html_report_laneBarcode + ) for entry in next_html_report_laneBarcode_parser.sample_data: html_report_laneBarcode_parser.sample_data.append(entry) # For complex lanes, set all numbers of undetermined to 0. And only keep one such entry - constant_keys = ['Lane', 'Barcode sequence', 'Project', 'Sample'] + constant_keys = ["Lane", "Barcode sequence", "Project", "Sample"] modified_complex_lanes = [] for entry in html_report_laneBarcode_parser.sample_data: - if entry['Lane'] in list(complex_lanes.keys()) and entry['Project'] in 'default': - if entry['Lane'] not in modified_complex_lanes: + if ( + entry["Lane"] in list(complex_lanes.keys()) + and entry["Project"] in "default" + ): + if entry["Lane"] not in modified_complex_lanes: for key in entry.keys(): if key not in constant_keys: - entry[key] = '0' - modified_complex_lanes.append(entry['Lane']) + entry[key] = "0" + modified_complex_lanes.append(entry["Lane"]) else: html_report_laneBarcode_parser.sample_data.remove(entry) # Update NumberReads for total sample yields for entry in html_report_laneBarcode_parser.sample_data: - if 'total_sample_cluster' not in NumberReads_Summary[entry['Lane']].keys(): - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] = 0 - NumberReads_Summary[entry['Lane']]['total_sample_yield'] = 0 - if entry['Project'] != 'default': - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', '')) - NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', '')) + if "total_sample_cluster" not in NumberReads_Summary[entry["Lane"]].keys(): + NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0 + NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0 + if entry["Project"] != "default": + NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( + entry["PF Clusters"].replace(",", "") + ) + NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( + entry["Yield (Mbases)"].replace(",", "") + ) else: - if entry['Project'] != 'default': - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', '')) - NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', '')) + if entry["Project"] != "default": + NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( + entry["PF Clusters"].replace(",", "") + ) + NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( + entry["Yield (Mbases)"].replace(",", "") + ) # Calculate the numbers clusters/yields of undet reads for key, value in NumberReads_Summary.items(): - value['undet_cluster'] = value['total_lane_cluster'] - value['total_sample_cluster'] - value['undet_yield'] = value['total_lane_yield'] - value['total_sample_yield'] + value["undet_cluster"] = ( + value["total_lane_cluster"] - value["total_sample_cluster"] + ) + value["undet_yield"] = ( + value["total_lane_yield"] - value["total_sample_yield"] + ) # Update the cluster/yield info of undet for complex lanes for entry in html_report_laneBarcode_parser.sample_data: - if entry['Project'] == 'default' and entry['Lane'] in complex_lanes.keys(): - entry['PF Clusters'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_cluster']) - entry['Yield (Mbases)'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_yield']) + if entry["Project"] == "default" and entry["Lane"] in complex_lanes.keys(): + entry["PF Clusters"] = "{:,}".format( + NumberReads_Summary[entry["Lane"]]["undet_cluster"] + ) + entry["Yield (Mbases)"] = "{:,}".format( + NumberReads_Summary[entry["Lane"]]["undet_yield"] + ) # Fix special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: lane_project_sample = dict() for entry in html_report_laneBarcode_parser.sample_data: - if entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined': - lane_project_sample[entry['Lane']] = {'Project': entry['Project'], - 'Sample': entry['Sample']} + if entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined": + lane_project_sample[entry["Lane"]] = { + "Project": entry["Project"], + "Sample": entry["Sample"], + } for entry in html_report_laneBarcode_parser.sample_data[:]: - if entry['Lane'] in noindex_lanes and entry['Sample'] == 'Undetermined': - entry['Project'] = lane_project_sample[entry['Lane']]['Project'] - entry['Sample'] = lane_project_sample[entry['Lane']]['Sample'] - elif entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined': + if entry["Lane"] in noindex_lanes and entry["Sample"] == "Undetermined": + entry["Project"] = lane_project_sample[entry["Lane"]]["Project"] + entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"] + elif ( + entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined" + ): html_report_laneBarcode_parser.sample_data.remove(entry) # Sort sample_data: first by lane then by sample ID - html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data, - key=lambda k: (k['Lane'].lower(), k['Sample'])) + html_report_laneBarcode_parser.sample_data = sorted( + html_report_laneBarcode_parser.sample_data, + key=lambda k: (k["Lane"].lower(), k["Sample"]), + ) # Update the values in Flowcell Summary - html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}' - html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}' - html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}' + html_report_laneBarcode_parser.flowcell_data[ + "Clusters (Raw)" + ] = f"{Clusters_Raw:,}" + html_report_laneBarcode_parser.flowcell_data[ + "Clusters(PF)" + ] = f"{Clusters_PF:,}" + html_report_laneBarcode_parser.flowcell_data[ + "Yield (MBases)" + ] = f"{Yield_Mbases:,}" # Generate the new report for laneBarcode.html - new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html') + new_html_report_laneBarcode = os.path.join( + new_html_report_lane_dir, "laneBarcode.html" + ) _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) - def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes): + def _fix_demultiplexingstats_xml_dir( + self, + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ): # Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done) - DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ['Stats']) + DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"]) # For creating DemuxSummary.txt files for complex lanes DemuxSummaryFiles_complex_lanes = dict() # Generate the Stats.json - with open(os.path.join(DemultiplexingStats_xml_dir, 'Stats.json'), 'w') as json_data_cumulative: + with open( + os.path.join(DemultiplexingStats_xml_dir, "Stats.json"), "w" + ) as json_data_cumulative: stats_list = {} for stat_json in stats_json: - demux_id = re.findall('Demultiplexing_([0-9])', stat_json)[0] + demux_id = re.findall("Demultiplexing_([0-9])", stat_json)[0] with open(stat_json) as json_data_partial: data = json.load(json_data_partial) if len(stats_list) == 0: # First time I do this - stats_list['RunNumber'] = data['RunNumber'] - stats_list['Flowcell'] = data['Flowcell'] - stats_list['RunId'] = data['RunId'] - stats_list['ConversionResults'] = data['ConversionResults'] - stats_list['ReadInfosForLanes'] = data['ReadInfosForLanes'] - stats_list['UnknownBarcodes'] = [] + stats_list["RunNumber"] = data["RunNumber"] + stats_list["Flowcell"] = data["Flowcell"] + stats_list["RunId"] = data["RunId"] + stats_list["ConversionResults"] = data["ConversionResults"] + stats_list["ReadInfosForLanes"] = data["ReadInfosForLanes"] + stats_list["UnknownBarcodes"] = [] else: # Update only the importat fields - lanes_present_in_stats_json = [entry['LaneNumber'] for entry in stats_list['ConversionResults']] - for ReadInfosForLanes_lane in data['ReadInfosForLanes']: - if ReadInfosForLanes_lane['LaneNumber'] not in lanes_present_in_stats_json: - stats_list['ReadInfosForLanes'].extend([ReadInfosForLanes_lane]) - for ConversionResults_lane in data['ConversionResults']: - if ConversionResults_lane['LaneNumber'] in lanes_present_in_stats_json and str(ConversionResults_lane['LaneNumber']) in complex_lanes.keys(): + lanes_present_in_stats_json = [ + entry["LaneNumber"] + for entry in stats_list["ConversionResults"] + ] + for ReadInfosForLanes_lane in data["ReadInfosForLanes"]: + if ( + ReadInfosForLanes_lane["LaneNumber"] + not in lanes_present_in_stats_json + ): + stats_list["ReadInfosForLanes"].extend( + [ReadInfosForLanes_lane] + ) + for ConversionResults_lane in data["ConversionResults"]: + if ( + ConversionResults_lane["LaneNumber"] + in lanes_present_in_stats_json + and str(ConversionResults_lane["LaneNumber"]) + in complex_lanes.keys() + ): # For complex lanes, we set all stats to 0, except for read number and yield which will use values from NumberReads_Summary - ConversionResults_lane['Undetermined']['NumberReads'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_cluster'] - ConversionResults_lane['Undetermined']['Yield'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_yield']*1000000 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['QualityScoreSum'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['TrimmedBases'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['Yield'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['YieldQ30'] = 0 - if len([r for r in self.runParserObj.runinfo.data['Reads'] if r['IsIndexedRead'] == 'N']) == 2: - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['QualityScoreSum'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['TrimmedBases'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['Yield'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['YieldQ30'] = 0 + ConversionResults_lane["Undetermined"][ + "NumberReads" + ] = NumberReads_Summary[ + str(ConversionResults_lane["LaneNumber"]) + ]["undet_cluster"] + ConversionResults_lane["Undetermined"]["Yield"] = ( + NumberReads_Summary[ + str(ConversionResults_lane["LaneNumber"]) + ]["undet_yield"] + * 1000000 + ) + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["QualityScoreSum"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["TrimmedBases"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["Yield"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["YieldQ30"] = 0 + if ( + len( + [ + r + for r in self.runParserObj.runinfo.data[ + "Reads" + ] + if r["IsIndexedRead"] == "N" + ] + ) + == 2 + ): + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["QualityScoreSum"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["TrimmedBases"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["Yield"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["YieldQ30"] = 0 # Find the list containing info for this lane #TODO: can lane_to_update be removed? - lane_to_update = [entry for entry in stats_list['ConversionResults'] if entry['LaneNumber'] == ConversionResults_lane['LaneNumber']][0] - lane_to_update['DemuxResults'].extend(ConversionResults_lane['DemuxResults']) - lane_to_update['Undetermined'] = ConversionResults_lane['Undetermined'] + lane_to_update = [ + entry + for entry in stats_list["ConversionResults"] + if entry["LaneNumber"] + == ConversionResults_lane["LaneNumber"] + ][0] + lane_to_update["DemuxResults"].extend( + ConversionResults_lane["DemuxResults"] + ) + lane_to_update["Undetermined"] = ConversionResults_lane[ + "Undetermined" + ] else: - stats_list['ConversionResults'].extend([ConversionResults_lane]) - - for unknown_barcode_lane in data['UnknownBarcodes']: - if str(unknown_barcode_lane['Lane']) in simple_lanes.keys(): - stats_list['UnknownBarcodes'].extend([unknown_barcode_lane]) - elif str(unknown_barcode_lane['Lane']) in complex_lanes.keys(): - if list(complex_lanes[str(unknown_barcode_lane['Lane'])].keys())[0] == demux_id: + stats_list["ConversionResults"].extend( + [ConversionResults_lane] + ) + + for unknown_barcode_lane in data["UnknownBarcodes"]: + if str(unknown_barcode_lane["Lane"]) in simple_lanes.keys(): + stats_list["UnknownBarcodes"].extend([unknown_barcode_lane]) + elif str(unknown_barcode_lane["Lane"]) in complex_lanes.keys(): + if ( + list( + complex_lanes[ + str(unknown_barcode_lane["Lane"]) + ].keys() + )[0] + == demux_id + ): # First have the list of unknown indexes from the top priority demux run full_list_unknownbarcodes = unknown_barcode_lane # Remove the samples involved in the other samplesheets for samplesheet in samplesheets: - demux_id_ss = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] + demux_id_ss = os.path.splitext( + os.path.split(samplesheet)[1] + )[0].split("_")[1] if demux_id_ss != demux_id: ssparser = SampleSheetParser(samplesheet) - ssparser_data_lane = [row for row in ssparser.data if row['Lane'] == str(unknown_barcode_lane['Lane'])] + ssparser_data_lane = [ + row + for row in ssparser.data + if row["Lane"] + == str(unknown_barcode_lane["Lane"]) + ] for row in ssparser_data_lane: - sample_idx1 = row.get('index','') - sample_idx2 = row.get('index2','') - idx_copy = tuple(full_list_unknownbarcodes['Barcodes'].keys()) + sample_idx1 = row.get("index", "") + sample_idx2 = row.get("index2", "") + idx_copy = tuple( + full_list_unknownbarcodes[ + "Barcodes" + ].keys() + ) for idx in idx_copy: - unknownbarcode_idx1 = idx.split('+')[0] if '+' in idx else idx - unknownbarcode_idx2 = idx.split('+')[1] if '+' in idx else '' + unknownbarcode_idx1 = ( + idx.split("+")[0] + if "+" in idx + else idx + ) + unknownbarcode_idx2 = ( + idx.split("+")[1] + if "+" in idx + else "" + ) if sample_idx1 and sample_idx2: - comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)] - comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx2) else sample_idx2[:len(unknownbarcode_idx2)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)] and comparepart_idx2 == unknownbarcode_idx2[:len(comparepart_idx2)]: - del full_list_unknownbarcodes['Barcodes'][idx] + comparepart_idx1 = ( + sample_idx1 + if len(sample_idx1) + <= len(unknownbarcode_idx1) + else sample_idx1[ + : len(unknownbarcode_idx1) + ] + ) + comparepart_idx2 = ( + sample_idx2 + if len(sample_idx2) + <= len(unknownbarcode_idx2) + else sample_idx2[ + : len(unknownbarcode_idx2) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx1) + ] + and comparepart_idx2 + == unknownbarcode_idx2[ + : len(comparepart_idx2) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] elif sample_idx1 and not sample_idx2: - comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)]: - del full_list_unknownbarcodes['Barcodes'][idx] + comparepart_idx1 = ( + sample_idx1 + if len(sample_idx1) + <= len(unknownbarcode_idx1) + else sample_idx1[ + : len(unknownbarcode_idx1) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx1) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] elif not sample_idx1 and sample_idx2: - comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx1) else sample_idx2[:len(unknownbarcode_idx1)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx2)]: - del full_list_unknownbarcodes['Barcodes'][idx] - stats_list['UnknownBarcodes'].extend([full_list_unknownbarcodes]) - DemuxSummaryFiles_complex_lanes[str(unknown_barcode_lane['Lane'])] = full_list_unknownbarcodes + comparepart_idx2 = ( + sample_idx2 + if len(sample_idx2) + <= len(unknownbarcode_idx1) + else sample_idx2[ + : len(unknownbarcode_idx1) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx2) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] + stats_list["UnknownBarcodes"].extend( + [full_list_unknownbarcodes] + ) + DemuxSummaryFiles_complex_lanes[ + str(unknown_barcode_lane["Lane"]) + ] = full_list_unknownbarcodes else: pass # Fix special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: - for entry in stats_list['ConversionResults'][:]: - if str(entry['LaneNumber']) in noindex_lanes: - del entry['DemuxResults'][0]['IndexMetrics'] - entry['DemuxResults'][0].update(entry['Undetermined']) - del entry['Undetermined'] + for entry in stats_list["ConversionResults"][:]: + if str(entry["LaneNumber"]) in noindex_lanes: + del entry["DemuxResults"][0]["IndexMetrics"] + entry["DemuxResults"][0].update(entry["Undetermined"]) + del entry["Undetermined"] # Reset unknown barcodes list - for entry in stats_list['UnknownBarcodes'][:]: - if str(entry['Lane']) in noindex_lanes: - entry['Barcodes'] = {'unknown': 1} + for entry in stats_list["UnknownBarcodes"][:]: + if str(entry["Lane"]) in noindex_lanes: + entry["Barcodes"] = {"unknown": 1} # Write the final version of Stats.json file json.dump(stats_list, json_data_cumulative) @@ -773,100 +1125,161 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet # Create DemuxSummary.txt files for complex lanes if len(DemuxSummaryFiles_complex_lanes) > 0: for key, value in DemuxSummaryFiles_complex_lanes.items(): - with open(os.path.join(DemultiplexingStats_xml_dir, f'DemuxSummaryF1L{key}.txt'), 'w') as DemuxSummaryFile: - DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n') - DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n') - for idx, count in value['Barcodes'].items(): - DemuxSummaryFile.write(f'{idx}\t{count}\n') - - open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close() - - def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes): + with open( + os.path.join( + DemultiplexingStats_xml_dir, f"DemuxSummaryF1L{key}.txt" + ), + "w", + ) as DemuxSummaryFile: + DemuxSummaryFile.write("### Most Popular Unknown Index Sequences\n") + DemuxSummaryFile.write("### Columns: Index_Sequence Hit_Count\n") + for idx, count in value["Barcodes"].items(): + DemuxSummaryFile.write(f"{idx}\t{count}\n") + + open( + os.path.join(DemultiplexingStats_xml_dir, "DemultiplexingStats.xml"), "a" + ).close() + + def _process_demux_with_complex_lanes( + self, + demux_folder, + samplesheets, + legacy_path, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ): html_reports_lane = [] html_reports_laneBarcode = [] stats_json = [] for samplesheet in samplesheets: ssparser = SampleSheetParser(samplesheet) demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] - html_report_lane = os.path.join(self.run_dir, - f"Demultiplexing_{demux_id}", - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "lane.html" - ) + html_report_lane = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) if os.path.exists(html_report_lane): html_reports_lane.append(html_report_lane) else: - raise RuntimeError(f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing") - - html_report_laneBarcode = os.path.join(self.run_dir, - f"Demultiplexing_{demux_id}", - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + raise RuntimeError( + f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing" + ) + + html_report_laneBarcode = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) if os.path.exists(html_report_laneBarcode): html_reports_laneBarcode.append(html_report_laneBarcode) else: - raise RuntimeError(f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing") - - stat_json = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json") + raise RuntimeError( + f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing" + ) + + stat_json = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + "Stats.json", + ) if os.path.exists(stat_json): stats_json.append(stat_json) else: - raise RuntimeError(f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing") + raise RuntimeError( + f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing" + ) # Aggregate fastq lanes_samples = dict() for row in ssparser.data: - if row['Lane'] not in lanes_samples.keys(): - lanes_samples[row['Lane']] = [row['Sample_Name']] + if row["Lane"] not in lanes_samples.keys(): + lanes_samples[row["Lane"]] = [row["Sample_Name"]] else: - lanes_samples[row['Lane']].append(row['Sample_Name']) + lanes_samples[row["Lane"]].append(row["Sample_Name"]) # Special case that when we assign fake indexes for NoIndex samples - if (set(list(lanes_samples.keys())) & set(noindex_lanes)) and index_cycles != [0, 0]: + if ( + set(list(lanes_samples.keys())) & set(noindex_lanes) + ) and index_cycles != [0, 0]: sample_counter = 1 - for entry in sorted(ssparser.data, key=lambda k: k['Lane']): - lane = entry['Lane'] - project = entry['Sample_Project'] - sample = entry['Sample_ID'] + for entry in sorted(ssparser.data, key=lambda k: k["Lane"]): + lane = entry["Lane"] + project = entry["Sample_Project"] + sample = entry["Sample_ID"] project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): os.makedirs(project_dest) sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")): + for file in glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined*L0?{lane}*", + ) + ): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_', ''), f'S{str(sample_counter)}'] + old_name_comps[2:] + new_name_comps = [ + sample.replace("Sample_", ""), + f"S{str(sample_counter)}", + ] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) - logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name)) + logger.info( + "For undet sample {}, renaming {} to {}".format( + sample.replace("Sample_", ""), old_name, new_name + ) + ) sample_counter += 1 # Ordinary cases else: - projects = [project for project in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) if os.path.isdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project))] + projects = [ + project + for project in os.listdir( + os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") + ) + if os.path.isdir( + os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", project + ) + ) + ] for project in projects: if project in "Reports" or project in "Stats": continue - project_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project) + project_source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", project + ) project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): # There might be project seqeunced with multiple index lengths os.makedirs(project_dest) - samples = [sample for sample in os.listdir(project_source) if os.path.isdir(os.path.join(project_source, sample))] + samples = [ + sample + for sample in os.listdir(project_source) + if os.path.isdir(os.path.join(project_source, sample)) + ] for sample in samples: sample_source = os.path.join(project_source, sample) sample_dest = os.path.join(project_dest, sample) @@ -874,12 +1287,30 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p # There should never be the same sample sequenced with different index length, # however a sample might be pooled in several lanes and therefore sequenced using different samplesheets os.makedirs(sample_dest) - fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*")) + fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*")) for fastqfile in fastqfiles: - os.symlink(fastqfile, os.path.join(sample_dest, os.path.split(fastqfile)[1])) + os.symlink( + fastqfile, + os.path.join(sample_dest, os.path.split(fastqfile)[1]), + ) # Copy fastq files for undetermined and the undetermined stats for simple lanes only lanes_in_sub_samplesheet = [] - header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project'] + header = [ + "[Header]", + "[Data]", + "FCID", + "Lane", + "Sample_ID", + "Sample_Name", + "Sample_Ref", + "index", + "index2", + "Description", + "Control", + "Recipe", + "Operator", + "Sample_Project", + ] with open(samplesheet) as sub_samplesheet_file: sub_samplesheet_reader = csv.reader(sub_samplesheet_file) for row in sub_samplesheet_reader: @@ -888,68 +1319,119 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p lanes_in_sub_samplesheet = list(set(lanes_in_sub_samplesheet)) for lane in lanes_in_sub_samplesheet: if lane in simple_lanes.keys(): - undetermined_fastq_files = glob.glob(os.path.join(self.run_dir, - f"Demultiplexing_{demux_id}", - f"Undetermined_S0_L00{lane}*.fastq*")) # Contains only simple lanes undetermined + undetermined_fastq_files = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined_S0_L00{lane}*.fastq*", + ) + ) # Contains only simple lanes undetermined for fastqfile in undetermined_fastq_files: - os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1])) - DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir, - f"Demultiplexing_{demux_id}", - legacy_path, - "Stats", - f"*L{lane}*txt")) + os.symlink( + fastqfile, + os.path.join(demux_folder, os.path.split(fastqfile)[1]), + ) + DemuxSummaryFiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + f"*L{lane}*txt", + ) + ) if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) for DemuxSummaryFile in DemuxSummaryFiles: - os.symlink(DemuxSummaryFile, os.path.join(demux_folder, "Stats", os.path.split(DemuxSummaryFile)[1])) + os.symlink( + DemuxSummaryFile, + os.path.join( + demux_folder, + "Stats", + os.path.split(DemuxSummaryFile)[1], + ), + ) return html_reports_lane, html_reports_laneBarcode, stats_json def _aggregate_demux_results_simple_complex(self): runSetup = self.runParserObj.runinfo.get_read_configuration() - demux_folder = os.path.join(self.run_dir , self.demux_dir) + demux_folder = os.path.join(self.run_dir, self.demux_dir) samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) - if self.software == 'bcl2fastq': - legacy_path = '' - elif self.software == 'bclconvert': + if self.software == "bcl2fastq": + legacy_path = "" + elif self.software == "bclconvert": legacy_path = f"Reports/{self.legacy_dir}" else: raise RuntimeError("Unrecognized software!") index_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) # Classify lanes in samplesheets - (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(samplesheets) + (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes( + samplesheets + ) # Case with only one sub-demultiplexing if len(complex_lanes) == 0 and len(samplesheets) == 1: - demux_id = "0" # in this case this is the only demux dir + demux_id = "0" # in this case this is the only demux dir # Special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: # We first softlink the FastQ files of undet as the FastQ files of samples - self._process_noindex_sample_with_fake_index_with_single_demux(demux_id, legacy_path) + self._process_noindex_sample_with_fake_index_with_single_demux( + demux_id, legacy_path + ) # This is the simple case, Demultiplexing dir is simply a symlink to the only sub-demultiplexing dir else: - self._process_simple_lane_with_single_demux(demux_id, legacy_path, noindex_lanes) + self._process_simple_lane_with_single_demux( + demux_id, legacy_path, noindex_lanes + ) return True # Case with multiple sub-demultiplexings - (html_reports_lane, html_reports_laneBarcode, stats_json) = self._process_demux_with_complex_lanes(samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes) + ( + html_reports_lane, + html_reports_laneBarcode, + stats_json, + ) = self._process_demux_with_complex_lanes( + samplesheets, + legacy_path, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ) # Create the html reports - self._fix_html_reports_for_complex_lanes(demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode) + self._fix_html_reports_for_complex_lanes( + demux_folder, + index_cycles, + complex_lanes, + noindex_lanes, + html_reports_lane, + html_reports_laneBarcode, + ) # Fix contents under the DemultiplexingStats folder - self._fix_demultiplexingstats_xml_dir(demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes) + self._fix_demultiplexingstats_xml_dir( + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ) return True + def _create_folder_structure(root, dirs): """Creates a fodler stucture rooted in root usinf all dirs listed in dirs (a list) returns the path to the deepest directory @@ -961,49 +1443,56 @@ def _create_folder_structure(root, dirs): os.makedirs(path) return path + def _generate_lane_html(html_file, html_report_lane_parser): - with open(html_file, 'w') as html: + with open(html_file, "w") as html: # HEADER - html.write('\n') - html.write('\n') - html.write('\n') - html.write('\n') + html.write( + '\n' + ) + html.write("\n") + html.write( + '\n' + ) + html.write("\n") html.write('\n') - html.write('\n') - html.write('\n') - html.write('

C6L1WANXX /\n') - html.write(' [all projects] /\n') - html.write(' [all samples] /\n') - html.write(' [all barcodes]

show barcodes

\n') + html.write("

C6L1WANXX /\n") + html.write(" [all projects] /\n") + html.write(" [all samples] /\n") + html.write(" [all barcodes]

\n") + html.write( + '

show barcodes

\n' + ) + html.write("\n") # FLOWCELL SUMMARY TABLE - html.write('

Flowcell Summary

\n') + html.write("

Flowcell Summary

\n") html.write('\n') - html.write('\n') + html.write("\n") fc_keys = sorted(list(html_report_lane_parser.flowcell_data.keys())) for key in fc_keys: - html.write(f'\n') - html.write('\n') - html.write('\n') + html.write(f"\n") + html.write("\n") + html.write("\n") for key in fc_keys: - html.write(f'\n') - html.write('\n') - html.write('
{key}
{key}
{html_report_lane_parser.flowcell_data[key]}
\n') + html.write(f"{html_report_lane_parser.flowcell_data[key]}\n") + html.write("\n") + html.write("\n") # LANE SUMMARY TABLE - html.write('

Lane Summary

\n') + html.write("

Lane Summary

\n") html.write('\n') - html.write('\n') + html.write("\n") lane_keys = sorted(list(html_report_lane_parser.sample_data[0].keys())) for key in lane_keys: - html.write(f'\n') - html.write('\n') + html.write(f"\n") + html.write("\n") for sample in html_report_lane_parser.sample_data: - html.write('\n') + html.write("\n") for key in lane_keys: - html.write(f'\n') - html.write('\n') - html.write('
{key}
{key}
{sample[key]}
\n') + html.write(f"{sample[key]}\n") + html.write("\n") + html.write("\n") # FOOTER - html.write('

\n') - html.write('\n') - html.write('\n') + html.write("

\n") + html.write("\n") + html.write("\n") diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py index 98741bb4..7f051d66 100755 --- a/taca/illumina/Standard_Runs.py +++ b/taca/illumina/Standard_Runs.py @@ -11,15 +11,14 @@ logger = logging.getLogger(__name__) -TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?') -TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?') -SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]') -IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)') -RECIPE_PAT = re.compile('[0-9]+-[0-9]+') +TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?") +TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?") +SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]") +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") +RECIPE_PAT = re.compile("[0-9]+-[0-9]+") class Standard_Run(Run): - def __init__(self, run_dir, software, configuration): super().__init__(run_dir, software, configuration) self._set_sequencer_type() @@ -27,53 +26,63 @@ def __init__(self, run_dir, software, configuration): self._copy_samplesheet() def _set_sequencer_type(self): - self.sequencer_type = '' + self.sequencer_type = "" def _set_run_type(self): - self.run_type = 'NGI-RUN' + self.run_type = "NGI-RUN" def _copy_samplesheet(self): - ssname = self._get_samplesheet() + ssname = self._get_samplesheet() ssparser = SampleSheetParser(ssname) indexfile = dict() runSetup = self.runParserObj.runinfo.get_read_configuration() # Loading index files try: - indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path'] + indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"] except KeyError: - logger.error('Path to index file (10X) not found in the config file') + logger.error("Path to index file (10X) not found in the config file") raise RuntimeError try: - indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path'] + indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"] except KeyError: - logger.error('Path to index file (Smart-seq) not found in the config file') + logger.error("Path to index file (Smart-seq) not found in the config file") raise RuntimeError # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) # If this is not the case then create it and take special care of modification to be done on the SampleSheet - samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet.csv') + samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") # Function that goes through the original sample sheet and check for sample types self.sample_table = self._classify_samples(indexfile, ssparser, runSetup) # Check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_clean_samplesheet(ssparser, - indexfile, - fields_to_remove=None, - rename_samples=True, - rename_qPCR_suffix = True, - fields_qPCR=[ssparser.dfield_snm])) + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_clean_samplesheet( + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=True, + fields_qPCR=[ssparser.dfield_snm], + ) + ) except Exception as e: - logger.error(f'Encountered the following exception {e}') + logger.error(f"Encountered the following exception {e}") return False - logger.info(f'Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} ') + logger.info( + f"Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} " + ) # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use # Need to rewrite so that SampleSheet_0.csv is always used. - self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet.csv')) - if not self.runParserObj.obj.get('samplesheet_csv'): - self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data + self.runParserObj.samplesheet = SampleSheetParser( + os.path.join(self.run_dir, "SampleSheet.csv") + ) + if not self.runParserObj.obj.get("samplesheet_csv"): + self.runParserObj.obj[ + "samplesheet_csv" + ] = self.runParserObj.samplesheet.data def _parse_10X_indexes(self, indexfile): """ @@ -83,7 +92,7 @@ def _parse_10X_indexes(self, indexfile): index_dict = {} with open(indexfile) as f: for line in f: - line_ = line.rstrip().split(',') + line_ = line.rstrip().split(",") index_dict[line_[0]] = line_[1:5] return index_dict @@ -95,105 +104,138 @@ def _parse_smartseq_indexes(self, indexfile): index_dict = {} with open(indexfile) as f: for line in f: - line_ = line.rstrip().split(',') + line_ = line.rstrip().split(",") if index_dict.get(line_[0]): - index_dict[line_[0]].append((line_[1],line_[2])) + index_dict[line_[0]].append((line_[1], line_[2])) else: - index_dict.update({line_[0]:[(line_[1],line_[2])]}) + index_dict.update({line_[0]: [(line_[1], line_[2])]}) return index_dict def _classify_samples(self, indexfile, ssparser, runSetup): """Given an ssparser object, go through all samples and decide sample types.""" sample_table = dict() - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) index_cycles = [0, 0] read_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) - elif read['IsIndexedRead'] == 'N': - if int(read['Number']) == 1: - read_cycles[0] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) + elif read["IsIndexedRead"] == "N": + if int(read["Number"]) == 1: + read_cycles[0] = int(read["NumCycles"]) else: - read_cycles[1] = int(read['NumCycles']) + read_cycles[1] = int(read["NumCycles"]) for sample in ssparser.data: - lane = sample['Lane'] - sample_name = sample.get('Sample_Name') or sample.get('SampleName') + lane = sample["Lane"] + sample_name = sample.get("Sample_Name") or sample.get("SampleName") umi_length = [0, 0] read_length = read_cycles # Read the length of read 1 and read 2 from the field Recipe - if sample.get('Recipe') and RECIPE_PAT.findall(sample.get('Recipe')): - ss_read_length = [int(sample.get('Recipe').split('-')[0]), int(sample.get('Recipe').split('-')[1])] + if sample.get("Recipe") and RECIPE_PAT.findall(sample.get("Recipe")): + ss_read_length = [ + int(sample.get("Recipe").split("-")[0]), + int(sample.get("Recipe").split("-")[1]), + ] else: ss_read_length = [0, 0] # By default use the read cycles from the sequncing setup. Otherwise use the shorter read length if ss_read_length != [0, 0]: read_length = [min(rd) for rd in zip(ss_read_length, read_length)] # 10X single index - if TENX_SINGLE_PAT.findall(sample['index']): - index_length = [len(index_dict_tenX[sample['index']][0]),0] - sample_type = '10X_SINGLE' + if TENX_SINGLE_PAT.findall(sample["index"]): + index_length = [len(index_dict_tenX[sample["index"]][0]), 0] + sample_type = "10X_SINGLE" # 10X dual index - elif TENX_DUAL_PAT.findall(sample['index']): - index_length = [len(index_dict_tenX[sample['index']][0]),len(index_dict_tenX[sample['index']][1])] - sample_type = '10X_DUAL' + elif TENX_DUAL_PAT.findall(sample["index"]): + index_length = [ + len(index_dict_tenX[sample["index"]][0]), + len(index_dict_tenX[sample["index"]][1]), + ] + sample_type = "10X_DUAL" # IDT UMI samples - elif IDT_UMI_PAT.findall(sample['index']) or IDT_UMI_PAT.findall(sample['index2']): + elif IDT_UMI_PAT.findall(sample["index"]) or IDT_UMI_PAT.findall( + sample["index2"] + ): # Index length after removing "N" part - index_length = [len(sample['index'].replace('N', '')), - len(sample['index2'].replace('N', ''))] - sample_type = 'IDT_UMI' - umi_length = [sample['index'].upper().count('N'), sample['index2'].upper().count('N')] + index_length = [ + len(sample["index"].replace("N", "")), + len(sample["index2"].replace("N", "")), + ] + sample_type = "IDT_UMI" + umi_length = [ + sample["index"].upper().count("N"), + sample["index2"].upper().count("N"), + ] # Smart-seq - elif SMARTSEQ_PAT.findall(sample['index']): - smartseq_index = sample['index'].split('-')[1] - index_length = [len(index_dict_smartseq[smartseq_index][0][0]),len(index_dict_smartseq[smartseq_index][0][1])] - sample_type = 'SMARTSEQ' + elif SMARTSEQ_PAT.findall(sample["index"]): + smartseq_index = sample["index"].split("-")[1] + index_length = [ + len(index_dict_smartseq[smartseq_index][0][0]), + len(index_dict_smartseq[smartseq_index][0][1]), + ] + sample_type = "SMARTSEQ" # No Index case 1. We will write indexes to separate FastQ files - elif sample['index'].upper() == 'NOINDEX' and index_cycles != [0, 0]: + elif sample["index"].upper() == "NOINDEX" and index_cycles != [0, 0]: index_length = index_cycles - sample_type = 'NOINDEX' + sample_type = "NOINDEX" # No Index case 2. Both index 1 and 2 are empty, it will be the same index type but will be handled in the next case - elif sample['index'].upper() == 'NOINDEX' and index_cycles == [0, 0]: + elif sample["index"].upper() == "NOINDEX" and index_cycles == [0, 0]: index_length = [0, 0] - sample_type = 'ordinary' + sample_type = "ordinary" # Ordinary samples else: - index_length = [len(sample['index']),len(sample['index2'])] + index_length = [len(sample["index"]), len(sample["index2"])] # Short single index (<=6nt) - if (index_length[0] <= 8 and index_length[1] == 0) or (index_length[0] == 0 and index_length[1] <= 8): - sample_type = 'short_single_index' + if (index_length[0] <= 8 and index_length[1] == 0) or ( + index_length[0] == 0 and index_length[1] <= 8 + ): + sample_type = "short_single_index" else: - sample_type = 'ordinary' + sample_type = "ordinary" # Write in sample table # {'1': [('101', {'sample_type': 'ordinary', 'index_length': [8, 8]}), ('102', {'sample_type': 'ordinary', 'index_length': [8, 8]})]} if sample_table.get(lane): - sample_table[lane].append((sample_name, - {'sample_type': sample_type, - 'index_length': index_length, - 'umi_length': umi_length, - 'read_length': read_length})) + sample_table[lane].append( + ( + sample_name, + { + "sample_type": sample_type, + "index_length": index_length, + "umi_length": umi_length, + "read_length": read_length, + }, + ) + ) else: - sample_table.update({lane:[(sample_name, - {'sample_type': sample_type, - 'index_length': index_length, - 'umi_length': umi_length, - 'read_length': read_length})]}) + sample_table.update( + { + lane: [ + ( + sample_name, + { + "sample_type": sample_type, + "index_length": index_length, + "umi_length": umi_length, + "read_length": read_length, + }, + ) + ] + } + ) return sample_table - def demultiplex_run(self): """ - Demultiplex a run: - - Make sub-samplesheet based on sample classes - - Decide correct bcl2fastq/bclconvert command parameters based on sample classes - - run bcl2fastq/bclconvert conversion + Demultiplex a run: + - Make sub-samplesheet based on sample classes + - Decide correct bcl2fastq/bclconvert command parameters based on sample classes + - run bcl2fastq/bclconvert conversion """ runSetup = self.runParserObj.runinfo.get_read_configuration() # Check sample types @@ -201,7 +243,7 @@ def demultiplex_run(self): for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] - sample_type = sample_detail['sample_type'] + sample_type = sample_detail["sample_type"] if sample_type not in sample_type_list: sample_type_list.append(sample_type) @@ -213,21 +255,43 @@ def demultiplex_run(self): for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] if sample_type_t == sample_type: if lane_table.get(lane): - if (sample_index_length, sample_umi_length, sample_read_length) not in lane_table[lane]: - lane_table[lane].append((sample_index_length, sample_umi_length, sample_read_length)) + if ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) not in lane_table[lane]: + lane_table[lane].append( + ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) + ) else: - lane_table.update({lane:[(sample_index_length, sample_umi_length, sample_read_length)]}) + lane_table.update( + { + lane: [ + ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) + ] + } + ) # Determine the number of demux needed for the same sample type - if self.software == 'bcl2fastq': - demux_number_with_the_same_sample_type = len(max([v for k, v in lane_table.items()],key=len)) - elif self.software == 'bclconvert': + if self.software == "bcl2fastq": + demux_number_with_the_same_sample_type = len( + max([v for k, v in lane_table.items()], key=len) + ) + elif self.software == "bclconvert": unique_masks = [] for masks in lane_table.values(): for mask in masks: @@ -235,33 +299,44 @@ def demultiplex_run(self): unique_masks.append(mask) demux_number_with_the_same_sample_type = len(unique_masks) # Prepare sub-samplesheets, masks and commands - for i in range(0,demux_number_with_the_same_sample_type): + for i in range(0, demux_number_with_the_same_sample_type): # Prepare sub-samplesheet # A dictionary with lane and sample IDs to include samples_to_include = dict() # A dictionary with lane and index length for generating masks mask_table = dict() - if self.software == 'bcl2fastq': + if self.software == "bcl2fastq": for lane, lane_contents in self.sample_table.items(): try: - (index_length, umi_length, read_length) = lane_table[lane][i] - mask_table.update({lane: (index_length, umi_length, read_length)}) + (index_length, umi_length, read_length) = lane_table[lane][ + i + ] + mask_table.update( + {lane: (index_length, umi_length, read_length)} + ) for sample in lane_contents: sample_name = sample[0] sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] - if sample_type_t == sample_type and sample_index_length == index_length and sample_umi_length == umi_length and sample_read_length == read_length: + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] + if ( + sample_type_t == sample_type + and sample_index_length == index_length + and sample_umi_length == umi_length + and sample_read_length == read_length + ): if samples_to_include.get(lane): samples_to_include[lane].append(sample_name) else: - samples_to_include.update({lane:[sample_name]}) + samples_to_include.update({lane: [sample_name]}) except (KeyError, IndexError): - logger.info(f'No corresponding mask in lane {lane}. Skip it.') + logger.info( + f"No corresponding mask in lane {lane}. Skip it." + ) continue - elif self.software == 'bclconvert': + elif self.software == "bclconvert": mask = unique_masks[i] for lane, lane_contents in self.sample_table.items(): if lane_table.get(lane): @@ -270,17 +345,24 @@ def demultiplex_run(self): for sample in lane_contents: sample_name = sample[0] sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] - if sample_type_t == sample_type and sample_index_length == mask[0] and sample_umi_length == mask[1] and sample_read_length == mask[2]: + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] + if ( + sample_type_t == sample_type + and sample_index_length == mask[0] + and sample_umi_length == mask[1] + and sample_read_length == mask[2] + ): if samples_to_include.get(lane): samples_to_include[lane].append(sample_name) else: - samples_to_include.update({lane:[sample_name]}) + samples_to_include.update( + {lane: [sample_name]} + ) - if self.software == 'bclconvert': + if self.software == "bclconvert": runSetup = self.runParserObj.runinfo.get_read_configuration() (index_length, umi_length, read_length) = mask index1_size = int(index_length[0]) @@ -290,36 +372,61 @@ def demultiplex_run(self): read1_size = int(read_length[0]) read2_size = int(read_length[1]) is_dual_index = False - if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0): + if (index1_size != 0 and index2_size != 0) or ( + index1_size == 0 and index2_size != 0 + ): is_dual_index = True - base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size) + base_mask = self._compute_base_mask( + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ) else: index1_size = 0 index2_size = 0 base_mask = [] # Make sub-samplesheet with chdir(self.run_dir): - samplesheet_dest=f'SampleSheet_{bcl_cmd_counter}.csv' - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_samplesheet_subset(self.runParserObj.samplesheet, - samples_to_include, runSetup, self.software, sample_type, index1_size, index2_size, base_mask, self.CONFIG)) + samplesheet_dest = f"SampleSheet_{bcl_cmd_counter}.csv" + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_samplesheet_subset( + self.runParserObj.samplesheet, + samples_to_include, + runSetup, + self.software, + sample_type, + index1_size, + index2_size, + base_mask, + self.CONFIG, + ) + ) # Prepare demultiplexing dir with chdir(self.run_dir): # Create Demultiplexing dir, this changes the status to IN_PROGRESS - if not os.path.exists('Demultiplexing'): - os.makedirs('Demultiplexing') + if not os.path.exists("Demultiplexing"): + os.makedirs("Demultiplexing") # Prepare demultiplexing command with chdir(self.run_dir): - cmd = self.generate_bcl_command(sample_type, - mask_table, - bcl_cmd_counter) - misc.call_external_command_detached(cmd, - with_log_files = True, - prefix=f'demux_{bcl_cmd_counter}') - logger.info('BCL to FASTQ conversion and demultiplexing ' \ - f'started for run {os.path.basename(self.id)} on {datetime.now()}') + cmd = self.generate_bcl_command( + sample_type, mask_table, bcl_cmd_counter + ) + misc.call_external_command_detached( + cmd, with_log_files=True, prefix=f"demux_{bcl_cmd_counter}" + ) + logger.info( + "BCL to FASTQ conversion and demultiplexing " + f"started for run {os.path.basename(self.id)} on {datetime.now()}" + ) # Demutiplexing done for one mask type and scripts will continue # Working with the next type. Command counter should increase by 1 @@ -335,47 +442,59 @@ def _aggregate_demux_results(self): def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter): with chdir(self.run_dir): # Software - cl = [self.CONFIG.get(self.software)['bin']] + cl = [self.CONFIG.get(self.software)["bin"]] # Case with bcl2fastq - if self.software == 'bcl2fastq': - logger.info('Building a bcl2fastq command') - per_lane_base_masks = self._generate_per_lane_base_mask(sample_type, mask_table) + if self.software == "bcl2fastq": + logger.info("Building a bcl2fastq command") + per_lane_base_masks = self._generate_per_lane_base_mask( + sample_type, mask_table + ) # Add the base_mask for each lane lanes = list(mask_table.keys()) for lane in sorted(lanes): # Iterate thorugh each lane and add the correct --use-bases-mask for that lane - base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # Get the base_mask - base_mask_expr = f'{lane}:' + ','.join(base_mask) - cl.extend(['--use-bases-mask', base_mask_expr]) + base_mask = [ + per_lane_base_masks[lane][bm]["base_mask"] + for bm in per_lane_base_masks[lane] + ][0] # Get the base_mask + base_mask_expr = f"{lane}:" + ",".join(base_mask) + cl.extend(["--use-bases-mask", base_mask_expr]) # Case with bclconvert - elif self.software == 'bclconvert': - logger.info('Building a bclconvert command') - cl.extend(['--bcl-input-directory', self.run_dir]) + elif self.software == "bclconvert": + logger.info("Building a bclconvert command") + cl.extend(["--bcl-input-directory", self.run_dir]) else: raise RuntimeError("Unrecognized software!") # Output dir - output_dir = os.path.join(self.run_dir, f'Demultiplexing_{bcl_cmd_counter}') + output_dir = os.path.join(self.run_dir, f"Demultiplexing_{bcl_cmd_counter}") if not os.path.exists(output_dir): os.makedirs(output_dir) - cl.extend(['--output-dir', output_dir]) + cl.extend(["--output-dir", output_dir]) # Samplesheet - cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, f'SampleSheet_{bcl_cmd_counter}.csv'))]) + cl.extend( + [ + "--sample-sheet", + os.path.join( + os.path.join(self.run_dir, f"SampleSheet_{bcl_cmd_counter}.csv") + ), + ] + ) # Demux options cl_options = [] - if 'options' in self.CONFIG.get(self.software): - if self.CONFIG[self.software]['options'].get('common'): - for option in self.CONFIG[self.software]['options']['common']: + if "options" in self.CONFIG.get(self.software): + if self.CONFIG[self.software]["options"].get("common"): + for option in self.CONFIG[self.software]["options"]["common"]: cl_options.extend([option]) - if self.CONFIG[self.software]['options'].get(sample_type): - for option in self.CONFIG[self.software]['options'][sample_type]: + if self.CONFIG[self.software]["options"].get(sample_type): + for option in self.CONFIG[self.software]["options"][sample_type]: cl_options.extend([option]) for option in cl_options: if isinstance(option, dict): opt, val = list(option.items())[0] - if 'output-dir' not in opt: - cl.extend([f'--{opt}', str(val).lower()]) + if "output-dir" not in opt: + cl.extend([f"--{opt}", str(val).lower()]) else: - cl.append(f'--{option}') + cl.append(f"--{option}") return cl def _generate_per_lane_base_mask(self, sample_type, mask_table): @@ -407,199 +526,302 @@ def _generate_per_lane_base_mask(self, sample_type, mask_table): read1_size = lane_contents[2][0] read2_size = lane_contents[2][1] is_dual_index = False - if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0): + if (index1_size != 0 and index2_size != 0) or ( + index1_size == 0 and index2_size != 0 + ): is_dual_index = True # Compute the basemask - base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size) - base_mask_string = ''.join(base_mask) - - base_masks[lane][base_mask_string] = {'base_mask':base_mask} + base_mask = self._compute_base_mask( + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ) + base_mask_string = "".join(base_mask) + + base_masks[lane][base_mask_string] = {"base_mask": base_mask} return base_masks - def _compute_base_mask(self, runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size): + def _compute_base_mask( + self, + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ): """ - Assumptions: - - if runSetup is of size 3, then single index run - - if runSetup is of size 4, then dual index run + Assumptions: + - if runSetup is of size 3, then single index run + - if runSetup is of size 4, then dual index run """ bm = [] if len(runSetup) > 4: - raise RuntimeError("when generating base_masks looks like there are" \ - " more than 4 reads in the RunSetup.xml") + raise RuntimeError( + "when generating base_masks looks like there are" + " more than 4 reads in the RunSetup.xml" + ) for read in runSetup: - cycles = int(read['NumCycles']) - if read['IsIndexedRead'] == 'N': + cycles = int(read["NumCycles"]) + if read["IsIndexedRead"] == "N": # Prepare the base mask for the 1st read - is_first_read = int(read['Number']) == 1 + is_first_read = int(read["Number"]) == 1 if is_first_read: if cycles > read1_size: r_remainder = cycles - read1_size if read1_size != 0: - bm.append('Y' + str(read1_size) + 'N' + str(r_remainder)) + bm.append("Y" + str(read1_size) + "N" + str(r_remainder)) else: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('Y' + str(cycles)) + bm.append("Y" + str(cycles)) else: if cycles > read2_size: r_remainder = cycles - read2_size if read2_size != 0: - bm.append('Y' + str(read2_size) + 'N' + str(r_remainder)) + bm.append("Y" + str(read2_size) + "N" + str(r_remainder)) else: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('Y' + str(cycles)) + bm.append("Y" + str(cycles)) else: - is_first_index_read = int(read['Number']) == 2 + is_first_index_read = int(read["Number"]) == 2 # Prepare the base mask for the 1st index read if is_first_index_read: # The size of the index of the sample sheet is larger than the # one specified by RunInfo.xml, somethig must be wrong if index1_size > cycles: - raise RuntimeError("when generating base_masks found index 1 in" \ - " samplesheet larger than the index specifed in RunInfo.xml") + raise RuntimeError( + "when generating base_masks found index 1 in" + " samplesheet larger than the index specifed in RunInfo.xml" + ) i_remainder = cycles - index1_size if i_remainder > 0: - if sample_type == 'IDT_UMI': # Case of IDT UMI + if sample_type == "IDT_UMI": # Case of IDT UMI if umi1_size != 0: if i_remainder - umi1_size > 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index1_size) + 'Y' + str(umi1_size) + 'N' + str(i_remainder - umi1_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index1_size) + 'U' + str(umi1_size) + 'N' + str(i_remainder - umi1_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index1_size) + + "Y" + + str(umi1_size) + + "N" + + str(i_remainder - umi1_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index1_size) + + "U" + + str(umi1_size) + + "N" + + str(i_remainder - umi1_size) + ) else: raise RuntimeError("Unrecognized software!") elif i_remainder - umi1_size == 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index1_size) + 'Y' + str(umi1_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index1_size) + 'U' + str(umi1_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index1_size) + + "Y" + + str(umi1_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index1_size) + + "U" + + str(umi1_size) + ) else: raise RuntimeError("Unrecognized software!") else: - raise RuntimeError("when generating base_masks for UMI samples" \ - " some UMI1 length is longer than specified in RunInfo.xml") + raise RuntimeError( + "when generating base_masks for UMI samples" + " some UMI1 length is longer than specified in RunInfo.xml" + ) else: - bm.append('I' + str(index1_size) + 'N' + str(i_remainder)) + bm.append( + "I" + str(index1_size) + "N" + str(i_remainder) + ) elif index1_size == 0: - bm.append('N' + str(cycles)) # Case of NoIndex + bm.append("N" + str(cycles)) # Case of NoIndex else: - bm.append('I' + str(index1_size) + 'N' + str(i_remainder)) + bm.append("I" + str(index1_size) + "N" + str(i_remainder)) else: - bm.append('I' + str(cycles)) + bm.append("I" + str(cycles)) else: # The size of the index of the sample sheet is larger than the # one specified by RunInfo.xml, somethig must be wrong if index2_size > cycles: - raise RuntimeError("when generating base_masks found index 2 in" \ - " samplesheet larger than the index specifed in RunInfo.xml") + raise RuntimeError( + "when generating base_masks found index 2 in" + " samplesheet larger than the index specifed in RunInfo.xml" + ) # When working on the second read index I need to know if the sample is dual index or not - if is_dual_index or sample_type == '10X_SINGLE': - if sample_type == '10X_SINGLE': # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert - if self.software == 'bcl2fastq': - bm.append('Y' + str(cycles)) - elif self.software == 'bclconvert': - bm.append('N' + str(cycles)) + if is_dual_index or sample_type == "10X_SINGLE": + if ( + sample_type == "10X_SINGLE" + ): # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert + if self.software == "bcl2fastq": + bm.append("Y" + str(cycles)) + elif self.software == "bclconvert": + bm.append("N" + str(cycles)) else: raise RuntimeError("Unrecognized software!") else: i_remainder = cycles - index2_size if i_remainder > 0: - if sample_type == 'IDT_UMI': # Case of IDT UMI + if sample_type == "IDT_UMI": # Case of IDT UMI if umi2_size != 0: if i_remainder - umi2_size > 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index2_size) + 'Y' + str(umi2_size) + 'N' + str(i_remainder - umi2_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index2_size) + 'U' + str(umi2_size) + 'N' + str(i_remainder - umi2_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index2_size) + + "Y" + + str(umi2_size) + + "N" + + str(i_remainder - umi2_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index2_size) + + "U" + + str(umi2_size) + + "N" + + str(i_remainder - umi2_size) + ) else: - raise RuntimeError("Unrecognized software!") + raise RuntimeError( + "Unrecognized software!" + ) elif i_remainder - umi2_size == 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index2_size) + 'Y' + str(umi2_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index2_size) + 'U' + str(umi2_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index2_size) + + "Y" + + str(umi2_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index2_size) + + "U" + + str(umi2_size) + ) else: - raise RuntimeError("Unrecognized software!") + raise RuntimeError( + "Unrecognized software!" + ) else: - raise RuntimeError("when generating base_masks for UMI samples" \ - " some UMI2 length is longer than specified in RunInfo.xml") + raise RuntimeError( + "when generating base_masks for UMI samples" + " some UMI2 length is longer than specified in RunInfo.xml" + ) else: - bm.append('I' + str(index2_size) + 'N' + str(i_remainder)) + bm.append( + "I" + + str(index2_size) + + "N" + + str(i_remainder) + ) elif index2_size == 0: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('I' + str(index2_size) + 'N' + str(i_remainder)) + bm.append( + "I" + str(index2_size) + "N" + str(i_remainder) + ) else: - bm.append('I' + str(cycles)) + bm.append("I" + str(cycles)) else: - # If this sample is not dual index but the run is, - # then I need to ignore the second index completely - bm.append('N' + str(cycles)) + # If this sample is not dual index but the run is, + # then I need to ignore the second index completely + bm.append("N" + str(cycles)) return bm - - def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None): + def _generate_clean_samplesheet( + self, + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=False, + fields_qPCR=None, + ): """Generate a 'clean' samplesheet, the given fields will be removed. If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) """ - output = '' + output = "" # Expand the ssparser if there are lanes with 10X or Smart-seq samples - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) # Replace 10X or Smart-seq indices for sample in ssparser.data: - if sample['index'] in index_dict_tenX.keys(): - tenX_index = sample['index'] + if sample["index"] in index_dict_tenX.keys(): + tenX_index = sample["index"] # In the case of 10X dual indexes, replace index and index2 if TENX_DUAL_PAT.findall(tenX_index): - sample['index'] = index_dict_tenX[tenX_index][0] - sample['index2'] = index_dict_tenX[tenX_index][1] + sample["index"] = index_dict_tenX[tenX_index][0] + sample["index2"] = index_dict_tenX[tenX_index][1] # In the case of 10X single indexes, replace the index name with the 4 actual indicies else: x = 0 indices_number = len(index_dict_tenX[tenX_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_tenX[tenX_index][x] + new_sample["index"] = index_dict_tenX[tenX_index][x] ssparser.data.append(new_sample) x += 1 # Set the original 10X index to the 4th correct index - sample['index'] = index_dict_tenX[tenX_index][x] - elif SMARTSEQ_PAT.findall(sample['index']): + sample["index"] = index_dict_tenX[tenX_index][x] + elif SMARTSEQ_PAT.findall(sample["index"]): x = 0 - smartseq_index = sample['index'].split('-')[1] + smartseq_index = sample["index"].split("-")[1] indices_number = len(index_dict_smartseq[smartseq_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_smartseq[smartseq_index][x][0] - new_sample['index2'] = index_dict_smartseq[smartseq_index][x][1] + new_sample["index"] = index_dict_smartseq[smartseq_index][x][0] + new_sample["index2"] = index_dict_smartseq[smartseq_index][x][1] ssparser.data.append(new_sample) x += 1 - sample['index'] = index_dict_smartseq[smartseq_index][x][0] - sample['index2'] = index_dict_smartseq[smartseq_index][x][1] + sample["index"] = index_dict_smartseq[smartseq_index][x][0] + sample["index2"] = index_dict_smartseq[smartseq_index][x][1] # Sort to get the added indicies from 10x in the right place # Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID - ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID'))) + ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID"))) if not fields_to_remove: fields_to_remove = [] # Header - output += f'[Header]{os.linesep}' + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Data - output += f'[Data]{os.linesep}' + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: line_ar = [] @@ -609,79 +831,108 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}') + value = re.sub( + "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}" + ) else: # Substitute SampleID with SampleName, add Sample_ as prefix - value =f'Sample_{line[ssparser.dfield_snm]}' + value = f"Sample_{line[ssparser.dfield_snm]}" except: - # Otherwise add Sample_ as prefix - value = f'Sample_{line[ssparser.dfield_sid]}' + # Otherwise add Sample_ as prefix + value = f"Sample_{line[ssparser.dfield_sid]}" elif rename_qPCR_suffix and field in fields_qPCR: - value = re.sub('__qPCR_$', '', line[field]) + value = re.sub("__qPCR_$", "", line[field]) line_ar.append(value) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output - def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, software, sample_type, index1_size, index2_size, base_mask, CONFIG): - output = '' + def _generate_samplesheet_subset( + self, + ssparser, + samples_to_include, + runSetup, + software, + sample_type, + index1_size, + index2_size, + base_mask, + CONFIG, + ): + output = "" # Prepare index cycles index_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) # Header - output += f'[Header]{os.linesep}' + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += f'{field.rstrip()},{ssparser.header[field].rstrip()}' + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Settings for BCL Convert - if software == 'bclconvert': - output += f'[Settings]{os.linesep}' - output += 'OverrideCycles,{}{}'.format(';'.join(base_mask), os.linesep) + if software == "bclconvert": + output += f"[Settings]{os.linesep}" + output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep) - if CONFIG.get('bclconvert'): - if CONFIG['bclconvert'].get('settings'): + if CONFIG.get("bclconvert"): + if CONFIG["bclconvert"].get("settings"): # Put common settings - if CONFIG['bclconvert']['settings'].get('common'): - for setting in CONFIG['bclconvert']['settings']['common']: + if CONFIG["bclconvert"]["settings"].get("common"): + for setting in CONFIG["bclconvert"]["settings"]["common"]: for k, v in setting.items(): - output += f'{k},{v}{os.linesep}' + output += f"{k},{v}{os.linesep}" # Put special settings: - if sample_type in CONFIG['bclconvert']['settings'].keys(): - for setting in CONFIG['bclconvert']['settings'][sample_type]: + if sample_type in CONFIG["bclconvert"]["settings"].keys(): + for setting in CONFIG["bclconvert"]["settings"][sample_type]: for k, v in setting.items(): - if (k == 'BarcodeMismatchesIndex1' and index1_size != 0) or (k == 'BarcodeMismatchesIndex2' and index2_size != 0) or 'BarcodeMismatchesIndex' not in k: - output += f'{k},{v}{os.linesep}' + if ( + ( + k == "BarcodeMismatchesIndex1" + and index1_size != 0 + ) + or ( + k == "BarcodeMismatchesIndex2" + and index2_size != 0 + ) + or "BarcodeMismatchesIndex" not in k + ): + output += f"{k},{v}{os.linesep}" # Data - output += f'[Data]{os.linesep}' + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: - sample_name = line.get('Sample_Name') or line.get('SampleName') - lane = line['Lane'] + sample_name = line.get("Sample_Name") or line.get("SampleName") + lane = line["Lane"] noindex_flag = False if lane in samples_to_include.keys(): if sample_name in samples_to_include.get(lane): line_ar = [] for field in datafields: # Case with NoIndex - if field == 'index' and 'NOINDEX' in line['index'].upper(): - line[field] = 'T'*index_cycles[0] if index_cycles[0] !=0 else '' + if field == "index" and "NOINDEX" in line["index"].upper(): + line[field] = ( + "T" * index_cycles[0] if index_cycles[0] != 0 else "" + ) noindex_flag = True - if field == 'index2' and noindex_flag: - line[field] = 'A'*index_cycles[1] if index_cycles[1] !=0 else '' + if field == "index2" and noindex_flag: + line[field] = ( + "A" * index_cycles[1] if index_cycles[1] != 0 else "" + ) noindex_flag = False # Case of IDT UMI - if (field == 'index' or field == 'index2') and IDT_UMI_PAT.findall(line[field]): - line[field] = line[field].replace('N', '') + if ( + field == "index" or field == "index2" + ) and IDT_UMI_PAT.findall(line[field]): + line[field] = line[field].replace("N", "") line_ar.append(line[field]) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output diff --git a/taca/illumina/__init__.py b/taca/illumina/__init__.py index 14e36756..50a56a43 100644 --- a/taca/illumina/__init__.py +++ b/taca/illumina/__init__.py @@ -1,3 +1,3 @@ """ Runs class to parse and work with illumina flowcells -""" \ No newline at end of file +""" diff --git a/taca/log/__init__.py b/taca/log/__init__.py index 0946603e..0ce995d1 100644 --- a/taca/log/__init__.py +++ b/taca/log/__init__.py @@ -8,25 +8,28 @@ # Console logger stream_handler = logging.StreamHandler() -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") stream_handler.setFormatter(formatter) ROOT_LOG.addHandler(stream_handler) LOG_LEVELS = { - 'ERROR': logging.ERROR, - 'WARN': logging.WARN, - 'INFO': logging.INFO, - 'DEBUG': logging.DEBUG + "ERROR": logging.ERROR, + "WARN": logging.WARN, + "INFO": logging.INFO, + "DEBUG": logging.DEBUG, } -def init_logger_file(log_file, log_level='INFO'): - """ Append a FileHandler to the root logger. + +def init_logger_file(log_file, log_level="INFO"): + """Append a FileHandler to the root logger. :param str log_file: Path to the log file :param str log_level: Logging level """ - ROOT_LOG.handlers=[] - log_level = LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO + ROOT_LOG.handlers = [] + log_level = ( + LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO + ) ROOT_LOG.setLevel(log_level) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index dfbdf3b8..ec610ad2 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -29,7 +29,6 @@ class ONT_run: """ def __init__(self, run_abspath: str): - # Get paths and names of MinKNOW experiment, sample and run self.run_name = os.path.basename(run_abspath) self.run_abspath = run_abspath @@ -185,7 +184,6 @@ def update_db_entry(self, force_update=False): ) def parse_pore_activity(self, db_update): - logger.info(f"{self.run_name}: Parsing pore activity...") pore_activity = {} @@ -299,7 +297,6 @@ def copy_metadata(self): ) def copy_html_report(self): - logger.info(f"{self.run_name}: Transferring .html report to ngi-internal...") # Transfer the MinKNOW .html report file to ngi-internal, renaming it to the full run ID. Requires password-free SSH access. diff --git a/taca/nanopore/__init__.py b/taca/nanopore/__init__.py index c8b7802c..5063a460 100644 --- a/taca/nanopore/__init__.py +++ b/taca/nanopore/__init__.py @@ -1,3 +1,3 @@ """ Classes to parse and work with ONT data -""" \ No newline at end of file +""" diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py index a7cabdf1..728393a0 100644 --- a/taca/nanopore/instrument_transfer.py +++ b/taca/nanopore/instrument_transfer.py @@ -48,7 +48,6 @@ def main(args): # Iterate over runs for run_path in run_paths: - logging.info(f"Handling {run_path}...") if run_path.split(os.sep)[-2][0:3] == "QC_": @@ -115,7 +114,9 @@ def sync_to_storage(run_dir, destination, log): ) -def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: list[str]): +def final_sync_to_storage( + run_dir: str, destination: str, archive_dir: str, log: list[str] +): """Do a final sync of the run to storage, then archive it. Skip if rsync is already running on the run.""" @@ -214,7 +215,6 @@ def parse_position_logs(minknow_logs_dir: str) -> list: entries = [] for position in positions: - log_files = glob( os.path.join(minknow_logs_dir, position, "control_server_log-*.txt") ) @@ -258,7 +258,6 @@ def get_pore_counts(position_logs: list) -> list: pore_counts = [] for entry in position_logs: - if "INFO: platform_qc.report (user_messages)" in entry["category"]: type = "qc" elif "INFO: mux_scan_result (user_messages)" in entry["category"]: @@ -267,7 +266,6 @@ def get_pore_counts(position_logs: list) -> list: type = "other" if type in ["qc", "mux"]: - new_entry = { "flow_cell_id": entry["body"]["flow_cell_id"], "timestamp": entry["timestamp"], @@ -327,6 +325,7 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str: return new_file_path + # BEGIN_EXCLUDE if __name__ == "__main__": # This is clunky but should be fine since it will only ever run as a cronjob @@ -365,4 +364,4 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str: args = parser.parse_args() main(args) -# END_EXCLUDE \ No newline at end of file +# END_EXCLUDE diff --git a/taca/server_status/cli.py b/taca/server_status/cli.py index 4b786fc1..1833035f 100644 --- a/taca/server_status/cli.py +++ b/taca/server_status/cli.py @@ -9,36 +9,38 @@ from taca.utils.config import CONFIG -@click.group(name='server_status') +@click.group(name="server_status") def server_status(): - """ Monitor server status """ + """Monitor server status""" + # server status subcommands @server_status.command() -@click.option('--statusdb', is_flag=True, help="Update the statusdb") +@click.option("--statusdb", is_flag=True, help="Update the statusdb") def nases(statusdb): - """ Checks the available space on all the nases - """ - if not CONFIG.get('server_status', ''): + """Checks the available space on all the nases""" + if not CONFIG.get("server_status", ""): logging.warning("Configuration missing required entries: server_status") disk_space = status.get_nases_disk_space() if statusdb: - status.update_status_db(disk_space, server_type='nas') + status.update_status_db(disk_space, server_type="nas") + @server_status.command() def cronjobs(): - """ Monitors cronjobs and updates statusdb - """ + """Monitors cronjobs and updates statusdb""" cj.update_cronjob_db() + @server_status.command() def monitor_promethion(): - """ Checks the status of PromethION and if ngi-nas is mounted - """ - if not CONFIG.get('promethion_status', ''): + """Checks the status of PromethION and if ngi-nas is mounted""" + if not CONFIG.get("promethion_status", ""): logging.warning("Configuration missing required entries: server_status") promethion_status = status.check_promethion_status() if promethion_status: logging.info("No issues encountered with the PromethION") else: - logging.warning("An issue with the PromethION was encountered. Operator has been notified by email.") \ No newline at end of file + logging.warning( + "An issue with the PromethION was encountered. Operator has been notified by email." + ) diff --git a/taca/server_status/cronjobs.py b/taca/server_status/cronjobs.py index 80fd59fc..1f1605c4 100644 --- a/taca/server_status/cronjobs.py +++ b/taca/server_status/cronjobs.py @@ -12,68 +12,73 @@ def _parse_crontab(): result = {} user = getpass.getuser() - logging.info(f'Getting crontab for user {user}') + logging.info(f"Getting crontab for user {user}") try: crontab = CronTab(user=user) except Exception as e: - logging.error(f'Cannot get a crontab for user: {user}') + logging.error(f"Cannot get a crontab for user: {user}") logging.error(e.message) else: result[user] = [] for job in crontab.crons: # this is for special syntax like @monthly or @reboot - special_syntax = str(job).split()[0] if str(job).startswith('@') else '' - result[user].append({'Command': job.command, - 'Comment': job.comment, - 'Enabled': job.enabled, - 'Minute': str(job.minutes), - 'Hour': str(job.hours), - 'Day of month' : str(job.dom), - 'Month': str(job.month), - 'Day of week': str(job.dow), - 'Special syntax': special_syntax}) + special_syntax = str(job).split()[0] if str(job).startswith("@") else "" + result[user].append( + { + "Command": job.command, + "Comment": job.comment, + "Enabled": job.enabled, + "Minute": str(job.minutes), + "Hour": str(job.hours), + "Day of month": str(job.dom), + "Month": str(job.month), + "Day of week": str(job.dow), + "Special syntax": special_syntax, + } + ) return result def update_cronjob_db(): - server = platform.node().split('.')[0] + server = platform.node().split(".")[0] timestamp = datetime.datetime.now() # parse results result = _parse_crontab() # connect to db - statusdb_conf = CONFIG.get('statusdb') - logging.info('Connecting to database: {}'.format(CONFIG.get('statusdb', {}).get('url'))) + statusdb_conf = CONFIG.get("statusdb") + logging.info( + "Connecting to database: {}".format(CONFIG.get("statusdb", {}).get("url")) + ) try: couch_connection = statusdb.StatusdbSession(statusdb_conf).connection except Exception as e: logging.error(e.message) else: # update document - crontab_db = couch_connection['cronjobs'] - view = crontab_db.view('server/alias') + crontab_db = couch_connection["cronjobs"] + view = crontab_db.view("server/alias") # to be safe doc = {} # create doc if not exist if not view[server].rows: - logging.info('Creating a document') + logging.info("Creating a document") doc = { - 'users': {user: cronjobs for user, cronjobs in result.items()}, - 'Last updated': str(timestamp), - 'server': server, + "users": {user: cronjobs for user, cronjobs in result.items()}, + "Last updated": str(timestamp), + "server": server, } # else: get existing doc for row in view[server]: - logging.info('Updating the document') + logging.info("Updating the document") doc = crontab_db.get(row.value) - doc['users'].update(result) - doc['Last updated'] = str(timestamp) + doc["users"].update(result) + doc["Last updated"] = str(timestamp) if doc: try: crontab_db.save(doc) except Exception as e: logging.error(e.message) else: - logging.info(f'{server} has been successfully updated') + logging.info(f"{server} has been successfully updated") else: - logging.warning('Document has not been created/updated') - + logging.warning("Document has not been created/updated") diff --git a/taca/server_status/server_status.py b/taca/server_status/server_status.py index 36b6f27a..3431da31 100644 --- a/taca/server_status/server_status.py +++ b/taca/server_status/server_status.py @@ -9,41 +9,42 @@ def get_nases_disk_space(): result = {} - config = CONFIG['server_status'] - servers = config.get('servers', dict()) + config = CONFIG["server_status"] + servers = config.get("servers", dict()) for server_url, path in servers.items(): - # Get command - command = '{command} {path}'.format(command=config['command'], path=path) + command = "{command} {path}".format(command=config["command"], path=path) # If localhost, don't connect to ssh - if server_url == 'localhost': + if server_url == "localhost": command = command.split() else: - if 'promethion' in server_url: - user = 'prom' + if "promethion" in server_url: + user = "prom" else: - user = config['user'] + user = config["user"] # Connect via ssh to server and execute the command - command = ['ssh', '-t', f'{user}@{server_url}', command] + command = ["ssh", "-t", f"{user}@{server_url}", command] result[server_url] = _run_cmd(command) # Storage systems are mouted locally, e.g. ngi-nas - for storage_system, path in config.get('storage_systems', {}).items(): + for storage_system, path in config.get("storage_systems", {}).items(): # Get command - command = '{command} {path}'.format(command=config['command'], path=path) + command = "{command} {path}".format(command=config["command"], path=path) result[storage_system] = _run_cmd(command.split()) return result + def _run_cmd(command): proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = proc.stdout.read().decode("utf-8") return _parse_output(output) -def _parse_output(output): # for nases + +def _parse_output(output): # for nases # command = df -h /home # output = Filesystem Size Used Avail Use% Mounted on # /dev/mapper/VGStor-lv_illumina @@ -59,39 +60,40 @@ def _parse_output(output): # for nases disk_size = output[-5] filesystem = output[-6] - available_percentage = str(100 - int(used_percentage.replace('%',''))) + '%' + available_percentage = str(100 - int(used_percentage.replace("%", ""))) + "%" result = { - 'disk_size': disk_size, - 'space_used': space_used, - 'space_available': space_available, - 'used_percentage': used_percentage, - 'available_percentage': available_percentage, - 'mounted_on': mounted_on, - 'filesystem': filesystem + "disk_size": disk_size, + "space_used": space_used, + "space_available": space_available, + "used_percentage": used_percentage, + "available_percentage": available_percentage, + "mounted_on": mounted_on, + "filesystem": filesystem, } except: # Sometimes it fails for whatever reason as Popen returns not what it is supposed to result = { - 'disk_size': 'NaN', - 'space_used': 'NaN', - 'space_available': 'NaN', - 'used_percentage': 'NaN', - 'available_percentage': 'NaN', - 'mounted_on': 'NaN', - 'filesystem': 'NaN' + "disk_size": "NaN", + "space_used": "NaN", + "space_available": "NaN", + "used_percentage": "NaN", + "available_percentage": "NaN", + "mounted_on": "NaN", + "filesystem": "NaN", } - logging.error(f'Can not parse the output: {output}') + logging.error(f"Can not parse the output: {output}") return result + def update_status_db(data, server_type=None): - """ Pushed the data to status db. + """Pushed the data to status db. data can be from nases server_type should be 'nas'. """ - db_config = CONFIG.get('statusdb') + db_config = CONFIG.get("statusdb") if db_config is None: logging.error('"statusdb" must be present in the config file!') raise RuntimeError('"statusdb" must be present in the config file!') @@ -101,14 +103,14 @@ def update_status_db(data, server_type=None): logging.error(e.message) raise - db = couch_connection['server_status'] - logging.info('Connection established') - for key in data.keys(): # data is dict of dicts - server = data[key] # data[key] is dictionary (the command output) - server['name'] = key # key is nas url + db = couch_connection["server_status"] + logging.info("Connection established") + for key in data.keys(): # data is dict of dicts + server = data[key] # data[key] is dictionary (the command output) + server["name"] = key # key is nas url # datetime.datetime(2015, 11, 18, 9, 54, 33, 473189) is not JSON serializable - server['time'] = datetime.datetime.now().isoformat() - server['server_type'] = server_type or 'unknown' + server["time"] = datetime.datetime.now().isoformat() + server["server_type"] = server_type or "unknown" try: db.save(server) @@ -116,27 +118,30 @@ def update_status_db(data, server_type=None): logging.error(e.message) raise else: - logging.info(f'{key}: Server status has been updated') + logging.info(f"{key}: Server status has been updated") + def check_promethion_status(): - config = CONFIG.get('promethion_status') - server = config.get('server') - path = config.get('path') - command = config.get('command') - command_to_run = f'{command} {path}' - user = config.get('user') + config = CONFIG.get("promethion_status") + server = config.get("server") + path = config.get("path") + command = config.get("command") + command_to_run = f"{command} {path}" + user = config.get("user") try: - subprocess.run(['ssh', '-t', f'{user}@{server}', command_to_run], - check=True) + subprocess.run(["ssh", "-t", f"{user}@{server}", command_to_run], check=True) except subprocess.CalledProcessError: _send_promethion_warning_email() return False return True + def _send_promethion_warning_email(): - email_recipients = CONFIG.get('mail').get('recipients') - email_subject = ('An issue with the PromethION has been detected.') - email_message = ('An issue with the PromethION has been detected. ' - 'Please investigate and consider pausing the transfer cronjob on preproc1') - send_mail(email_subject, email_message, email_recipients) \ No newline at end of file + email_recipients = CONFIG.get("mail").get("recipients") + email_subject = "An issue with the PromethION has been detected." + email_message = ( + "An issue with the PromethION has been detected. " + "Please investigate and consider pausing the transfer cronjob on preproc1" + ) + send_mail(email_subject, email_message, email_recipients) diff --git a/taca/testing/cli.py b/taca/testing/cli.py index 4856b75d..2abcea9e 100644 --- a/taca/testing/cli.py +++ b/taca/testing/cli.py @@ -1,4 +1,3 @@ - """ CLI for the testing commands """ import os @@ -8,62 +7,87 @@ import taca.testing.create_uppmax_like_env as createupp -@click.group(name='uppmax_env') +@click.group(name="uppmax_env") def uppmax_env(): - """ Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log) - The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to: - - \b - environment: - project_id: ngi1234 #CAN BE ANYTHING - ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING - conda_env: TACA #CAN BE ANYTHING - flowcell_inbox: - - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS - analysis: - best_practice_analysis: - whole_genome_reseq: - analysis_engine: ngi_pipeline.engines.piper_ngi - IGN: - analysis_engine: ngi_pipeline.engines.piper_ngi - - qc: - - analysis_engine: ngi_pipeline.engines.qc_ngi - - base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS - sthlm_root: uppmax_env #NEEDS TO EXISTS - top_dir: nobackup/NGI #NEEDS TO EXISTS - upps_root: nothing #CAN BE ANYTHING - logging: - log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL + """Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log) + The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to: + + \b + environment: + project_id: ngi1234 #CAN BE ANYTHING + ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING + conda_env: TACA #CAN BE ANYTHING + flowcell_inbox: + - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS + analysis: + best_practice_analysis: + whole_genome_reseq: + analysis_engine: ngi_pipeline.engines.piper_ngi + IGN: + analysis_engine: ngi_pipeline.engines.piper_ngi + + qc: + + analysis_engine: ngi_pipeline.engines.qc_ngi + + base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS + sthlm_root: uppmax_env #NEEDS TO EXISTS + top_dir: nobackup/NGI #NEEDS TO EXISTS + upps_root: nothing #CAN BE ANYTHING + logging: + log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL + + \b + The requested project will be divided into the following sets: + - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in: + - 1/4: closed more than 3 months ago + - 1/4: closed more than 1 month ago, less than 3 months + - 1/4: closed less than 1 month ago + - 1/4: open + - 1/3 will be selected amonf the projects with application different from 'WG re-seq': + - 1/4: closed more than 3 months ago + - 1/4: closed more than 1 month ago, less than 3 months + - 1/4: closed less than 1 month ago + - 1/4: open - \b - The requested project will be divided into the following sets: - - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in: - - 1/4: closed more than 3 months ago - - 1/4: closed more than 1 month ago, less than 3 months - - 1/4: closed less than 1 month ago - - 1/4: open - - 1/3 will be selected amonf the projects with application different from 'WG re-seq': - - 1/4: closed more than 3 months ago - - 1/4: closed more than 1 month ago, less than 3 months - - 1/4: closed less than 1 month ago - - 1/4: open - - """ + """ pass -@uppmax_env.command() -@click.option('-p', '--projects', type=int, default=30, help='number of projects to be extracted from statusdb') -@click.option('-nc', '--ngi-config', type=str, default=os.environ.get('NGI_CONFIG') , help='path to ngi configuration file (expected in variable NGI_CONFIG)') -@click.option('-fq1', '--fastq_1', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 1') -@click.option('-fq2', '--fastq_2', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 2') +@uppmax_env.command() +@click.option( + "-p", + "--projects", + type=int, + default=30, + help="number of projects to be extracted from statusdb", +) +@click.option( + "-nc", + "--ngi-config", + type=str, + default=os.environ.get("NGI_CONFIG"), + help="path to ngi configuration file (expected in variable NGI_CONFIG)", +) +@click.option( + "-fq1", + "--fastq_1", + type=click.Path(exists=True, dir_okay=False), + default=None, + help="Path to fastq file for read 1", +) +@click.option( + "-fq2", + "--fastq_2", + type=click.Path(exists=True, dir_okay=False), + default=None, + help="Path to fastq file for read 2", +) def create(projects, ngi_config, fastq_1, fastq_2): - """creates a uppmax like env - """ - if (fastq_1 is None and fastq_2 is not None) or (fastq_1 is not None and fastq_2 is None): + """creates a uppmax like env""" + if (fastq_1 is None and fastq_2 is not None) or ( + fastq_1 is not None and fastq_2 is None + ): print("ERROR: either both fastq_1 and fastq_2 are specified or none of them") return 1 if fastq_1 is not None: @@ -73,11 +97,13 @@ def create(projects, ngi_config, fastq_1, fastq_2): if which("ngi_pipeline_start.py"): createupp.create(projects, ngi_config, fastq_1, fastq_2) else: - print("ERROR: ngi_pipeline_start.py needs to be available and properly installed") + print( + "ERROR: ngi_pipeline_start.py needs to be available and properly installed" + ) def which(file): for path in os.environ["PATH"].split(os.pathsep): if os.path.exists(os.path.join(path, file)): - return True + return True return False diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py index f0a10ea9..831646b6 100644 --- a/taca/testing/create_uppmax_like_env.py +++ b/taca/testing/create_uppmax_like_env.py @@ -18,121 +18,151 @@ def create_version_report(path): # Creates the file version_report.txt for stuff run ngi_pipeline - with open(os.path.join(path, 'version_report.txt'), 'w') as VERSION_REPORT: - VERSION_REPORT.write('******\n') - VERSION_REPORT.write('README\n') - VERSION_REPORT.write('******\n') - VERSION_REPORT.write('\n') - VERSION_REPORT.write('Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n') - VERSION_REPORT.write('\n') - VERSION_REPORT.write('The versions of programs and references used:\n') - VERSION_REPORT.write('piper: unknown\n') - VERSION_REPORT.write('bwa: 0.7.12\n') - VERSION_REPORT.write('samtools: 0.1.19\n') - VERSION_REPORT.write('qualimap: v2.2\n') - VERSION_REPORT.write('snpEff: 4.1\n') - VERSION_REPORT.write('snpEff reference: GRCh37.75\n') - VERSION_REPORT.write('gatk: 3.3-0-geee94ec\n') - VERSION_REPORT.write('\n') - VERSION_REPORT.write('reference: human_g1k_v37.fasta\n') - VERSION_REPORT.write('db_snp: gatk-bundle/2.8\n') - VERSION_REPORT.write('hapmap: gatk-bundle/2.8\n') - VERSION_REPORT.write('omni: gatk-bundle/2.8\n') - VERSION_REPORT.write('1000G_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write('Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write('\n') - VERSION_REPORT.write('indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write('indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write('\n') - VERSION_REPORT.write('piper\n') - VERSION_REPORT.write('-----\n') - VERSION_REPORT.write('Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n') + with open(os.path.join(path, "version_report.txt"), "w") as VERSION_REPORT: + VERSION_REPORT.write("******\n") + VERSION_REPORT.write("README\n") + VERSION_REPORT.write("******\n") + VERSION_REPORT.write("\n") + VERSION_REPORT.write( + "Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write("The versions of programs and references used:\n") + VERSION_REPORT.write("piper: unknown\n") + VERSION_REPORT.write("bwa: 0.7.12\n") + VERSION_REPORT.write("samtools: 0.1.19\n") + VERSION_REPORT.write("qualimap: v2.2\n") + VERSION_REPORT.write("snpEff: 4.1\n") + VERSION_REPORT.write("snpEff reference: GRCh37.75\n") + VERSION_REPORT.write("gatk: 3.3-0-geee94ec\n") + VERSION_REPORT.write("\n") + VERSION_REPORT.write("reference: human_g1k_v37.fasta\n") + VERSION_REPORT.write("db_snp: gatk-bundle/2.8\n") + VERSION_REPORT.write("hapmap: gatk-bundle/2.8\n") + VERSION_REPORT.write("omni: gatk-bundle/2.8\n") + VERSION_REPORT.write("1000G_indels: gatk-bundle/2.8\n") + VERSION_REPORT.write( + "Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write( + "indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n" + ) + VERSION_REPORT.write( + "indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write("piper\n") + VERSION_REPORT.write("-----\n") + VERSION_REPORT.write( + "Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n" + ) -def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None ): + +def create_FC(incoming_dir, run_name, samplesheet, fastq_1=None, fastq_2=None): # Create something like 160217_ST-E00201_0063_AHJHNYCCXX path_to_fc = os.path.join(incoming_dir, run_name) if os.path.exists(path_to_fc): # This FC exists, skip it return fs.create_folder(path_to_fc) - fs.touch(os.path.join(path_to_fc, 'RTAComplete.txt')) + fs.touch(os.path.join(path_to_fc, "RTAComplete.txt")) # Create folder Demultiplexing - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing")) # Create folder Demultiplexing/Reports - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Reports')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Reports")) # Create folder Demultiplexing/Stats - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Stats')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Stats")) # Memorise SampleSheet stats header = [] for key in samplesheet[0]: header.append(key) counter = 1 - current_lane = '' + current_lane = "" for line in samplesheet: - project_name = line.get('Sample_Project', line.get('Project', '')) - lane = line['Lane'] - if current_lane == '': + project_name = line.get("Sample_Project", line.get("Project", "")) + lane = line["Lane"] + if current_lane == "": current_lane = lane elif current_lane != lane: counter = 1 current_lane = lane - sample_id = line.get('SampleID', line.get('Sample_ID', '')) - sample_name = line.get('SampleName', line.get('Sample_Name', '')) + sample_id = line.get("SampleID", line.get("Sample_ID", "")) + sample_name = line.get("SampleName", line.get("Sample_Name", "")) # Create dir structure - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', project_name, sample_id)) + fs.create_folder( + os.path.join(path_to_fc, "Demultiplexing", project_name, sample_id) + ) # Now create the data - fastq_1_dest = f'{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz' - fastq_2_dest = f'{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz' + fastq_1_dest = f"{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz" + fastq_2_dest = f"{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz" counter += 1 if fastq_1 is None: - fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name, - sample_id, fastq_1_dest)) - fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name, - sample_id, fastq_2_dest)) + fs.touch( + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest + ) + ) + fs.touch( + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest + ) + ) else: - fs.do_symlink(fastq_1, os.path.join(path_to_fc, 'Demultiplexing', - project_name, sample_id, fastq_1_dest)) - fs.do_symlink(fastq_2, os.path.join(path_to_fc, 'Demultiplexing', - project_name, sample_id, fastq_2_dest)) + fs.do_symlink( + fastq_1, + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest + ), + ) + fs.do_symlink( + fastq_2, + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest + ), + ) - with open(os.path.join(path_to_fc, 'SampleSheet.csv'), 'w') as Samplesheet_file: - Samplesheet_file.write('[Header]\n') - Samplesheet_file.write('Date,2016-03-29\n') - Samplesheet_file.write('Investigator Name,Christian Natanaelsson\n') - Samplesheet_file.write('[Data]\n') + with open(os.path.join(path_to_fc, "SampleSheet.csv"), "w") as Samplesheet_file: + Samplesheet_file.write("[Header]\n") + Samplesheet_file.write("Date,2016-03-29\n") + Samplesheet_file.write("Investigator Name,Christian Natanaelsson\n") + Samplesheet_file.write("[Data]\n") for key in header: - Samplesheet_file.write(f'{key},') - Samplesheet_file.write('\n') + Samplesheet_file.write(f"{key},") + Samplesheet_file.write("\n") for line in samplesheet: for key in header: - Samplesheet_file.write(f'{line[key]},') - Samplesheet_file.write('\n') + Samplesheet_file.write(f"{line[key]},") + Samplesheet_file.write("\n") + def create_uppmax_env(ngi_config): paths = {} - if 'analysis' not in ngi_config: - sys.exit('ERROR: analysis must be a field of NGI_CONFIG.') + if "analysis" not in ngi_config: + sys.exit("ERROR: analysis must be a field of NGI_CONFIG.") try: - base_root = ngi_config['analysis']['base_root'] - paths['base_root'] = base_root - sthlm_root = ngi_config['analysis']['sthlm_root'] - paths['sthlm_root'] = sthlm_root - top_dir = ngi_config['analysis']['top_dir'] - paths['top_dir'] = top_dir + base_root = ngi_config["analysis"]["base_root"] + paths["base_root"] = base_root + sthlm_root = ngi_config["analysis"]["sthlm_root"] + paths["sthlm_root"] = sthlm_root + top_dir = ngi_config["analysis"]["top_dir"] + paths["top_dir"] = top_dir except KeyError as e: - raise SystemExit(f'Config file is missing the key {str(e)}, make sure it have all required information') - if 'environment' not in ngi_config: - sys.exit('ERROR: environment must be a field of NGI_CONFIG.') + raise SystemExit( + f"Config file is missing the key {str(e)}, make sure it have all required information" + ) + if "environment" not in ngi_config: + sys.exit("ERROR: environment must be a field of NGI_CONFIG.") try: # Get base root - flowcell_inboxes = ngi_config['environment']['flowcell_inbox'] - flowcell_inbox = flowcell_inboxes[0] # I assume there is only one - paths['flowcell_inbox'] = flowcell_inbox + flowcell_inboxes = ngi_config["environment"]["flowcell_inbox"] + flowcell_inbox = flowcell_inboxes[0] # I assume there is only one + paths["flowcell_inbox"] = flowcell_inbox except ValueError as e: sys.exit(f'key error, flowcell_inbox not found in "{ngi_config}": {e}') # Now I need to create the folders for this if not os.path.exists(base_root): - sys.exit(f'base_root needs to exists: {base_root}') + sys.exit(f"base_root needs to exists: {base_root}") fs.create_folder(flowcell_inbox) if sthlm_root is None: path_to_analysis = os.path.join(base_root, top_dir) @@ -141,72 +171,89 @@ def create_uppmax_env(ngi_config): fs.create_folder(path_to_analysis) return paths + def produce_analysis_qc_ngi(ngi_config, project_id): - analysis_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'ANALYSIS', project_id) - data_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'DATA', project_id) + analysis_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "ANALYSIS", + project_id, + ) + data_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "DATA", + project_id, + ) - qc_ngi_dir = os.path.join(analysis_dir, 'qc_ngi') + qc_ngi_dir = os.path.join(analysis_dir, "qc_ngi") fs.create_folder(qc_ngi_dir) for sample_id in os.listdir(data_dir): sample_dir_qc = os.path.join(qc_ngi_dir, sample_id) fs.create_folder(sample_dir_qc) - fastqc_dir = os.path.join(sample_dir_qc, 'fastqc') + fastqc_dir = os.path.join(sample_dir_qc, "fastqc") fs.create_folder(fastqc_dir) - fastq_screen_dir = os.path.join(sample_dir_qc, 'fastq_screen') + fastq_screen_dir = os.path.join(sample_dir_qc, "fastq_screen") fs.create_folder(fastq_screen_dir) # Do not create more than this... + def produce_analysis_piper(ngi_config, project_id): # Create piper_ngi - analysis_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'ANALYSIS', project_id) - data_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'DATA', project_id) + analysis_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "ANALYSIS", + project_id, + ) + data_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "DATA", + project_id, + ) - piper_ngi_dir = os.path.join(analysis_dir, 'piper_ngi') + piper_ngi_dir = os.path.join(analysis_dir, "piper_ngi") fs.create_folder(piper_ngi_dir) - piper_dirs = ['01_raw_alignments', - '02_preliminary_alignment_qc', - '03_genotype_concordance', - '04_merged_aligments', - '05_processed_alignments', - '06_final_alignment_qc', - '07_variant_calls', - '08_misc'] + piper_dirs = [ + "01_raw_alignments", + "02_preliminary_alignment_qc", + "03_genotype_concordance", + "04_merged_aligments", + "05_processed_alignments", + "06_final_alignment_qc", + "07_variant_calls", + "08_misc", + ] for piper_dir in piper_dirs: - current_dir = os.path.join(piper_ngi_dir, piper_dir) + current_dir = os.path.join(piper_ngi_dir, piper_dir) fs.create_folder(current_dir) - if piper_dir == '05_processed_alignments': + if piper_dir == "05_processed_alignments": for sample_id in os.listdir(data_dir): - bam_file = f'{sample_id}.clean.dedup.bam' + bam_file = f"{sample_id}.clean.dedup.bam" fs.touch(os.path.join(current_dir, bam_file)) - if piper_dir == '07_variant_calls': + if piper_dir == "07_variant_calls": for sample_id in os.listdir(data_dir): - vcf_file = f'{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz' + vcf_file = f"{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz" fs.touch(os.path.join(current_dir, vcf_file)) - current_dir = os.path.join(piper_ngi_dir, 'sbatch') + current_dir = os.path.join(piper_ngi_dir, "sbatch") fs.create_folder(current_dir) - current_dir = os.path.join(piper_ngi_dir, 'setup_xml_files') + current_dir = os.path.join(piper_ngi_dir, "setup_xml_files") fs.create_folder(current_dir) - current_dir = os.path.join(piper_ngi_dir, 'logs') + current_dir = os.path.join(piper_ngi_dir, "logs") fs.create_folder(current_dir) create_version_report(current_dir) + def select_random_projects(projects_in, num_proj, application, projects_out, label): chosen_projects = 0 - iterations = 0 # Safe guard to avoid infinite loops - application_not_in_other = ['WG re-seq'] - while chosen_projects != num_proj and iterations < 4*len(projects_in): + iterations = 0 # Safe guard to avoid infinite loops + application_not_in_other = ["WG re-seq"] + while chosen_projects != num_proj and iterations < 4 * len(projects_in): iterations += 1 selected_proj = random.choice(list(projects_in.keys())) # Check if I have already picked up this element @@ -215,139 +262,190 @@ def select_random_projects(projects_in, num_proj, application, projects_out, lab if selected_proj == project_pair[0]: already_chosen = True if already_chosen: - continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops + continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops proj_value = projects_in[selected_proj] - if application == 'other': + if application == "other": # In this case everything expcept - if proj_value['application'] not in application_not_in_other: + if proj_value["application"] not in application_not_in_other: # I select this one projects_out.append([selected_proj, label]) chosen_projects += 1 - elif application == proj_value['application']: + elif application == proj_value["application"]: # I select this one projects_out.append([selected_proj, label]) chosen_projects += 1 + def create(projects, ngi_config_file, fastq_1, fastq_2): - statusdb_conf = CONFIG.get('statusdb') + statusdb_conf = CONFIG.get("statusdb") if statusdb_conf is None: - logger.error('No statusdb field in taca configuration file') + logger.error("No statusdb field in taca configuration file") return 1 - if 'dev' not in statusdb_conf['url']: - logger.error('url for status db is {}, but dev must be specified in this case'.format(statusdb_conf['url'])) + if "dev" not in statusdb_conf["url"]: + logger.error( + "url for status db is {}, but dev must be specified in this case".format( + statusdb_conf["url"] + ) + ) couch_connection = statusdb.StatusdbSession(statusdb_conf).connection - projectsDB = couch_connection['projects'] - project_summary = projectsDB.view('project/summary') + projectsDB = couch_connection["projects"] + project_summary = projectsDB.view("project/summary") projects_closed_more_than_three_months = {} projects_closed_more_than_one_month_less_than_three = {} projects_closed_less_than_one_month = {} projects_opened = {} current_date = datetime.datetime.today() - date_limit_one_year = current_date - relativedelta(months=6) #yes yes I know.. but in this way i am sure all data in in xflocell_db + date_limit_one_year = current_date - relativedelta( + months=6 + ) # yes yes I know.. but in this way i am sure all data in in xflocell_db date_limit_one_month = current_date - relativedelta(months=1) date_limit_three_month = current_date - relativedelta(months=3) for row in project_summary: - project_id = row['key'][1] - project_status = row['key'][0] - if 'application' not in row['value']: + project_id = row["key"][1] + project_status = row["key"][0] + if "application" not in row["value"]: continue - if row['value']['no_samples'] > 50: - continue # Skip large projects - application = row['value']['application'] - if project_status == 'closed': - if 'close_date' in row['value']: - close_date = datetime.datetime.strptime(row['value']['close_date'], '%Y-%m-%d') - if close_date > date_limit_one_year: # If the project has been closed after the date limit + if row["value"]["no_samples"] > 50: + continue # Skip large projects + application = row["value"]["application"] + if project_status == "closed": + if "close_date" in row["value"]: + close_date = datetime.datetime.strptime( + row["value"]["close_date"], "%Y-%m-%d" + ) + if ( + close_date > date_limit_one_year + ): # If the project has been closed after the date limit if close_date >= date_limit_one_month: - projects_closed_less_than_one_month[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} - elif close_date < date_limit_one_month and close_date >= date_limit_three_month: - projects_closed_more_than_one_month_less_than_three[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} + projects_closed_less_than_one_month[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } + elif ( + close_date < date_limit_one_month + and close_date >= date_limit_three_month + ): + projects_closed_more_than_one_month_less_than_three[ + project_id + ] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } elif close_date < date_limit_three_month: - projects_closed_more_than_three_months[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} - elif project_status == 'open': - if 'lanes_sequenced' in row['value'] and row['value']['lanes_sequenced'] > 0: - projects_opened[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} + projects_closed_more_than_three_months[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } + elif project_status == "open": + if ( + "lanes_sequenced" in row["value"] + and row["value"]["lanes_sequenced"] > 0 + ): + projects_opened[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } else: - print(f'status {project_status}') + print(f"status {project_status}") ## Now I can parse the x_flowcell db to check what I can and cannot use - whole_genome_projects = int(2*projects/3) + whole_genome_projects = int(2 * projects / 3) projects_to_reproduce = [] - select_random_projects(projects_closed_more_than_three_months, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_tot_closed') - select_random_projects(projects_closed_more_than_one_month_less_than_three, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_closed_clean_no_del') - select_random_projects(projects_closed_less_than_one_month, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_closed_no_clean') - select_random_projects(projects_opened, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_open') + select_random_projects( + projects_closed_more_than_three_months, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_tot_closed", + ) + select_random_projects( + projects_closed_more_than_one_month_less_than_three, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_closed_clean_no_del", + ) + select_random_projects( + projects_closed_less_than_one_month, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_closed_no_clean", + ) + select_random_projects( + projects_opened, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_open", + ) - other_projects = int(projects/3) - select_random_projects(projects_closed_more_than_three_months, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_tot_closed') - select_random_projects(projects_closed_more_than_one_month_less_than_three, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_closed_clean_no_del') - select_random_projects(projects_closed_less_than_one_month, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_closed_no_clean') - select_random_projects(projects_opened, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_open') + other_projects = int(projects / 3) + select_random_projects( + projects_closed_more_than_three_months, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_tot_closed", + ) + select_random_projects( + projects_closed_more_than_one_month_less_than_three, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_closed_clean_no_del", + ) + select_random_projects( + projects_closed_less_than_one_month, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_closed_no_clean", + ) + select_random_projects( + projects_opened, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_open", + ) # Create ngi_pipeline enviroment - print(f'#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file') - print(f'NGI_CONFIG={ngi_config_file}') + print( + f"#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file" + ) + print(f"NGI_CONFIG={ngi_config_file}") try: ngi_config = conf.load_config(ngi_config_file) except OSError as e: - print(f'ERROR: {e.message}') + print(f"ERROR: {e.message}") # Create uppmax env paths = create_uppmax_env(ngi_config) - print(f'#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry') + print( + f"#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry" + ) # Scan over x_flowcell and reproduce FCs - flowcellDB = couch_connection['x_flowcells'] + flowcellDB = couch_connection["x_flowcells"] reproduced_projects = {} for fc_doc in flowcellDB: try: - samplesheet_csv = flowcellDB[fc_doc]['samplesheet_csv'] + samplesheet_csv = flowcellDB[fc_doc]["samplesheet_csv"] except KeyError: - continue # Parse only FC that have a samplesheet + continue # Parse only FC that have a samplesheet # Check if this FC contains one of the proejcts I need to replicate. projects_in_FC = set() - if 'SampleName' in samplesheet_csv[0]: - projects_in_FC = set([line['SampleName'].split('_')[0] for line in samplesheet_csv]) + if "SampleName" in samplesheet_csv[0]: + projects_in_FC = set( + [line["SampleName"].split("_")[0] for line in samplesheet_csv] + ) else: - projects_in_FC = set([line['Sample_Name'].split('_')[0] for line in samplesheet_csv]) + projects_in_FC = set( + [line["Sample_Name"].split("_")[0] for line in samplesheet_csv] + ) found = False for project_pair in projects_to_reproduce: project = project_pair[0] @@ -355,31 +453,46 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): # This FC needs to be created if not found: # Create the FC only the first time I see a project belonging to it - create_FC(paths['flowcell_inbox'] , flowcellDB[fc_doc]['RunInfo']['Id'], samplesheet_csv, fastq_1, fastq_2) + create_FC( + paths["flowcell_inbox"], + flowcellDB[fc_doc]["RunInfo"]["Id"], + samplesheet_csv, + fastq_1, + fastq_2, + ) found = True # But I keep track of all projects-run I need to organise if project not in reproduced_projects: reproduced_projects[project] = [] - reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id']) - print(f'#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)') + reproduced_projects[project].append(flowcellDB[fc_doc]["RunInfo"]["Id"]) + print( + f"#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)" + ) for project in projects_to_reproduce: if project[0] in reproduced_projects: - print(f'# {project[0]}: {project[1]}') + print(f"# {project[0]}: {project[1]}") # Need to output the command to organise to_be_deleted = [] for project in reproduced_projects: for FC in reproduced_projects[project]: - print(f'Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}') - with open('ngi_pipeline_local.logs', 'w') as NGILOGS: - return_value = subprocess.call(['ngi_pipeline_start.py', - 'organize', - 'flowcell', - f'{FC}', - '-p', - f'{project}'], - stdout=NGILOGS, stderr=NGILOGS) + print(f"Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}") + with open("ngi_pipeline_local.logs", "w") as NGILOGS: + return_value = subprocess.call( + [ + "ngi_pipeline_start.py", + "organize", + "flowcell", + f"{FC}", + "-p", + f"{project}", + ], + stdout=NGILOGS, + stderr=NGILOGS, + ) if return_value > 0: - print(f'#project {project} not organised: have a look to the logs, but most likely this projec is not in charon') + print( + f"#project {project} not organised: have a look to the logs, but most likely this projec is not in charon" + ) if project not in to_be_deleted: to_be_deleted.append(project) @@ -388,13 +501,15 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): # Create ANALYSIS -- for project in projects_to_reproduce: - if project[0] in reproduced_projects: # Only for projects that I know I have organised + if ( + project[0] in reproduced_projects + ): # Only for projects that I know I have organised produce_analysis_qc_ngi(ngi_config, project[0]) - if project[1].startswith('WGreseq'): + if project[1].startswith("WGreseq"): produce_analysis_piper(ngi_config, project[0]) # Store in a file the results - with open('projects.txt', 'w') as PROJECTS: + with open("projects.txt", "w") as PROJECTS: for project in projects_to_reproduce: if project[0] in reproduced_projects: - PROJECTS.write(f'{project[0]}:{project[1]}\n') + PROJECTS.write(f"{project[0]}:{project[1]}\n") diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 022ac93e..33ea19b5 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -16,6 +16,7 @@ class Tree(defaultdict): """Constructor for a search tree.""" + def __init__(self, value=None): super().__init__(Tree) self.value = value @@ -26,31 +27,36 @@ def collect_runs(): found_runs = [] # Pattern explained: # 6-8Digits_(maybe ST-)AnythingLetterornumberNumber_Number_AorBLetterornumberordash - rundir_re = re.compile('\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+') - for data_dir in CONFIG['bioinfo_tab']['data_dirs']: + rundir_re = re.compile("\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+") + for data_dir in CONFIG["bioinfo_tab"]["data_dirs"]: if os.path.exists(data_dir): - potential_run_dirs = glob.glob(os.path.join(data_dir, '*')) + potential_run_dirs = glob.glob(os.path.join(data_dir, "*")) for run_dir in potential_run_dirs: - if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir): + if rundir_re.match( + os.path.basename(os.path.abspath(run_dir)) + ) and os.path.isdir(run_dir): found_runs.append(os.path.basename(run_dir)) - logger.info(f'Working on {run_dir}') + logger.info(f"Working on {run_dir}") update_statusdb(run_dir) - nosync_data_dir = os.path.join(data_dir, 'nosync') - potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, '*')) + nosync_data_dir = os.path.join(data_dir, "nosync") + potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, "*")) for run_dir in potential_nosync_run_dirs: - if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir): + if rundir_re.match( + os.path.basename(os.path.abspath(run_dir)) + ) and os.path.isdir(run_dir): update_statusdb(run_dir) + def update_statusdb(run_dir): """Gets status for a project.""" # Fetch individual fields project_info = get_ss_projects(run_dir) run_id = os.path.basename(os.path.abspath(run_dir)) - statusdb_conf = CONFIG.get('statusdb') + statusdb_conf = CONFIG.get("statusdb") couch_connection = statusdb.StatusdbSession(statusdb_conf).connection valueskey = datetime.datetime.now().isoformat() - db = couch_connection['bioinfo_analysis'] - view = db.view('latest_data/sample_id') + db = couch_connection["bioinfo_analysis"] + view = db.view("latest_data/sample_id") # Construction and sending of individual records, if samplesheet is incorrectly formatted the loop is skipped if project_info: for flowcell in project_info: @@ -59,14 +65,20 @@ def update_statusdb(run_dir): for project in project_info[flowcell][lane][sample]: project_info[flowcell][lane][sample].value = get_status(run_dir) sample_status = project_info[flowcell][lane][sample].value - obj = {'run_id': run_id, - 'project_id': project, - 'flowcell': flowcell, - 'lane': lane, - 'sample': sample, - 'status': sample_status, - 'values': {valueskey: {'user': 'taca', - 'sample_status': sample_status}}} + obj = { + "run_id": run_id, + "project_id": project, + "flowcell": flowcell, + "lane": lane, + "sample": sample, + "status": sample_status, + "values": { + valueskey: { + "user": "taca", + "sample_status": sample_status, + } + }, + } # If entry exists, append to existing # Special if case to handle lanes written as int, can be safely removed when old lanes # is no longer stored as int @@ -75,151 +87,193 @@ def update_statusdb(run_dir): if len(view[[project, run_id, lane, sample]].rows) >= 1: remote_id = view[[project, run_id, lane, sample]].rows[0].id lane = str(lane) - remote_doc = db[remote_id]['values'] - remote_status = db[remote_id]['status'] + remote_doc = db[remote_id]["values"] + remote_status = db[remote_id]["status"] # Only updates the listed statuses - if remote_status in ['New', 'ERROR', 'Sequencing', 'Demultiplexing'] and sample_status != remote_status: + if ( + remote_status + in ["New", "ERROR", "Sequencing", "Demultiplexing"] + and sample_status != remote_status + ): # Appends old entry to new. Essentially merges the two for k, v in remote_doc.items(): - obj['values'][k] = v - logger.info('Updating {} {} {} {} {} as {}'.format(run_id, - project, - flowcell, - lane, - sample, - sample_status)) + obj["values"][k] = v + logger.info( + "Updating {} {} {} {} {} as {}".format( + run_id, + project, + flowcell, + lane, + sample, + sample_status, + ) + ) # Sorts timestamps - obj['values'] = OrderedDict(sorted(obj['values'].items(), key=lambda k_v: k_v[0], reverse=True)) + obj["values"] = OrderedDict( + sorted( + obj["values"].items(), + key=lambda k_v: k_v[0], + reverse=True, + ) + ) # Update record cluster - obj['_rev'] = db[remote_id].rev - obj['_id'] = remote_id + obj["_rev"] = db[remote_id].rev + obj["_id"] = remote_id db.save(obj) # Creates new entry else: - logger.info('Creating {} {} {} {} {} as {}'.format(run_id, - project, - flowcell, - lane, - sample, - sample_status)) + logger.info( + "Creating {} {} {} {} {} as {}".format( + run_id, + project, + flowcell, + lane, + sample, + sample_status, + ) + ) # Creates record db.save(obj) # Sets FC error flag if project_info[flowcell].value is not None: - if (('Failed' in project_info[flowcell].value and 'Failed' not in sample_status) - or ('Failed' in sample_status and 'Failed' not in project_info[flowcell].value)): - project_info[flowcell].value = 'Ambiguous' + if ( + "Failed" in project_info[flowcell].value + and "Failed" not in sample_status + ) or ( + "Failed" in sample_status + and "Failed" not in project_info[flowcell].value + ): + project_info[flowcell].value = "Ambiguous" else: project_info[flowcell].value = sample_status # Checks if a flowcell needs partial re-doing # Email error per flowcell if project_info[flowcell].value is not None: - if 'Ambiguous' in project_info[flowcell].value: - error_emailer('failed_run', run_id) + if "Ambiguous" in project_info[flowcell].value: + error_emailer("failed_run", run_id) + def get_status(run_dir): """Gets status of a sample run, based on flowcell info (folder structure).""" # Default state, should never occur - status = 'ERROR' - xten_dmux_folder = os.path.join(run_dir, 'Demultiplexing') - unaligned_folder = glob.glob(os.path.join(run_dir, 'Unaligned_*')) - nosync_pattern = re.compile('nosync') + status = "ERROR" + xten_dmux_folder = os.path.join(run_dir, "Demultiplexing") + unaligned_folder = glob.glob(os.path.join(run_dir, "Unaligned_*")) + nosync_pattern = re.compile("nosync") # If we're in a nosync folder if nosync_pattern.search(run_dir): - status = 'New' + status = "New" # If demux folder exist (or similar) - elif (os.path.exists(xten_dmux_folder) or unaligned_folder): - status = 'Demultiplexing' + elif os.path.exists(xten_dmux_folder) or unaligned_folder: + status = "Demultiplexing" # If RTAcomplete doesn't exist - elif not (os.path.exists(os.path.join(run_dir, 'RTAComplete.txt'))): - status = 'Sequencing' + elif not (os.path.exists(os.path.join(run_dir, "RTAComplete.txt"))): + status = "Sequencing" return status + def get_ss_projects(run_dir): """Fetches project, FC, lane & sample (sample-run) status for a given folder.""" proj_tree = Tree() - lane_pattern = re.compile('^([1-8]{1,2})$') - sample_proj_pattern = re.compile('^((P[0-9]{3,5})_[0-9]{3,5})') + lane_pattern = re.compile("^([1-8]{1,2})$") + sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})") run_name = os.path.basename(os.path.abspath(run_dir)) - run_date = run_name.split('_')[0] + run_date = run_name.split("_")[0] if len(run_date) == 6: - current_year = '20' + run_date[0:2] - elif len(run_name.split('_')[0]) == 8: # NovaSeqXPlus case + current_year = "20" + run_date[0:2] + elif len(run_name.split("_")[0]) == 8: # NovaSeqXPlus case current_year = run_date[0:4] - run_name_components = run_name.split('_') - if 'VH' in run_name_components[1]: + run_name_components = run_name.split("_") + if "VH" in run_name_components[1]: FCID = run_name_components[3] else: FCID = run_name_components[3][1:] miseq = False # FIXME: this check breaks if the system is case insensitive - if os.path.exists(os.path.join(run_dir, 'runParameters.xml')): - run_parameters_file = 'runParameters.xml' - elif os.path.exists(os.path.join(run_dir, 'RunParameters.xml')): - run_parameters_file = 'RunParameters.xml' + if os.path.exists(os.path.join(run_dir, "runParameters.xml")): + run_parameters_file = "runParameters.xml" + elif os.path.exists(os.path.join(run_dir, "RunParameters.xml")): + run_parameters_file = "RunParameters.xml" else: - logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}') + logger.error( + f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}" + ) return [] rp = RunParametersParser(os.path.join(run_dir, run_parameters_file)) - if 'Setup' in rp.data['RunParameters']: - runtype = rp.data['RunParameters']['Setup'].get('Flowcell', '') + if "Setup" in rp.data["RunParameters"]: + runtype = rp.data["RunParameters"]["Setup"].get("Flowcell", "") if not runtype: - logger.warn('Parsing runParameters to fetch instrument type, ' - 'not found Flowcell information in it. Using ApplicationName') - runtype = rp.data['RunParameters']['Setup'].get('ApplicationName', '') - elif 'InstrumentType' in rp.data['RunParameters']: - runtype = rp.data['RunParameters'].get('InstrumentType') + logger.warn( + "Parsing runParameters to fetch instrument type, " + "not found Flowcell information in it. Using ApplicationName" + ) + runtype = rp.data["RunParameters"]["Setup"].get("ApplicationName", "") + elif "InstrumentType" in rp.data["RunParameters"]: + runtype = rp.data["RunParameters"].get("InstrumentType") else: - runtype = rp.data['RunParameters'].get('Application') + runtype = rp.data["RunParameters"].get("Application") if not runtype: - logger.warn("Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'") - runtype = rp.data['RunParameters'].get('ApplicationName', '') + logger.warn( + "Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'" + ) + runtype = rp.data["RunParameters"].get("ApplicationName", "") # Miseq case - if 'MiSeq' in runtype: - if os.path.exists(os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv')): - FCID_samplesheet_origin = os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv') - elif os.path.exists(os.path.join(run_dir, 'SampleSheet.csv')): - FCID_samplesheet_origin = os.path.join(run_dir, 'SampleSheet.csv') + if "MiSeq" in runtype: + if os.path.exists( + os.path.join(run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv") + ): + FCID_samplesheet_origin = os.path.join( + run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv" + ) + elif os.path.exists(os.path.join(run_dir, "SampleSheet.csv")): + FCID_samplesheet_origin = os.path.join(run_dir, "SampleSheet.csv") else: - logger.warn(f'No samplesheet found for {run_dir}') + logger.warn(f"No samplesheet found for {run_dir}") miseq = True lanes = str(1) # Pattern is a bit more rigid since we're no longer also checking for lanes - sample_proj_pattern=re.compile('^((P[0-9]{3,5})_[0-9]{3,5})$') + sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})$") data = parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=True) # HiSeq X case - elif 'HiSeq X' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['xten_samplesheets'], - current_year, f'{FCID}.csv') + elif "HiSeq X" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["xten_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # HiSeq 2500 case - elif 'HiSeq' in runtype or 'TruSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['hiseq_samplesheets'], - current_year, f'{FCID}.csv') + elif "HiSeq" in runtype or "TruSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["hiseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) - elif 'NovaSeqXPlus' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseqxplus_samplesheets'], - current_year, f'{FCID}.csv') + elif "NovaSeqXPlus" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["novaseqxplus_samplesheets"], + current_year, + f"{FCID}.csv", + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NovaSeq 6000 case - elif 'NovaSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseq_samplesheets'], - current_year, f'{FCID}.csv') + elif "NovaSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["novaseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NextSeq Case - elif 'NextSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['nextseq_samplesheets'], - current_year, f'{FCID}.csv') + elif "NextSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["nextseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) else: - logger.warn(f'Cannot locate the samplesheet for run {run_dir}') + logger.warn(f"Cannot locate the samplesheet for run {run_dir}") return [] # If samplesheet is empty, don't bother going through it if data == []: - return data + return data proj_n_sample = False lane = False @@ -245,87 +299,110 @@ def get_ss_projects(run_dir): lane = False if list(proj_tree.keys()) == []: - logger.info(f'INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}') + logger.info(f"INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}") return proj_tree + def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False): """Parses a samplesheet with SampleSheetParser - :param FCID_samplesheet_origin sample sheet path + :param FCID_samplesheet_origin sample sheet path """ data = [] try: ss_reader = SampleSheetParser(FCID_samplesheet_origin) data = ss_reader.data except: - logger.warn(f'Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation') + logger.warn( + f"Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation" + ) return [] if is_miseq: - if 'Description' not in ss_reader.header or not \ - ('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']): - logger.warn(f'Run {run_dir} not labelled as production or application. Disregarding it.') + if "Description" not in ss_reader.header or not ( + "Production" in ss_reader.header["Description"] + or "Application" in ss_reader.header["Description"] + ): + logger.warn( + f"Run {run_dir} not labelled as production or application. Disregarding it." + ) # Skip this run return [] return data + def error_emailer(flag, info): """Sends a custom error e-mail :param flag e-mail state :param info variable that describes the record in some way """ - recipients = CONFIG['mail']['recipients'] + recipients = CONFIG["mail"]["recipients"] # Failed_run: Samplesheet for a given project couldn't be found - body = 'TACA has encountered an issue that might be worth investigating\n' - body += 'The offending entry is: ' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " body += info - body += '\n\nSincerely, TACA' + body += "\n\nSincerely, TACA" - if (flag == 'no_samplesheet'): - subject='ERROR, Samplesheet error' - elif (flag == "failed_run"): - subject='WARNING, Reinitialization of partially failed FC' - elif (flag == 'weird_samplesheet'): - subject='ERROR, Incorrectly formatted samplesheet' + if flag == "no_samplesheet": + subject = "ERROR, Samplesheet error" + elif flag == "failed_run": + subject = "WARNING, Reinitialization of partially failed FC" + elif flag == "weird_samplesheet": + subject = "ERROR, Incorrectly formatted samplesheet" hour_now = datetime.datetime.now().hour if hour_now == 7 or hour_now == 12 or hour_now == 16: send_mail(subject, body, recipients) + def fail_run(runid, project): """Updates status of specified run or project-run to Failed.""" - statusdb_conf = CONFIG.get('statusdb') - logger.info('Connecting to status db: {}'.format(statusdb_conf.get('url'))) + statusdb_conf = CONFIG.get("statusdb") + logger.info("Connecting to status db: {}".format(statusdb_conf.get("url"))) try: status_db = statusdb.StatusdbSession(statusdb_conf).connection except Exception as e: - logger.error('Can not connect to status_db: https://{}:*****@{}'.format( - statusdb_conf.get('username'), - statusdb_conf.get('url'))) + logger.error( + "Can not connect to status_db: https://{}:*****@{}".format( + statusdb_conf.get("username"), statusdb_conf.get("url") + ) + ) logger.error(e) raise e - bioinfo_db = status_db['bioinfo_analysis'] + bioinfo_db = status_db["bioinfo_analysis"] if project is not None: - view = bioinfo_db.view('full_doc/pj_run_to_doc') + view = bioinfo_db.view("full_doc/pj_run_to_doc") rows = view[[project, runid]].rows - logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}') + logger.info( + f"Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}" + ) else: - view = bioinfo_db.view('full_doc/run_id_to_doc') + view = bioinfo_db.view("full_doc/run_id_to_doc") rows = view[[runid]].rows - logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid}') + logger.info(f"Updating status of {len(rows)} objects with flowcell_id: {runid}") new_timestamp = datetime.datetime.now().isoformat() updated = 0 for row in rows: - if row.value['status'] != 'Failed': - row.value['values'][new_timestamp] = {'sample_status' : 'Failed', 'user': 'taca'} - row.value['status'] = 'Failed' + if row.value["status"] != "Failed": + row.value["values"][new_timestamp] = { + "sample_status": "Failed", + "user": "taca", + } + row.value["status"] = "Failed" try: bioinfo_db.save(row.value) updated += 1 except Exception as e: - logger.error('Cannot update object project-sample-run-lane: {}-{}-{}-{}'.format(row.value.get('project_id'), row.value.get('sample'), row.value.get('run_id'), row.value.get('lane'))) + logger.error( + "Cannot update object project-sample-run-lane: {}-{}-{}-{}".format( + row.value.get("project_id"), + row.value.get("sample"), + row.value.get("run_id"), + row.value.get("lane"), + ) + ) logger.error(e) raise e - logger.info(f'Successfully updated {updated} objects') + logger.info(f"Successfully updated {updated} objects") diff --git a/taca/utils/cli.py b/taca/utils/cli.py index 4fa3eafa..3bef6eef 100644 --- a/taca/utils/cli.py +++ b/taca/utils/cli.py @@ -4,26 +4,31 @@ import taca.utils.bioinfo_tab as bt -@click.group(name='bioinfo_deliveries') +@click.group(name="bioinfo_deliveries") def bioinfo_deliveries(): """Update statusdb with information about FC entry point.""" pass + # bioinfo subcommands @bioinfo_deliveries.command() -@click.argument('rundir') +@click.argument("rundir") def updaterun(rundir): """Saves the bioinfo data to statusdb.""" bt.update_statusdb(rundir) + @bioinfo_deliveries.command() def update(): """Saves the bioinfo data of everything that can be found to statusdb.""" bt.collect_runs() -@bioinfo_deliveries.command(name='fail_run') -@click.argument('runid') -@click.option('-p','--project', is_flag=False, help='Fail run for the specified project') + +@bioinfo_deliveries.command(name="fail_run") +@click.argument("runid") +@click.option( + "-p", "--project", is_flag=False, help="Fail run for the specified project" +) def fail_run(runid, project=None): """Updates the status of the specified run to 'Failed'. Example of RUNID: 170113_ST-E00269_0163_BHCVH7ALXX""" diff --git a/taca/utils/config.py b/taca/utils/config.py index 1a6fd6a1..e2710ba7 100644 --- a/taca/utils/config.py +++ b/taca/utils/config.py @@ -4,6 +4,7 @@ CONFIG = {} + def load_config(config_file): """Loads a configuration file.""" config = {} @@ -16,6 +17,7 @@ def load_config(config_file): e.message = f'Could not open configuration file "{config_file}".' raise e + def load_yaml_config(config_file): """Load YAML config file diff --git a/taca/utils/filesystem.py b/taca/utils/filesystem.py index 957bf818..a001615e 100644 --- a/taca/utils/filesystem.py +++ b/taca/utils/filesystem.py @@ -3,8 +3,9 @@ import os import shutil -RUN_RE = '^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$' -RUN_RE_ONT = '^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$' +RUN_RE = "^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$" +RUN_RE_ONT = "^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$" + @contextlib.contextmanager def chdir(new_dir): @@ -16,13 +17,14 @@ def chdir(new_dir): finally: os.chdir(cur_dir) + def create_folder(target_folder): - """ Ensure that a folder exists and create it if it doesn't, including any - parent folders, as necessary. + """Ensure that a folder exists and create it if it doesn't, including any + parent folders, as necessary. - :param target_folder: the target folder - :returns: True if the folder exists or was created, False if the folder - does not exists and could not be created + :param target_folder: the target folder + :returns: True if the folder exists or was created, False if the folder + does not exists and could not be created """ try: os.makedirs(target_folder) @@ -30,14 +32,17 @@ def create_folder(target_folder): pass return os.path.exists(target_folder) + def touch(file): - open(file, 'w').close() + open(file, "w").close() + def do_symlink(src_file, dst_file): link_f = os.symlink if not os.path.isfile(dst_file): link_f(os.path.realpath(src_file), dst_file) + def do_copy(src_path, dst_path): # copies folder structure and files (recursively) # if symlinks, will copy content, not the links diff --git a/taca/utils/misc.py b/taca/utils/misc.py index 3a4a1d68..ec05d4ff 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -19,17 +19,18 @@ def send_mail(subject, content, receiver): :param str receiver: Address to send the email """ if not receiver: - raise SystemExit('No receiver was given to send mail') + raise SystemExit("No receiver was given to send mail") msg = MIMEText(content) - msg['Subject'] = f'TACA - {subject}' - msg['From'] = 'TACA@scilifelab.se' - msg['to'] = receiver + msg["Subject"] = f"TACA - {subject}" + msg["From"] = "TACA@scilifelab.se" + msg["to"] = receiver - s = smtplib.SMTP('localhost') - s.sendmail('TACA', [receiver], msg.as_string()) + s = smtplib.SMTP("localhost") + s.sendmail("TACA", [receiver], msg.as_string()) s.quit() -def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): + +def call_external_command(cl, with_log_files=False, prefix=None, log_dir=""): """Executes an external command. :param string cl: Command line to be executed (command + options and parameters) @@ -38,33 +39,34 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): :param string log_dir: where to write the log file (to avoid problems with rights) """ if type(cl) == str: - cl = cl.split(' ') + cl = cl.split(" ") logFile = os.path.basename(cl[0]) stdout = sys.stdout stderr = sys.stderr if with_log_files: if prefix: - logFile = f'{prefix}_{logFile}' + logFile = f"{prefix}_{logFile}" # Create log dir if it didn't exist in CWD if log_dir and not os.path.exists(log_dir): os.mkdir(log_dir) logFile = os.path.join(log_dir, logFile) - stdout = open(logFile + '.out', 'a') - stderr = open(logFile + '.err', 'a') - started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + '\n') - stdout.write(''.join(['=']*len(cl)) + '\n') + stdout = open(logFile + ".out", "a") + stderr = open(logFile + ".err", "a") + started = "Started command {} on {}".format(" ".join(cl), datetime.now()) + stdout.write(started + "\n") + stdout.write("".join(["="] * len(cl)) + "\n") try: subprocess.check_call(cl, stdout=stdout, stderr=stderr) except subprocess.CalledProcessError as e: - e.message = 'The command {} failed.'.format(' '.join(cl)) + e.message = "The command {} failed.".format(" ".join(cl)) raise e finally: if with_log_files: stdout.close() stderr.close() + def call_external_command_detached(cl, with_log_files=False, prefix=None): """Executes an external command. @@ -72,24 +74,24 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None): :param bool with_log_files: Create log files for stdout and stderr """ if type(cl) == str: - cl = cl.split(' ') + cl = cl.split(" ") command = os.path.basename(cl[0]) stdout = sys.stdout stderr = sys.stderr if with_log_files: if prefix: - command = f'{prefix}_{command}' - stdout = open(command + '.out', 'a') - stderr = open(command + '.err', 'a') - started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + '\n') - stdout.write(''.join(['=']*len(cl)) + '\n') + command = f"{prefix}_{command}" + stdout = open(command + ".out", "a") + stderr = open(command + ".err", "a") + started = "Started command {} on {}".format(" ".join(cl), datetime.now()) + stdout.write(started + "\n") + stdout.write("".join(["="] * len(cl)) + "\n") try: p_handle = subprocess.Popen(cl, stdout=stdout, stderr=stderr) except subprocess.CalledProcessError as e: - e.message = 'The command {} failed.'.format(' '.join(cl)) + e.message = "The command {} failed.".format(" ".join(cl)) raise e finally: if with_log_files: @@ -97,6 +99,7 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None): stderr.close() return p_handle + def to_seconds(days=None, hours=None): """Convert given day/hours to seconds and return. @@ -115,7 +118,8 @@ def to_seconds(days=None, hours=None): # 1 hour == 60*60 seconds --> 3600 return 3600 * hours -def hashfile(afile, hasher='sha1', blocksize=65536): + +def hashfile(afile, hasher="sha1", blocksize=65536): """Calculate the hash digest of a file with the specified algorithm and return it. @@ -129,14 +133,15 @@ def hashfile(afile, hasher='sha1', blocksize=65536): if not os.path.isfile(afile): return None hashobj = hashlib.new(hasher) - with open(afile,'rb') as fh: + with open(afile, "rb") as fh: buf = fh.read(blocksize) while len(buf) > 0: hashobj.update(buf) buf = fh.read(blocksize) return hashobj.hexdigest() -def query_yes_no(question, default='yes', force=False): + +def query_yes_no(question, default="yes", force=False): """Ask a yes/no question via raw_input() and return their answer. "question" is a string that is presented to the user. "default" is the presumed answer if the user just hits . It must be @@ -149,14 +154,13 @@ def query_yes_no(question, default='yes', force=False): :param force: set answer to default :returns: yes or no """ - valid = {'yes': True, 'y': True, 'ye': True, - 'no': False, 'n': False} + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} if default is None: - prompt = ' [y/n] ' - elif default == 'yes': - prompt = ' [Y/n] ' - elif default == 'no': - prompt = ' [y/N] ' + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " else: raise ValueError('invalid default answer: "%s"' % default) @@ -165,56 +169,60 @@ def query_yes_no(question, default='yes', force=False): if not force: choice = input().lower() else: - choice = 'yes' - if default is not None and choice == '': + choice = "yes" + if default is not None and choice == "": return valid[default] elif choice in valid: return valid[choice] else: - sys.stdout.write('Please respond with "yes" or "no" '\ - '(or "y" or "n").\n') + sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n') + def return_unique(seq): seen = set() seen_add = seen.add - return [ x for x in seq if not (x in seen or seen_add(x))] + return [x for x in seq if not (x in seen or seen_add(x))] + def run_is_demuxed(run, couch_info=None, seq_run_type=None): - """ + """ For ONT runs: check that .sync_finished exists, which is created by TACA when the sync is finalized. Since demux is done on the sequencers in parallel to sequencing, the presence of this file also implies that demux is done. - + For Illumina runs: Check in StatusDB 'x_flowcells' database if the given run has an entry which means it was demultiplexed (as TACA only creates a document upon successfull demultiplexing) :param dict couch_info: a dict with 'statusDB' info """ - if seq_run_type in ['promethion', 'minion']: + if seq_run_type in ["promethion", "minion"]: if os.path.exists(os.path.join(run.abs_path, ".sync_finished")): return True else: return False else: if not couch_info: - raise SystemExit('To check for demultiplexing is enabled in config file but no "statusDB" info was given') - run_terms = run.name.split('_') + raise SystemExit( + 'To check for demultiplexing is enabled in config file but no "statusDB" info was given' + ) + run_terms = run.name.split("_") run_date = run_terms[0] - if len(run_date)>6: + if len(run_date) > 6: run_date = run_date[2:] run_fc = run_terms[-1] - run_name = f'{run_date}_{run_fc}' + run_name = f"{run_date}_{run_fc}" try: couch_connection = statusdb.StatusdbSession(couch_info).connection - fc_db = couch_connection[couch_info['xten_db']] - for fc in fc_db.view('names/name', reduce=False, descending=True): + fc_db = couch_connection[couch_info["xten_db"]] + for fc in fc_db.view("names/name", reduce=False, descending=True): if fc.key != run_name: continue fc_doc = fc_db.get(fc.id) - if not fc_doc or not fc_doc.get('illumina', {}).get('Demultiplex_Stats', {}): + if not fc_doc or not fc_doc.get("illumina", {}).get( + "Demultiplex_Stats", {} + ): return False return True except Exception as e: raise e - diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 4bbb70a4..548b87b0 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -8,17 +8,19 @@ logger = logging.getLogger(__name__) + class StatusdbSession: """Wrapper class for couchdb.""" + def __init__(self, config, db=None): - user = config.get('username') - password = config.get('password') - url = config.get('url') - url_string = f'https://{user}:{password}@{url}' - display_url_string = 'https://{}:{}@{}'.format(user, '*********', url) + user = config.get("username") + password = config.get("password") + url = config.get("url") + url_string = f"https://{user}:{password}@{url}" + display_url_string = "https://{}:{}@{}".format(user, "*********", url) self.connection = couchdb.Server(url=url_string) if not self.connection: - raise Exception(f'Couchdb connection failed for url {display_url_string}') + raise Exception(f"Couchdb connection failed for url {display_url_string}") if db: self.db_connection = self.connection[db] @@ -40,9 +42,11 @@ def save_db_doc(self, doc, db=None): db = db or self.db db.save(doc) except Exception as e: - raise Exception(f'Failed saving document due to {e}') + raise Exception(f"Failed saving document due to {e}") - def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='%Y-%m-%d'): + def get_project_flowcell( + self, project_id, open_date="2015-01-01", date_format="%Y-%m-%d" + ): """From information available in flowcell db connection, collect the flowcell this project was sequenced. @@ -53,64 +57,87 @@ def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format=' try: open_date = datetime.strptime(open_date, date_format) except: - open_date = datetime.strptime('2015-01-01', '%Y-%m-%d') + open_date = datetime.strptime("2015-01-01", "%Y-%m-%d") project_flowcells = {} - date_sorted_fcs = sorted(list(self.proj_list.keys()), key=lambda k: datetime.strptime(k.split('_')[0], '%y%m%d'), reverse=True) + date_sorted_fcs = sorted( + list(self.proj_list.keys()), + key=lambda k: datetime.strptime(k.split("_")[0], "%y%m%d"), + reverse=True, + ) for fc in date_sorted_fcs: - fc_date, fc_name = fc.split('_') - if datetime.strptime(fc_date,'%y%m%d') < open_date: + fc_date, fc_name = fc.split("_") + if datetime.strptime(fc_date, "%y%m%d") < open_date: break - if project_id in self.proj_list[fc] and fc_name not in project_flowcells.keys(): - project_flowcells[fc_name] = {'name':fc_name,'run_name':fc, 'date':fc_date, 'db':self.db.name} + if ( + project_id in self.proj_list[fc] + and fc_name not in project_flowcells.keys() + ): + project_flowcells[fc_name] = { + "name": fc_name, + "run_name": fc, + "date": fc_date, + "db": self.db.name, + } return project_flowcells + class ProjectSummaryConnection(StatusdbSession): - def __init__(self, config, dbname='projects'): + def __init__(self, config, dbname="projects"): super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key: k.id for k in self.db.view('project/project_name', reduce=False)} - self.id_view = {k.key: k.id for k in self.db.view('project/project_id', reduce=False)} + self.name_view = { + k.key: k.id for k in self.db.view("project/project_name", reduce=False) + } + self.id_view = { + k.key: k.id for k in self.db.view("project/project_id", reduce=False) + } class FlowcellRunMetricsConnection(StatusdbSession): - def __init__(self, config, dbname='flowcells'): + def __init__(self, config, dbname="flowcells"): super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} - self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} + self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)} + self.proj_list = { + k.key: k.value + for k in self.db.view("names/project_ids_list", reduce=False) + if k.key + } class X_FlowcellRunMetricsConnection(StatusdbSession): - def __init__(self, config, dbname='x_flowcells'): + def __init__(self, config, dbname="x_flowcells"): super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} - self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} + self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)} + self.proj_list = { + k.key: k.value + for k in self.db.view("names/project_ids_list", reduce=False) + if k.key + } class NanoporeRunsConnection(StatusdbSession): - - def __init__(self, config, dbname='nanopore_runs'): + def __init__(self, config, dbname="nanopore_runs"): super().__init__(config) self.db = self.connection[dbname] def check_run_exists(self, ont_run) -> bool: - view_names = self.db.view('names/name') + view_names = self.db.view("names/name") if len(view_names[ont_run.run_name].rows) > 0: return True else: return False - + def check_run_status(self, ont_run) -> str: - view_all_stats = self.db.view('names/name') + view_all_stats = self.db.view("names/name") doc_id = view_all_stats[ont_run.run_name].rows[0].id return self.db[doc_id]["run_status"] def create_ongoing_run( self, ont_run, run_path_file: str, pore_count_history_file: str ): - run_path = open(run_path_file).read().strip() pore_counts = [] @@ -130,7 +157,7 @@ def create_ongoing_run( ) def finish_ongoing_run(self, ont_run, dict_json: dict): - view_names = self.db.view('names/name') + view_names = self.db.view("names/name") doc_id = view_names[ont_run.run_name].rows[0].id doc = self.db[doc_id] @@ -140,23 +167,23 @@ def finish_ongoing_run(self, ont_run, dict_json: dict): def update_doc(db, obj, over_write_db_entry=False): - view = db.view('info/name') - if len(view[obj['name']].rows) == 1: - remote_doc = view[obj['name']].rows[0].value - doc_id = remote_doc.pop('_id') - doc_rev = remote_doc.pop('_rev') + view = db.view("info/name") + if len(view[obj["name"]].rows) == 1: + remote_doc = view[obj["name"]].rows[0].value + doc_id = remote_doc.pop("_id") + doc_rev = remote_doc.pop("_rev") if remote_doc != obj: if not over_write_db_entry: obj = merge_dicts(obj, remote_doc) - obj['_id'] = doc_id - obj['_rev'] = doc_rev + obj["_id"] = doc_id + obj["_rev"] = doc_rev db[doc_id] = obj - logger.info('Updating {}'.format(obj['name'])) - elif len(view[obj['name']].rows) == 0: + logger.info("Updating {}".format(obj["name"])) + elif len(view[obj["name"]].rows) == 0: db.save(obj) - logger.info('Saving {}'.format(obj['name'])) + logger.info("Saving {}".format(obj["name"])) else: - logger.warn('More than one row with name {} found'.format(obj['name'])) + logger.warn("More than one row with name {} found".format(obj["name"])) def merge_dicts(d1, d2): @@ -170,8 +197,10 @@ def merge_dicts(d1, d2): elif d1[key] == d2[key]: pass # same leaf value else: - logger.debug(f'Values for key {key} in d1 and d2 differ, ' - 'using the value of d1') + logger.debug( + f"Values for key {key} in d1 and d2 differ, " + "using the value of d1" + ) else: d1[key] = d2[key] return d1 diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index 2968d0f9..b04eace5 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -13,47 +13,43 @@ class TransferAgent: """ - (Abstract) superclass representing an Agent that performs file transfers. - Agents implementing specific methods for transferring files should extend - this and implement the transfer() method. + (Abstract) superclass representing an Agent that performs file transfers. + Agents implementing specific methods for transferring files should extend + this and implement the transfer() method. """ - def __init__( - self, - src_path=None, - dest_path=None, - opts={}, - **kwargs): - """ Creates an agent instance - :param string src_path: the file or folder that should be transferred - :param string dest_path: the destination file or folder - :param bool validate: whether to validate the transferred files - :param opts: options that will be passed to the transfer command + + def __init__(self, src_path=None, dest_path=None, opts={}, **kwargs): + """Creates an agent instance + :param string src_path: the file or folder that should be transferred + :param string dest_path: the destination file or folder + :param bool validate: whether to validate the transferred files + :param opts: options that will be passed to the transfer command """ self.src_path = src_path self.dest_path = dest_path - self.validate = kwargs.get('validate', False) + self.validate = kwargs.get("validate", False) self.cmdopts = opts def __str__(self): return type(self).__name__ def format_options(self): - """ Format the options dictionary stored in this instance's cmdopts - attribute and return the formatted options as a list of strings. - A key in the dictionary represents the option name. If - the corresponding value is None, the option will be assumed to - represent a flag. If the value is a list, the option will be given - multiple times. + """Format the options dictionary stored in this instance's cmdopts + attribute and return the formatted options as a list of strings. + A key in the dictionary represents the option name. If + the corresponding value is None, the option will be assumed to + represent a flag. If the value is a list, the option will be given + multiple times. - For example: + For example: - opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']} + opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']} - will be expanded to: + will be expanded to: - ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3'] + ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3'] - :returns: List of formatted options as strings + :returns: List of formatted options as strings """ cmdopts = [] for param, val in self.cmdopts.items(): @@ -63,51 +59,55 @@ def format_options(self): if type(val) == str: val = [val] for v in val: - cmdopts.append(f'{param}={v}') + cmdopts.append(f"{param}={v}") return cmdopts def transfer(self): """Abstract method, should be implemented by subclasses.""" - raise NotImplementedError('This method should be implemented by subclass') + raise NotImplementedError("This method should be implemented by subclass") def validate_src_path(self): """Validates that the src_path attribute of the Agent instance. - :raises transfer.TransferError: if src_path is not valid + :raises transfer.TransferError: if src_path is not valid """ if self.src_path is None: raise TransferError( - msg='src_path cannot be None', + msg="src_path cannot be None", src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) if not os.path.exists(self.src_path): raise TransferError( msg=f'src_path "{self.src_path}" does not exist', src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) def validate_dest_path(self): """Validates that the dest_path attribute of the Agent instance. - :raises transfer.TransferError: if dest_path is not valid + :raises transfer.TransferError: if dest_path is not valid """ if self.dest_path is None: raise TransferError( - msg='dest_path cannot be None', + msg="dest_path cannot be None", src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) def validate_transfer(self): """Abstract method, should be implemented by subclasses.""" - raise NotImplementedError('This method should be implemented by subclass') + raise NotImplementedError("This method should be implemented by subclass") class RsyncAgent(TransferAgent): """An agent that knows how to perform an rsync transfer locally or - between hosts. If supplied with a checksum file, the transfer can - be validated on the receiving side. + between hosts. If supplied with a checksum file, the transfer can + be validated on the receiving side. """ - CMD = 'rsync' + + CMD = "rsync" DEFAULT_OPTS = { - '-a': None, + "-a": None, } def __init__( @@ -119,68 +119,65 @@ def __init__( validate=True, digestfile=None, opts=None, - **kwargs): + **kwargs, + ): """Creates an RsyncAgent instance - :param string src_path: the file or folder that should be transferred - :param string dest_path: the destination file or folder - :param string remote_host: the remote host to transfer to. - If None, the transfer will be on the local filesystem - :param string remote_user: the remote user to connect with. - If None, the local user will be used - :param bool validate: whether to validate the transferred files - using a supplied file with checksums - :param string digestfile: a file with checksums for the files to be - transferred. Must be specified if validate is True. The checksum - algorithm will be inferred from the extension of the digest file - :param opts: options that will be passed to the rsync command + :param string src_path: the file or folder that should be transferred + :param string dest_path: the destination file or folder + :param string remote_host: the remote host to transfer to. + If None, the transfer will be on the local filesystem + :param string remote_user: the remote user to connect with. + If None, the local user will be used + :param bool validate: whether to validate the transferred files + using a supplied file with checksums + :param string digestfile: a file with checksums for the files to be + transferred. Must be specified if validate is True. The checksum + algorithm will be inferred from the extension of the digest file + :param opts: options that will be passed to the rsync command """ super().__init__( src_path=src_path, dest_path=dest_path, opts=opts or self.DEFAULT_OPTS, validate=validate, - **kwargs) + **kwargs, + ) self.remote_host = remote_host self.remote_user = remote_user self.digestfile = digestfile def transfer(self, transfer_log=None): """Execute the transfer as set up by this instance and, if requested, - validate the transfer. + validate the transfer. - :param string transfer_log: path prefix to log files where stderr - and stdout streams will be directed if this option is specified - :returns True on success, False if the validation failed - :raises transfer.TransferError: if src_path or dest_path were not valid - :raises transfer.RsyncError: if the rsync command did not exit successfully + :param string transfer_log: path prefix to log files where stderr + and stdout streams will be directed if this option is specified + :returns True on success, False if the validation failed + :raises transfer.TransferError: if src_path or dest_path were not valid + :raises transfer.RsyncError: if the rsync command did not exit successfully """ self.validate_src_path() self.validate_dest_path() - command = [self.CMD] + self.format_options() + [self.src_path, self.remote_path()] + command = ( + [self.CMD] + self.format_options() + [self.src_path, self.remote_path()] + ) try: call_external_command( - command, - with_log_files=(transfer_log is not None), - prefix=transfer_log) + command, with_log_files=(transfer_log is not None), prefix=transfer_log + ) except subprocess.CalledProcessError as e: raise RsyncError(e) return (not self.validate) or self.validate_transfer() def remote_path(self): """Construct the remote path according to what has been specified. - :returns: the remote path string on the form - [remote_user]@[remote_host]:[dest_path] + :returns: the remote path string on the form + [remote_user]@[remote_host]:[dest_path] """ - return '{}{}{}'.format( - f'{self.remote_user}@' \ - if self.remote_user is not None \ - else '', - f'{self.remote_host}:' \ - if self.remote_host is not None \ - else '', - self.dest_path \ - if self.dest_path is not None \ - else '' + return "{}{}{}".format( + f"{self.remote_user}@" if self.remote_user is not None else "", + f"{self.remote_host}:" if self.remote_host is not None else "", + self.dest_path if self.dest_path is not None else "", ) def validate_dest_path(self): @@ -192,63 +189,60 @@ def validate_dest_path(self): """ if self.dest_path is None and self.remote_host is None: raise TransferError( - msg='dest_path and remote_host cannot both be None', - src_path=self.src_path) + msg="dest_path and remote_host cannot both be None", + src_path=self.src_path, + ) if self.remote_user is not None and self.remote_host is None: raise TransferError( - msg='dest_path cannot be None if remote_user is not None', - src_path=self.src_path) + msg="dest_path cannot be None if remote_user is not None", + src_path=self.src_path, + ) def validate_transfer(self): """Validate the transferred files by computing checksums and comparing - to the pre-computed checksums, supplied in the digestfile attribute - of this Agent instance. The hash algorithm is inferred from the file - extension of the digestfile. The paths of the files to check are - assumed to be relative to the location of the digestfile. + to the pre-computed checksums, supplied in the digestfile attribute + of this Agent instance. The hash algorithm is inferred from the file + extension of the digestfile. The paths of the files to check are + assumed to be relative to the location of the digestfile. - Currently not implemented for remote transfers. + Currently not implemented for remote transfers. - :returns: False if any checksum does not match, or if a file does - not exist. True otherwise. - :raises transfer.RsyncValidationError: if the digestfile was not - supplied + :returns: False if any checksum does not match, or if a file does + not exist. True otherwise. + :raises transfer.RsyncValidationError: if the digestfile was not + supplied """ if self.remote_host is not None: - raise NotImplementedError('Validation on remote host not implemented') + raise NotImplementedError("Validation on remote host not implemented") try: with open(self.digestfile) as fh: - hasher = self.digestfile.split('.')[-1] + hasher = self.digestfile.split(".")[-1] dpath = os.path.dirname(self.digestfile) for line in fh: - digest,fpath = line.split() - tfile = os.path.join(dpath,fpath) + digest, fpath = line.split() + tfile = os.path.join(dpath, fpath) if not os.path.exists(tfile) or digest != hashfile( - tfile, - hasher=hasher): + tfile, hasher=hasher + ): return False except TypeError: raise RsyncValidationError( - 'no digest file specified', - self.src_path, - self.dest_path) + "no digest file specified", self.src_path, self.dest_path + ) return True class SymlinkAgent(TransferAgent): - def __init__(self, src_path, dest_path, overwrite=True, relative=True, **kwargs): """Creates an SymlinkAgent instance for creating symlinks. - :param string src_path: the file or folder that should be symlinked - :param string dest_path: the destination symlink - :param bool overwrite: if true, the destination file or folder will - be overwritten if it already exists - :param bool relative: if true, the destination symlink will be relative + :param string src_path: the file or folder that should be symlinked + :param string dest_path: the destination symlink + :param bool overwrite: if true, the destination file or folder will + be overwritten if it already exists + :param bool relative: if true, the destination symlink will be relative """ - super().__init__( - src_path=src_path, - dest_path=dest_path, - **kwargs) + super().__init__(src_path=src_path, dest_path=dest_path, **kwargs) self.overwrite = overwrite self.relative = relative @@ -267,20 +261,22 @@ def transfer(self): # If the existing target is a symlink that points to the # source, we're all good if self.validate_transfer(): - logger.debug('target exists and points to the correct ' - f'source path: "{self.src_path}"') + logger.debug( + "target exists and points to the correct " + f'source path: "{self.src_path}"' + ) return True # If we are not overwriting, return False if not self.overwrite: - logger.debug(f'target "{self.dest_path}" exists and will not be ' - 'overwritten') + logger.debug( + f'target "{self.dest_path}" exists and will not be ' "overwritten" + ) return False # If the target is a mount, let's not mess with it if os.path.ismount(self.dest_path): - raise SymlinkError('target exists and is a mount') + raise SymlinkError("target exists and is a mount") # If the target is a link or a file, we remove it - if os.path.islink(self.dest_path) or \ - os.path.isfile(self.dest_path): + if os.path.islink(self.dest_path) or os.path.isfile(self.dest_path): logger.debug(f'removing existing target file "{self.dest_path}"') try: os.unlink(self.dest_path) @@ -296,28 +292,32 @@ def transfer(self): raise SymlinkError(e) # If it's something else, let's bail out else: - raise SymlinkError('target exists and will not be overwritten') + raise SymlinkError("target exists and will not be overwritten") if not create_folder(os.path.dirname(self.dest_path)): - raise SymlinkError('failed to create target folder hierarchy') + raise SymlinkError("failed to create target folder hierarchy") try: # If we should create a relative symlink, determine the relative path os.symlink( - os.path.relpath(self.src_path,os.path.dirname(self.dest_path)) \ - if self.relative else self.src_path, - self.dest_path) + os.path.relpath(self.src_path, os.path.dirname(self.dest_path)) + if self.relative + else self.src_path, + self.dest_path, + ) except OSError as e: raise SymlinkError(e) return (not self.validate) or self.validate_transfer() def validate_transfer(self): """Validates the symlinked files by verifying that the dest_path was - created, is a link and resolves to the same file as src_path. + created, is a link and resolves to the same file as src_path. - :returns: True if link is valid, False otherwise + :returns: True if link is valid, False otherwise """ - return os.path.exists(self.dest_path) and \ - os.path.islink(self.dest_path) and \ - os.path.samefile(self.src_path, self.dest_path) + return ( + os.path.exists(self.dest_path) + and os.path.islink(self.dest_path) + and os.path.samefile(self.src_path, self.dest_path) + ) class TransferError(Exception): @@ -326,6 +326,14 @@ def __init__(self, msg, src_path=None, dest_path=None): self.src_path = src_path self.dest_path = dest_path -class SymlinkError(TransferError): pass -class RsyncError(TransferError): pass -class RsyncValidationError(TransferError): pass + +class SymlinkError(TransferError): + pass + + +class RsyncError(TransferError): + pass + + +class RsyncValidationError(TransferError): + pass diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 173c57dd..114f8316 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -9,7 +9,8 @@ from taca.analysis import analysis as an from taca.utils import config -CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = config.load_yaml_config("data/taca_test_cfg.yaml") + class TestAnalysis(unittest.TestCase): """Tests for the Analysis functions.""" @@ -28,22 +29,36 @@ def setUpClass(self): | |__ RTAComplete.txt | |__ SampleSheet.csv """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - self.completed = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + self.completed = os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX") # Create runs directory structure os.makedirs(self.tmp_dir) - os.makedirs(os.path.join(self.completed, 'Demultiplexing', 'Stats')) + os.makedirs(os.path.join(self.completed, "Demultiplexing", "Stats")) # Set up files - open(os.path.join(self.completed, 'RTAComplete.txt'), 'w').close() - shutil.copy('data/samplesheet.csv', os.path.join(self.completed, 'SampleSheet.csv')) - open(os.path.join(self.completed, 'Demultiplexing', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(self.completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close() - with open(os.path.join(self.completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w') as stats_json: - json.dump({'silly': 1}, stats_json) - shutil.copy('data/RunInfo.xml', self.completed) - shutil.copy('data/runParameters.xml', self.completed) + open(os.path.join(self.completed, "RTAComplete.txt"), "w").close() + shutil.copy( + "data/samplesheet.csv", os.path.join(self.completed, "SampleSheet.csv") + ) + open( + os.path.join( + self.completed, "Demultiplexing", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + self.completed, "Demultiplexing", "Undetermined_S0_L001_R1_001.fastq.gz" + ), + "w", + ).close() + with open( + os.path.join(self.completed, "Demultiplexing", "Stats", "Stats.json"), "w" + ) as stats_json: + json.dump({"silly": 1}, stats_json) + shutil.copy("data/RunInfo.xml", self.completed) + shutil.copy("data/runParameters.xml", self.completed) @classmethod def tearDownClass(self): @@ -51,127 +66,159 @@ def tearDownClass(self): def test_get_runObj_miseq(self): """Return MiSeq run object.""" - miseq_run = os.path.join(self.tmp_dir, '141124_ST-MISEQ1_01_AFCIDXX') + miseq_run = os.path.join(self.tmp_dir, "141124_ST-MISEQ1_01_AFCIDXX") os.mkdir(miseq_run) - shutil.copy('data/runParameters_miseq.xml', os.path.join(miseq_run, 'runParameters.xml')) + shutil.copy( + "data/runParameters_miseq.xml", os.path.join(miseq_run, "runParameters.xml") + ) got_miseq_run = an.get_runObj(miseq_run) - self.assertEqual(got_miseq_run.sequencer_type, 'MiSeq') + self.assertEqual(got_miseq_run.sequencer_type, "MiSeq") def test_get_runObj_nextseq(self): """Return NextSeq run object.""" - nextseq_run = os.path.join(self.tmp_dir, '141124_ST-NEXTSEQ1_01_AFCIDXX') + nextseq_run = os.path.join(self.tmp_dir, "141124_ST-NEXTSEQ1_01_AFCIDXX") os.mkdir(nextseq_run) - shutil.copy('data/runParameters_nextseq.xml', os.path.join(nextseq_run, 'runParameters.xml')) + shutil.copy( + "data/runParameters_nextseq.xml", + os.path.join(nextseq_run, "runParameters.xml"), + ) got_nextseq_run = an.get_runObj(nextseq_run) - self.assertEqual(got_nextseq_run.sequencer_type, 'NextSeq') + self.assertEqual(got_nextseq_run.sequencer_type, "NextSeq") def test_get_runObj_novaseq(self): """Return NovaSeq run object.""" - novaseq_run = os.path.join(self.tmp_dir, '141124_ST-NOVASEQ1_01_AFCIDXX') + novaseq_run = os.path.join(self.tmp_dir, "141124_ST-NOVASEQ1_01_AFCIDXX") os.mkdir(novaseq_run) - shutil.copy('data/runParameters_novaseq.xml', os.path.join(novaseq_run, 'RunParameters.xml')) + shutil.copy( + "data/runParameters_novaseq.xml", + os.path.join(novaseq_run, "RunParameters.xml"), + ) got_novaseq_run = an.get_runObj(novaseq_run) - self.assertEqual(got_novaseq_run.sequencer_type, 'NovaSeq') + self.assertEqual(got_novaseq_run.sequencer_type, "NovaSeq") - @mock.patch('taca.analysis.analysis.get_runObj') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') + @mock.patch("taca.analysis.analysis.get_runObj") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") def test_upload_to_statusdb(self, mock_upload_to_statusdb, mock_get_runobj): """Get run object and initiate upload to statusdb.""" - mock_get_runobj.return_value = 'Standard_run_object' + mock_get_runobj.return_value = "Standard_run_object" an.upload_to_statusdb(self.completed) - mock_upload_to_statusdb.assert_called_once_with('Stan_run_object') + mock_upload_to_statusdb.assert_called_once_with("Stan_run_object") - @mock.patch('taca.analysis.analysis.statusdb') + @mock.patch("taca.analysis.analysis.statusdb") def test__upload_to_statusdb(self, mock_statusdb): """Upload to statusdb.""" - run = os.path.join(self.tmp_dir, '141124_ST-NOINDEX1_01_AFCIDYX') + run = os.path.join(self.tmp_dir, "141124_ST-NOINDEX1_01_AFCIDYX") os.mkdir(run) - shutil.copy('data/runParameters_minimal.xml', os.path.join(run, 'runParameters.xml')) - demux_dir = os.path.join(run, 'Demultiplexing', 'Stats') + shutil.copy( + "data/runParameters_minimal.xml", os.path.join(run, "runParameters.xml") + ) + demux_dir = os.path.join(run, "Demultiplexing", "Stats") os.makedirs(demux_dir) - shutil.copy('data/DemuxSummaryF1L1.txt', demux_dir) - reports_dir = os.path.join(run, 'Demultiplexing', 'Reports', 'html', 'FCIDYX', 'all', 'all', 'all') + shutil.copy("data/DemuxSummaryF1L1.txt", demux_dir) + reports_dir = os.path.join( + run, "Demultiplexing", "Reports", "html", "FCIDYX", "all", "all", "all" + ) os.makedirs(reports_dir) - shutil.copy('data/laneBarcode.html', (reports_dir)) - shutil.copy('data/lane.html', (reports_dir)) + shutil.copy("data/laneBarcode.html", (reports_dir)) + shutil.copy("data/lane.html", (reports_dir)) noindex_run = an.get_runObj(run) an._upload_to_statusdb(noindex_run) mock_statusdb.update_doc.assert_called_once() - @mock.patch('taca.analysis.analysis.Standard_Run.transfer_run') + @mock.patch("taca.analysis.analysis.Standard_Run.transfer_run") def test_transfer_run(self, mock_transfer_run): """Transfer run to Uppmax.""" - run_dir = (self.completed) + run_dir = self.completed an.transfer_run(run_dir) - mock_transfer_run.assert_called_once_with('nosync/data/transfer.tsv', 'some_user@some_email.com') - - @mock.patch('taca.analysis.analysis.RsyncAgent.transfer') - @mock.patch('taca.analysis.analysis.subprocess.call') - @mock.patch('taca.analysis.analysis.os.remove') - @mock.patch('taca.analysis.analysis.open') - def test_transfer_runfolder(self, mock_open, mock_remove, mock_subprocess_call, mock_transfer): + mock_transfer_run.assert_called_once_with( + "nosync/data/transfer.tsv", "some_user@some_email.com" + ) + + @mock.patch("taca.analysis.analysis.RsyncAgent.transfer") + @mock.patch("taca.analysis.analysis.subprocess.call") + @mock.patch("taca.analysis.analysis.os.remove") + @mock.patch("taca.analysis.analysis.open") + def test_transfer_runfolder( + self, mock_open, mock_remove, mock_subprocess_call, mock_transfer + ): """Transfer runfolder to uppmax.""" - run_dir = (self.completed) - pid = 'P1775' - exclude_lane = '' + run_dir = self.completed + pid = "P1775" + exclude_lane = "" an.transfer_runfolder(run_dir, pid, exclude_lane) mock_subprocess_call.assert_called() mock_transfer.assert_called() def test_extract_project_samplesheet(self): """Extract project specific lines from sample sheet.""" - sample_sheet = 'data/samplesheet.csv' - pid = 'P1775' + sample_sheet = "data/samplesheet.csv" + pid = "P1775" samplesheet_content = an.extract_project_samplesheet(sample_sheet, pid) expected_samplesheet_content = """Lane,SampleID,SampleName,SamplePlate,SampleWell,index,Project 1,Sample_P1775_147,P1775_147,FCB_150423,1:1,GAATTCGT,J_Lundeberg_14_24 """ self.assertEqual(samplesheet_content, expected_samplesheet_content) - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - def test_run_preprocessing_sequencing(self, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + def test_run_preprocessing_sequencing( + self, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocess run still sequencing.""" run = self.completed - mock_get_run_status.return_value = 'SEQUENCING' + mock_get_run_status.return_value = "SEQUENCING" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.demultiplex_run') - def test_run_preprocessing_to_start(self, mock_demultiplex_run, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.demultiplex_run") + def test_run_preprocessing_to_start( + self, mock_demultiplex_run, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocessing start demux.""" run = self.completed - mock_get_run_status.return_value = 'TO_START' + mock_get_run_status.return_value = "TO_START" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() mock_demultiplex_run.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.check_run_status') - def test_run_preprocessing_in_progress(self, mock_check_run_status, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.check_run_status") + def test_run_preprocessing_in_progress( + self, mock_check_run_status, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocessing demux in progress.""" run = self.completed - mock_get_run_status.return_value = 'IN_PROGRESS' + mock_get_run_status.return_value = "IN_PROGRESS" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() mock_check_run_status.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.send_mail') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.transfer_run') - @mock.patch('taca.analysis.analysis.os.mkdir') - @mock.patch('taca.analysis.analysis.copyfile') - def test_run_preprocessing_completed(self, mock_copy, mock_mkdir, mock_transfer_run, mock_send_mail, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.send_mail") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.transfer_run") + @mock.patch("taca.analysis.analysis.os.mkdir") + @mock.patch("taca.analysis.analysis.copyfile") + def test_run_preprocessing_completed( + self, + mock_copy, + mock_mkdir, + mock_transfer_run, + mock_send_mail, + mock_upload_to_statusdb, + mock_get_run_status, + ): """Run preprocessing demux completed.""" run = self.completed - mock_get_run_status.return_value = 'COMPLETED' + mock_get_run_status.return_value = "COMPLETED" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() - message = 'The run 141124_ST-COMPLETED1_01_AFCIDXX has been demultiplexed.\n The Run will be transferred to the analysis cluster for further analysis.\n\n \ - The run is available at : https://genomics-status.scilifelab.se/flowcells/141124_ST-COMPLETED1_01_AFCIDXX\n\n ' - mock_send_mail.assert_called_once_with(message, rcp='some_user@some_email.com') - mock_transfer_run.assert_called_once_with('data/transfer.tsv', 'some_user@some_email.com') + message = "The run 141124_ST-COMPLETED1_01_AFCIDXX has been demultiplexed.\n The Run will be transferred to the analysis cluster for further analysis.\n\n \ + The run is available at : https://genomics-status.scilifelab.se/flowcells/141124_ST-COMPLETED1_01_AFCIDXX\n\n " + mock_send_mail.assert_called_once_with(message, rcp="some_user@some_email.com") + mock_transfer_run.assert_called_once_with( + "data/transfer.tsv", "some_user@some_email.com" + ) diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py index f1b02676..bb8ac35a 100644 --- a/tests/test_analysis_nanopore.py +++ b/tests/test_analysis_nanopore.py @@ -6,62 +6,81 @@ from taca.nanopore.minion import MinIONqc from taca.utils import config as conf -CONFIG = conf.load_yaml_config('data/taca_test_nanopore_cfg.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_nanopore_cfg.yaml") + class TestNanoporeAnalysis(unittest.TestCase): def test_find_runs_to_process(self): """Find all expected nanopore runs to process.""" - expected_dirs = ["data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2", - "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2", - "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2", - "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2", - "data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2", - "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2"] - nanopore_data_dir = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('data_dir') - skip_dirs = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('ignore_dirs') + expected_dirs = [ + "data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2", + "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2", + "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2", + "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2", + "data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2", + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2", + ] + nanopore_data_dir = ( + CONFIG.get("nanopore_analysis").get("minion_qc_run").get("data_dir") + ) + skip_dirs = ( + CONFIG.get("nanopore_analysis").get("minion_qc_run").get("ignore_dirs") + ) found_dirs = find_minion_runs(nanopore_data_dir, skip_dirs) self.assertEqual(sorted(found_dirs), sorted(expected_dirs)) - @mock.patch('taca.analysis.analysis_nanopore.os.path.isfile') - @mock.patch('taca.nanopore.minion.MinIONqc.start_nanoseq') + @mock.patch("taca.analysis.analysis_nanopore.os.path.isfile") + @mock.patch("taca.nanopore.minion.MinIONqc.start_nanoseq") def test_process_minion_run_start_analysis(self, mock_start, mock_isfile): """Start nanoseq analysis for minion.""" - nanoseq_sample_sheet = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/SQK-LSK109_sample_sheet.csv' - anglerfish_sample_sheet = 'some/path' + nanoseq_sample_sheet = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/SQK-LSK109_sample_sheet.csv" + anglerfish_sample_sheet = "some/path" mock_isfile.return_value = True - run_dir = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2' + run_dir = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2" minion_run = MinIONqc(run_dir, nanoseq_sample_sheet, anglerfish_sample_sheet) process_minion_qc_run(minion_run) mock_start.assert_called_once() - @mock.patch('taca.nanopore.minion.MinIONqc.copy_results_for_lims') - @mock.patch('taca.nanopore.minion.Nanopore.transfer_run') - @mock.patch('taca.nanopore.minion.Nanopore.update_transfer_log') - @mock.patch('taca.nanopore.minion.Nanopore.archive_run') - @mock.patch('taca.analysis.analysis_nanopore.send_mail') - def test_process_minion_run_transfer(self, mock_mail, mock_archive, mock_update, mock_transfer, mock_cp): + @mock.patch("taca.nanopore.minion.MinIONqc.copy_results_for_lims") + @mock.patch("taca.nanopore.minion.Nanopore.transfer_run") + @mock.patch("taca.nanopore.minion.Nanopore.update_transfer_log") + @mock.patch("taca.nanopore.minion.Nanopore.archive_run") + @mock.patch("taca.analysis.analysis_nanopore.send_mail") + def test_process_minion_run_transfer( + self, mock_mail, mock_archive, mock_update, mock_transfer, mock_cp + ): """Start transfer of run directory.""" mock_transfer.return_value = True mock_cp.return_value = True - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - minion_run = MinIONqc(run_dir, 'dummy/path', None) + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + minion_run = MinIONqc(run_dir, "dummy/path", None) process_minion_qc_run(minion_run) - expected_calls = [mock.call('Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2', - 'Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.', - 'test@test.com'), - mock.call('Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2', - 'Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.', - 'test@test.com')] + expected_calls = [ + mock.call( + "Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2", + "Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.", + "test@test.com", + ), + mock.call( + "Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2", + "Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.", + "test@test.com", + ), + ] mock_mail.assert_has_calls(expected_calls) - @mock.patch('taca.analysis.analysis_nanopore.send_mail') + @mock.patch("taca.analysis.analysis_nanopore.send_mail") def test_process_minion_run_fail_analysis(self, mock_mail): """Send email to operator if nanoseq analysis failed.""" - run_dir = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2' + run_dir = ( + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2" + ) minion_run = MinIONqc(run_dir, None, None) minion_run.qc_run = True process_minion_qc_run(minion_run) - email_subject = ('Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2') - email_message = f'The nanoseq analysis failed for run {minion_run.run_id}.' - email_recipients = 'test@test.com' - mock_mail.assert_called_once_with(email_subject, email_message, email_recipients) + email_subject = "Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2" + email_message = f"The nanoseq analysis failed for run {minion_run.run_id}." + email_recipients = "test@test.com" + mock_mail.assert_called_once_with( + email_subject, email_message, email_recipients + ) diff --git a/tests/test_backup.py b/tests/test_backup.py index f1addeb9..204761e9 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -8,7 +8,7 @@ from taca.backup import backup from taca.utils import config as conf -CONFIG = conf.load_yaml_config('data/taca_test_cfg_backup.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg_backup.yaml") class TestRunVars(unittest.TestCase): @@ -16,12 +16,18 @@ class TestRunVars(unittest.TestCase): def test_backup_variables(self): """Set up backup variables.""" - run_variables = backup.run_vars('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(run_variables.name, '190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(run_variables.zip, '190201_A00621_0032_BHHFCFDSXX.tar.gz') - self.assertEqual(run_variables.key, '190201_A00621_0032_BHHFCFDSXX.key') - self.assertEqual(run_variables.key_encrypted, '190201_A00621_0032_BHHFCFDSXX.key.gpg') - self.assertEqual(run_variables.zip_encrypted, '190201_A00621_0032_BHHFCFDSXX.tar.gz.gpg') + run_variables = backup.run_vars( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + ) + self.assertEqual(run_variables.name, "190201_A00621_0032_BHHFCFDSXX") + self.assertEqual(run_variables.zip, "190201_A00621_0032_BHHFCFDSXX.tar.gz") + self.assertEqual(run_variables.key, "190201_A00621_0032_BHHFCFDSXX.key") + self.assertEqual( + run_variables.key_encrypted, "190201_A00621_0032_BHHFCFDSXX.key.gpg" + ) + self.assertEqual( + run_variables.zip_encrypted, "190201_A00621_0032_BHHFCFDSXX.tar.gz.gpg" + ) class TestBackupUtils(unittest.TestCase): @@ -29,81 +35,101 @@ class TestBackupUtils(unittest.TestCase): def test_fetch_config_info(self): """Get backup info from config.""" - config_info = backup.backup_utils('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(config_info.data_dirs, {'miseq': 'data/nas/miseq.lab'}) - self.assertEqual(config_info.archive_dirs, {'hiseq': 'blah', 'miseq': 'data/nas/miseq.lab/nosync'}) - self.assertEqual(config_info.keys_path, 'data/nas/run_keys') - self.assertEqual(config_info.gpg_receiver, 'some.user') - self.assertEqual(config_info.mail_recipients, 'some_user@some_email.com') + config_info = backup.backup_utils( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + ) + self.assertEqual(config_info.data_dirs, {"miseq": "data/nas/miseq.lab"}) + self.assertEqual( + config_info.archive_dirs, + {"hiseq": "blah", "miseq": "data/nas/miseq.lab/nosync"}, + ) + self.assertEqual(config_info.keys_path, "data/nas/run_keys") + self.assertEqual(config_info.gpg_receiver, "some.user") + self.assertEqual(config_info.mail_recipients, "some_user@some_email.com") self.assertEqual(config_info.check_demux, True) - self.assertEqual(config_info.couch_info, {'url': 'url', 'username': 'username', 'password': 'pwd', 'xten_db': 'x_flowcells'}) + self.assertEqual( + config_info.couch_info, + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + }, + ) def test_collect_runs(self): """Get backup runs from archive directories.""" backup_object = backup.backup_utils() - backup_object.collect_runs(ext='.tar.gz', filter_by_ext=True) + backup_object.collect_runs(ext=".tar.gz", filter_by_ext=True) run = backup_object.runs[0].name - self.assertEqual(run, '200201_A00621_0032_BHHFCFDSXY') + self.assertEqual(run, "200201_A00621_0032_BHHFCFDSXY") def test_collect_runs_specific_run(self): """Collect only specific run.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) backup_object.collect_runs() run = backup_object.runs[0].name - self.assertEqual(run, '200201_A00621_0032_BHHFCFDSXX') + self.assertEqual(run, "200201_A00621_0032_BHHFCFDSXX") - missing_object = backup.backup_utils(run='some/missing/path/run') + missing_object = backup.backup_utils(run="some/missing/path/run") with self.assertRaises(SystemExit): missing_object.collect_runs() - @mock.patch('taca.backup.backup.sp.Popen.communicate') - @mock.patch('taca.backup.backup.misc') + @mock.patch("taca.backup.backup.sp.Popen.communicate") + @mock.patch("taca.backup.backup.misc") def test_avail_disk_space(self, mock_misc, mock_sp): """Check backup disk space.""" backup_object = backup.backup_utils() - mock_sp.return_value = ['Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted on\n/dev/disk1s1 976490576 100 813074776 15% 1086272 4881366608 0% /System/Volumes/Data', None] - path = 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX' - run = '190201_A00621_0032_BHHFCFDSXX' + mock_sp.return_value = [ + "Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted on\n/dev/disk1s1 976490576 100 813074776 15% 1086272 4881366608 0% /System/Volumes/Data", + None, + ] + path = "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + run = "190201_A00621_0032_BHHFCFDSXX" with self.assertRaises(SystemExit): backup_object.avail_disk_space(path, run) - @mock.patch('taca.backup.backup.sp.check_call') + @mock.patch("taca.backup.backup.sp.check_call") def test_file_in_pdc(self, mock_call): """Check if files exist in PDC.""" - mock_call.return_value = 'Whatever' + mock_call.return_value = "Whatever" backup_object = backup.backup_utils() - src_file = 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt' + src_file = "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt" self.assertTrue(backup_object.file_in_pdc(src_file, silent=True)) def test_get_run_type(self): """Get run types from flowcell names.""" backup_object = backup.backup_utils() - novaseq_run = backup_object._get_run_type('190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(novaseq_run, 'novaseq') - hiseqx_run = backup_object._get_run_type('190711_ST-E00266_0356_AH2L32CCX2') - self.assertEqual(hiseqx_run, 'hiseqx') - miseq_run = backup_object._get_run_type('200604_M01320_0329_000000000-J668J') - self.assertEqual(miseq_run, 'miseq') - hiseq_run = backup_object._get_run_type('190628_D00415_0465_BH2HVYBCX3') - self.assertEqual(hiseq_run, 'hiseq') - nextseq_run = backup_object._get_run_type('200602_NS500688_0656_AHGCKWBGXF') - self.assertEqual(nextseq_run, 'nextseq') + novaseq_run = backup_object._get_run_type("190201_A00621_0032_BHHFCFDSXX") + self.assertEqual(novaseq_run, "novaseq") + hiseqx_run = backup_object._get_run_type("190711_ST-E00266_0356_AH2L32CCX2") + self.assertEqual(hiseqx_run, "hiseqx") + miseq_run = backup_object._get_run_type("200604_M01320_0329_000000000-J668J") + self.assertEqual(miseq_run, "miseq") + hiseq_run = backup_object._get_run_type("190628_D00415_0465_BH2HVYBCX3") + self.assertEqual(hiseq_run, "hiseq") + nextseq_run = backup_object._get_run_type("200602_NS500688_0656_AHGCKWBGXF") + self.assertEqual(nextseq_run, "nextseq") def test_call_commands(self): """Call expernal backup command.""" backup_object = backup.backup_utils() - got_output = backup_object._call_commands(cmd1='ls data/nas/miseq.lab', mail_failed=False, return_out=True) - expected_output = (True, b'190201_A00621_0032_BHHFCFDSXX\nnosync\n') + got_output = backup_object._call_commands( + cmd1="ls data/nas/miseq.lab", mail_failed=False, return_out=True + ) + expected_output = (True, b"190201_A00621_0032_BHHFCFDSXX\nnosync\n") self.assertEqual(got_output, expected_output) def test_call_commands_double(self): """Call external backup command, given two commands.""" backup_object = backup.backup_utils() - tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - tmp_file = os.path.join(tmp_dir, 'output.out') + tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + tmp_file = os.path.join(tmp_dir, "output.out") os.makedirs(tmp_dir) - cmd1 = 'ls data/nas/miseq.lab' - cmd2 = 'ls data/nas/miseq.lab' + cmd1 = "ls data/nas/miseq.lab" + cmd2 = "ls data/nas/miseq.lab" backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False) self.assertTrue(os.path.isfile(tmp_file)) shutil.rmtree(tmp_dir) @@ -111,49 +137,62 @@ def test_call_commands_double(self): def test_check_status(self): """Check subprocess status.""" backup_object = backup.backup_utils() - cmd = 'ls' + cmd = "ls" status_pass = 0 - err_msg = 'Error' - got_status_pass = backup_object._check_status(cmd, status_pass, err_msg, mail_failed=False) + err_msg = "Error" + got_status_pass = backup_object._check_status( + cmd, status_pass, err_msg, mail_failed=False + ) self.assertTrue(got_status_pass) status_fail = 1 - got_status_fail = backup_object._check_status(cmd, status_fail, err_msg, mail_failed=False) + got_status_fail = backup_object._check_status( + cmd, status_fail, err_msg, mail_failed=False + ) self.assertFalse(got_status_fail) - @mock.patch('taca.backup.backup.os.remove') + @mock.patch("taca.backup.backup.os.remove") def test_clean_tmp_files(self, mock_remove): """Remove file if it exist.""" backup_object = backup.backup_utils() - files = ['data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt', 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/missing_file.txt'] + files = [ + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt", + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/missing_file.txt", + ] backup_object._clean_tmp_files(files) - mock_remove.assert_called_once_with('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt') + mock_remove.assert_called_once_with( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt" + ) - @mock.patch('taca.backup.backup.statusdb', autospec=True) - @mock.patch('taca.backup.backup.logger') + @mock.patch("taca.backup.backup.statusdb", autospec=True) + @mock.patch("taca.backup.backup.logger") def test_log_pdc_statusdb(self, mock_logger, mock_couch): """Update statusdb if transfer was successful.""" backup_object = backup.backup_utils() - run = '190201_A00621_0032_BHHFCFDSXX' + run = "190201_A00621_0032_BHHFCFDSXX" backup_object._log_pdc_statusdb(run) mock_logger.warn.assert_called_once() - @mock.patch('taca.backup.backup.backup_utils._call_commands', return_value=True) - @mock.patch('taca.backup.backup.shutil') - @mock.patch('taca.backup.backup.backup_utils._clean_tmp_files') - @mock.patch('taca.backup.backup.backup_utils.avail_disk_space') + @mock.patch("taca.backup.backup.backup_utils._call_commands", return_value=True) + @mock.patch("taca.backup.backup.shutil") + @mock.patch("taca.backup.backup.backup_utils._clean_tmp_files") + @mock.patch("taca.backup.backup.backup_utils.avail_disk_space") def test_encrypt_runs(self, mock_space, mock_clean, mock_shutil, mock_command): """Encrypt found runs.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') - run = 'data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX' + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) + run = "data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX" force = True backup_object.encrypt_runs(run, force) mock_clean.assert_called_once() - os.remove('data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX.encrypting') + os.remove("data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX.encrypting") - @mock.patch('taca.backup.backup.logger.error') + @mock.patch("taca.backup.backup.logger.error") def test_pdc_put(self, mock_logger): """Put runs on PDC.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') - run = 'data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX' + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) + run = "data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX" backup_object.pdc_put(run) mock_logger.assert_called_once() diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index 2b4f365a..c5a0d53f 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -10,44 +10,48 @@ from taca.cleanup import cleanup from taca.utils import config as conf -CONFIG = conf.load_yaml_config('data/taca_test_cfg_cleanup.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg_cleanup.yaml") class TestCleanup(unittest.TestCase): """Tests for TACA Cleanup module.""" - @mock.patch('taca.cleanup.cleanup.shutil.move') - @mock.patch('taca.cleanup.cleanup.os.listdir') + @mock.patch("taca.cleanup.cleanup.shutil.move") + @mock.patch("taca.cleanup.cleanup.os.listdir") def test_cleanup_nas(self, mock_listdir, mock_move): """Locate and move old data on NAS.""" seconds = 1 - run = '190201_A00621_0032_BHHFCFDSXX' + run = "190201_A00621_0032_BHHFCFDSXX" mock_listdir.return_value = [run] cleanup.cleanup_nas(seconds) - mock_move.assert_called_once_with(run, 'nosync') + mock_move.assert_called_once_with(run, "nosync") - @mock.patch('taca.cleanup.cleanup.shutil.rmtree') - @mock.patch('taca.cleanup.cleanup.os.listdir') + @mock.patch("taca.cleanup.cleanup.shutil.rmtree") + @mock.patch("taca.cleanup.cleanup.os.listdir") def test_cleanup_processing(self, mock_listdir, mock_rmtree): """Locate and move old data on preproc.""" seconds = 1 - run = '190201_A00621_0032_BHHFCFDSXY' + run = "190201_A00621_0032_BHHFCFDSXY" mock_listdir.return_value = [run] cleanup.cleanup_processing(seconds) mock_rmtree.assert_called_once_with(run) - @mock.patch('taca.cleanup.cleanup.statusdb') - @mock.patch('taca.cleanup.cleanup.get_closed_proj_info') - @mock.patch('taca.cleanup.cleanup.misc.query_yes_no') - @mock.patch('taca.cleanup.cleanup._remove_files') - @mock.patch('taca.cleanup.cleanup._touch_cleaned') - def test_cleanup_miarka(self, mock_touch, mock_rm, mock_query, mock_info, mock_statusdb): + @mock.patch("taca.cleanup.cleanup.statusdb") + @mock.patch("taca.cleanup.cleanup.get_closed_proj_info") + @mock.patch("taca.cleanup.cleanup.misc.query_yes_no") + @mock.patch("taca.cleanup.cleanup._remove_files") + @mock.patch("taca.cleanup.cleanup._touch_cleaned") + def test_cleanup_miarka( + self, mock_touch, mock_rm, mock_query, mock_info, mock_statusdb + ): """Locate and move old data on Miarka.""" - mock_info.return_value = {'closed_date': '2019-04-07', - 'bioinfo_responsible': 'O.B. One', - 'pid': 'P1234', - 'name': 'N.Owens_19_01', - 'closed_days': 5} + mock_info.return_value = { + "closed_date": "2019-04-07", + "bioinfo_responsible": "O.B. One", + "pid": "P1234", + "name": "N.Owens_19_01", + "closed_days": 5, + } mock_query.return_value = True mock_rm.return_value = True days_fastq = 1 @@ -55,71 +59,102 @@ def test_cleanup_miarka(self, mock_touch, mock_rm, mock_query, mock_info, mock_ only_fastq = False only_analysis = False clean_undetermined = False - status_db_config = 'data/taca_test_cfg_cleanup.yaml' + status_db_config = "data/taca_test_cfg_cleanup.yaml" exclude_projects = False list_only = False - date = '2016-01-31' - calls = [mock.call('data/miarka/incoming/190201_A00621_0032_BHHFCFDSXX/Demultiplexing/N.Owens_19_01'), - mock.call('../../nobackup/NGI/ANALYSIS/P1234')] - cleanup.cleanup_miarka(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False) + date = "2016-01-31" + calls = [ + mock.call( + "data/miarka/incoming/190201_A00621_0032_BHHFCFDSXX/Demultiplexing/N.Owens_19_01" + ), + mock.call("../../nobackup/NGI/ANALYSIS/P1234"), + ] + cleanup.cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run=False, + ) mock_touch.assert_has_calls(calls) def test_get_closed_proj_info(self): """Return a dict if project is closed.""" - pid = 'P1234' - pdoc = {'close_date': '2019-04-07', - 'project_name': 'A.Name_19_01', - 'project_id': 'P1234', - 'project_summary': {'bioinfo_responsible': 'O.B. One'}} - tdate = datetime.strptime('2019-04-08', '%Y-%m-%d') + pid = "P1234" + pdoc = { + "close_date": "2019-04-07", + "project_name": "A.Name_19_01", + "project_id": "P1234", + "project_summary": {"bioinfo_responsible": "O.B. One"}, + } + tdate = datetime.strptime("2019-04-08", "%Y-%m-%d") got_data = cleanup.get_closed_proj_info(pid, pdoc, tdate) - expected_data = {'closed_date': '2019-04-07', - 'bioinfo_responsible': b'O.B. One', - 'pid': 'P1234', - 'name': 'A.Name_19_01', - 'closed_days': 1} + expected_data = { + "closed_date": "2019-04-07", + "bioinfo_responsible": b"O.B. One", + "pid": "P1234", + "name": "A.Name_19_01", + "closed_days": 1, + } self.assertEqual(got_data, expected_data) def test_collect_analysis_data_miarka(self): """Get analysis data on Miarka.""" - pid = 'P1234' - analysis_root = 'data/test_data/analysis' - file_list, size = cleanup.collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}) - self.assertEqual(file_list, 'cleaned') + pid = "P1234" + analysis_root = "data/test_data/analysis" + file_list, size = cleanup.collect_analysis_data_miarka( + pid, analysis_root, files_ext_to_remove={} + ) + self.assertEqual(file_list, "cleaned") def test_collect_fastq_data_miarka(self): """Collect removed files.""" - fc_root = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' - fc_proj_src = 'N.Owens_19_01' + fc_root = "data/test_data/190201_A00621_0032_BHHFCFDSXX" + fc_proj_src = "N.Owens_19_01" file_list, size = cleanup.collect_fastq_data_miarka(fc_root, fc_proj_src) - expected_data = {'flowcells': - {'190201_A00621_0032_BHHFCFDSXX': - {'proj_root': 'data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01', - 'fq_files': ['data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample1.fastq.gz', - 'data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample2.fastq.gz']}}} + expected_data = { + "flowcells": { + "190201_A00621_0032_BHHFCFDSXX": { + "proj_root": "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01", + "fq_files": [ + "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample1.fastq.gz", + "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample2.fastq.gz", + ], + } + } + } self.assertEqual(file_list, expected_data) self.assertEqual(size, 0) def test_collect_files_by_ext(self): """Return found paths.""" - path = 'data/test_data' - ext = ['*.txt'] + path = "data/test_data" + ext = ["*.txt"] found_files = cleanup.collect_files_by_ext(path, ext) - expected_files = ['data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY/RTAComplete.txt', - 'data/test_data/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt'] + expected_files = [ + "data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY/RTAComplete.txt", + "data/test_data/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt", + ] self.assertEqual(found_files, expected_files) def test_get_proj_meta_info(self): """Get project metadata.""" - info = {'name': 'Nobody Owens', - 'pid': 'P1234', - 'bioinfo_responsible': 'O.B. One', - 'closed_days': 1, - 'closed_date': '2020-04-07', - 'fastq_size': 1001} - days_fastq = '' + info = { + "name": "Nobody Owens", + "pid": "P1234", + "bioinfo_responsible": "O.B. One", + "closed_days": 1, + "closed_date": "2020-04-07", + "fastq_size": 1001, + } + days_fastq = "" got_data = cleanup.get_proj_meta_info(info, days_fastq) - expected_data = ''' + expected_data = """ Project overview: Nobody Owens Project ID: P1234 Bioinfo Responsible: O.B. One @@ -127,42 +162,44 @@ def test_get_proj_meta_info(self): Closed from (date): 2020-04-07 Project analysis: No analysis directory Estimated data size: ~2kb -''' +""" self.assertEqual(got_data, expected_data) def test_get_files_size_text(self): """Format file size string.""" - plist = {'P1': {'fastq_size': 1001, 'analysis_size': 1000000}, - 'P2': {'fastq_size': 1001, 'analysis_size': 1000000}} + plist = { + "P1": {"fastq_size": 1001, "analysis_size": 1000000}, + "P2": {"fastq_size": 1001, "analysis_size": 1000000}, + } got_data = cleanup.get_files_size_text(plist) - expected_data = '(~~2kb fastq data and ~~2mb analysis data) ' + expected_data = "(~~2kb fastq data and ~~2mb analysis data) " self.assertEqual(got_data, expected_data) def test_def_get_size_unit(self): """Convert size.""" - #function broken if size < 1000 + # function broken if size < 1000 size = 1001 - self.assertEqual(cleanup._def_get_size_unit(size), '~1kb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1kb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1mb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1mb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1gb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1gb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1tb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1tb") - @mock.patch('taca.cleanup.cleanup.os.remove') + @mock.patch("taca.cleanup.cleanup.os.remove") def test_remove_files(self, mock_remove): """Remove files in given list.""" - files = ['file1', 'file2'] + files = ["file1", "file2"] cleanup._remove_files(files) - calls = [mock.call('file1'), mock.call('file2')] + calls = [mock.call("file1"), mock.call("file2")] mock_remove.assert_has_calls(calls) def test_touch_cleaned(self): """Create empty file in specified dir.""" - tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") os.makedirs(tmp_dir) cleanup._touch_cleaned(tmp_dir) - expected_file = os.path.join(tmp_dir, 'cleaned') + expected_file = os.path.join(tmp_dir, "cleaned") self.assertTrue(os.path.exists(expected_file)) shutil.rmtree(tmp_dir) diff --git a/tests/test_illumina.py b/tests/test_illumina.py index e26a48e5..d052bb89 100644 --- a/tests/test_illumina.py +++ b/tests/test_illumina.py @@ -23,14 +23,15 @@ # This is only run if TACA is called from the CLI, as this is a test, we need to # call it explicitely -CONFIG = conf.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg.yaml") class TestRuns(unittest.TestCase): """Tests for the Run base class.""" + @classmethod def setUpClass(self): - """ Creates the following directory tree for testing purposes: + """Creates the following directory tree for testing purposes: tmp/ |__ 141124_ST-COMPLETED_01_AFCIDXX @@ -86,107 +87,277 @@ def setUpClass(self): | |__lots of files |__ archive """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - self.transfer_file = os.path.join(self.tmp_dir, 'transfer.tsv') - - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') - to_start = os.path.join(self.tmp_dir, '141124_ST-TOSTART1_04_FCIDXXX') - in_progress = os.path.join(self.tmp_dir, '141124_ST-INPROGRESS1_02_AFCIDXX') - in_progress_done = os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX') - completed = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX') - dummy = os.path.join(self.tmp_dir, '141124_ST-DUMMY1_01_AFCIDXX') - complex_run_dir = os.path.join(self.tmp_dir, '141124_ST-COMPLEX1_01_AFCIDXX') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + self.transfer_file = os.path.join(self.tmp_dir, "transfer.tsv") + + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") + to_start = os.path.join(self.tmp_dir, "141124_ST-TOSTART1_04_FCIDXXX") + in_progress = os.path.join(self.tmp_dir, "141124_ST-INPROGRESS1_02_AFCIDXX") + in_progress_done = os.path.join( + self.tmp_dir, "141124_ST-INPROGRESSDONE1_02_AFCIDXX" + ) + completed = os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX") + dummy = os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX") + complex_run_dir = os.path.join(self.tmp_dir, "141124_ST-COMPLEX1_01_AFCIDXX") finished_runs = [to_start, in_progress, in_progress_done, completed] # Create runs directory structure os.makedirs(self.tmp_dir) os.makedirs(running) os.makedirs(to_start) - os.makedirs(os.path.join(in_progress, 'Demultiplexing')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_1')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_2')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_3')) - os.makedirs(os.path.join(in_progress_done, 'Demultiplexing')) - os.makedirs(os.path.join(in_progress_done, 'Demultiplexing_0/Stats')) - os.makedirs(os.path.join(completed, 'Demultiplexing', 'Stats')) + os.makedirs(os.path.join(in_progress, "Demultiplexing")) + os.makedirs( + os.path.join( + in_progress, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) + os.makedirs(os.path.join(in_progress, "Demultiplexing_1")) + os.makedirs(os.path.join(in_progress, "Demultiplexing_2")) + os.makedirs(os.path.join(in_progress, "Demultiplexing_3")) + os.makedirs(os.path.join(in_progress_done, "Demultiplexing")) + os.makedirs(os.path.join(in_progress_done, "Demultiplexing_0/Stats")) + os.makedirs(os.path.join(completed, "Demultiplexing", "Stats")) os.makedirs(dummy) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_0', 'Stats')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_1', 'Stats')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001')) - os.makedirs(os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html','FCIDXX', 'all', 'all', 'all')) - os.makedirs(os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html','FCIDXX', 'all', 'all', 'all')) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing")) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing_0", "Stats")) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing_1", "Stats")) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + ) + ) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) # Create files indicating that the run is finished for run in finished_runs: - open(os.path.join(run, 'RTAComplete.txt'), 'w').close() + open(os.path.join(run, "RTAComplete.txt"), "w").close() # Create sample sheets for running demultiplexing - open(os.path.join(in_progress, 'SampleSheet_0.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_1.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_2.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_3.csv'), 'w').close() - open(os.path.join(in_progress_done, 'SampleSheet_0.csv'), 'w').close() - shutil.copy('data/samplesheet.csv', os.path.join(completed, 'SampleSheet.csv')) - shutil.copy('data/samplesheet.csv', os.path.join(complex_run_dir, 'SampleSheet_0.csv')) - shutil.copy('data/samplesheet.csv', os.path.join(complex_run_dir, 'SampleSheet_1.csv')) + open(os.path.join(in_progress, "SampleSheet_0.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_1.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_2.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_3.csv"), "w").close() + open(os.path.join(in_progress_done, "SampleSheet_0.csv"), "w").close() + shutil.copy("data/samplesheet.csv", os.path.join(completed, "SampleSheet.csv")) + shutil.copy( + "data/samplesheet.csv", os.path.join(complex_run_dir, "SampleSheet_0.csv") + ) + shutil.copy( + "data/samplesheet.csv", os.path.join(complex_run_dir, "SampleSheet_1.csv") + ) # Create files indicating that demultiplexing is ongoing - open(os.path.join(in_progress_done, 'Demultiplexing_0', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(in_progress_done, 'Demultiplexing_0', 'Stats', 'DemuxSummaryF1L1.txt'), 'w').close() - shutil.copy('data/lane.html', os.path.join(in_progress,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) + open( + os.path.join( + in_progress_done, "Demultiplexing_0", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + in_progress_done, "Demultiplexing_0", "Stats", "DemuxSummaryF1L1.txt" + ), + "w", + ).close() + shutil.copy( + "data/lane.html", + os.path.join( + in_progress, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) # Create files indicating that the preprocessing is done - open(os.path.join(completed, 'Demultiplexing', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close() - open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R1_001.fastq.gz'), 'w').close() - open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R2_001.fastq.gz'), 'w').close() - with open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json: - stats_json.write(unicode(json.dumps({'silly': 1}, ensure_ascii=False))) + open( + os.path.join( + completed, "Demultiplexing", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + completed, "Demultiplexing", "Undetermined_S0_L001_R1_001.fastq.gz" + ), + "w", + ).close() + open( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + "P16510_1001_S1_L001_R1_001.fastq.gz", + ), + "w", + ).close() + open( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + "P16510_1001_S1_L001_R2_001.fastq.gz", + ), + "w", + ).close() + with open( + os.path.join(completed, "Demultiplexing", "Stats", "Stats.json"), + "w", + encoding="utf-8", + ) as stats_json: + stats_json.write(unicode(json.dumps({"silly": 1}, ensure_ascii=False))) # Copy transfer file with the completed run - shutil.copy('data/test_transfer.tsv', self.transfer_file) + shutil.copy("data/test_transfer.tsv", self.transfer_file) # Move sample RunInfo.xml file to every run directory - for run in [running, to_start, in_progress, in_progress_done, completed, dummy, complex_run_dir]: - shutil.copy('data/RunInfo.xml', run) - shutil.copy('data/runParameters.xml', run) + for run in [ + running, + to_start, + in_progress, + in_progress_done, + completed, + dummy, + complex_run_dir, + ]: + shutil.copy("data/RunInfo.xml", run) + shutil.copy("data/runParameters.xml", run) # Create files for complex case - shutil.copy('data/Stats.json', os.path.join(complex_run_dir, 'Demultiplexing_0', 'Stats', 'Stats.json')) - shutil.copy('data/Stats.json', os.path.join(complex_run_dir, 'Demultiplexing_1', 'Stats', 'Stats.json')) - shutil.copy('data/lane.html', os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/lane.html', os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/laneBarcode.html', os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/laneBarcode.html', os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) + shutil.copy( + "data/Stats.json", + os.path.join(complex_run_dir, "Demultiplexing_0", "Stats", "Stats.json"), + ) + shutil.copy( + "data/Stats.json", + os.path.join(complex_run_dir, "Demultiplexing_1", "Stats", "Stats.json"), + ) + shutil.copy( + "data/lane.html", + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/lane.html", + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/laneBarcode.html", + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/laneBarcode.html", + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) # Create archive dir - self.archive_dir = os.path.join(self.tmp_dir, 'archive') + self.archive_dir = os.path.join(self.tmp_dir, "archive") os.makedirs(self.archive_dir) # Create run objects - self.running = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.to_start = Run(os.path.join(self.tmp_dir, - '141124_ST-TOSTART1_04_FCIDXXX'), - CONFIG['analysis']['NovaSeq']) - self.in_progress = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-INPROGRESS1_02_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.in_progress_done = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-INPROGRESSDONE1_02_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.completed = Run(os.path.join(self.tmp_dir, - '141124_ST-COMPLETED1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.dummy_run = Run(os.path.join(self.tmp_dir, - '141124_ST-DUMMY1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.to_start = Run( + os.path.join(self.tmp_dir, "141124_ST-TOSTART1_04_FCIDXXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.in_progress = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-INPROGRESS1_02_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.in_progress_done = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-INPROGRESSDONE1_02_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.completed = Run( + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.dummy_run = Run( + os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) self.finished_runs = [self.to_start, self.in_progress, self.completed] - self.complex_run = Run(os.path.join(self.tmp_dir, '141124_ST-COMPLEX1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.complex_run = Run( + os.path.join(self.tmp_dir, "141124_ST-COMPLEX1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -196,13 +367,13 @@ def test_run_setup(self): """Raise RuntimeError if files are missing.""" # if rundir missing with self.assertRaises(RuntimeError): - Run('missing_dir', CONFIG['analysis']['NovaSeq']) + Run("missing_dir", CONFIG["analysis"]["NovaSeq"]) # if config incomplete with self.assertRaises(RuntimeError): - Run(self.tmp_dir, CONFIG['analysis']['DummySeq']) + Run(self.tmp_dir, CONFIG["analysis"]["DummySeq"]) # if runParameters.xml missing with self.assertRaises(RuntimeError): - Run(self.tmp_dir, CONFIG['analysis']['NovaSeq']) + Run(self.tmp_dir, CONFIG["analysis"]["NovaSeq"]) def test_is_sequencing_done(self): """Is finished should be True only if "RTAComplete.txt" file is present.""" @@ -211,22 +382,24 @@ def test_is_sequencing_done(self): def test_get_run_status(self): """Get the run status based on present files.""" - self.assertEqual('SEQUENCING', self.running.get_run_status()) - self.assertEqual('TO_START', self.to_start.get_run_status()) - self.assertEqual('IN_PROGRESS', self.in_progress.get_run_status()) - self.assertEqual('COMPLETED', self.completed.get_run_status()) + self.assertEqual("SEQUENCING", self.running.get_run_status()) + self.assertEqual("TO_START", self.to_start.get_run_status()) + self.assertEqual("IN_PROGRESS", self.in_progress.get_run_status()) + self.assertEqual("COMPLETED", self.completed.get_run_status()) def test_is_transferred(self): """is_transferred should rely on the info in transfer.tsv.""" - os.makedirs(os.path.join(self.tmp_dir, '141124_ST-DUMMY1_01_AFCIDXX', 'transferring')) + os.makedirs( + os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX", "transferring") + ) self.assertTrue(self.dummy_run.is_transferred(self.transfer_file)) self.assertTrue(self.completed.is_transferred(self.transfer_file)) self.assertFalse(self.running.is_transferred(self.transfer_file)) self.assertFalse(self.to_start.is_transferred(self.transfer_file)) - self.assertFalse(self.in_progress.is_transferred( self.transfer_file)) - self.assertFalse(self.completed.is_transferred('missing_file')) + self.assertFalse(self.in_progress.is_transferred(self.transfer_file)) + self.assertFalse(self.completed.is_transferred("missing_file")) - @mock.patch('taca.illumina.Standard_Runs.Standard_Run._aggregate_demux_results') + @mock.patch("taca.illumina.Standard_Runs.Standard_Run._aggregate_demux_results") def test_check_run_status_done(self, mock_aggregate_demux_results): """Recognize if a demultiplexing run is finished or not.""" self.in_progress.check_run_status() @@ -234,26 +407,26 @@ def test_check_run_status_done(self, mock_aggregate_demux_results): self.in_progress_done.check_run_status() mock_aggregate_demux_results.assert_called_once() - @mock.patch('taca.illumina.Runs.Run.get_run_status') + @mock.patch("taca.illumina.Runs.Run.get_run_status") def test_check_run_status_completed(self, mock_status): """Return None if run is finished.""" - mock_status.return_value = 'COMPLETED' + mock_status.return_value = "COMPLETED" self.assertEqual(self.in_progress.check_run_status(), None) def test_get_run_type(self): """Return runtype if set.""" - self.assertEqual('NGI-RUN', self.running.get_run_type()) + self.assertEqual("NGI-RUN", self.running.get_run_type()) self.to_start.run_type = False with self.assertRaises(RuntimeError): self.to_start.get_run_type() def test_get_demux_folder(self): """Return name of demux folder if set.""" - self.assertEqual('Demultiplexing', self.running._get_demux_folder()) + self.assertEqual("Demultiplexing", self.running._get_demux_folder()) def test_get_samplesheet(self): """Return location of sample sheet.""" - self.assertEqual('data/2014/FCIDXX.csv', self.running._get_samplesheet()) + self.assertEqual("data/2014/FCIDXX.csv", self.running._get_samplesheet()) def test_is_demultiplexing_done(self): """Return true if Stats.json exists, else false.""" @@ -270,110 +443,196 @@ def test_generate_per_lane_base_mask(self): with self.assertRaises(RuntimeError): self.dummy_run._generate_per_lane_base_mask() - shutil.copy('data/samplesheet_dummy_run.csv', os.path.join(self.tmp_dir,'141124_ST-DUMMY1_01_AFCIDXX', 'SampleSheet.csv')) - self.dummy_run._set_run_parser_obj(CONFIG['analysis']['NovaSeq']) - expected_mask = {'1': {'Y151I7N3I7N3': - {'base_mask': ['Y151', 'I7N3', 'I7N3'], - 'data': [{'index': 'CGCGCAG', - 'Lane': '1', - 'Sample_ID': 'Sample_P10000_1001', - 'Sample_Project': 'A_Test_18_01', - 'Sample_Name': 'Sample_P10000_1001', - 'index2': 'CTGCGCG'}]}, - 'Y151I7N3N10': - {'base_mask': ['Y151', 'I7N3', 'N10'], - 'data': [{'index': 'AGGTACC', - 'Lane': '1', - 'Sample_ID': 'Sample_P10000_1005', - 'Sample_Project': 'A_Test_18_01', - 'Sample_Name': 'Sample_P10000_1005', - 'index2': ''}]}}} + shutil.copy( + "data/samplesheet_dummy_run.csv", + os.path.join( + self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX", "SampleSheet.csv" + ), + ) + self.dummy_run._set_run_parser_obj(CONFIG["analysis"]["NovaSeq"]) + expected_mask = { + "1": { + "Y151I7N3I7N3": { + "base_mask": ["Y151", "I7N3", "I7N3"], + "data": [ + { + "index": "CGCGCAG", + "Lane": "1", + "Sample_ID": "Sample_P10000_1001", + "Sample_Project": "A_Test_18_01", + "Sample_Name": "Sample_P10000_1001", + "index2": "CTGCGCG", + } + ], + }, + "Y151I7N3N10": { + "base_mask": ["Y151", "I7N3", "N10"], + "data": [ + { + "index": "AGGTACC", + "Lane": "1", + "Sample_ID": "Sample_P10000_1005", + "Sample_Project": "A_Test_18_01", + "Sample_Name": "Sample_P10000_1005", + "index2": "", + } + ], + }, + } + } got_mask = self.dummy_run._generate_per_lane_base_mask() self.assertEqual(expected_mask, got_mask) def test_compute_base_mask(self): """Compute Run base mask.""" - runSetup = [{'IsIndexedRead': 'N', 'NumCycles': '151', 'Number': '1'}, - {'IsIndexedRead': 'Y', 'NumCycles': '8', 'Number': '2'}, - {'IsIndexedRead': 'Y', 'NumCycles': '8', 'Number': '3'}, - {'IsIndexedRead': 'N', 'NumCycles': '151', 'Number': '4'}] + runSetup = [ + {"IsIndexedRead": "N", "NumCycles": "151", "Number": "1"}, + {"IsIndexedRead": "Y", "NumCycles": "8", "Number": "2"}, + {"IsIndexedRead": "Y", "NumCycles": "8", "Number": "3"}, + {"IsIndexedRead": "N", "NumCycles": "151", "Number": "4"}, + ] index_size = 7 dual_index_sample = True index2_size = 7 - got_mask = self.dummy_run._compute_base_mask(runSetup, index_size, dual_index_sample, index2_size) - expected_mask = ['Y151', 'I7N1', 'I7N1', 'Y151'] + got_mask = self.dummy_run._compute_base_mask( + runSetup, index_size, dual_index_sample, index2_size + ) + expected_mask = ["Y151", "I7N1", "I7N1", "Y151"] self.assertEqual(got_mask, expected_mask) - @mock.patch('taca.illumina.Runs.misc.call_external_command') + @mock.patch("taca.illumina.Runs.misc.call_external_command") def test_transfer_run(self, mock_call_external_command): """Call external rsync.""" self.completed.transfer_run(self.transfer_file) - command_line = ['rsync', '-LtDrv', '--chmod=g+rw', - '--exclude=Demultiplexing_*/*_*', - '--include=*/', '--include=*.file', - '--exclude=*', '--prune-empty-dirs', - os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - 'None@None:None'] - mock_call_external_command.assert_called_once_with(command_line, - log_dir=os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - prefix='', - with_log_files=True) - - @mock.patch('taca.illumina.Runs.misc.call_external_command') + command_line = [ + "rsync", + "-LtDrv", + "--chmod=g+rw", + "--exclude=Demultiplexing_*/*_*", + "--include=*/", + "--include=*.file", + "--exclude=*", + "--prune-empty-dirs", + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + "None@None:None", + ] + mock_call_external_command.assert_called_once_with( + command_line, + log_dir=os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + prefix="", + with_log_files=True, + ) + + @mock.patch("taca.illumina.Runs.misc.call_external_command") def test_transfer_run_error(self, mock_call_external_command): """Handle external rsync error.""" - mock_call_external_command.side_effect = subprocess.CalledProcessError(1, 'some error') + mock_call_external_command.side_effect = subprocess.CalledProcessError( + 1, "some error" + ) with self.assertRaises(subprocess.CalledProcessError): self.completed.transfer_run(self.transfer_file) - @mock.patch('taca.illumina.Runs.shutil.move') + @mock.patch("taca.illumina.Runs.shutil.move") def test_archive_run(self, mock_move): """Move file to archive.""" self.completed.archive_run(self.archive_dir) - mock_move.assert_called_once_with(os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - os.path.join(self.archive_dir, '141124_ST-COMPLETED1_01_AFCIDXX')) + mock_move.assert_called_once_with( + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + os.path.join(self.archive_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + ) - @mock.patch('taca.illumina.Runs.misc.send_mail') + @mock.patch("taca.illumina.Runs.misc.send_mail") def test_send_mail(self, mock_send_mail): """Send mail to user.""" - self.completed.send_mail('Hello', 'user@email.com') - mock_send_mail.assert_called_once_with('141124_ST-COMPLETED1_01_AFCIDXX', 'Hello', 'user@email.com') + self.completed.send_mail("Hello", "user@email.com") + mock_send_mail.assert_called_once_with( + "141124_ST-COMPLETED1_01_AFCIDXX", "Hello", "user@email.com" + ) def test_is_unpooled_lane(self): """Check if lane is unpooled.""" - self.assertTrue(self.in_progress.is_unpooled_lane('2')) + self.assertTrue(self.in_progress.is_unpooled_lane("2")) def test_get_samples_per_lane(self): """Return samples from samplesheet.""" - expected_samples = {'1': 'P10000_1001', '2': 'P10000_1005', '3': 'P10000_1006', '4': 'P10000_1007'} - got_samples = self.in_progress.get_samples_per_lane() + expected_samples = { + "1": "P10000_1001", + "2": "P10000_1005", + "3": "P10000_1006", + "4": "P10000_1007", + } + got_samples = self.in_progress.get_samples_per_lane() self.assertEqual(expected_samples, got_samples) - @mock.patch('taca.illumina.Runs.os.rename') + @mock.patch("taca.illumina.Runs.os.rename") def test_rename_undet(self, mock_rename): """Prepend sample name to file name.""" - samples_per_lane = {'1': 'P10000_1001', '2': 'P10000_1005'} - lane = '1' + samples_per_lane = {"1": "P10000_1001", "2": "P10000_1005"} + lane = "1" self.completed._rename_undet(lane, samples_per_lane) - old_name = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX', 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz') - new_name = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX', 'Demultiplexing', 'P10000_1001_Undetermined_L011_R1_001.fastq.gz') + old_name = os.path.join( + self.tmp_dir, + "141124_ST-COMPLETED1_01_AFCIDXX", + "Demultiplexing", + "Undetermined_S0_L001_R1_001.fastq.gz", + ) + new_name = os.path.join( + self.tmp_dir, + "141124_ST-COMPLETED1_01_AFCIDXX", + "Demultiplexing", + "P10000_1001_Undetermined_L011_R1_001.fastq.gz", + ) mock_rename.assert_called_once_with(old_name, new_name) - @mock.patch('taca.illumina.Runs.os.symlink') + @mock.patch("taca.illumina.Runs.os.symlink") def test_aggregate_demux_results_simple_complex(self, mock_symlink): """Aggregare demux results simple case.""" self.assertTrue(self.in_progress_done._aggregate_demux_results_simple_complex()) - calls = [mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/DemultiplexingStats.xml'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/DemultiplexingStats.xml')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/AdapterTrimming.txt'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/AdapterTrimming.txt')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/ConversionStats.xml'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/ConversionStats.xml')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/Stats.json'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/Stats.json'))] + calls = [ + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/DemultiplexingStats.xml", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/DemultiplexingStats.xml", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/AdapterTrimming.txt", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/AdapterTrimming.txt", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/ConversionStats.xml", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/ConversionStats.xml", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/Stats.json", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/Stats.json", + ), + ), + ] mock_symlink.assert_has_calls(calls) - @mock.patch('taca.illumina.Runs.json.dump') + @mock.patch("taca.illumina.Runs.json.dump") def test_aggregate_demux_results_simple_complex_complex(self, mock_json_dump): """Aggregare demux results complex case.""" self.assertTrue(self.complex_run._aggregate_demux_results_simple_complex()) @@ -387,22 +646,23 @@ def test_aggregate_demux_results_simple_complex_fail(self): def test_create_folder_structure(self): """Make directory structure.""" root = self.tmp_dir - dirs = ['dir1', 'dir2'] + dirs = ["dir1", "dir2"] path = _create_folder_structure(root, dirs) - self.assertEqual(path, os.path.join(self.tmp_dir, 'dir1/dir2')) + self.assertEqual(path, os.path.join(self.tmp_dir, "dir1/dir2")) def test_generate_lane_html(self): """Generate lane HTML.""" - html_report = 'data/lane.html' + html_report = "data/lane.html" html_report_lane_parser = LaneBarcodeParser(html_report) - html_file = os.path.join(self.tmp_dir, 'generated_lane.html') - expected_file = 'data/lane_result.html' + html_file = os.path.join(self.tmp_dir, "generated_lane.html") + expected_file = "data/lane_result.html" _generate_lane_html(html_file, html_report_lane_parser) self.assertTrue(filecmp.cmp(html_file, expected_file)) class TestNovaSeqRuns(unittest.TestCase): """Tests for the NovaSeq_Run run class.""" + @classmethod def setUpClass(self): """Creates the following directory tree for testing purposes: @@ -411,23 +671,24 @@ def setUpClass(self): |__ 141124_ST-RUNNING1_03_AFCIDXX |__ RunInfo.xml """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") os.makedirs(self.tmp_dir) os.makedirs(running) # Create files indicating that the run is finished - open(os.path.join(running, 'RTAComplete.txt'), 'w').close() + open(os.path.join(running, "RTAComplete.txt"), "w").close() # Move sample RunInfo.xml file to run directory - shutil.copy('data/RunInfo.xml', running) - shutil.copy('data/runParameters.xml', running) + shutil.copy("data/RunInfo.xml", running) + shutil.copy("data/runParameters.xml", running) # Create run objects - self.running = NovaSeq_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = NovaSeq_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -435,12 +696,13 @@ def tearDownClass(self): def test_novaseq(self): """Set sequencer and run type NovaSeq.""" - self.assertEqual(self.running.sequencer_type, 'NovaSeq') - self.assertEqual(self.running.run_type, 'NGI-RUN') + self.assertEqual(self.running.sequencer_type, "NovaSeq") + self.assertEqual(self.running.run_type, "NGI-RUN") class TestNextSeqRuns(unittest.TestCase): """Tests for the NextSeq_Run run class.""" + @classmethod def setUpClass(self): """Creates the following directory tree for testing purposes: @@ -449,23 +711,24 @@ def setUpClass(self): |__ 141124_ST-RUNNING1_03_AFCIDXX |__ RunInfo.xml """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") os.makedirs(self.tmp_dir) os.makedirs(running) # Create files indicating that the run is finished - open(os.path.join(running, 'RTAComplete.txt'), 'w').close() + open(os.path.join(running, "RTAComplete.txt"), "w").close() # Move sample RunInfo.xml file to run directory - shutil.copy('data/RunInfo.xml', running) - shutil.copy('data/runParameters.xml', running) + shutil.copy("data/RunInfo.xml", running) + shutil.copy("data/runParameters.xml", running) # Create run objects - self.running = NextSeq_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = NextSeq_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -473,5 +736,5 @@ def tearDownClass(self): def test_nextseq(self): """Set sequencer and run type NextSeq.""" - self.assertEqual(self.running.sequencer_type, 'NextSeq') - self.assertEqual(self.running.run_type, 'NGI-RUN') + self.assertEqual(self.running.sequencer_type, "NextSeq") + self.assertEqual(self.running.run_type, "NGI-RUN") diff --git a/tests/test_instrument_transfer.py b/tests/test_instrument_transfer.py index 3d9b85fc..2b66c111 100644 --- a/tests/test_instrument_transfer.py +++ b/tests/test_instrument_transfer.py @@ -80,14 +80,15 @@ def setup_test_fixture() -> (Mock, tempfile.TemporaryDirectory, dict): def test_main_ignore_CTC(setup_test_fixture): - """Check so that runs on configuration test cells are not picked up. - """ + """Check so that runs on configuration test cells are not picked up.""" # Run fixture args, tmp, file_paths = setup_test_fixture # Setup run - run_path = f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'CTC')}" + run_path = ( + f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'CTC')}" + ) os.makedirs(run_path) with patch("taca.nanopore.instrument_transfer.dump_path") as mock_dump_path: @@ -107,7 +108,9 @@ def test_main_ignore_col3(setup_test_fixture): args, tmp, file_paths = setup_test_fixture # Setup run - run_path = f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('MN19414', '3A')}" + run_path = ( + f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('MN19414', '3A')}" + ) os.makedirs(run_path) with patch("taca.nanopore.instrument_transfer.dump_path") as mock_dump_path: @@ -115,7 +118,7 @@ def test_main_ignore_col3(setup_test_fixture): instrument_transfer.main(args) # Check dump_path was not called - mock_dump_path.assert_not_called() + mock_dump_path.assert_not_called() @pytest.mark.parametrize( @@ -158,9 +161,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc): # Check path was dumped assert os.path.exists(run_path + "/run_path.txt") - assert open(run_path + "/run_path.txt").read() == "/".join( - run_path.split("/")[-3:] - ) + assert open(run_path + "/run_path.txt").read() == "/".join(run_path.split("/")[-3:]) # Check pore count history was dumped assert os.path.exists(run_path + "/pore_count_history.csv") @@ -328,7 +329,6 @@ def test_archive_finished_run(): def test_parse_position_logs(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture @@ -342,7 +342,6 @@ def test_parse_position_logs(setup_test_fixture): assert len(logs) == len(set(logs_as_strings)) for entry in logs: - assert re.match(r"^(MN19414)|(1A)$", entry["position"]) assert re.match(r"^2024-01-01 0\d:0\d:0\d.0\d$", entry["timestamp"]) assert re.match(r"^INFO: [a-z\._]+ \(user_messages\)$", entry["category"]) @@ -353,7 +352,6 @@ def test_parse_position_logs(setup_test_fixture): def test_get_pore_counts(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture @@ -368,7 +366,6 @@ def test_get_pore_counts(setup_test_fixture): assert len(logs) == len(set(pore_counts_as_strings)) for entry in pore_counts: - assert re.match(r"^(TEST12345)|(PAM12345)$", entry["flow_cell_id"]) assert re.match(r"^(MN19414)|(1A)$", entry["position"]) assert re.match(r"^2024-01-01 0\d:0\d:0\d.0\d$", entry["timestamp"]) @@ -379,7 +376,6 @@ def test_get_pore_counts(setup_test_fixture): def test_dump_pore_count_history(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture diff --git a/tests/test_nanopore.py b/tests/test_nanopore.py index 0220f6de..03ed4bc0 100644 --- a/tests/test_nanopore.py +++ b/tests/test_nanopore.py @@ -14,42 +14,59 @@ class TestNanopore(unittest.TestCase): """Test Nanopore class""" + def test_is_not_transferred(self): """Check if nanopore run has been transferred.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - np_run.transfer_log = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer').get('transfer_file') + np_run.transfer_log = ( + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("transfer") + .get("transfer_file") + ) self.assertTrue(np_run.is_not_transferred()) - run_dir_transf = 'data/nanopore_data/run4/done_demuxing/20200105_1412_MN19414_AAU645_68125dc2' + run_dir_transf = "data/nanopore_data/run4/done_demuxing/20200105_1412_MN19414_AAU645_68125dc2" np_run_transf = ONT_run(run_dir_transf) - np_run_transf.transfer_log = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer').get('transfer_file') + np_run_transf.transfer_log = ( + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("transfer") + .get("transfer_file") + ) self.assertFalse(np_run_transf.is_not_transferred()) - @mock.patch('taca.nanopore.nanopore.RsyncAgent') + @mock.patch("taca.nanopore.nanopore.RsyncAgent") def test_transfer_run(self, mock_rsync): """Start rsync of finished run.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - transfer_details = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer') + transfer_details = ( + CONFIG.get("nanopore_analysis").get("minion_qc_run").get("transfer") + ) np_run.transfer_run(transfer_details) - rsync_opts = {'-LtDrv': None, - '--chown': ':ngi2016003', - '--chmod' : 'Dg+s,g+rw', - '-r' : None, - '--exclude' : 'work'} - mock_rsync.assert_called_with(run_dir, - dest_path='some_dir', - remote_host='some_host', - remote_user='some_user', - validate=False, - opts=rsync_opts) - - @mock.patch('taca.nanopore.nanopore.shutil.move') + rsync_opts = { + "-LtDrv": None, + "--chown": ":ngi2016003", + "--chmod": "Dg+s,g+rw", + "-r": None, + "--exclude": "work", + } + mock_rsync.assert_called_with( + run_dir, + dest_path="some_dir", + remote_host="some_host", + remote_user="some_user", + validate=False, + opts=rsync_opts, + ) + + @mock.patch("taca.nanopore.nanopore.shutil.move") def test_archive_run(self, mock_move): """Move directory to archive.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - np_run.archive_dir = '/some/dir' + np_run.archive_dir = "/some/dir" np_run.archive_run() mock_move.assert_called_once() @@ -59,89 +76,123 @@ class TestMinION(unittest.TestCase): def test_get_original_samplesheet(self): """Get location of lims sample sheet.""" - run_dir = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2' + run_dir = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2" run = MinIONqc(run_dir, None, None) run._get_anglerfish_samplesheet() - expected_sample_sheet = 'data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv' + expected_sample_sheet = "data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv" self.assertEqual(run.lims_samplesheet, expected_sample_sheet) def test_parse_samplesheet(self): """Make nanoseq sample sheet from lims sample sheet.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) - run.lims_samplesheet = 'data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv' + run.lims_samplesheet = "data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv" run._parse_samplesheet() - self.assertTrue(filecmp.cmp(run.nanoseq_sample_sheet, 'data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv')) - self.assertTrue(filecmp.cmp(run.anglerfish_sample_sheet, 'data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv')) + self.assertTrue( + filecmp.cmp( + run.nanoseq_sample_sheet, + "data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv", + ) + ) + self.assertTrue( + filecmp.cmp( + run.anglerfish_sample_sheet, + "data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv", + ) + ) - @mock.patch('taca.nanopore.minion.MinIONqc._get_flowcell_product_code') - @mock.patch('taca.nanopore.minion.MinIONqc._is_multiplexed') - @mock.patch('taca.nanopore.minion.subprocess.Popen') - def test_start_analysis_pipeline_multiplexed(self, mock_popen, mock_is_multiplexed, mock_get_fc_code): + @mock.patch("taca.nanopore.minion.MinIONqc._get_flowcell_product_code") + @mock.patch("taca.nanopore.minion.MinIONqc._is_multiplexed") + @mock.patch("taca.nanopore.minion.subprocess.Popen") + def test_start_analysis_pipeline_multiplexed( + self, mock_popen, mock_is_multiplexed, mock_get_fc_code + ): """Submit detached nanoseq job for multiplexed data.""" - mock_get_fc_code.return_value = 'FLO-FLG001' + mock_get_fc_code.return_value = "FLO-FLG001" mock_is_multiplexed.return_value = True - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run = MinIONqc(run_dir, sample_sheet, None) run.start_nanoseq() - expected_parameters = ('nextflow run nf-core/nanoseq' - + ' -r ' + CONFIG.get('nanopore_analysis').get('minion_qc_run').get('nanoseq_version') - + ' --input ' + sample_sheet - + ' --protocol DNA' - + ' --input_path ' + os.path.join(run_dir, 'fast5') - + ' --outdir ' + os.path.join(run_dir, 'nanoseq_output') - + ' --flowcell FLO-FLG001' - + ' --guppy_gpu' - + ' --skip_alignment' - + ' --skip_quantification' - + ' --kit SQK-LSK109' - + ' --max_cpus 6' - + ' --max_memory 20.GB' - + ' --barcode_kit EXP-NBD104' - + ' -profile singularity; echo $? > .exitcode_for_nanoseq') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) - - @mock.patch('taca.nanopore.minion.MinIONqc._get_flowcell_product_code') - @mock.patch('taca.nanopore.minion.MinIONqc._is_multiplexed') - @mock.patch('taca.nanopore.minion.subprocess.Popen') - def test_start_analysis_pipeline_not_multiplexed(self, mock_popen, mock_is_multiplexed, mock_get_fc_code): + expected_parameters = ( + "nextflow run nf-core/nanoseq" + + " -r " + + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("nanoseq_version") + + " --input " + + sample_sheet + + " --protocol DNA" + + " --input_path " + + os.path.join(run_dir, "fast5") + + " --outdir " + + os.path.join(run_dir, "nanoseq_output") + + " --flowcell FLO-FLG001" + + " --guppy_gpu" + + " --skip_alignment" + + " --skip_quantification" + + " --kit SQK-LSK109" + + " --max_cpus 6" + + " --max_memory 20.GB" + + " --barcode_kit EXP-NBD104" + + " -profile singularity; echo $? > .exitcode_for_nanoseq" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) + + @mock.patch("taca.nanopore.minion.MinIONqc._get_flowcell_product_code") + @mock.patch("taca.nanopore.minion.MinIONqc._is_multiplexed") + @mock.patch("taca.nanopore.minion.subprocess.Popen") + def test_start_analysis_pipeline_not_multiplexed( + self, mock_popen, mock_is_multiplexed, mock_get_fc_code + ): """Submit detached nanoseq job for non multiplexed data.""" - mock_get_fc_code.return_value = 'FLO-FLG001' + mock_get_fc_code.return_value = "FLO-FLG001" mock_is_multiplexed.return_value = False - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run = MinIONqc(run_dir, sample_sheet, None) run.start_nanoseq() - expected_parameters = ('nextflow run nf-core/nanoseq' - + ' -r ' + CONFIG.get('nanopore_analysis').get('minion_qc_run').get('nanoseq_version') - + ' --input ' + sample_sheet - + ' --protocol DNA' - + ' --input_path ' + os.path.join(run_dir, 'fast5') - + ' --outdir ' + os.path.join(run_dir, 'nanoseq_output') - + ' --flowcell FLO-FLG001' - + ' --guppy_gpu' - + ' --skip_alignment' - + ' --skip_quantification' - + ' --kit SQK-LSK109' - + ' --max_cpus 6' - + ' --max_memory 20.GB' - + ' -profile singularity; echo $? > .exitcode_for_nanoseq') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) + expected_parameters = ( + "nextflow run nf-core/nanoseq" + + " -r " + + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("nanoseq_version") + + " --input " + + sample_sheet + + " --protocol DNA" + + " --input_path " + + os.path.join(run_dir, "fast5") + + " --outdir " + + os.path.join(run_dir, "nanoseq_output") + + " --flowcell FLO-FLG001" + + " --guppy_gpu" + + " --skip_alignment" + + " --skip_quantification" + + " --kit SQK-LSK109" + + " --max_cpus 6" + + " --max_memory 20.GB" + + " -profile singularity; echo $? > .exitcode_for_nanoseq" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) def test_get_flowcell_product_code(self): """Get flowcell product code from report.md.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) got_id = run._get_flowcell_product_code() - expected_id = 'FLO-FLG001' + expected_id = "FLO-FLG001" self.assertEqual(got_id, expected_id) def test_is_multiplexed(self): """Return True if run is multiplexed, else False.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - multiplexed_sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' - non_multiplexed_sample_sheet = 'data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + multiplexed_sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" + non_multiplexed_sample_sheet = "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv" multiplexed_run = MinIONqc(run_dir, multiplexed_sample_sheet, None) non_multiplexed_run = MinIONqc(run_dir, non_multiplexed_sample_sheet, None) self.assertTrue(multiplexed_run._is_multiplexed()) @@ -149,60 +200,81 @@ def test_is_multiplexed(self): def test_get_barcode_kit(self): """Return EXP-NBD104 or EXP-NBD114 barcode kit based on sample sheet.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet_104 = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet_104 = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run_104 = MinIONqc(run_dir, sample_sheet_104, None) got_kit_104 = run_104._get_barcode_kit() - sample_sheet_114 = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv' + sample_sheet_114 = "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv" run_114 = MinIONqc(run_dir, sample_sheet_114, None) got_kit_114 = run_114._get_barcode_kit() - self.assertEqual(got_kit_104, 'EXP-NBD104') - self.assertEqual(got_kit_114, 'EXP-NBD114') + self.assertEqual(got_kit_104, "EXP-NBD104") + self.assertEqual(got_kit_114, "EXP-NBD114") def test_check_exit_status(self): """Check nanoseq exit status from file.""" - run_dir_success = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir_success = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" success_run = MinIONqc(run_dir_success, None, None) - self.assertTrue(success_run.check_exit_status('data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq')) - run_dir_fail = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2' + self.assertTrue( + success_run.check_exit_status( + "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq" + ) + ) + run_dir_fail = ( + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2" + ) fail_run = MinIONqc(run_dir_fail, None, None) - self.assertFalse(fail_run.check_exit_status('data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq')) + self.assertFalse( + fail_run.check_exit_status( + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq" + ) + ) - @mock.patch('taca.nanopore.minion.os.makedirs') - @mock.patch('taca.nanopore.minion.subprocess.Popen') + @mock.patch("taca.nanopore.minion.os.makedirs") + @mock.patch("taca.nanopore.minion.subprocess.Popen") def test_start_anglerfish(self, mock_popen, mock_mkdir): """Start Anglerfish.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - af_sample_sheet = 'anglerfish_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + af_sample_sheet = "anglerfish_sample_sheet.csv" run = MinIONqc(run_dir, None, af_sample_sheet) run.start_anglerfish() - expected_parameters = ('anglerfish.py' - + ' --samplesheet anglerfish_sample_sheet.csv' - + ' --out_fastq data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output' - + ' --threads 2' - + ' --skip_demux' - + ' --skip_fastqc; echo $? > .exitcode_for_anglerfish') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) - - @mock.patch('taca.nanopore.minion.MinIONqc._find_anglerfish_results') - @mock.patch('taca.nanopore.minion.shutil.copyfile') + expected_parameters = ( + "anglerfish.py" + + " --samplesheet anglerfish_sample_sheet.csv" + + " --out_fastq data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output" + + " --threads 2" + + " --skip_demux" + + " --skip_fastqc; echo $? > .exitcode_for_anglerfish" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) + + @mock.patch("taca.nanopore.minion.MinIONqc._find_anglerfish_results") + @mock.patch("taca.nanopore.minion.shutil.copyfile") def test_copy_results_for_lims(self, mock_copy, mock_results): """Copy Anglerfish results to lims.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) - anglerfish_results_path = 'anglerfish_output' - anglerfish_results_file = os.path.join(run_dir, anglerfish_results_path, 'anglerfish_2020_09_23_141922', 'anglerfish_stats.txt') - lims_results_file = 'some/dir/2020/anglerfish_stats_AAU644.txt' + anglerfish_results_path = "anglerfish_output" + anglerfish_results_file = os.path.join( + run_dir, + anglerfish_results_path, + "anglerfish_2020_09_23_141922", + "anglerfish_stats.txt", + ) + lims_results_file = "some/dir/2020/anglerfish_stats_AAU644.txt" mock_results.return_value = anglerfish_results_file run.copy_results_for_lims() mock_copy.assert_called_once_with(anglerfish_results_file, lims_results_file) def test_find_anglerfish_results(self): """Locate Anglerfish results file.""" - anglerfish_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output' - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + anglerfish_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output" + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) found_file = run._find_anglerfish_results() - expected_file = os.path.join(anglerfish_dir, 'anglerfish_2020_09_23_141922', 'anglerfish_stats.txt') + expected_file = os.path.join( + anglerfish_dir, "anglerfish_2020_09_23_141922", "anglerfish_stats.txt" + ) self.assertEqual(expected_file, found_file) diff --git a/tests/test_server_status.py b/tests/test_server_status.py index 781adbe9..44690644 100644 --- a/tests/test_server_status.py +++ b/tests/test_server_status.py @@ -7,12 +7,13 @@ from taca.server_status import cronjobs, server_status from taca.utils import config -CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = config.load_yaml_config("data/taca_test_cfg.yaml") INITAL_TAB = """ # First Comment 0,30 * * * * firstcommand """ + class TestServerStatus(unittest.TestCase): def test_get_nases_disk_space(self): """Get disk space for disk specified in config file.""" @@ -21,81 +22,105 @@ def test_get_nases_disk_space(self): def test_parse_output_valid_case(self): """Parse valid disk space output.""" - valid_disk_space = 'Filesystem Size Used Avail Capacity iused ifree %iused Mounted on \ - /dev/disk1s1 466Gi 59Gi 393Gi 14% 1062712 4881390168 0% /System/Volumes/Data' - expected_result = {'disk_size': '14%', - 'mounted_on': '/System/Volumes/Data', - 'available_percentage': '100%', - 'space_used': '1062712', - 'used_percentage': '0%', - 'filesystem': '393Gi', - 'space_available': '4881390168'} + valid_disk_space = "Filesystem Size Used Avail Capacity iused ifree %iused Mounted on \ + /dev/disk1s1 466Gi 59Gi 393Gi 14% 1062712 4881390168 0% /System/Volumes/Data" + expected_result = { + "disk_size": "14%", + "mounted_on": "/System/Volumes/Data", + "available_percentage": "100%", + "space_used": "1062712", + "used_percentage": "0%", + "filesystem": "393Gi", + "space_available": "4881390168", + } got_result = server_status._parse_output(valid_disk_space) self.assertEqual(expected_result, got_result) def test_parse_output_invalid_case(self): """Parse invalid disk space output.""" - invalid_disk_space = '' + invalid_disk_space = "" expected_invalid_result = { - 'disk_size': 'NaN', - 'space_used': 'NaN', - 'space_available': 'NaN', - 'used_percentage': 'NaN', - 'available_percentage': 'NaN', - 'mounted_on': 'NaN', - 'filesystem': 'NaN' + "disk_size": "NaN", + "space_used": "NaN", + "space_available": "NaN", + "used_percentage": "NaN", + "available_percentage": "NaN", + "mounted_on": "NaN", + "filesystem": "NaN", } invalid_result = server_status._parse_output(invalid_disk_space) self.assertEqual(expected_invalid_result, invalid_result) - @mock.patch('taca.server_status.server_status.statusdb') + @mock.patch("taca.server_status.server_status.statusdb") def test_update_status_db(self, mock_couchdb): """Update statusdb.""" - disk_space = {'localhost': {'disk_size': '14%', 'mounted_on': '/System/Volumes/Data', 'available_percentage': '100%', 'space_used': '1061701', 'used_percentage': '0%', 'filesystem': '393Gi', 'space_available': '4881391179'}} - server_status.update_status_db(disk_space, server_type='nas') + disk_space = { + "localhost": { + "disk_size": "14%", + "mounted_on": "/System/Volumes/Data", + "available_percentage": "100%", + "space_used": "1061701", + "used_percentage": "0%", + "filesystem": "393Gi", + "space_available": "4881391179", + } + } + server_status.update_status_db(disk_space, server_type="nas") class TestCronjobs(unittest.TestCase): - @mock.patch('taca.server_status.cronjobs.CronTab') - @mock.patch('taca.server_status.cronjobs.getpass.getuser') + @mock.patch("taca.server_status.cronjobs.CronTab") + @mock.patch("taca.server_status.cronjobs.getpass.getuser") def test_parse_crontab(self, mock_getpass, mock_crontab): """Parse crontab.""" mock_crontab.return_value = crontab.CronTab(tab=INITAL_TAB) - mock_getpass.return_value = 'test_user' - expected_crontab = {'test_user': - [{'Comment': 'First Comment', - 'Day of month': '*', - 'Command': 'firstcommand', - 'Hour': '*', - 'Day of week': '*', - 'Enabled': True, - 'Special syntax': '', - 'Minute': '0,30', - 'Month': '*'}] + mock_getpass.return_value = "test_user" + expected_crontab = { + "test_user": [ + { + "Comment": "First Comment", + "Day of month": "*", + "Command": "firstcommand", + "Hour": "*", + "Day of week": "*", + "Enabled": True, + "Special syntax": "", + "Minute": "0,30", + "Month": "*", + } + ] } got_crontab = cronjobs._parse_crontab() self.assertEqual(expected_crontab, got_crontab) - @mock.patch('taca.server_status.cronjobs.statusdb') - @mock.patch('taca.server_status.cronjobs.logging') - @mock.patch('taca.server_status.cronjobs.platform') - @mock.patch('taca.server_status.cronjobs._parse_crontab') - def test_update_cronjob_db(self, mock_parser, mock_platform, mock_logging, mock_statusdb): + @mock.patch("taca.server_status.cronjobs.statusdb") + @mock.patch("taca.server_status.cronjobs.logging") + @mock.patch("taca.server_status.cronjobs.platform") + @mock.patch("taca.server_status.cronjobs._parse_crontab") + def test_update_cronjob_db( + self, mock_parser, mock_platform, mock_logging, mock_statusdb + ): """Update couchdb with cronjobs.""" - mock_parser.return_value = {'test_user': - [{'Comment': 'First Comment', - 'Day of month': '*', - 'Command': 'firstcommand', - 'Hour': '*', - 'Day of week': '*', - 'Enabled': True, - 'Special syntax': '', - 'Minute': '0,30', - 'Month': '*'}] + mock_parser.return_value = { + "test_user": [ + { + "Comment": "First Comment", + "Day of month": "*", + "Command": "firstcommand", + "Hour": "*", + "Day of week": "*", + "Enabled": True, + "Special syntax": "", + "Minute": "0,30", + "Month": "*", + } + ] } - mock_platform.node.return_value = 'server.name' + mock_platform.node.return_value = "server.name" cronjobs.update_cronjob_db() - calls = [mock.call.info('Connecting to database: url'), - mock.call.warning('Document has not been created/updated')] + calls = [ + mock.call.info("Connecting to database: url"), + mock.call.warning("Document has not been created/updated"), + ] mock_logging.assert_has_calls(calls) diff --git a/tests/test_utils.py b/tests/test_utils.py index 2b52f37d..79e17645 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,17 +17,15 @@ class TestMisc(unittest.TestCase): @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_misc') - self.hashfile = os.path.join(self.rootdir, 'test_hashfile') - with open(self.hashfile, 'w') as fh: - fh.write('This is some contents\n') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_misc") + self.hashfile = os.path.join(self.rootdir, "test_hashfile") + with open(self.hashfile, "w") as fh: + fh.write("This is some contents\n") self.hashfile_digests = { - 'SHA256': - '4f075ae76b480bb0200dab01cd304f4045e04cd2b73e88b89549e5ac1627f222', - 'MD5': - 'c8498fc299bc3e22690045f1b62ce4e9', - 'SHA1': - '098fb272dfdae2ea1ba57c795dd325fa70e3c3fb'} + "SHA256": "4f075ae76b480bb0200dab01cd304f4045e04cd2b73e88b89549e5ac1627f222", + "MD5": "c8498fc299bc3e22690045f1b62ce4e9", + "SHA1": "098fb272dfdae2ea1ba57c795dd325fa70e3c3fb", + } @classmethod def tearDownClass(self): @@ -44,42 +42,48 @@ def test_hashfile_dir(self): def test_multiple_hashfile_calls(self): """Ensure that the hasher object is cleared between subsequent calls.""" - assert misc.hashfile(self.hashfile, hasher='sha1') == misc.hashfile(self.hashfile, 'sha1') + assert misc.hashfile(self.hashfile, hasher="sha1") == misc.hashfile( + self.hashfile, "sha1" + ) - @mock.patch('taca.utils.misc.smtplib.SMTP') + @mock.patch("taca.utils.misc.smtplib.SMTP") def test_send_mail(self, mock_smtplib): """Test send email.""" - assert misc.send_mail('subject', 'content', 'receiver') is None - mock_smtplib.assert_called_with('localhost') - mock_smtplib().sendmail.assert_called_with('TACA', ['receiver'], mock.ANY) + assert misc.send_mail("subject", "content", "receiver") is None + mock_smtplib.assert_called_with("localhost") + mock_smtplib().sendmail.assert_called_with("TACA", ["receiver"], mock.ANY) with self.assertRaises(SystemExit): - misc.send_mail('subject', 'content', None) + misc.send_mail("subject", "content", None) def test_call_external_command_pass(self): """Call external command.""" - new_file = os.path.join(self.rootdir, 'test_call_external') - command = 'touch ' + new_file - log_dir = os.path.join(self.rootdir, 'log_tests') - misc.call_external_command(command, with_log_files=True, prefix='test', log_dir=log_dir) + new_file = os.path.join(self.rootdir, "test_call_external") + command = "touch " + new_file + log_dir = os.path.join(self.rootdir, "log_tests") + misc.call_external_command( + command, with_log_files=True, prefix="test", log_dir=log_dir + ) assert os.path.isfile(new_file) - assert os.path.isfile(os.path.join(self.rootdir, 'log_tests', 'test_touch.out')) + assert os.path.isfile(os.path.join(self.rootdir, "log_tests", "test_touch.out")) def test_call_external_command_fail(self): """Call external command should handle error.""" - command = 'ls -E' + command = "ls -E" with self.assertRaises(subprocess.CalledProcessError): misc.call_external_command(command) def test_call_external_command_detached(self): """Call external command detached.""" - new_file = os.path.join(self.rootdir, 'test_call_external_det') - command = 'touch ' + new_file - misc.call_external_command_detached(command, with_log_files=True, prefix='test_det') + new_file = os.path.join(self.rootdir, "test_call_external_det") + command = "touch " + new_file + misc.call_external_command_detached( + command, with_log_files=True, prefix="test_det" + ) time.sleep(0.1) self.assertTrue(os.path.isfile(new_file)) - self.assertTrue(os.path.isfile('test_det_touch.out')) - os.remove('test_det_touch.out') - os.remove('test_det_touch.err') + self.assertTrue(os.path.isfile("test_det_touch.out")) + os.remove("test_det_touch.out") + os.remove("test_det_touch.err") def test_to_seconds(self): """Transform days and hours to seconds.""" @@ -90,85 +94,92 @@ def test_to_seconds(self): self.assertEqual(misc.to_seconds(days=1), 86400) self.assertEqual(misc.to_seconds(hours=1), 3600) - @mock.patch('taca.utils.misc.input', return_value='yes') + @mock.patch("taca.utils.misc.input", return_value="yes") def test_query_yes_no_true(self, mock_raw_input): """Return True from answer yes.""" - response = misc.query_yes_no('Some question') + response = misc.query_yes_no("Some question") self.assertTrue(response) - @mock.patch('taca.utils.misc.input', return_value='no') + @mock.patch("taca.utils.misc.input", return_value="no") def test_query_yes_no_false(self, mock_raw_input): """Return False from answer no.""" - response = misc.query_yes_no('Some question') + response = misc.query_yes_no("Some question") self.assertFalse(response) def test_return_unique(self): """Return unique items in a list.""" - input_list = ['a', 'b', 'a', 'c'] + input_list = ["a", "b", "a", "c"] returned_list = misc.return_unique(input_list) - expected_list = ['a', 'b', 'c'] + expected_list = ["a", "b", "c"] self.assertEqual(returned_list, expected_list) - @mock.patch('taca.utils.misc.statusdb') + @mock.patch("taca.utils.misc.statusdb") def test_run_is_demuxed(self, mock_couch): """Check in StatusDB if run was demultiplexed.""" - run = '200201_A00621_0032_BHHFCFDSXX' - couch_info = {'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'db': 'db'} + run = "200201_A00621_0032_BHHFCFDSXX" + couch_info = { + "url": "url", + "username": "username", + "password": "pwd", + "db": "db", + } misc.run_is_demuxed(run, couch_info=couch_info) - #TODO: should add a check here but not sure how to mock this properly + # TODO: should add a check here but not sure how to mock this properly + class TestFilesystem(unittest.TestCase): """Test class for the filesystem functions.""" def setUp(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_filesystem') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_filesystem") def tearDown(self): shutil.rmtree(self.rootdir) def test_crete_folder_non_existing(self): """Ensure that a non-existing folder is created.""" - target_folder = os.path.join(self.rootdir,'target-non-existing') + target_folder = os.path.join(self.rootdir, "target-non-existing") self.assertTrue( filesystem.create_folder(target_folder), - 'A non-existing target folder could not be created') + "A non-existing target folder could not be created", + ) self.assertTrue( os.path.exists(target_folder), - 'A non-existing target folder was not created \ - but method returned True' + "A non-existing target folder was not created \ + but method returned True", ) def test_crete_folder_existing(self): """Ensure that an existing folder is detected.""" self.assertTrue( filesystem.create_folder(self.rootdir), - 'A pre-existing target folder was not detected') + "A pre-existing target folder was not detected", + ) def test_crete_folder_parent_non_existing(self): """Ensure that a non-existing parent folder is created.""" target_folder = os.path.join( - self.rootdir, - 'parent-non-existing', - 'target-non-existing') + self.rootdir, "parent-non-existing", "target-non-existing" + ) self.assertTrue( filesystem.create_folder(target_folder), - 'A non-existing parent and target folder could not be created') + "A non-existing parent and target folder could not be created", + ) self.assertTrue( os.path.exists(target_folder), - 'A non-existing parent folder was not created \ - but method returned True' + "A non-existing parent folder was not created \ + but method returned True", ) def test_crete_folder_exception(self): """Ensure that create_folder handles thrown exceptions gracefully.""" - with mock.patch.object(filesystem.os, 'makedirs', side_effect=OSError): + with mock.patch.object(filesystem.os, "makedirs", side_effect=OSError): self.assertFalse( filesystem.create_folder( - os.path.join(self.rootdir,'target-non-existing')), - 'A raised exception was not handled properly') + os.path.join(self.rootdir, "target-non-existing") + ), + "A raised exception was not handled properly", + ) def test_chdir(self): """Ensure start dir and end dir are the same.""" @@ -179,34 +190,35 @@ def test_chdir(self): def test_touch(self): """Make empty file.""" - new_file = os.path.join(self.rootdir, 'empty') + new_file = os.path.join(self.rootdir, "empty") filesystem.touch(new_file) self.assertTrue(os.path.isfile(new_file)) def test_do_symlink(self): """Make a symlink.""" - src = os.path.join(self.rootdir, 'source_file') - open(src, 'w').close() - dst = os.path.join(self.rootdir, 'dest_file') + src = os.path.join(self.rootdir, "source_file") + open(src, "w").close() + dst = os.path.join(self.rootdir, "dest_file") filesystem.do_symlink(src, dst) self.assertTrue(os.path.islink(dst)) def test_do_copy(self): """Copy files.""" - src_dir = os.path.join(self.rootdir, 'source_dir') - src = os.path.join(src_dir, 'source_file') + src_dir = os.path.join(self.rootdir, "source_dir") + src = os.path.join(src_dir, "source_file") os.mkdir(src_dir) - open(src, 'w').close() - dst_dir = os.path.join(self.rootdir, 'dest_dir') + open(src, "w").close() + dst_dir = os.path.join(self.rootdir, "dest_dir") filesystem.do_copy(src_dir, dst_dir) - self.assertTrue(os.path.isfile(os.path.join(dst_dir, 'source_file'))) + self.assertTrue(os.path.isfile(os.path.join(dst_dir, "source_file"))) + class TestTransferAgent(unittest.TestCase): """Test class for the TransferAgent class.""" @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_transfer_src') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_transfer_src") self.testfile = tempfile.mkstemp(dir=self.rootdir) @classmethod @@ -214,10 +226,10 @@ def tearDownClass(self): shutil.rmtree(self.rootdir) def setUp(self): - self.destdir = tempfile.mkdtemp(prefix='test_taca_transfer_dest') + self.destdir = tempfile.mkdtemp(prefix="test_taca_transfer_dest") self.agent = transfer.TransferAgent( - src_path=self.rootdir, - dest_path=self.destdir) + src_path=self.rootdir, dest_path=self.destdir + ) def tearDown(self): shutil.rmtree(self.destdir) @@ -228,9 +240,7 @@ def test_transfer_validate_src_path(self): self.agent.src_path = None with self.assertRaises(transfer.TransferError): self.agent.validate_src_path() - self.agent.src_path = os.path.join( - self.rootdir, - 'this-file-does-not-exist') + self.agent.src_path = os.path.join(self.rootdir, "this-file-does-not-exist") with self.assertRaises(transfer.TransferError): self.agent.validate_src_path() @@ -257,11 +267,11 @@ class TestSymlinkAgent(unittest.TestCase): @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_symlink_src') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_symlink_src") path = self.rootdir for n in range(3): - open(os.path.join(path, f'file{n}'), 'w').close() - path = os.path.join(path, f'folder{n}') + open(os.path.join(path, f"file{n}"), "w").close() + path = os.path.join(path, f"folder{n}") os.mkdir(path) @classmethod @@ -269,51 +279,49 @@ def tearDownClass(self): shutil.rmtree(self.rootdir) def setUp(self): - self.targetdir = tempfile.mkdtemp( - prefix='test_taca_filesystem_symlink_dest') + self.targetdir = tempfile.mkdtemp(prefix="test_taca_filesystem_symlink_dest") def tearDown(self): shutil.rmtree(self.targetdir) def test_symlink_validate_transfer(self): """Verify that the dest_path was created.""" - src = os.path.join(self.rootdir, 'file0') - dst = os.path.join(self.targetdir, 'file0') + src = os.path.join(self.rootdir, "file0") + dst = os.path.join(self.targetdir, "file0") os.symlink(src, dst) self.assertTrue(transfer.SymlinkAgent(src, dst).validate_transfer()) def test_symlink_file_top_folder(self): """Symlink a single file in the top folder.""" - src = os.path.join(self.rootdir, 'file0') - target = os.path.join(self.targetdir,os.path.basename(src)) + src = os.path.join(self.rootdir, "file0") + target = os.path.join(self.targetdir, os.path.basename(src)) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_make_dir(self): """Symlink a single file into a non-existing folder.""" - src = os.path.join(self.rootdir, 'folder0', 'folder1', 'file2') + src = os.path.join(self.rootdir, "folder0", "folder1", "file2") target = os.path.join( - self.targetdir, - 'these', 'folders', 'should', 'be', 'created') + self.targetdir, "these", "folders", "should", "be", "created" + ) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_overwrite(self): """Replace an existing file with overwrite.""" - src = os.path.join(self.rootdir, 'file0') + src = os.path.join(self.rootdir, "file0") target = os.path.join(self.targetdir, os.path.basename(src)) - open(target, 'w').close() + open(target, "w").close() self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_not_overwrite(self): """Don't replace an existing file without overwrite.""" - src = os.path.join(self.rootdir, 'file0') + src = os.path.join(self.rootdir, "file0") target = os.path.join(self.targetdir, os.path.basename(src)) - open(target, 'w').close() - self.assertFalse( - transfer.SymlinkAgent(src, target, overwrite=False).transfer()) + open(target, "w").close() + self.assertFalse(transfer.SymlinkAgent(src, target, overwrite=False).transfer()) def test_symlink_file_broken(self): """Don't create a broken symlink.""" - src = os.path.join(self.rootdir, 'non-existing-file') + src = os.path.join(self.rootdir, "non-existing-file") target = os.path.join(self.targetdir, os.path.basename(src)) with self.assertRaises(transfer.TransferError): transfer.SymlinkAgent(src, target).transfer() @@ -321,24 +329,23 @@ def test_symlink_file_broken(self): def test_symlink_file_unlink_fail(self): """Failing to remove existing file should raise SymlinkError.""" src = self.rootdir - target = os.path.join(self.targetdir, 'target-file') - open(target, 'w').close() + target = os.path.join(self.targetdir, "target-file") + open(target, "w").close() with mock.patch.object( - transfer.os, - 'unlink', - side_effect=OSError('Mocked error')): + transfer.os, "unlink", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() def test_symlink_folder_top_folder(self): """Symlinking a top-level folder.""" - src = os.path.join(self.rootdir, 'folder0') + src = os.path.join(self.rootdir, "folder0") target = os.path.join(self.targetdir, os.path.basename(src)) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_folder_overwrite(self): """Replace an existing folder with overwrite.""" - src = os.path.join(self.rootdir, 'folder0') + src = os.path.join(self.rootdir, "folder0") target = os.path.join(self.targetdir, os.path.basename(src)) shutil.copytree(src, target) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) @@ -347,7 +354,7 @@ def test_symlink_folder_mount_point(self): """Don't overwrite a mount point.""" src = os.path.join(self.rootdir) target = os.path.join(self.targetdir) - with mock.patch.object(transfer.os.path, 'ismount', return_value=True): + with mock.patch.object(transfer.os.path, "ismount", return_value=True): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -355,19 +362,19 @@ def test_symlink_folder_not_overwrite(self): """Don't overwrite other existing paths.""" src = os.path.join(self.rootdir) target = os.path.join(self.targetdir) - with mock.patch('taca.utils.transfer.os.path') as mockobj: + with mock.patch("taca.utils.transfer.os.path") as mockobj: mockobj.ismount.return_value = False mockobj.isfile.return_value = False mockobj.islink.return_value = False mockobj.isdir.return_value = False with self.assertRaises(transfer.SymlinkError): - transfer.SymlinkAgent(src,target).transfer() + transfer.SymlinkAgent(src, target).transfer() def test_symlink_folder_parent_error(self): """Failing to create parent folder structure should raise SymlinkError.""" src = self.rootdir - target = os.path.join(self.targetdir, 'non-existing-folder', 'target-file') - with mock.patch.object(transfer, 'create_folder', return_value=False): + target = os.path.join(self.targetdir, "non-existing-folder", "target-file") + with mock.patch.object(transfer, "create_folder", return_value=False): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -376,9 +383,8 @@ def test_symlink_folder_rmtree_fail(self): src = self.rootdir target = self.targetdir with mock.patch.object( - transfer.shutil, - 'rmtree', - side_effect=OSError('Mocked error')): + transfer.shutil, "rmtree", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -387,9 +393,8 @@ def test_symlink_folder_symlink_error(self): src = self.rootdir target = os.path.join(self.targetdir, os.path.basename(src)) with mock.patch.object( - transfer.os, - 'symlink', - side_effect=OSError('Mocked error')): + transfer.os, "symlink", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -398,11 +403,10 @@ def test_symlink_folder_unexpected(self): src = self.rootdir target = self.targetdir with mock.patch.object( - transfer.os.path, - 'exists', - side_effect=Exception('Mocked error')): + transfer.os.path, "exists", side_effect=Exception("Mocked error") + ): with self.assertRaises(Exception): - transfer.SymlinkAgent(src,target).transfer() + transfer.SymlinkAgent(src, target).transfer() class TestRsyncAgent(unittest.TestCase): @@ -410,25 +414,32 @@ class TestRsyncAgent(unittest.TestCase): @classmethod def setUpClass(cls): - cls.rootdir = tempfile.mkdtemp(prefix='test_taca_transfer_src') - (fh, cls.testfile) = tempfile.mkstemp( - prefix='test_taca_transfer_file') - os.write(fh, b'this is some content') + cls.rootdir = tempfile.mkdtemp(prefix="test_taca_transfer_src") + (fh, cls.testfile) = tempfile.mkstemp(prefix="test_taca_transfer_file") + os.write(fh, b"this is some content") os.close(fh) - open(os.path.join(cls.rootdir, 'file0'), 'w').close() - f = os.path.join(cls.rootdir, 'folder0') + open(os.path.join(cls.rootdir, "file0"), "w").close() + f = os.path.join(cls.rootdir, "folder0") os.mkdir(f) - open(os.path.join(f, 'file1'), 'w').close() + open(os.path.join(f, "file1"), "w").close() # create a digest file def _write_digest(rootdir, fhandle, fpath): - fhandle.write(f'{misc.hashfile(fpath)} {os.path.relpath(fpath, rootdir)}\n') - - cls.digestfile = os.path.join(cls.rootdir, 'digestfile.sha1') - with open(cls.digestfile, 'w') as digesth: - map(lambda x: - [_write_digest(cls.rootdir, digesth, os.path.join(x[0], y)) for y in [z for z in x[2] if os.path.join(x[0], z) != cls.digestfile]], - os.walk(cls.rootdir)) + fhandle.write( + f"{misc.hashfile(fpath)} {os.path.relpath(fpath, rootdir)}\n" + ) + + cls.digestfile = os.path.join(cls.rootdir, "digestfile.sha1") + with open(cls.digestfile, "w") as digesth: + map( + lambda x: [ + _write_digest(cls.rootdir, digesth, os.path.join(x[0], y)) + for y in [ + z for z in x[2] if os.path.join(x[0], z) != cls.digestfile + ] + ], + os.walk(cls.rootdir), + ) @classmethod def tearDownClass(cls): @@ -436,25 +447,26 @@ def tearDownClass(cls): os.unlink(cls.testfile) def setUp(self): - self.destdir = tempfile.mkdtemp(prefix='test_taca_transfer_dest') + self.destdir = tempfile.mkdtemp(prefix="test_taca_transfer_dest") self.agent = transfer.RsyncAgent( - self.rootdir, - dest_path=self.destdir, - validate=False) + self.rootdir, dest_path=self.destdir, validate=False + ) def tearDown(self): shutil.rmtree(self.destdir) def test_init(self): """Test initiation of agent instance.""" - args = ['arg1'] - kwargs = {'dest_path': 'arg2', - 'remote_host': 'arg3', - 'remote_user': 'arg4', - 'validate': True, - 'digestfile': 'arg5'} + args = ["arg1"] + kwargs = { + "dest_path": "arg2", + "remote_host": "arg3", + "remote_user": "arg4", + "validate": True, + "digestfile": "arg5", + } agent = transfer.RsyncAgent(*args, **kwargs) - self.assertEqual(getattr(agent, 'src_path'), args[0]) + self.assertEqual(getattr(agent, "src_path"), args[0]) for attribute, value in kwargs.items(): self.assertEqual(getattr(agent, attribute), value) self.assertEqual(agent.cmdopts, agent.DEFAULT_OPTS) @@ -462,7 +474,7 @@ def test_init(self): def test_rsync_validate_transfer(self): """Validate_transfer.""" # validation on remote hosts are not supported - self.agent.remote_host = 'not None' + self.agent.remote_host = "not None" with self.assertRaises(NotImplementedError): self.agent.validate_transfer() # validation without a digestfile throws an exception @@ -471,11 +483,17 @@ def test_rsync_validate_transfer(self): self.agent.validate_transfer() # validation with a valid digestfile should return true self.agent.digestfile = self.digestfile - self.assertTrue(self.agent.validate_transfer(), 'validation with a valid digestfile should return true') + self.assertTrue( + self.agent.validate_transfer(), + "validation with a valid digestfile should return true", + ) # modifying the contents of the digestfile should make validation fail - with open(self.digestfile, 'a') as fh: - fh.write('randomdigeststring this-file-does-not-exist') - self.assertFalse(self.agent.validate_transfer(), 'validation with an invalid digestfile should return false') + with open(self.digestfile, "a") as fh: + fh.write("randomdigeststring this-file-does-not-exist") + self.assertFalse( + self.agent.validate_transfer(), + "validation with an invalid digestfile should return false", + ) def test_rsync_validate_dest_path(self): """Destination path should be properly checked.""" @@ -484,7 +502,7 @@ def test_rsync_validate_dest_path(self): self.agent.dest_path = None with self.assertRaises(transfer.TransferError): self.agent.validate_dest_path() - self.agent.remote_user = 'user' + self.agent.remote_user = "user" self.agent.dest_path = self.destdir with self.assertRaises(transfer.TransferError): self.agent.validate_dest_path() @@ -494,76 +512,79 @@ def test_rsync_agent_dest_paths_constructed(self): self.assertEqual( self.destdir, self.agent.remote_path(), - 'Destination path was not correct for empty remote user ' \ - 'and empty destination host') - self.agent.remote_host = 'localhost' + "Destination path was not correct for empty remote user " + "and empty destination host", + ) + self.agent.remote_host = "localhost" self.assertEqual( - f'localhost:{self.destdir}', + f"localhost:{self.destdir}", self.agent.remote_path(), - 'Destination path was not correct for empty remote user') - self.agent.remote_user = 'user' + "Destination path was not correct for empty remote user", + ) + self.agent.remote_user = "user" self.assertEqual( - f'user@localhost:{self.destdir}', + f"user@localhost:{self.destdir}", self.agent.remote_path(), - 'Destination path was not correct for non-empty remote user') + "Destination path was not correct for non-empty remote user", + ) self.agent.dest_path = None self.assertEqual( - 'user@localhost:', + "user@localhost:", self.agent.remote_path(), - 'Destination path was not correct for empty destination path') + "Destination path was not correct for empty destination path", + ) def test_rsync_agent_propagate_error(self): """Wrap and propagate error thrown by the rsync subprocess.""" with mock.patch.object( - transfer.subprocess, 'check_call', - side_effect=subprocess.CalledProcessError( - cmd='mocked subprocess', - returncode=-1)): + transfer.subprocess, + "check_call", + side_effect=subprocess.CalledProcessError( + cmd="mocked subprocess", returncode=-1 + ), + ): with self.assertRaises(transfer.RsyncError): self.agent.transfer() def test_rsync_agent_file(self): """Rsync transfer of a single file.""" - self.agent.src_path = os.path.join(self.rootdir, 'file0') - self.assertTrue( - self.agent.transfer(), - 'transfer a single file failed') + self.agent.src_path = os.path.join(self.rootdir, "file0") + self.assertTrue(self.agent.transfer(), "transfer a single file failed") self.assertTrue( self.validate_files( self.agent.src_path, - os.path.join( - self.destdir, - os.path.basename(self.agent.src_path))), - 'test file was not properly transferred') + os.path.join(self.destdir, os.path.basename(self.agent.src_path)), + ), + "test file was not properly transferred", + ) def test_rsync_agent_dir(self): """Rsync transfer of a folder.""" - self.agent.src_path = os.path.join(self.rootdir, 'folder0') - self.assertTrue( - self.agent.transfer(), - 'transfer a folder failed') + self.agent.src_path = os.path.join(self.rootdir, "folder0") + self.assertTrue(self.agent.transfer(), "transfer a folder failed") self.assertTrue( self.validate_folders( self.agent.src_path, - os.path.join( - self.destdir, - os.path.basename(self.agent.src_path))), - 'folder was not properly transferred') + os.path.join(self.destdir, os.path.basename(self.agent.src_path)), + ), + "folder was not properly transferred", + ) def test_rsync_agent_symlink(self): """Rsync should be able to resolve symlinks.""" - self.agent.src_path = os.path.join(self.rootdir, 'folder0') - os.symlink(self.testfile,os.path.join(self.agent.src_path, 'link1')) - self.agent.cmdopts = {'-a': None, '--copy-links': None} + self.agent.src_path = os.path.join(self.rootdir, "folder0") + os.symlink(self.testfile, os.path.join(self.agent.src_path, "link1")) + self.agent.cmdopts = {"-a": None, "--copy-links": None} self.assertTrue( - self.agent.transfer(), - 'transfer a folder containing a symlink failed') + self.agent.transfer(), "transfer a folder containing a symlink failed" + ) self.assertEqual( - misc.hashfile(self.testfile, hasher='sha1'), + misc.hashfile(self.testfile, hasher="sha1"), misc.hashfile( - os.path.join(self.destdir, 'folder0', 'link1'), - hasher='sha1'), - 'symlink was not properly transferred') + os.path.join(self.destdir, "folder0", "link1"), hasher="sha1" + ), + "symlink was not properly transferred", + ) def validate_folders(self, src, dst): for root, dirs, files in os.walk(src): @@ -575,215 +596,256 @@ def validate_folders(self, src, dst): return True def validate_files(self, src, dst): - return os.path.exists(src) and \ - os.path.isfile(src) and \ - os.path.exists(dst) and \ - os.path.isfile(dst) and \ - misc.hashfile(src) == misc.hashfile(dst) + return ( + os.path.exists(src) + and os.path.isfile(src) + and os.path.exists(dst) + and os.path.isfile(dst) + and misc.hashfile(src) == misc.hashfile(dst) + ) -class TestConfig(unittest.TestCase): +class TestConfig(unittest.TestCase): def test_load_yaml_config(self): """Load a yaml config file""" - got_config_data = config.load_yaml_config('data/taca_test_cfg_minimal.yaml') - expexted_config_data = {'statusdb': - {'url': 'url', - 'username': 'username', - 'password': 'pwd'}, - 'log': - {'file': 'data/taca.log'}} + got_config_data = config.load_yaml_config("data/taca_test_cfg_minimal.yaml") + expexted_config_data = { + "statusdb": {"url": "url", "username": "username", "password": "pwd"}, + "log": {"file": "data/taca.log"}, + } self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - config.load_yaml_config('data/missing_file.yaml)') + config.load_yaml_config("data/missing_file.yaml)") def test_load_config(self): """Load a config file.""" - got_config_data = config.load_config('data/taca_test_cfg_minimal.yaml') - expexted_config_data = {'statusdb': - {'url': 'url', - 'username': 'username', - 'password': 'pwd'}, - 'log': - {'file': 'data/taca.log'}} + got_config_data = config.load_config("data/taca_test_cfg_minimal.yaml") + expexted_config_data = { + "statusdb": {"url": "url", "username": "username", "password": "pwd"}, + "log": {"file": "data/taca.log"}, + } self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - config.load_config('data/missing_file.yaml)') + config.load_config("data/missing_file.yaml)") + class TestBioinfoTab(unittest.TestCase): """Test class for bioinfo_tab.""" @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_bt') - self.new_run = os.path.join(self.rootdir,'nosync', '190821_M01545_0252_000000001') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_bt") + self.new_run = os.path.join( + self.rootdir, "nosync", "190821_M01545_0252_000000001" + ) os.makedirs(self.new_run) - self.demux_run = os.path.join(self.rootdir, '190821_M01545_0252_000000002') - os.makedirs(os.path.join(self.demux_run, 'Unaligned_1')) - self.seq_run = os.path.join(self.rootdir, '190821_M01545_0252_000000003') + self.demux_run = os.path.join(self.rootdir, "190821_M01545_0252_000000002") + os.makedirs(os.path.join(self.demux_run, "Unaligned_1")) + self.seq_run = os.path.join(self.rootdir, "190821_M01545_0252_000000003") os.makedirs(self.seq_run) - self.error_run = os.path.join(self.rootdir, '190821_M01545_0252_000000004') + self.error_run = os.path.join(self.rootdir, "190821_M01545_0252_000000004") os.makedirs(self.error_run) - with open(os.path.join(self.error_run, 'RTAComplete.txt'), 'w') as fh: - fh.write('This is some contents\n') + with open(os.path.join(self.error_run, "RTAComplete.txt"), "w") as fh: + fh.write("This is some contents\n") @classmethod def tearDownClass(self): shutil.rmtree(self.rootdir) - @mock.patch('taca.utils.bioinfo_tab.update_statusdb', return_value=None) + @mock.patch("taca.utils.bioinfo_tab.update_statusdb", return_value=None) def test_collect_runs(self, mock_update_statusdb): """Find runs in specified directory.""" bioinfo_tab.collect_runs() - calls = [mock.call('data/test_data/190201_A00621_0032_BHHFCFDSXX'), mock.call('data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY')] + calls = [ + mock.call("data/test_data/190201_A00621_0032_BHHFCFDSXX"), + mock.call("data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY"), + ] mock_update_statusdb.assert_has_calls(calls) def test_get_ss_projects(self): """Get project info.""" - run_dir = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' + run_dir = "data/test_data/190201_A00621_0032_BHHFCFDSXX" got_info = bioinfo_tab.get_ss_projects(run_dir) - expected_info = defaultdict(bioinfo_tab.Tree, - {'HHFCFDSXX': defaultdict(bioinfo_tab.Tree, - {'1': defaultdict(bioinfo_tab.Tree, - {'P10000_1001': defaultdict(bioinfo_tab.Tree, - {'P10000': defaultdict(bioinfo_tab.Tree, {})})}), - '2': defaultdict(bioinfo_tab.Tree, - {'P10000_1005': defaultdict(bioinfo_tab.Tree, - {'P10000': defaultdict(bioinfo_tab.Tree, {})})})})}) + expected_info = defaultdict( + bioinfo_tab.Tree, + { + "HHFCFDSXX": defaultdict( + bioinfo_tab.Tree, + { + "1": defaultdict( + bioinfo_tab.Tree, + { + "P10000_1001": defaultdict( + bioinfo_tab.Tree, + {"P10000": defaultdict(bioinfo_tab.Tree, {})}, + ) + }, + ), + "2": defaultdict( + bioinfo_tab.Tree, + { + "P10000_1005": defaultdict( + bioinfo_tab.Tree, + {"P10000": defaultdict(bioinfo_tab.Tree, {})}, + ) + }, + ), + }, + ) + }, + ) self.assertEqual(expected_info, got_info) - @mock.patch('taca.utils.bioinfo_tab.statusdb') + @mock.patch("taca.utils.bioinfo_tab.statusdb") def test_update_statusdb(self, mock_couch): """Update statusdb.""" - run_dir = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' + run_dir = "data/test_data/190201_A00621_0032_BHHFCFDSXX" bioinfo_tab.update_statusdb(run_dir) - mock_couch.StatusdbSession.assert_called_with({'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'xten_db': 'x_flowcells'}) + mock_couch.StatusdbSession.assert_called_with( + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + } + ) def test_get_status_new(self): """Return status New.""" got_status = bioinfo_tab.get_status(self.new_run) - self.assertEqual(got_status, 'New') + self.assertEqual(got_status, "New") def test_get_status_demultiplexing(self): """Return status Demultiplexing.""" got_status = bioinfo_tab.get_status(self.demux_run) - self.assertEqual(got_status, 'Demultiplexing') + self.assertEqual(got_status, "Demultiplexing") def test_get_status_sequencing(self): """Return status Sequencing.""" got_status = bioinfo_tab.get_status(self.seq_run) - self.assertEqual(got_status, 'Sequencing') + self.assertEqual(got_status, "Sequencing") def test_get_status_error(self): """Return status ERROR.""" got_status = bioinfo_tab.get_status(self.error_run) - self.assertEqual(got_status, 'ERROR') + self.assertEqual(got_status, "ERROR") def test_parse_sample_sheet(self): """Parse samplesheet.""" - sample_sheet = 'data/samplesheet.csv' - expected_data = [{'SampleWell': '1:1', - 'index': 'GAATTCGT', - 'Lane': '1', - 'SamplePlate': 'FCB_150423', - 'SampleName': 'P1775_147', - 'SampleID': 'Sample_P1775_147', - 'Project': 'J_Lundeberg_14_24'}] - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir') + sample_sheet = "data/samplesheet.csv" + expected_data = [ + { + "SampleWell": "1:1", + "index": "GAATTCGT", + "Lane": "1", + "SamplePlate": "FCB_150423", + "SampleName": "P1775_147", + "SampleID": "Sample_P1775_147", + "Project": "J_Lundeberg_14_24", + } + ] + parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, "run_dir") self.assertEqual(expected_data, parsed_data) def test_parse_sample_sheet_is_miseq(self): """Parse MiSeq samplesheet.""" - sample_sheet = 'data/miseq_samplesheet.csv' - expected_data = [{'SampleWell': '1:1', - 'index': 'GAATTCGT', - 'Lane': '1', - 'SamplePlate': 'FCB_150423', - 'SampleName': 'P1775_147', - 'SampleID': 'Sample_P1775_147', - 'Project': 'J_Lundeberg_14_24'}] - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir', is_miseq=True) + sample_sheet = "data/miseq_samplesheet.csv" + expected_data = [ + { + "SampleWell": "1:1", + "index": "GAATTCGT", + "Lane": "1", + "SamplePlate": "FCB_150423", + "SampleName": "P1775_147", + "SampleID": "Sample_P1775_147", + "Project": "J_Lundeberg_14_24", + } + ] + parsed_data = bioinfo_tab.parse_samplesheet( + sample_sheet, "run_dir", is_miseq=True + ) self.assertEqual(expected_data, parsed_data) def test_parse_sample_sheet_is_miseq_error(self): """Return empty list if not production or application in MiSeq samplesheet.""" - sample_sheet = 'data/samplesheet.csv' - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir', is_miseq=True) + sample_sheet = "data/samplesheet.csv" + parsed_data = bioinfo_tab.parse_samplesheet( + sample_sheet, "run_dir", is_miseq=True + ) self.assertEqual(parsed_data, []) - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_no_samplesheet(self, mock_datetime, mock_send_mail): """Send email if no_samplesheet error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'run_missing_samplesheet' - body+='\n\nSincerely, TACA' - subject='ERROR, Samplesheet error' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "run_missing_samplesheet" + body += "\n\nSincerely, TACA" + subject = "ERROR, Samplesheet error" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('no_samplesheet', 'run_missing_samplesheet') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("no_samplesheet", "run_missing_samplesheet") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_failed_run(self, mock_datetime, mock_send_mail): """Send email if failed_run error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'failed_run' - body+='\n\nSincerely, TACA' - subject='WARNING, Reinitialization of partially failed FC' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "failed_run" + body += "\n\nSincerely, TACA" + subject = "WARNING, Reinitialization of partially failed FC" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('failed_run', 'failed_run') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("failed_run", "failed_run") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_weird_samplesheet(self, mock_datetime, mock_send_mail): """Send email if weird_samplesheet error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'weird_samplesheet_run' - body+='\n\nSincerely, TACA' - subject='ERROR, Incorrectly formatted samplesheet' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "weird_samplesheet_run" + body += "\n\nSincerely, TACA" + subject = "ERROR, Incorrectly formatted samplesheet" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('weird_samplesheet', 'weird_samplesheet_run') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("weird_samplesheet", "weird_samplesheet_run") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.statusdb') + @mock.patch("taca.utils.bioinfo_tab.statusdb") def test_fail_run(self, mock_couch): """Fail run in statusdb.""" - run_id = '190201_A00621_0032_BHHFCFDSXX' - project = 'P0001' + run_id = "190201_A00621_0032_BHHFCFDSXX" + project = "P0001" bioinfo_tab.fail_run(run_id, project) - mock_couch.StatusdbSession.assert_called_with({'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'xten_db': - 'x_flowcells'}) + mock_couch.StatusdbSession.assert_called_with( + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + } + ) class TestStatusdb(unittest.TestCase): """Tests for statusdb utils.""" - @mock.patch('taca.utils.statusdb.couchdb') + @mock.patch("taca.utils.statusdb.couchdb") def test_get_entry(self, mock_couch): """Get an entry from statusdb.""" - couch_config = {'user': 'username', - 'url': 'some_url', - 'password': 'some_pwd'} - entry = statusdb.ProjectSummaryConnection(couch_config).get_entry('name') + couch_config = {"user": "username", "url": "some_url", "password": "some_pwd"} + entry = statusdb.ProjectSummaryConnection(couch_config).get_entry("name") self.assertEqual(entry, None) def test_merge_dicts(self): """Merge two dicts.""" - d1 = {'a': '1', 'b': '2'} - d2 = {'a': '3', 'c': '4'} + d1 = {"a": "1", "b": "2"} + d2 = {"a": "3", "c": "4"} merged_dict = statusdb.merge_dicts(d1, d2) - expected_dict = {'a': '1', 'b': '2', 'c': '4'} + expected_dict = {"a": "1", "b": "2", "c": "4"} self.assertEqual(merged_dict, expected_dict) From 267fe251f5c5ced4d6ee6171c3ca6ef5caa5f988 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:30:59 +0100 Subject: [PATCH 06/44] bump gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 91b6d2ab..c1357210 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ _build __pycache__ .pytest_cache .vscode +.ruff_cache From 8ea4523b1c9789d03410178d75aea93b6b2ffa77 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 14:34:42 +0100 Subject: [PATCH 07/44] prettier --- .devcontainer/devcontainer.json | 6 +- .github/pr_labels.yml | 2 +- .travis.yml | 16 +- CONTRIBUTING.md | 5 +- README.md | 1 - VERSIONLOG.md | 3 +- tests/data/Stats.json | 140 ++++++------- tests/data/lane.html | 175 ++++++++-------- tests/data/laneBarcode.html | 191 +++++++++--------- tests/data/lane_result.html | 174 ++++++++-------- .../report.md | 66 +++--- .../report.md | 66 +++--- tests/data/taca_test_cfg.yaml | 145 +++++++------ tests/data/taca_test_cfg_backup.yaml | 19 +- tests/data/taca_test_cfg_cleanup.yaml | 29 ++- tests/data/taca_test_nanopore_cfg.yaml | 16 +- 16 files changed, 536 insertions(+), 518 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9c0c3268..2ea64cc9 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -11,9 +11,7 @@ "features": {}, "customizations": { "vscode": { - "extensions": [ - "ms-python.python", - ] + "extensions": ["ms-python.python"] } }, // Features to add to the dev container. More info: https://containers.dev/features. @@ -28,4 +26,4 @@ "mounts": [ "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached" ] -} \ No newline at end of file +} diff --git a/.github/pr_labels.yml b/.github/pr_labels.yml index d04f24f3..8db6d109 100644 --- a/.github/pr_labels.yml +++ b/.github/pr_labels.yml @@ -1,4 +1,4 @@ -version: '1' +version: "1" invalidStatus: "pending" labelRule: values: diff --git a/.travis.yml b/.travis.yml index b1ae1922..66d4b020 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,19 +1,19 @@ language: python python: - - "2.7" - - "3.8" + - "2.7" + - "3.8" install: - - python setup.py install - - mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml - - pip install codecov + - python setup.py install + - mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml + - pip install codecov script: - - cd tests && nosetests --with-coverage -v -s + - cd tests && nosetests --with-coverage -v -s after_success: - - codecov + - codecov notifications: - email: false + email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c2345165..57b41e7d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,13 +2,14 @@ When contribution to this package please have the following things in mind: -__NOTE__: _Please make sure that there are no exisiting [issues]((https://github.com/SciLifeLab/TACA/issues)) relating to whatever you want to report._ +**NOTE**: _Please make sure that there are no exisiting [issues](<(https://github.com/SciLifeLab/TACA/issues)>) relating to whatever you want to report._ ####To contribute: + 1. Create an issue describing the bug / suggestion / improvement / ... [here](https://github.com/SciLifeLab/TACA/issues). 2. Fork this repository to your GitHub account 3. Make the necessary changes / additions to your forked TACA repository -4. Please *make sure* that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/) +4. Please _make sure_ that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/) 5. Update the version number in `TACA/__init__.py` 6. Pull Request and wait for the responsible reviewer to review and merge the code diff --git a/README.md b/README.md index 776051d6..ad652ede 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,6 @@ When a non-invasive tool is used to tidy up a lot of code, it is useful to supre To do this, add the hash of the commit containing the changes to `.git-blame-ignore-revs`, headed by an explanatory comment. - ### Deliver command There is also a [plugin for the deliver command](https://github.com/SciLifeLab/taca-ngi-pipeline). To install this in the same development environment: diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 0216daf7..bb274c28 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -21,9 +21,11 @@ Version 1.0.0 Fix bug with rsync permission issue cont. ## 20231031.1 + Improve run_folder transfer ## 20231026.1 + Fix bug with rsync permission issue ## 20231024.1 @@ -46,7 +48,6 @@ Fix bug that NovaSeqXPlus date format cause error in writing pdc_archived timest Remove the temp change of creating links - ## 20230920.1 Supplement last PR, primary purpose is to differentiate user runs from QC runs in the instrument transfer script rather than the installed TACA. diff --git a/tests/data/Stats.json b/tests/data/Stats.json index 5090f4ac..28340f00 100644 --- a/tests/data/Stats.json +++ b/tests/data/Stats.json @@ -1,75 +1,75 @@ { - "RunNumber":131, - "Flowcell":"FCIDXX", - "RunId":"141124_ST-COMPLEX1_01_AFCIDXX", - "ConversionResults":[ - { - "LaneNumber":1, - "DemuxResults":[ - { - "SampleId":"Sample_P12345_1001", - "SampleName":"P12345_1001", - "NumberReads":494288265, - "Yield":58820303535, - "ReadMetrics":[ - { - "ReadNumber":1, - "Yield":13840071420, - "YieldQ30":13329609381, - "QualityScoreSum":503672520160, - "TrimmedBases":0 - } - ] + "RunNumber": 131, + "Flowcell": "FCIDXX", + "RunId": "141124_ST-COMPLEX1_01_AFCIDXX", + "ConversionResults": [ + { + "LaneNumber": 1, + "DemuxResults": [ + { + "SampleId": "Sample_P12345_1001", + "SampleName": "P12345_1001", + "NumberReads": 494288265, + "Yield": 58820303535, + "ReadMetrics": [ + { + "ReadNumber": 1, + "Yield": 13840071420, + "YieldQ30": 13329609381, + "QualityScoreSum": 503672520160, + "TrimmedBases": 0 + } + ] + } + ], + "Undetermined": { + "NumberReads": 17709745, + "Yield": 2036620675, + "ReadMetrics": [ + { + "ReadNumber": 1, + "Yield": 885487250, + "YieldQ30": 680049984, + "QualityScoreSum": 28815661398, + "TrimmedBases": 0 + }, + { + "ReadNumber": 2, + "Yield": 283355920, + "YieldQ30": 179655904, + "QualityScoreSum": 8324058259, + "TrimmedBases": 0 + } + ] } - ], - "Undetermined":{ - "NumberReads":17709745, - "Yield":2036620675, - "ReadMetrics":[ - { - "ReadNumber":1, - "Yield":885487250, - "YieldQ30":680049984, - "QualityScoreSum":28815661398, - "TrimmedBases":0 - }, - { - "ReadNumber":2, - "Yield":283355920, - "YieldQ30":179655904, - "QualityScoreSum":8324058259, - "TrimmedBases":0 - } + } + ], + "ReadInfosForLanes": [ + { + "LaneNumber": 1, + "ReadInfos": [ + { + "Number": 1, + "NumCycles": 28, + "IsIndexedRead": "false" + } ] - } - } - ], - "ReadInfosForLanes":[ - { - "LaneNumber":1, - "ReadInfos":[ - { - "Number":1, - "NumCycles":28, - "IsIndexedRead":"false" + } + ], + "UnknownBarcodes": [ + { + "Lane": 1, + "Barcodes": { + "GGGGGGGG": 3203920, + "CCCTAACA": 290420 } - ] - } - ], - "UnknownBarcodes":[ - { - "Lane":1, - "Barcodes":{ - "GGGGGGGG":3203920, - "CCCTAACA":290420 - } - }, - { - "Lane":2, - "Barcodes":{ - "GGGGGGGG":3075440, - "CCCTAACA":296260 - } - } - ] + }, + { + "Lane": 2, + "Barcodes": { + "GGGGGGGG": 3075440, + "CCCTAACA": 296260 + } + } + ] } diff --git a/tests/data/lane.html b/tests/data/lane.html index 0079244b..b02fac08 100644 --- a/tests/data/lane.html +++ b/tests/data/lane.html @@ -1,85 +1,96 @@ - - - - - -

H5YKFDSXY / - [all projects] / - [all samples] / - [all barcodes]

show barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
-

- + + + + + + + +
+

+ H5YKFDSXY / [all projects] / [all samples] / [all + barcodes] +

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
+

+ diff --git a/tests/data/laneBarcode.html b/tests/data/laneBarcode.html index 24d51031..a1f50376 100644 --- a/tests/data/laneBarcode.html +++ b/tests/data/laneBarcode.html @@ -1,96 +1,101 @@ - - - - - -

FCIDXX / - [all projects] / - [all samples] / - [all barcodes]

hide barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
-

Top Unknown Barcodes

- - - - - - - - - - - - - - - - - - -
- Lane - CountSequence - Lane - CountSequence
1494,288,120unknown2464,768,960unknown
-

- + + + + + + + +
+

+ FCIDXX / [all projects] / [all samples] / [all barcodes] +

+
+

+ hide barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
+

Top Unknown Barcodes

+ + + + + + + + + + + + + + + + + + +
LaneCountSequenceLaneCountSequence
1494,288,120unknown2464,768,960unknown
+

+ diff --git a/tests/data/lane_result.html b/tests/data/lane_result.html index e94bfef0..41ace736 100644 --- a/tests/data/lane_result.html +++ b/tests/data/lane_result.html @@ -1,85 +1,95 @@ - - - - - -

C6L1WANXX / - [all projects] / - [all samples] / - [all barcodes]

show barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
-

- + + + + + + + +
+

+ C6L1WANXX / [all projects] / [all samples] / [all + barcodes] +

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
+

+ diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md index bfad9e66..4bcfcdb5 100644 --- a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md +++ b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md @@ -1,41 +1,39 @@ -Tracking ID -=========== +# Tracking ID { - "asic_id": "755228278", - "asic_id_eeprom": "2866631", - "asic_temp": "33.002907", - "asic_version": "IA02D", - "auto_update": "0", - "auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", - "bream_is_standard": "0", - "device_id": "MN19414", - "device_type": "minion", - "distribution_status": "stable", - "distribution_version": "19.10.1", - "exp_script_name": "N/A", - "exp_script_purpose": "sequencing_run", - "exp_start_time": "2020-08-03T13:05:12Z", - "flongle_adapter_id": "FA-00577", - "flow_cell_id": "ACG995", - "flow_cell_product_code": "FLO-FLG001", - "guppy_version": "3.2.6+afc8e14", - "heatsink_temp": "34.574219", - "hostname": "ngi-squiggle", - "installation_type": "nc", - "local_firmware_file": "1", - "operating_system": "ubuntu 16.04", - "protocol_group_id": "blah", - "protocol_run_id": "", - "protocols_version": "4.2.11", - "run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", - "sample_id": "blahblah", - "usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", - "version": "3.5.5" +"asic_id": "755228278", +"asic_id_eeprom": "2866631", +"asic_temp": "33.002907", +"asic_version": "IA02D", +"auto_update": "0", +"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", +"bream_is_standard": "0", +"device_id": "MN19414", +"device_type": "minion", +"distribution_status": "stable", +"distribution_version": "19.10.1", +"exp_script_name": "N/A", +"exp_script_purpose": "sequencing_run", +"exp_start_time": "2020-08-03T13:05:12Z", +"flongle_adapter_id": "FA-00577", +"flow_cell_id": "ACG995", +"flow_cell_product_code": "FLO-FLG001", +"guppy_version": "3.2.6+afc8e14", +"heatsink_temp": "34.574219", +"hostname": "ngi-squiggle", +"installation_type": "nc", +"local_firmware_file": "1", +"operating_system": "ubuntu 16.04", +"protocol_group_id": "blah", +"protocol_run_id": "", +"protocols_version": "4.2.11", +"run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", +"sample_id": "blahblah", +"usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", +"version": "3.5.5" } -Duty Time -========= +# Duty Time ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md index bfad9e66..4bcfcdb5 100644 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md +++ b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md @@ -1,41 +1,39 @@ -Tracking ID -=========== +# Tracking ID { - "asic_id": "755228278", - "asic_id_eeprom": "2866631", - "asic_temp": "33.002907", - "asic_version": "IA02D", - "auto_update": "0", - "auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", - "bream_is_standard": "0", - "device_id": "MN19414", - "device_type": "minion", - "distribution_status": "stable", - "distribution_version": "19.10.1", - "exp_script_name": "N/A", - "exp_script_purpose": "sequencing_run", - "exp_start_time": "2020-08-03T13:05:12Z", - "flongle_adapter_id": "FA-00577", - "flow_cell_id": "ACG995", - "flow_cell_product_code": "FLO-FLG001", - "guppy_version": "3.2.6+afc8e14", - "heatsink_temp": "34.574219", - "hostname": "ngi-squiggle", - "installation_type": "nc", - "local_firmware_file": "1", - "operating_system": "ubuntu 16.04", - "protocol_group_id": "blah", - "protocol_run_id": "", - "protocols_version": "4.2.11", - "run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", - "sample_id": "blahblah", - "usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", - "version": "3.5.5" +"asic_id": "755228278", +"asic_id_eeprom": "2866631", +"asic_temp": "33.002907", +"asic_version": "IA02D", +"auto_update": "0", +"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", +"bream_is_standard": "0", +"device_id": "MN19414", +"device_type": "minion", +"distribution_status": "stable", +"distribution_version": "19.10.1", +"exp_script_name": "N/A", +"exp_script_purpose": "sequencing_run", +"exp_start_time": "2020-08-03T13:05:12Z", +"flongle_adapter_id": "FA-00577", +"flow_cell_id": "ACG995", +"flow_cell_product_code": "FLO-FLG001", +"guppy_version": "3.2.6+afc8e14", +"heatsink_temp": "34.574219", +"hostname": "ngi-squiggle", +"installation_type": "nc", +"local_firmware_file": "1", +"operating_system": "ubuntu 16.04", +"protocol_group_id": "blah", +"protocol_run_id": "", +"protocols_version": "4.2.11", +"run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", +"sample_id": "blahblah", +"usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", +"version": "3.5.5" } -Duty Time -========= +# Duty Time ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u diff --git a/tests/data/taca_test_cfg.yaml b/tests/data/taca_test_cfg.yaml index 34437461..6bb3e657 100644 --- a/tests/data/taca_test_cfg.yaml +++ b/tests/data/taca_test_cfg.yaml @@ -14,77 +14,77 @@ statusdb: xten_db: x_flowcells analysis: - status_dir: data/ - data_dir: data/test_data/ - deliver_runfolder: - analysis_server: - host: b5.biotech.kth.se - port: - user: sara.sjunnebo - destination: test - mfs_path: - miseq: data/ - hiseqx: data/ - novaseq: data/ - NovaSeq: - bcl2fastq: - bin: path_to_bcl_to_fastq - tenX_index_path: "data/test_10X_indexes" - smartseq_index_path: "data/test_smartseq_indexes" - options: - common: - - output-dir: Demultiplexing - - opt: b - - c - SMARTSEQ: - - d - 10X_SINGLE: - - a - 10X_DUAL: - - e - samplesheets_dir: "data" - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - MiSeq: - bcl2fastq: - bin: path_to_bcl_to_fastq - options: - common: - - output-dir: Demultiplexing - samplesheets_dir: "data" - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - NextSeq: - samplesheets_dir: "data" - bcl2fastq: - bin: path_to_bcl_to_fastq - tenX_index_path: "data/test_10X_indexes" - smartseq_index_path: "data/test_smartseq_indexes" - options: - common: - - output-dir: Demultiplexing - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - DummySeq: - samplesheets_dir: "data" + status_dir: data/ + data_dir: data/test_data/ + deliver_runfolder: + analysis_server: + host: b5.biotech.kth.se + port: + user: sara.sjunnebo + destination: test + mfs_path: + miseq: data/ + hiseqx: data/ + novaseq: data/ + NovaSeq: + bcl2fastq: + bin: path_to_bcl_to_fastq + tenX_index_path: "data/test_10X_indexes" + smartseq_index_path: "data/test_smartseq_indexes" + options: + common: + - output-dir: Demultiplexing + - opt: b + - c + SMARTSEQ: + - d + 10X_SINGLE: + - a + 10X_DUAL: + - e + samplesheets_dir: "data" + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + MiSeq: + bcl2fastq: + bin: path_to_bcl_to_fastq + options: + common: + - output-dir: Demultiplexing + samplesheets_dir: "data" + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + NextSeq: + samplesheets_dir: "data" + bcl2fastq: + bin: path_to_bcl_to_fastq + tenX_index_path: "data/test_10X_indexes" + smartseq_index_path: "data/test_smartseq_indexes" + options: + common: + - output-dir: Demultiplexing + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + DummySeq: + samplesheets_dir: "data" bioinfo_tab: data_dirs: @@ -92,5 +92,4 @@ bioinfo_tab: xten_samplesheets: "data" mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_cfg_backup.yaml b/tests/data/taca_test_cfg_backup.yaml index e9c0662e..f66f5528 100644 --- a/tests/data/taca_test_cfg_backup.yaml +++ b/tests/data/taca_test_cfg_backup.yaml @@ -1,12 +1,12 @@ backup: - archive_dirs: - miseq: data/nas/miseq.lab/nosync - hiseq: blah - data_dirs: - miseq: data/nas/miseq.lab - keys_path: data/nas/run_keys - gpg_receiver: some.user - check_demux: True + archive_dirs: + miseq: data/nas/miseq.lab/nosync + hiseq: blah + data_dirs: + miseq: data/nas/miseq.lab + keys_path: data/nas/run_keys + gpg_receiver: some.user + check_demux: True statusdb: url: url @@ -15,5 +15,4 @@ statusdb: xten_db: x_flowcells mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_cfg_cleanup.yaml b/tests/data/taca_test_cfg_cleanup.yaml index 2a7d9b92..7610a91d 100644 --- a/tests/data/taca_test_cfg_cleanup.yaml +++ b/tests/data/taca_test_cfg_cleanup.yaml @@ -14,20 +14,19 @@ storage: HiSeq: data/test_data/nosync cleanup: - miarka: - flowcell: - root: - - "data/miarka/incoming" - - "data/miarka/archive" - relative_project_source: Demultiplexing - undet_file_pattern: "*Undetermined_*.fastq.gz" - data_dir: "data/miarka/nobackup/NGI/DATA" - analysis: - root: "../../nobackup/NGI/ANALYSIS" - files_to_remove: - piper_ngi: - - "*.bam" + miarka: + flowcell: + root: + - "data/miarka/incoming" + - "data/miarka/archive" + relative_project_source: Demultiplexing + undet_file_pattern: "*Undetermined_*.fastq.gz" + data_dir: "data/miarka/nobackup/NGI/DATA" + analysis: + root: "../../nobackup/NGI/ANALYSIS" + files_to_remove: + piper_ngi: + - "*.bam" mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_nanopore_cfg.yaml b/tests/data/taca_test_nanopore_cfg.yaml index df34c652..c5c32cc4 100644 --- a/tests/data/taca_test_nanopore_cfg.yaml +++ b/tests/data/taca_test_nanopore_cfg.yaml @@ -1,13 +1,13 @@ log: - file: "data/taca.log" + file: "data/taca.log" nanopore_analysis: minion_qc_run: nanoseq_version: 2.0.1 data_dir: data/nanopore_data/ ignore_dirs: - - 'nosync' - - '.nextflow' + - "nosync" + - ".nextflow" samplesheets_dir: data/nanopore_samplesheets lims_results_dir: some/dir transfer: @@ -18,11 +18,11 @@ nanopore_analysis: user: some_user destination: some_dir rsync_options: - '-LtDrv': None - '--chown': ':ngi2016003' - '--chmod' : 'Dg+s,g+rw' - '-r' : None - '--exclude' : 'work' + "-LtDrv": None + "--chown": ":ngi2016003" + "--chmod": "Dg+s,g+rw" + "-r": None + "--exclude": "work" finished_dir: data/nanopore_data/nosync mail: From fa9749ab0321bada03a7d5467ed3d48d6d7a4021 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 15:39:32 +0100 Subject: [PATCH 08/44] limit scope of editorconfig --- .github/workflows/lint-code.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 73fb6f97..9f42f631 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -127,4 +127,4 @@ jobs: run: npm install -g editorconfig-checker - name: editorconfig --> Lint files - run: editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html') + run: editorconfig-checker $(git ls-files | grep '.txt') From 703423cc3412b496b4456fb1ec9ca4616ea68211 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 15:44:46 +0100 Subject: [PATCH 09/44] add CouchDB to reqs, fix docs --- .github/workflows/lint-code.yml | 2 +- requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 9f42f631..a63bc5f4 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -111,7 +111,7 @@ jobs: - name: Run Prettier --check run: prettier --check . - # Use editorconfig to check all remaining file formats + # Use editorconfig to check other specified file formats editorconfig: runs-on: ubuntu-latest steps: diff --git a/requirements.txt b/requirements.txt index b2bc63b1..51592db7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ click +CouchDB requests pyyaml flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser From 2ba0179015e380b3b7e0ce8b9b99666533a7443f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 15:45:53 +0100 Subject: [PATCH 10/44] ruff fix --- taca/analysis/analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index c615b3f9..9773e308 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -84,9 +84,9 @@ def get_runObj(run, software): return NovaSeq_Run(run, software, CONFIG["analysis"]["NovaSeq"]) else: logger.warn( - "Unrecognized run type {}, cannot archive the run {}. " + f"Unrecognized run type {runtype}, cannot archive the run {run}. " "Someone as likely bought a new sequencer without telling " - "it to the bioinfo team".format(runtype, run) + "it to the bioinfo team" ) return None From ec8e9fce32cee5e6f829094c4fb1c9055e08d0eb Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 16:15:19 +0100 Subject: [PATCH 11/44] MANUAL fixes --- taca/analysis/analysis.py | 2 +- taca/cleanup/cleanup.py | 2 +- taca/illumina/NextSeq_Runs.py | 2 +- taca/illumina/Runs.py | 30 +++++++++++++------------- taca/testing/create_uppmax_like_env.py | 1 + taca/utils/misc.py | 4 ++-- taca/utils/statusdb.py | 2 +- taca/utils/transfer.py | 2 +- tests/test_illumina.py | 2 +- 9 files changed, 24 insertions(+), 23 deletions(-) diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index 9773e308..c19f0582 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -160,7 +160,7 @@ def _upload_to_statusdb(run): statusdb.update_doc(db, parser.obj, over_write_db_entry=True) -def transfer_run(run_dir): +def transfer_run(run_dir, software): """Interface for click to force a transfer a run to uppmax. :param: string run_dir: the run to tranfer diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index 3a58d871..df1e80ab 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -580,7 +580,7 @@ def _def_get_size_unit(s): elif s > kb: s = f"~{int(s/kb)}kb" elif s > 0: - s = f"~{int(s/b)}b" + s = f"~{int(s)}b" return str(s) diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py index 6dc8cee1..5785542c 100755 --- a/taca/illumina/NextSeq_Runs.py +++ b/taca/illumina/NextSeq_Runs.py @@ -3,7 +3,7 @@ class NextSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(Standard_Runs, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() # NextSeq2000 has a different FC ID pattern that ID contains the first letter for position diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 5fbac30a..2d582be9 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -180,7 +180,7 @@ def _check_demux_log(self, demux_id, demux_log): errors += 1 error_and_warning_messages.append(line) elif "WARNING" in line: - warnnings += 1 + warnings += 1 error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages else: @@ -725,14 +725,14 @@ def _fix_html_reports_for_complex_lanes( # Now all lanes have been inserted # NumberReads for total lane cluster/yields and total sample cluster/yields - NumberReads_Summary = dict() + self.NumberReads_Summary = dict() # The numbers in Flowcell Summary also need to be aggregated if multiple demultiplexing is done Clusters_Raw = 0 Clusters_PF = 0 Yield_Mbases = 0 for entry in html_report_lane_parser.sample_data: # Update NumberReads for total lane clusters - NumberReads_Summary[entry["Lane"]] = { + self.NumberReads_Summary[entry["Lane"]] = { "total_lane_cluster": int(entry["PF Clusters"].replace(",", "")), "total_lane_yield": int(entry["Yield (Mbases)"].replace(",", "")), } @@ -790,27 +790,27 @@ def _fix_html_reports_for_complex_lanes( # Update NumberReads for total sample yields for entry in html_report_laneBarcode_parser.sample_data: - if "total_sample_cluster" not in NumberReads_Summary[entry["Lane"]].keys(): - NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0 - NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0 + if "total_sample_cluster" not in self.NumberReads_Summary[entry["Lane"]].keys(): + self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0 + self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0 if entry["Project"] != "default": - NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( + self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( entry["PF Clusters"].replace(",", "") ) - NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( + self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( entry["Yield (Mbases)"].replace(",", "") ) else: if entry["Project"] != "default": - NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( + self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( entry["PF Clusters"].replace(",", "") ) - NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( + self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( entry["Yield (Mbases)"].replace(",", "") ) # Calculate the numbers clusters/yields of undet reads - for key, value in NumberReads_Summary.items(): + for key, value in self.NumberReads_Summary.items(): value["undet_cluster"] = ( value["total_lane_cluster"] - value["total_sample_cluster"] ) @@ -822,10 +822,10 @@ def _fix_html_reports_for_complex_lanes( for entry in html_report_laneBarcode_parser.sample_data: if entry["Project"] == "default" and entry["Lane"] in complex_lanes.keys(): entry["PF Clusters"] = "{:,}".format( - NumberReads_Summary[entry["Lane"]]["undet_cluster"] + self.NumberReads_Summary[entry["Lane"]]["undet_cluster"] ) entry["Yield (Mbases)"] = "{:,}".format( - NumberReads_Summary[entry["Lane"]]["undet_yield"] + self.NumberReads_Summary[entry["Lane"]]["undet_yield"] ) # Fix special case that when we assign fake indexes for NoIndex samples @@ -923,11 +923,11 @@ def _fix_demultiplexingstats_xml_dir( # For complex lanes, we set all stats to 0, except for read number and yield which will use values from NumberReads_Summary ConversionResults_lane["Undetermined"][ "NumberReads" - ] = NumberReads_Summary[ + ] = self.NumberReads_Summary[ str(ConversionResults_lane["LaneNumber"]) ]["undet_cluster"] ConversionResults_lane["Undetermined"]["Yield"] = ( - NumberReads_Summary[ + self.NumberReads_Summary[ str(ConversionResults_lane["LaneNumber"]) ]["undet_yield"] * 1000000 diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py index 831646b6..d3c4a615 100644 --- a/taca/testing/create_uppmax_like_env.py +++ b/taca/testing/create_uppmax_like_env.py @@ -5,6 +5,7 @@ import os import random import subprocess +import sys from dateutil.relativedelta import relativedelta diff --git a/taca/utils/misc.py b/taca/utils/misc.py index ec05d4ff..a180bcfd 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -38,7 +38,7 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=""): :param string prefix: the prefix to add to log file :param string log_dir: where to write the log file (to avoid problems with rights) """ - if type(cl) == str: + if isinstance(type(cl), str): cl = cl.split(" ") logFile = os.path.basename(cl[0]) stdout = sys.stdout @@ -73,7 +73,7 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None): :param string cl: Command line to be executed (command + options and parameters) :param bool with_log_files: Create log files for stdout and stderr """ - if type(cl) == str: + if isinstance(type(cl), str): cl = cl.split(" ") command = os.path.basename(cl[0]) stdout = sys.stdout diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 548b87b0..939e0606 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -193,7 +193,7 @@ def merge_dicts(d1, d2): for key in d2: if key in d1: if isinstance(d1[key], dict) and isinstance(d2[key], dict): - merge(d1[key], d2[key]) + merge_dicts(d1[key], d2[key]) elif d1[key] == d2[key]: pass # same leaf value else: diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index b04eace5..693f0725 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -56,7 +56,7 @@ def format_options(self): if val is None: cmdopts.append(param) else: - if type(val) == str: + if isinstance(type(val), str): val = [val] for v in val: cmdopts.append(f"{param}={v}") diff --git a/tests/test_illumina.py b/tests/test_illumina.py index d052bb89..c8c59a33 100644 --- a/tests/test_illumina.py +++ b/tests/test_illumina.py @@ -12,10 +12,10 @@ from flowcell_parser.classes import LaneBarcodeParser -from taca.analysis.analysis import * from taca.illumina.NextSeq_Runs import NextSeq_Run from taca.illumina.NovaSeq_Runs import NovaSeq_Run from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html +from taca.illumina.Standard_Runs import Standard_Run from taca.utils import config as conf if sys.version_info[0] >= 3: From 75dcdf40b7fb6220b03deee26ab4878f9a73cee9 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 16:15:33 +0100 Subject: [PATCH 12/44] remove outdated nanopore tests --- .../squigglefile.fast5 | 0 .../still_running_and_should_not_be_moved | 0 .../final_summary.txt | 17 ---- .../finished_sequencing_and_ready_to_analyse | 0 .../report.md | 42 --------- .../squigglefile.fast5 | 0 .../SQK-LSK109_AAU643_sample_sheet.csv | 2 - .../final_summary.txt | 0 .../nanoseq_output/some_intermediary_results | 0 .../squigglefile.fast5 | 0 .../.exitcode_for_anglerfish | 1 - .../.exitcode_for_nanoseq | 1 - .../SQK-LSK109_sample_sheet.csv | 3 - .../anglerfish_stats.txt | 0 .../anglerfish_2020_09_23_141923/.file | 0 .../anglerfish_sample_sheet.csv | 2 - .../expected_sample_sheet.csv | 3 - .../final_summary.txt | 17 ---- .../nanoseq_output/final_results | 0 .../report.md | 42 --------- .../squigglefile.fast5 | 0 .../final_summary.txt | 0 .../squigglefile.fast5 | 0 .../.exitcode_for_nanoseq | 1 - .../SQK-LSK109_sample_sheet.csv | 2 - .../final_summary.txt | 0 .../nanoseq_output/failed_output | 0 .../squigglefile.fast5 | 0 tests/data/nanopore_data/transfer.tsv | 2 - ...QK-LSK109_AAU644_Samplesheet_24-594126.csv | 2 - ...QK-LSK109_AAU642_Samplesheet_22-594126.csv | 2 - ...QK-LSK109_AAU645_Samplesheet_24-594126.csv | 74 ---------------- .../expected/SQK-LSK109_sample_sheet.csv | 3 - .../expected/anglerfish_sample_sheet.csv | 2 - tests/test_analysis_nanopore.py | 86 ------------------- 35 files changed, 304 deletions(-) delete mode 100644 tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/still_running_and_should_not_be_moved delete mode 100644 tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt delete mode 100644 tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/finished_sequencing_and_ready_to_analyse delete mode 100644 tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md delete mode 100644 tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv delete mode 100644 tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/final_summary.txt delete mode 100644 tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/nanoseq_output/some_intermediary_results delete mode 100644 tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141922/anglerfish_stats.txt delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141923/.file delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/final_results delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md delete mode 100644 tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/final_summary.txt delete mode 100644 tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq delete mode 100644 tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv delete mode 100644 tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/final_summary.txt delete mode 100644 tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/nanoseq_output/failed_output delete mode 100644 tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/squigglefile.fast5 delete mode 100644 tests/data/nanopore_data/transfer.tsv delete mode 100644 tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv delete mode 100644 tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv delete mode 100644 tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv delete mode 100644 tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv delete mode 100644 tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv delete mode 100644 tests/test_analysis_nanopore.py diff --git a/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/still_running_and_should_not_be_moved b/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/still_running_and_should_not_be_moved deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt deleted file mode 100644 index 56e06611..00000000 --- a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt +++ /dev/null @@ -1,17 +0,0 @@ -instrument=MN19414 -position= -flow_cell_id=AAU644 -sample_id=blah -protocol_group_id=blah -protocol=4a127386521a00415c821269a588a9271276dfd0 -protocol_run_id=5db4c5bc-34c9-452b-9d10-70e013228328 -acquisition_run_id=8b4541217a81f608d0562c0a0847b703ba77d13d -started=2020-08-03T15:05:12.504585+02:00 -acquisition_stopped=2020-08-04T09:05:16.104836+02:00 -processing_stopped=2020-08-04T09:05:17.311324+02:00 -basecalling_enabled=0 -sequencing_summary_file= -fast5_files_in_final_dest=42 -fast5_files_in_fallback=0 -fastq_files_in_final_dest=0 -fastq_files_in_fallback=0 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/finished_sequencing_and_ready_to_analyse b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/finished_sequencing_and_ready_to_analyse deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md deleted file mode 100644 index 4bcfcdb5..00000000 --- a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md +++ /dev/null @@ -1,42 +0,0 @@ -# Tracking ID - -{ -"asic_id": "755228278", -"asic_id_eeprom": "2866631", -"asic_temp": "33.002907", -"asic_version": "IA02D", -"auto_update": "0", -"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", -"bream_is_standard": "0", -"device_id": "MN19414", -"device_type": "minion", -"distribution_status": "stable", -"distribution_version": "19.10.1", -"exp_script_name": "N/A", -"exp_script_purpose": "sequencing_run", -"exp_start_time": "2020-08-03T13:05:12Z", -"flongle_adapter_id": "FA-00577", -"flow_cell_id": "ACG995", -"flow_cell_product_code": "FLO-FLG001", -"guppy_version": "3.2.6+afc8e14", -"heatsink_temp": "34.574219", -"hostname": "ngi-squiggle", -"installation_type": "nc", -"local_firmware_file": "1", -"operating_system": "ubuntu 16.04", -"protocol_group_id": "blah", -"protocol_run_id": "", -"protocols_version": "4.2.11", -"run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", -"sample_id": "blahblah", -"usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", -"version": "3.5.5" -} - -# Duty Time - -ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u - -Channel State,Experiment Time (minutes),State Time (samples), -strand,0,477917 -strand,1,1370562 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv deleted file mode 100644 index bc77384f..00000000 --- a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1025,,0,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/final_summary.txt b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/nanoseq_output/some_intermediary_results b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/nanoseq_output/some_intermediary_results deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish deleted file mode 100644 index 573541ac..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq deleted file mode 100644 index 573541ac..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index 9842c8d4..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -group,replicate,barcode,input_file,genome,transcriptome -P15608_1005,1,3,,, -P15608_1025,1,8,,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141922/anglerfish_stats.txt b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141922/anglerfish_stats.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141923/.file b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141923/.file deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv deleted file mode 100644 index 069eff1f..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,truseq,ACAGTG,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode03.fastq.gz -P15608_1025,truseq,ACTGAT,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode08.fastq.gz diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv deleted file mode 100644 index d5b94506..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1005,,3,, -P15608_1025,,0,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt deleted file mode 100644 index 56e06611..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt +++ /dev/null @@ -1,17 +0,0 @@ -instrument=MN19414 -position= -flow_cell_id=AAU644 -sample_id=blah -protocol_group_id=blah -protocol=4a127386521a00415c821269a588a9271276dfd0 -protocol_run_id=5db4c5bc-34c9-452b-9d10-70e013228328 -acquisition_run_id=8b4541217a81f608d0562c0a0847b703ba77d13d -started=2020-08-03T15:05:12.504585+02:00 -acquisition_stopped=2020-08-04T09:05:16.104836+02:00 -processing_stopped=2020-08-04T09:05:17.311324+02:00 -basecalling_enabled=0 -sequencing_summary_file= -fast5_files_in_final_dest=42 -fast5_files_in_fallback=0 -fastq_files_in_final_dest=0 -fastq_files_in_fallback=0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/final_results b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/final_results deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md deleted file mode 100644 index 4bcfcdb5..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md +++ /dev/null @@ -1,42 +0,0 @@ -# Tracking ID - -{ -"asic_id": "755228278", -"asic_id_eeprom": "2866631", -"asic_temp": "33.002907", -"asic_version": "IA02D", -"auto_update": "0", -"auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", -"bream_is_standard": "0", -"device_id": "MN19414", -"device_type": "minion", -"distribution_status": "stable", -"distribution_version": "19.10.1", -"exp_script_name": "N/A", -"exp_script_purpose": "sequencing_run", -"exp_start_time": "2020-08-03T13:05:12Z", -"flongle_adapter_id": "FA-00577", -"flow_cell_id": "ACG995", -"flow_cell_product_code": "FLO-FLG001", -"guppy_version": "3.2.6+afc8e14", -"heatsink_temp": "34.574219", -"hostname": "ngi-squiggle", -"installation_type": "nc", -"local_firmware_file": "1", -"operating_system": "ubuntu 16.04", -"protocol_group_id": "blah", -"protocol_run_id": "", -"protocols_version": "4.2.11", -"run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", -"sample_id": "blahblah", -"usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", -"version": "3.5.5" -} - -# Duty Time - -ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u - -Channel State,Experiment Time (minutes),State Time (samples), -strand,0,477917 -strand,1,1370562 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/final_summary.txt b/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq deleted file mode 100644 index d00491fd..00000000 --- a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq +++ /dev/null @@ -1 +0,0 @@ -1 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index fd0b7b5b..00000000 --- a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1005,,13,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/final_summary.txt b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/nanoseq_output/failed_output b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/nanoseq_output/failed_output deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/transfer.tsv b/tests/data/nanopore_data/transfer.tsv deleted file mode 100644 index 429a49f1..00000000 --- a/tests/data/nanopore_data/transfer.tsv +++ /dev/null @@ -1,2 +0,0 @@ -20200105_1412_MN19414_AAU645_68125dc2 20200105 -20200106_1412_MN19414_AAU646_68125dc2 20200106 diff --git a/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv b/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv deleted file mode 100644 index 3bbf5719..00000000 --- a/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,ACGTAACTTGGTTTGTTCCCTGAA,truseq,ACTGAT \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv b/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv deleted file mode 100644 index 6c6e8ad0..00000000 --- a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,CCTGGTAACTGGGACACAAGACTC,truseq,ACTGAT \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv b/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv deleted file mode 100644 index 14cb73ae..00000000 --- a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv +++ /dev/null @@ -1,74 +0,0 @@ -P15051_103_3,ACAGACGACTACAAACGGAATCGA,truseq,AGCTGGAT -P15051_102_1,ACAGACGACTACAAACGGAATCGA,truseq,ATAGTTAC -P15051_101_4,ACAGACGACTACAAACGGAATCGA,truseq,ATTGGCGT -P15051_103_1,ACAGACGACTACAAACGGAATCGA,truseq,CATGAACA -P15051_102_3,ACAGACGACTACAAACGGAATCGA,truseq,CCTACGTA -P15051_101_3,ACAGACGACTACAAACGGAATCGA,truseq,CGCCATCG -P15051_102_4,ACAGACGACTACAAACGGAATCGA,truseq,GAGCACCG -P15051_101_1,ACAGACGACTACAAACGGAATCGA,truseq,GCAACAAA -P15051_103_4,ACAGACGACTACAAACGGAATCGA,truseq,GTGACTTG -P15051_101_2,ACAGACGACTACAAACGGAATCGA,truseq,TAGTTGTC -P15051_103_2,ACAGACGACTACAAACGGAATCGA,truseq,TCACTCGC -P15051_102_2,ACAGACGACTACAAACGGAATCGA,truseq,TGCTGAGT -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,CCTGGTAACTGGGACACAAGACTC,truseq,ACTGAT -P15608_1008,CCTGGTAACTGGGACACAAGACTC,truseq,ACTTGA -P15608_1013,CCTGGTAACTGGGACACAAGACTC,truseq,AGTCAA -P15608_1014,CCTGGTAACTGGGACACAAGACTC,truseq,AGTTCC -P15608_1001,CCTGGTAACTGGGACACAAGACTC,truseq,ATCACG -P15608_1026,CCTGGTAACTGGGACACAAGACTC,truseq,ATGAGC -P15608_1015,CCTGGTAACTGGGACACAAGACTC,truseq,ATGTCA -P15608_1027,CCTGGTAACTGGGACACAAGACTC,truseq,ATTCCT -P15608_1028,CCTGGTAACTGGGACACAAGACTC,truseq,CAAAAG -P15608_1029,CCTGGTAACTGGGACACAAGACTC,truseq,CAACTA -P15608_1030,CCTGGTAACTGGGACACAAGACTC,truseq,CACCGG -P15608_1031,CCTGGTAACTGGGACACAAGACTC,truseq,CACGAT -P15608_1032,CCTGGTAACTGGGACACAAGACTC,truseq,CACTCA -P15608_1007,CCTGGTAACTGGGACACAAGACTC,truseq,CAGATC -P15608_1033,CCTGGTAACTGGGACACAAGACTC,truseq,CAGGCG -P15608_1034,CCTGGTAACTGGGACACAAGACTC,truseq,CATGGC -P15608_1035,CCTGGTAACTGGGACACAAGACTC,truseq,CATTTT -P15608_1036,CCTGGTAACTGGGACACAAGACTC,truseq,CCAACA -P15608_1016,CCTGGTAACTGGGACACAAGACTC,truseq,CCGTCC -P15608_1002,CCTGGTAACTGGGACACAAGACTC,truseq,CGATGT -P15608_1037,CCTGGTAACTGGGACACAAGACTC,truseq,CGGAAT -P15608_1022,CCTGGTAACTGGGACACAAGACTC,truseq,CGTACG -P15608_1038,CCTGGTAACTGGGACACAAGACTC,truseq,CTAGCT -P15608_1012,CCTGGTAACTGGGACACAAGACTC,truseq,CTTGTA -P15608_1023,CCTGGTAACTGGGACACAAGACTC,truseq,GAGTGG -P15608_1009,CCTGGTAACTGGGACACAAGACTC,truseq,GATCAG -P15608_1006,CCTGGTAACTGGGACACAAGACTC,truseq,GCCAAT -P15608_1011,CCTGGTAACTGGGACACAAGACTC,truseq,GGCTAC -P15608_1024,CCTGGTAACTGGGACACAAGACTC,truseq,GGTAGC -P15608_1017,CCTGGTAACTGGGACACAAGACTC,truseq,GTAGAG -P15608_1018,CCTGGTAACTGGGACACAAGACTC,truseq,GTCCGC -P15608_1019,CCTGGTAACTGGGACACAAGACTC,truseq,GTGAAA -P15608_1020,CCTGGTAACTGGGACACAAGACTC,truseq,GTGGCC -P15608_1021,CCTGGTAACTGGGACACAAGACTC,truseq,GTTTCG -P15608_1010,CCTGGTAACTGGGACACAAGACTC,truseq,TAGCTT -P15608_1004,CCTGGTAACTGGGACACAAGACTC,truseq,TGACCA -P15608_1003,CCTGGTAACTGGGACACAAGACTC,truseq,TTAGGC -P15951_1007,AAGGTTACACAAACCCTGGACAAG,truseq_dual,AAGAGGCA-TGATGAAA -P15951_1003,AAGGTTACACAAACCCTGGACAAG,truseq_dual,AGGCAGAA-TATCCTCT -P15951_1004,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CAGAGAGG-CTAAGCCT -P15951_1006,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CGAGGCTG-AACATGAT -P15951_1002,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CGTACTAG-CTCTCTAT -P15951_1005,AAGGTTACACAAACCCTGGACAAG,truseq_dual,GCTACGCT-TGGAAATC -P15951_1008,AAGGTTACACAAACCCTGGACAAG,truseq_dual,GTAGAGGA-GTCGGACT -P15951_1001,AAGGTTACACAAACCCTGGACAAG,truseq_dual,TAAGGCGA-TAGATCGC -P15951_1009,AAGGTTACACAAACCCTGGACAAG,truseq_dual,TGGATCTG-TTTCTAGC -P14604_101,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,AAGAGGCA-CTAAGCCT -P14604_106,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,AGGCAGAA-CGTCTAAT -P14604_105,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,CGTACTAG-CGTCTAAT -P14604_103,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GCTCATGA-CTAAGCCT -P14604_108,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GGACTCCT-CGTCTAAT -P14604_102,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GTAGAGGA-CTAAGCCT -P14604_104,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TAAGGCGA-CGTCTAAT -P14604_109,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TAGGCATG-CGTCTAAT -P14604_107,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TCCTGAGC-CGTCTAAT -P15759_1001,AAGGATTCATTCCCACGGTAACAC,truseq, -P14654_169,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-AGGCGAAG -P14654_121,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-ATAGAGGC -P14654_137,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-CCTATCCT -P14654_153,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-GGCTCTGA -P14654_185,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-TAATCTTA \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index 9842c8d4..00000000 --- a/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -group,replicate,barcode,input_file,genome,transcriptome -P15608_1005,1,3,,, -P15608_1025,1,8,,, \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv b/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv deleted file mode 100644 index 069eff1f..00000000 --- a/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,truseq,ACAGTG,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode03.fastq.gz -P15608_1025,truseq,ACTGAT,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode08.fastq.gz diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py deleted file mode 100644 index bb8ac35a..00000000 --- a/tests/test_analysis_nanopore.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python -import unittest -from unittest import mock - -from taca.analysis.analysis_nanopore import * -from taca.nanopore.minion import MinIONqc -from taca.utils import config as conf - -CONFIG = conf.load_yaml_config("data/taca_test_nanopore_cfg.yaml") - - -class TestNanoporeAnalysis(unittest.TestCase): - def test_find_runs_to_process(self): - """Find all expected nanopore runs to process.""" - expected_dirs = [ - "data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2", - "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2", - "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2", - "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2", - "data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2", - "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2", - ] - nanopore_data_dir = ( - CONFIG.get("nanopore_analysis").get("minion_qc_run").get("data_dir") - ) - skip_dirs = ( - CONFIG.get("nanopore_analysis").get("minion_qc_run").get("ignore_dirs") - ) - found_dirs = find_minion_runs(nanopore_data_dir, skip_dirs) - self.assertEqual(sorted(found_dirs), sorted(expected_dirs)) - - @mock.patch("taca.analysis.analysis_nanopore.os.path.isfile") - @mock.patch("taca.nanopore.minion.MinIONqc.start_nanoseq") - def test_process_minion_run_start_analysis(self, mock_start, mock_isfile): - """Start nanoseq analysis for minion.""" - nanoseq_sample_sheet = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/SQK-LSK109_sample_sheet.csv" - anglerfish_sample_sheet = "some/path" - mock_isfile.return_value = True - run_dir = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2" - minion_run = MinIONqc(run_dir, nanoseq_sample_sheet, anglerfish_sample_sheet) - process_minion_qc_run(minion_run) - mock_start.assert_called_once() - - @mock.patch("taca.nanopore.minion.MinIONqc.copy_results_for_lims") - @mock.patch("taca.nanopore.minion.Nanopore.transfer_run") - @mock.patch("taca.nanopore.minion.Nanopore.update_transfer_log") - @mock.patch("taca.nanopore.minion.Nanopore.archive_run") - @mock.patch("taca.analysis.analysis_nanopore.send_mail") - def test_process_minion_run_transfer( - self, mock_mail, mock_archive, mock_update, mock_transfer, mock_cp - ): - """Start transfer of run directory.""" - mock_transfer.return_value = True - mock_cp.return_value = True - run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" - minion_run = MinIONqc(run_dir, "dummy/path", None) - process_minion_qc_run(minion_run) - expected_calls = [ - mock.call( - "Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2", - "Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.", - "test@test.com", - ), - mock.call( - "Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2", - "Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.", - "test@test.com", - ), - ] - mock_mail.assert_has_calls(expected_calls) - - @mock.patch("taca.analysis.analysis_nanopore.send_mail") - def test_process_minion_run_fail_analysis(self, mock_mail): - """Send email to operator if nanoseq analysis failed.""" - run_dir = ( - "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2" - ) - minion_run = MinIONqc(run_dir, None, None) - minion_run.qc_run = True - process_minion_qc_run(minion_run) - email_subject = "Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2" - email_message = f"The nanoseq analysis failed for run {minion_run.run_id}." - email_recipients = "test@test.com" - mock_mail.assert_called_once_with( - email_subject, email_message, email_recipients - ) From 14fd3835640746c3e618530515659ccddc05e5f5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 17 Jan 2024 16:53:56 +0100 Subject: [PATCH 13/44] fix mypy issues --- .gitignore | 1 + doc/conf.py | 2 +- taca/analysis/analysis_nanopore.py | 2 +- taca/illumina/Runs.py | 29 +++++++++++---------- taca/nanopore/ONT_run_classes.py | 4 +++ taca/nanopore/instrument_transfer.py | 38 +++++++++++++++------------- tests/test_instrument_transfer.py | 6 ++--- 7 files changed, 47 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index c1357210..f60e5c99 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ __pycache__ .pytest_cache .vscode .ruff_cache +.mypy_cache diff --git a/doc/conf.py b/doc/conf.py index 01abb472..6a064945 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -191,7 +191,7 @@ # -- Options for LaTeX output --------------------------------------------- -latex_elements = { +latex_elements: dict = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index c9dea404..101ba0e8 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -240,7 +240,7 @@ def process_qc_run(ont_qc_run: ONT_qc_run): ont_qc_run.archive_run() -def ont_transfer(run_abspath: str or None, qc: bool = False): +def ont_transfer(run_abspath: str | None, qc: bool = False): """CLI entry function. Find finished ONT runs in ngi-nas and transfer to HPC cluster. diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 2d582be9..840850fd 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -790,24 +790,27 @@ def _fix_html_reports_for_complex_lanes( # Update NumberReads for total sample yields for entry in html_report_laneBarcode_parser.sample_data: - if "total_sample_cluster" not in self.NumberReads_Summary[entry["Lane"]].keys(): + if ( + "total_sample_cluster" + not in self.NumberReads_Summary[entry["Lane"]].keys() + ): self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0 self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0 if entry["Project"] != "default": - self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( - entry["PF Clusters"].replace(",", "") - ) - self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( - entry["Yield (Mbases)"].replace(",", "") - ) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_cluster" + ] += int(entry["PF Clusters"].replace(",", "")) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_yield" + ] += int(entry["Yield (Mbases)"].replace(",", "")) else: if entry["Project"] != "default": - self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int( - entry["PF Clusters"].replace(",", "") - ) - self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int( - entry["Yield (Mbases)"].replace(",", "") - ) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_cluster" + ] += int(entry["PF Clusters"].replace(",", "")) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_yield" + ] += int(entry["Yield (Mbases)"].replace(",", "")) # Calculate the numbers clusters/yields of undet reads for key, value in self.NumberReads_Summary.items(): diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index ec610ad2..615ca114 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -33,6 +33,10 @@ def __init__(self, run_abspath: str): self.run_name = os.path.basename(run_abspath) self.run_abspath = run_abspath + self.run_type: str | None = ( + None # This will be defined upon instantiation of a child class + ) + assert re.match( ONT_RUN_PATTERN, self.run_name ), f"Run {self.run_name} doesn't look like a run dir" diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py index 728393a0..978701aa 100644 --- a/taca/nanopore/instrument_transfer.py +++ b/taca/nanopore/instrument_transfer.py @@ -95,7 +95,7 @@ def write_finished_indicator(run_path): open(new_file, "w").close() -def sync_to_storage(run_dir, destination, log): +def sync_to_storage(run_dir: str, destination: str, rsync_log: str): """Sync the run to storage using rsync. Skip if rsync is already running on the run.""" @@ -103,7 +103,7 @@ def sync_to_storage(run_dir, destination, log): "run-one", "rsync", "-rvu", - "--log-file=" + log, + "--log-file=" + rsync_log, run_dir, destination, ] @@ -115,7 +115,7 @@ def sync_to_storage(run_dir, destination, log): def final_sync_to_storage( - run_dir: str, destination: str, archive_dir: str, log: list[str] + run_dir: str, destination: str, archive_dir: str, rsync_log: str ): """Do a final sync of the run to storage, then archive it. Skip if rsync is already running on the run.""" @@ -126,7 +126,7 @@ def final_sync_to_storage( "run-one", "rsync", "-rvu", - "--log-file=" + log, + "--log-file=" + rsync_log, run_dir, destination, ] @@ -213,7 +213,8 @@ def parse_position_logs(minknow_logs_dir: str) -> list: for row in "ABCDEFGH": positions.append(col + row) - entries = [] + headers = [] + header: dict | None = None for position in positions: log_files = glob( os.path.join(minknow_logs_dir, position, "control_server_log-*.txt") @@ -225,32 +226,35 @@ def parse_position_logs(minknow_logs_dir: str) -> list: for log_file in log_files: with open(log_file) as stream: lines = stream.readlines() - for i in range(0, len(lines)): - line = lines[i] - if line[0:4] != " ": + + # Iterate across log lines + for line in lines: + if not line[0:4] == " ": # Line is log header split_header = line.split(" ") timestamp = " ".join(split_header[0:2]) category = " ".join(split_header[2:]) - entry = { + header = { "position": position, "timestamp": timestamp.strip(), "category": category.strip(), } - entries.append(entry) - else: + headers.append(header) + + elif header: # Line is log body - if "body" not in entry: - entry["body"] = {} + if "body" not in header.keys(): + body: dict = {} + header["body"] = body key = line.split(": ")[0].strip() val = ": ".join(line.split(": ")[1:]).strip() - entry["body"][key] = val + header["body"][key] = val - entries.sort(key=lambda x: x["timestamp"]) - logging.info(f"Parsed {len(entries)} log entries.") + headers.sort(key=lambda x: x["timestamp"]) + logging.info(f"Parsed {len(headers)} log entries.") - return entries + return headers def get_pore_counts(position_logs: list) -> list: diff --git a/tests/test_instrument_transfer.py b/tests/test_instrument_transfer.py index 2b66c111..81c3ae5d 100644 --- a/tests/test_instrument_transfer.py +++ b/tests/test_instrument_transfer.py @@ -15,7 +15,7 @@ @pytest.fixture -def setup_test_fixture() -> (Mock, tempfile.TemporaryDirectory, dict): +def setup_test_fixture(): """Set up tempdir to mimic an ONT instrument file system""" tmp = tempfile.TemporaryDirectory() @@ -239,7 +239,7 @@ def test_final_sync_to_storage( run_dir="run_dir", destination="destination", archive_dir="archive_dir", - log="log_path", + rsync_log="log_path", ) assert mock_run.call_args_list[0] == call( @@ -266,7 +266,7 @@ def test_final_sync_to_storage( run_dir="run_dir", destination="destination", archive_dir="archive_dir", - log="log_path", + rsync_log="log_path", ) assert mock_run.call_count == 3 From 7b5a3940c7e95776040f7145f9a574fae713d539 Mon Sep 17 00:00:00 2001 From: Anandashankar Anil Date: Fri, 19 Jan 2024 16:00:35 +0100 Subject: [PATCH 14/44] Remove unused travis building and badge --- .travis.yml | 19 ------------------- README.md | 6 +++--- 2 files changed, 3 insertions(+), 22 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b1ae1922..00000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -language: python - -python: - - "2.7" - - "3.8" - -install: - - python setup.py install - - mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml - - pip install codecov - -script: - - cd tests && nosetests --with-coverage -v -s - -after_success: - - codecov - -notifications: - email: false diff --git a/README.md b/README.md index 50ce07c3..4d770eb5 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,13 @@ ## Tool for the Automation of Cleanup and Analyses [![PyPI version](https://badge.fury.io/py/taca.svg)](http://badge.fury.io/py/taca) -[![Build Status](https://travis-ci.org/SciLifeLab/TACA.svg?branch=master)](https://travis-ci.org/SciLifeLab/TACA) [![Documentation Status](https://readthedocs.org/projects/taca/badge/?version=latest)](https://readthedocs.org/projects/taca/?badge=latest) [![codecov](https://codecov.io/gh/scilifelab/taca/branch/master/graph/badge.svg)](https://codecov.io/gh/scilifelab/taca) This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) in Stockholm, Sweden. ### Install for development + You can install your own fork of taca in for instance a local conda environment for development. Provided you have conda installed: ``` @@ -27,7 +27,7 @@ conda activate taca_dev # install TACA and dependencies for developoment cd TACA python setup.py develop -pip install -r ./requirements-dev.txt +pip install -r ./requirements-dev.txt # Check that tests pass: cd tests && nosetests -v -s @@ -43,7 +43,7 @@ python setup.py develop pip install -r ./requirements-dev.txt # add required config files and env for taca delivery plugin -echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml +echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml export CHARON_BASE_URL="http://tracking.database.org" export CHARON_API_TOKEN="charonapitokengoeshere" From 08794a39c98908cfe7abc78876704fcd555ae45c Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 22 Jan 2024 16:17:55 +0100 Subject: [PATCH 15/44] Adapt to new(?) STUPID json format --- taca/nanopore/ONT_run_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 5f839058..46086e74 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -257,7 +257,7 @@ def parse_minknow_json(self, db_update): # -- Run output subsection seq_metadata_trimmed["acquisition_output"] = [] for section in seq_metadata["acquisition_output"]: - if section["type"] in ["AllData", "SplitByBarcode"]: + if "type" not in section.keys() or section["type"] in ["AllData", "SplitByBarcode"]: seq_metadata_trimmed["acquisition_output"].append(section) # -- Read length subseqtion From b738cb325b94d19c3532ae583f7137ba30fd166f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 22 Jan 2024 16:25:57 +0100 Subject: [PATCH 16/44] bump versionlog --- VERSIONLOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 0216daf7..82060159 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20240122.1 + +Adapt ONT analysis to new ONT JSON format (also backwards compatible). + ## 20231204.1 Update ONT instrument transfer script to ignore runs started in the 3rd PromethION column, which will be used by Clinical Genomics. From deebad80ba3c49b5383660bb97e999c99f484204 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 23 Jan 2024 09:44:22 +0100 Subject: [PATCH 17/44] Don't copy .pod5 into metadata dir --- taca/nanopore/ONT_run_classes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 46086e74..b9c7b513 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -282,11 +282,13 @@ def copy_metadata(self): "**/bam*/***", "**/fast5*/***", "**/fastq*/***", + "**/pod5*/***", # Any files found elsewhere "*.bam*", "*.bai*", "*.fast5*", "*.fastq*", + "*.pod5*", ] exclude_patterns_quoted = ["'" + pattern + "'" for pattern in exclude_patterns] From 8ea8640166ffbc14c9379951a2e499a6df3b80d0 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Tue, 23 Jan 2024 09:45:41 +0100 Subject: [PATCH 18/44] bump versionlog --- VERSIONLOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 82060159..14da1c35 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,9 @@ # TACA Version Log +## 20240123.1 + +Exclude pod5 dir and files from being copied to metadata dir. + ## 20240122.1 Adapt ONT analysis to new ONT JSON format (also backwards compatible). From 9c2dd96ab87b4eca9e7c0b34251b1840c8c0a30f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 14:58:56 +0100 Subject: [PATCH 19/44] use barcoding option conditionally --- taca/nanopore/ONT_run_classes.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index b9c7b513..22447e58 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -459,6 +459,15 @@ def fetch_anglerfish_samplesheet(self) -> bool: raise RsyncError( f"{self.run_name}: Error occured when copying anglerfish samplesheet to run dir." ) + + def run_has_barcode_output(self) -> bool: + + barcode_dir_pattern = r"barcode\d{2}" + reads_dir = os.path.join(self.run_abspath, "fastq_pass") + + for dir in os.listdir(reads_dir): + if re.search(barcode_dir_pattern, dir): + return True def run_anglerfish(self): """Run Anglerfish as subprocess within it's own Conda environment. @@ -475,10 +484,12 @@ def run_anglerfish(self): f"--run_name {anglerfish_run_name}", f"--threads {n_threads}", "--lenient", - "--ont_barcodes", "--skip_demux", ] + if self.run_has_barcode_output(): + anglerfish_command.append("--barcoding") + full_command = [ # Dump subprocess PID into 'run-ongoing'-indicator file. f"echo $$ > {self.anglerfish_ongoing_abspath}", From 32b7142cc97cb6fa1913417889227e885daaf05a Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 15:30:55 +0100 Subject: [PATCH 20/44] Add fastq output check --- taca/analysis/analysis_nanopore.py | 4 ++++ taca/nanopore/ONT_run_classes.py | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index 74e4c3ef..d17c29bc 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -209,6 +209,10 @@ def process_qc_run(ont_qc_run: ONT_qc_run): logger.info( f"{ont_qc_run.run_name}: Could not find Anglerfish sample sheet, skipping." ) + elif not ont_qc_run.has_fastq_output(): + logger.info( + f"{ont_qc_run.run_name}: Run has no fastq output, skipping." + ) else: logger.info(f"{ont_qc_run.run_name}: Starting Anglerfish...") ont_qc_run.run_anglerfish() diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 22447e58..3c9af03c 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -460,12 +460,21 @@ def fetch_anglerfish_samplesheet(self) -> bool: f"{self.run_name}: Error occured when copying anglerfish samplesheet to run dir." ) - def run_has_barcode_output(self) -> bool: + def has_fastq_output(self) -> bool: + """Check whether run has fastq output.""" - barcode_dir_pattern = r"barcode\d{2}" reads_dir = os.path.join(self.run_abspath, "fastq_pass") - for dir in os.listdir(reads_dir): + if os.path.exists(reads_dir): + return True + else: + return False + + def has_barcode_dirs(self) -> bool: + + barcode_dir_pattern = r"barcode\d{2}" + + for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")): if re.search(barcode_dir_pattern, dir): return True @@ -487,7 +496,9 @@ def run_anglerfish(self): "--skip_demux", ] - if self.run_has_barcode_output(): + + + if self.has_barcode_dirs(): anglerfish_command.append("--barcoding") full_command = [ From 6669bc95dc50330d8af604b01fb2cbd6c507c0a5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 16:55:32 +0100 Subject: [PATCH 21/44] implement new superdir and dump std --- taca/nanopore/ONT_run_classes.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 3c9af03c..11e7651b 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -483,6 +483,9 @@ def run_anglerfish(self): Dump files to indicate ongoing and finished processes. """ + timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}" anglerfish_run_name = "anglerfish_run" n_threads = 2 # This could possibly be changed @@ -496,7 +499,14 @@ def run_anglerfish(self): "--skip_demux", ] - + # Make dir to trace Anglerfish run + os.mkdir(taca_anglerfish_run_dir) + # Copy samplesheet used + shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") + # Create files to dump subprocess std + stdin = f"{taca_anglerfish_run_dir}/stdin.txt" + stdout = f"{taca_anglerfish_run_dir}/stdout.txt" + stderr = f"{taca_anglerfish_run_dir}/stderr.txt" if self.has_barcode_dirs(): anglerfish_command.append("--barcoding") @@ -508,19 +518,25 @@ def run_anglerfish(self): "conda run -n anglerfish " + " ".join(anglerfish_command), # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", - # Copy the Anglerfish samplesheet used to start the run into the run dir, for traceability - # (The correct anglerfish run dir is identified by it being younger than the "run-ongoing" file) - f"new_runs=$(find . -type d -name 'anglerfish_run*' -newer {self.anglerfish_ongoing_abspath})", - f"if [[ $(echo '${{new_runs}}' | wc -l) -eq 1 ]] ; then cp {self.anglerfish_samplesheet} ${{new_runs}}/ ; fi", + # Move the Anglerfish run dir into the taca anglerfish run folder + f"mv {anglerfish_run_name} {taca_anglerfish_run_dir}/", # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] + with open(f"{taca_anglerfish_run_dir}/command.sh", "w") as stream: + stream.write( + "; ".join(full_command) + ) + # Start Anglerfish subprocess process = subprocess.Popen( "; ".join(full_command), shell=True, cwd=self.run_abspath, + stdin=stdin, + stdout=stdout, + stderr=stderr, ) logger.info( f"{self.run_name}: Anglerfish subprocess started with process ID {process.pid}." From 4824ee1a4ab43e411f9e798119b9262412c1230f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:08:37 +0100 Subject: [PATCH 22/44] apply fix --- taca/nanopore/ONT_run_classes.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 11e7651b..8937c093 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -504,9 +504,9 @@ def run_anglerfish(self): # Copy samplesheet used shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") # Create files to dump subprocess std - stdin = f"{taca_anglerfish_run_dir}/stdin.txt" - stdout = f"{taca_anglerfish_run_dir}/stdout.txt" - stderr = f"{taca_anglerfish_run_dir}/stderr.txt" + stdin_relpath = f"{taca_anglerfish_run_dir}/stdin.txt" + stdout_relpath = f"{taca_anglerfish_run_dir}/stdout.txt" + stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt" if self.has_barcode_dirs(): anglerfish_command.append("--barcoding") @@ -530,14 +530,15 @@ def run_anglerfish(self): ) # Start Anglerfish subprocess - process = subprocess.Popen( - "; ".join(full_command), - shell=True, - cwd=self.run_abspath, - stdin=stdin, - stdout=stdout, - stderr=stderr, - ) + with open(stdin_relpath, 'w') as stdin, open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'r') as stderr: + process = subprocess.Popen( + "; ".join(full_command), + shell=True, + cwd=self.run_abspath, + stdin=stdin, + stdout=stdout, + stderr=stderr, + ) logger.info( f"{self.run_name}: Anglerfish subprocess started with process ID {process.pid}." ) From f05b17fc2a460a9078816bf47ba292af2fabb9cb Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:10:15 +0100 Subject: [PATCH 23/44] typo --- taca/nanopore/ONT_run_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 8937c093..f236d059 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -530,7 +530,7 @@ def run_anglerfish(self): ) # Start Anglerfish subprocess - with open(stdin_relpath, 'w') as stdin, open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'r') as stderr: + with open(stdin_relpath, 'w') as stdin, open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'w') as stderr: process = subprocess.Popen( "; ".join(full_command), shell=True, From 6052e71cd52b8e1713dc0e916d142bcb6ea0f9a6 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:30:39 +0100 Subject: [PATCH 24/44] try finding anglerfish run dir by age --- taca/nanopore/ONT_run_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index f236d059..46259175 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -519,7 +519,7 @@ def run_anglerfish(self): # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", # Move the Anglerfish run dir into the taca anglerfish run folder - f"mv {anglerfish_run_name} {taca_anglerfish_run_dir}/", + 'find . -name "anglerfish_run*" -type d -newer .anglerfish_ongoing -exec mv {} ' + f"{taca_anglerfish_run_dir}/" # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] From 2705c7e3995f65b1e45468156170b1fa53b79aca Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:36:22 +0100 Subject: [PATCH 25/44] try further fixing --- taca/nanopore/ONT_run_classes.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 46259175..275b62b8 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -504,7 +504,6 @@ def run_anglerfish(self): # Copy samplesheet used shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") # Create files to dump subprocess std - stdin_relpath = f"{taca_anglerfish_run_dir}/stdin.txt" stdout_relpath = f"{taca_anglerfish_run_dir}/stdout.txt" stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt" @@ -519,23 +518,22 @@ def run_anglerfish(self): # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", # Move the Anglerfish run dir into the taca anglerfish run folder - 'find . -name "anglerfish_run*" -type d -newer .anglerfish_ongoing -exec mv {} ' + f"{taca_anglerfish_run_dir}/" + 'find . -name "anglerfish_run*" -type d -newer .anglerfish_ongoing -exec mv \{\} ' + f"{taca_anglerfish_run_dir}/" # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] with open(f"{taca_anglerfish_run_dir}/command.sh", "w") as stream: stream.write( - "; ".join(full_command) + "\n".join(full_command) ) # Start Anglerfish subprocess - with open(stdin_relpath, 'w') as stdin, open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'w') as stderr: + with open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'w') as stderr: process = subprocess.Popen( "; ".join(full_command), shell=True, cwd=self.run_abspath, - stdin=stdin, stdout=stdout, stderr=stderr, ) From a83949b0f442ce4b7b3291ac1c450dddb25aa2bb Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:44:24 +0100 Subject: [PATCH 26/44] try simplifying --- taca/nanopore/ONT_run_classes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 275b62b8..6c4c7fb8 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -517,8 +517,6 @@ def run_anglerfish(self): "conda run -n anglerfish " + " ".join(anglerfish_command), # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", - # Move the Anglerfish run dir into the taca anglerfish run folder - 'find . -name "anglerfish_run*" -type d -newer .anglerfish_ongoing -exec mv \{\} ' + f"{taca_anglerfish_run_dir}/" # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] From 665652965af2f530a965766a5a3031759c860fa8 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 17:57:35 +0100 Subject: [PATCH 27/44] try running from subdir --- taca/nanopore/ONT_run_classes.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 6c4c7fb8..5f29bda4 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -483,7 +483,7 @@ def run_anglerfish(self): Dump files to indicate ongoing and finished processes. """ - timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S") taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}" anglerfish_run_name = "anglerfish_run" @@ -492,7 +492,7 @@ def run_anglerfish(self): anglerfish_command = [ self.anglerfish_path, f"--samplesheet {self.anglerfish_samplesheet}", - f"--out_fastq {self.run_abspath}", + f"--out_fastq {os.join(self.run_abspath, taca_anglerfish_run_dir)}", f"--run_name {anglerfish_run_name}", f"--threads {n_threads}", "--lenient", @@ -504,7 +504,6 @@ def run_anglerfish(self): # Copy samplesheet used shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") # Create files to dump subprocess std - stdout_relpath = f"{taca_anglerfish_run_dir}/stdout.txt" stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt" if self.has_barcode_dirs(): @@ -522,17 +521,14 @@ def run_anglerfish(self): ] with open(f"{taca_anglerfish_run_dir}/command.sh", "w") as stream: - stream.write( - "\n".join(full_command) - ) + stream.write("\n".join(full_command)) # Start Anglerfish subprocess - with open(stdout_relpath, 'w') as stdout, open(stderr_relpath, 'w') as stderr: + with open(stderr_relpath, 'w') as stderr: process = subprocess.Popen( "; ".join(full_command), shell=True, cwd=self.run_abspath, - stdout=stdout, stderr=stderr, ) logger.info( From 81e07e38d38cc3dc00eade3d5787b32b141921bc Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 18:13:17 +0100 Subject: [PATCH 28/44] try abspaths --- taca/nanopore/ONT_run_classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 5f29bda4..0d76837e 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -492,7 +492,7 @@ def run_anglerfish(self): anglerfish_command = [ self.anglerfish_path, f"--samplesheet {self.anglerfish_samplesheet}", - f"--out_fastq {os.join(self.run_abspath, taca_anglerfish_run_dir)}", + f"--out_fastq {self.run_abspath}", f"--run_name {anglerfish_run_name}", f"--threads {n_threads}", "--lenient", @@ -516,6 +516,8 @@ def run_anglerfish(self): "conda run -n anglerfish " + " ".join(anglerfish_command), # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", + # Move run to subdir + f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/', # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] From 8f7124f4767203decb2caa06eb09988e0061c11e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 18:34:23 +0100 Subject: [PATCH 29/44] fix exec --- taca/nanopore/ONT_run_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 0d76837e..7f98b8b1 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -517,7 +517,7 @@ def run_anglerfish(self): # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", # Move run to subdir - f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/', + f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \;', # Regardless of exit status: Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] From 5c1fc22e5724b1d46158db14c337db1a6866c19d Mon Sep 17 00:00:00 2001 From: kedhammar Date: Fri, 26 Jan 2024 18:49:11 +0100 Subject: [PATCH 30/44] more bash voodoo --- taca/nanopore/ONT_run_classes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 7f98b8b1..09deaaa3 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -517,8 +517,11 @@ def run_anglerfish(self): # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", # Move run to subdir - f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \;', - # Regardless of exit status: Remove 'run-ongoing' file. + f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + + '-exec mv \{\} ' + + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \; ' + + '-quit', + # Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] From 2dd8b7fd79edaa135dd64b38df75d45e6e09a486 Mon Sep 17 00:00:00 2001 From: Alfred Kedhammar <89784800+kedhammar@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:28:52 +0100 Subject: [PATCH 31/44] Update taca/nanopore/ONT_run_classes.py Co-authored-by: Johannes Alneberg --- taca/nanopore/ONT_run_classes.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 09deaaa3..6f668d94 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -465,10 +465,7 @@ def has_fastq_output(self) -> bool: reads_dir = os.path.join(self.run_abspath, "fastq_pass") - if os.path.exists(reads_dir): - return True - else: - return False + return os.path.exists(reads_dir) def has_barcode_dirs(self) -> bool: From 0baa97c9898eb71061b8b2ac2b86b44e52de2c20 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Mon, 29 Jan 2024 15:41:47 +0100 Subject: [PATCH 32/44] clarify command and make subprocess run it by calling .sh file --- taca/nanopore/ONT_run_classes.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 6f668d94..2308b6ef 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -482,8 +482,9 @@ def run_anglerfish(self): timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S") - taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}" + # "anglerfish_run*" is the dir pattern recognized by the LIMS script parsing the results anglerfish_run_name = "anglerfish_run" + n_threads = 2 # This could possibly be changed anglerfish_command = [ @@ -495,17 +496,17 @@ def run_anglerfish(self): "--lenient", "--skip_demux", ] + if self.has_barcode_dirs(): + anglerfish_command.append("--barcoding") - # Make dir to trace Anglerfish run + # Create dir to trace TACA executing Anglerfish as a subprocess + taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}" os.mkdir(taca_anglerfish_run_dir) - # Copy samplesheet used + # Copy samplesheet used for traceability shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") # Create files to dump subprocess std stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt" - if self.has_barcode_dirs(): - anglerfish_command.append("--barcoding") - full_command = [ # Dump subprocess PID into 'run-ongoing'-indicator file. f"echo $$ > {self.anglerfish_ongoing_abspath}", @@ -514,9 +515,11 @@ def run_anglerfish(self): # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", # Move run to subdir + # 1) Find the latest Anglerfish run dir (younger than the 'run-ongoing' file) f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' - + '-exec mv \{\} ' - + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \; ' + # 2) Move the Anglerfish run dir into the TACA Anglerfish run dir + + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \; ' + # 3) Only do this once + '-quit', # Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", @@ -528,7 +531,7 @@ def run_anglerfish(self): # Start Anglerfish subprocess with open(stderr_relpath, 'w') as stderr: process = subprocess.Popen( - "; ".join(full_command), + f"bash {taca_anglerfish_run_dir}/command.sh", shell=True, cwd=self.run_abspath, stderr=stderr, From 4287ae1d87d1c3b993927f8d175e6a81c1565acd Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 10:43:26 +0100 Subject: [PATCH 33/44] Update script to compare requirements --- .github/workflows/lint-code.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index a63bc5f4..4612ee1e 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -81,16 +81,28 @@ jobs: - name: Compare requirements run: | # Extract and sort package names - awk '{print $1}' $1 | sort -u > "$1".compare - awk -F'==' '{print $1}' $2 | sort -u > "$2".compare + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' requirements.txt | sort -u > requirements.compare + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' pipreqs.txt | sort -u > pipreqs.compare # Compare package lists - if cmp -s "$1".compare "$2".compare + if cmp -s requirements.compare pipreqs.compare then echo "Requirements are the same" + exit 0 else echo "Requirements are different" + echo "" + + echo "=== current requirements.txt ===" + echo "" + cat requirements.compare + echo "" + + echo "=== pipreqs requirements ===" + echo "" + cat pipreqs.compare + exit 1 fi From fa1dc33a150b05fe77eb532d93cf85f0f80700fa Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 10:43:46 +0100 Subject: [PATCH 34/44] remove outdated requirements (according to pipreqs) --- requirements.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 51592db7..fa2e4812 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,7 @@ click CouchDB -requests -pyyaml flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser -soupsieve<2.0 -beautifulsoup4 -python-crontab pandas +sphinx_rtd_theme +python_crontab +pytest From 103ef786bfd2746302d9268a8fa7c0a365fc8e88 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 10:48:39 +0100 Subject: [PATCH 35/44] add newline --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3c63be1a..db00553a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,4 +24,4 @@ COPY requirements-dev.txt requirements-dev.txt RUN python -m pip install -r requirements-dev.txt RUN mkdir /root/.taca/ -COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml \ No newline at end of file +COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml From b3bc7f785da0200866cf0d0a08981dcc5dbeca88 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:18:13 +0100 Subject: [PATCH 36/44] remove editorconfig-checker --- .editorconfig | 12 ------------ .github/workflows/lint-code.yml | 17 ----------------- .pre-commit-config.yaml | 4 ---- 3 files changed, 33 deletions(-) delete mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 70c7a9a8..00000000 --- a/.editorconfig +++ /dev/null @@ -1,12 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,cff}] -indent_size = 2 diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 4612ee1e..87b85054 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -123,20 +123,3 @@ jobs: - name: Run Prettier --check run: prettier --check . - # Use editorconfig to check other specified file formats - editorconfig: - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Setup node - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install editorconfig-checker - run: npm install -g editorconfig-checker - - - name: editorconfig --> Lint files - run: editorconfig-checker $(git ls-files | grep '.txt') diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c09ed2d..c30ff77b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,3 @@ repos: rev: "v4.0.0-alpha.8" hooks: - id: prettier - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "2.7.2" - hooks: - - id: editorconfig-checker From a676908bb32d60bcbc5e42ce710f889980845af4 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:26:55 +0100 Subject: [PATCH 37/44] non-invasive fixes after merge from master --- .devcontainer/devcontainer.json | 50 ++++---- .github/workflows/lint-code.yml | 1 - taca/illumina/Runs.py | 46 +++++++- taca/nanopore/ONT_run_classes.py | 19 +-- tests/data/Stats.json | 136 +++++++++++----------- tests/data/lane.html | 182 ++++++++++++++--------------- tests/data/laneBarcode.html | 192 +++++++++++++++---------------- tests/data/lane_result.html | 181 ++++++++++++++--------------- 8 files changed, 418 insertions(+), 389 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2ea64cc9..4d2cc553 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,29 +1,29 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile { - "name": "TACA", - "build": { - // Sets the run context to one level up instead of the .devcontainer folder. - "context": "..", - // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "../Dockerfile" - }, - "features": {}, - "customizations": { - "vscode": { - "extensions": ["ms-python.python"] - } - }, - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .", - // Configure tool-specific properties. - // "customizations": {}, - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "devcontainer" - "mounts": [ - "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached" - ] + "name": "TACA", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + }, + "features": {}, + "customizations": { + "vscode": { + "extensions": ["ms-python.python"] + } + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" + "mounts": [ + "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached" + ] } diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 87b85054..1ab877de 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -122,4 +122,3 @@ jobs: - name: Run Prettier --check run: prettier --check . - diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 202f3f23..483d278b 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -873,7 +873,17 @@ def _fix_html_reports_for_complex_lanes( return NumberReads_Summary - def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary): + def _fix_demultiplexingstats_xml_dir( + self, + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + NumberReads_Summary, + ): # Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done) DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"]) # For creating DemuxSummary.txt files for complex lanes @@ -1391,13 +1401,41 @@ def _aggregate_demux_results_simple_complex(self): return True # Case with multiple sub-demultiplexings - (html_reports_lane, html_reports_laneBarcode, stats_json) = self._process_demux_with_complex_lanes(demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes) + ( + html_reports_lane, + html_reports_laneBarcode, + stats_json, + ) = self._process_demux_with_complex_lanes( + demux_folder, + samplesheets, + legacy_path, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ) # Create the html reports - NumberReads_Summary = self._fix_html_reports_for_complex_lanes(demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode) + NumberReads_Summary = self._fix_html_reports_for_complex_lanes( + demux_folder, + index_cycles, + complex_lanes, + noindex_lanes, + html_reports_lane, + html_reports_laneBarcode, + ) # Fix contents under the DemultiplexingStats folder - self._fix_demultiplexingstats_xml_dir(demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary) + self._fix_demultiplexingstats_xml_dir( + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + NumberReads_Summary, + ) return True diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 86b38deb..e301f6d8 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -259,7 +259,10 @@ def parse_minknow_json(self, db_update): # -- Run output subsection seq_metadata_trimmed["acquisition_output"] = [] for section in seq_metadata["acquisition_output"]: - if "type" not in section.keys() or section["type"] in ["AllData", "SplitByBarcode"]: + if "type" not in section.keys() or section["type"] in [ + "AllData", + "SplitByBarcode", + ]: seq_metadata_trimmed["acquisition_output"].append(section) # -- Read length subseqtion @@ -460,16 +463,15 @@ def fetch_anglerfish_samplesheet(self) -> bool: raise RsyncError( f"{self.run_name}: Error occured when copying anglerfish samplesheet to run dir." ) - + def has_fastq_output(self) -> bool: """Check whether run has fastq output.""" reads_dir = os.path.join(self.run_abspath, "fastq_pass") return os.path.exists(reads_dir) - - def has_barcode_dirs(self) -> bool: + def has_barcode_dirs(self) -> bool: barcode_dir_pattern = r"barcode\d{2}" for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")): @@ -483,7 +485,7 @@ def run_anglerfish(self): timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S") - # "anglerfish_run*" is the dir pattern recognized by the LIMS script parsing the results + # "anglerfish_run*" is the dir pattern recognized by the LIMS script parsing the results anglerfish_run_name = "anglerfish_run" n_threads = 2 # This could possibly be changed @@ -519,9 +521,10 @@ def run_anglerfish(self): # 1) Find the latest Anglerfish run dir (younger than the 'run-ongoing' file) f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' # 2) Move the Anglerfish run dir into the TACA Anglerfish run dir - + '-exec mv \{\} ' + f'{self.run_abspath}/{taca_anglerfish_run_dir}/ \; ' + + "-exec mv \{\} " + + f"{self.run_abspath}/{taca_anglerfish_run_dir}/ \; " # 3) Only do this once - + '-quit', + + "-quit", # Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] @@ -530,7 +533,7 @@ def run_anglerfish(self): stream.write("\n".join(full_command)) # Start Anglerfish subprocess - with open(stderr_relpath, 'w') as stderr: + with open(stderr_relpath, "w") as stderr: process = subprocess.Popen( f"bash {taca_anglerfish_run_dir}/command.sh", shell=True, diff --git a/tests/data/Stats.json b/tests/data/Stats.json index 28340f00..1d2e2bd1 100644 --- a/tests/data/Stats.json +++ b/tests/data/Stats.json @@ -1,75 +1,75 @@ { - "RunNumber": 131, - "Flowcell": "FCIDXX", - "RunId": "141124_ST-COMPLEX1_01_AFCIDXX", - "ConversionResults": [ + "RunNumber": 131, + "Flowcell": "FCIDXX", + "RunId": "141124_ST-COMPLEX1_01_AFCIDXX", + "ConversionResults": [ + { + "LaneNumber": 1, + "DemuxResults": [ { - "LaneNumber": 1, - "DemuxResults": [ - { - "SampleId": "Sample_P12345_1001", - "SampleName": "P12345_1001", - "NumberReads": 494288265, - "Yield": 58820303535, - "ReadMetrics": [ - { - "ReadNumber": 1, - "Yield": 13840071420, - "YieldQ30": 13329609381, - "QualityScoreSum": 503672520160, - "TrimmedBases": 0 - } - ] - } - ], - "Undetermined": { - "NumberReads": 17709745, - "Yield": 2036620675, - "ReadMetrics": [ - { - "ReadNumber": 1, - "Yield": 885487250, - "YieldQ30": 680049984, - "QualityScoreSum": 28815661398, - "TrimmedBases": 0 - }, - { - "ReadNumber": 2, - "Yield": 283355920, - "YieldQ30": 179655904, - "QualityScoreSum": 8324058259, - "TrimmedBases": 0 - } - ] + "SampleId": "Sample_P12345_1001", + "SampleName": "P12345_1001", + "NumberReads": 494288265, + "Yield": 58820303535, + "ReadMetrics": [ + { + "ReadNumber": 1, + "Yield": 13840071420, + "YieldQ30": 13329609381, + "QualityScoreSum": 503672520160, + "TrimmedBases": 0 } + ] } - ], - "ReadInfosForLanes": [ + ], + "Undetermined": { + "NumberReads": 17709745, + "Yield": 2036620675, + "ReadMetrics": [ + { + "ReadNumber": 1, + "Yield": 885487250, + "YieldQ30": 680049984, + "QualityScoreSum": 28815661398, + "TrimmedBases": 0 + }, + { + "ReadNumber": 2, + "Yield": 283355920, + "YieldQ30": 179655904, + "QualityScoreSum": 8324058259, + "TrimmedBases": 0 + } + ] + } + } + ], + "ReadInfosForLanes": [ + { + "LaneNumber": 1, + "ReadInfos": [ { - "LaneNumber": 1, - "ReadInfos": [ - { - "Number": 1, - "NumCycles": 28, - "IsIndexedRead": "false" - } - ] + "Number": 1, + "NumCycles": 28, + "IsIndexedRead": "false" } - ], - "UnknownBarcodes": [ - { - "Lane": 1, - "Barcodes": { - "GGGGGGGG": 3203920, - "CCCTAACA": 290420 - } - }, - { - "Lane": 2, - "Barcodes": { - "GGGGGGGG": 3075440, - "CCCTAACA": 296260 - } - } - ] + ] + } + ], + "UnknownBarcodes": [ + { + "Lane": 1, + "Barcodes": { + "GGGGGGGG": 3203920, + "CCCTAACA": 290420 + } + }, + { + "Lane": 2, + "Barcodes": { + "GGGGGGGG": 3075440, + "CCCTAACA": 296260 + } + } + ] } diff --git a/tests/data/lane.html b/tests/data/lane.html index b02fac08..435f6b29 100644 --- a/tests/data/lane.html +++ b/tests/data/lane.html @@ -1,96 +1,92 @@ - - - - - - - -
-

- H5YKFDSXY / [all projects] / [all samples] / [all - barcodes] -

-
-

- show barcodes -

-
-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
-

- + + + + + + + +
+

H5YKFDSXY / [all projects] / [all samples] / [all barcodes]

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
+

+ diff --git a/tests/data/laneBarcode.html b/tests/data/laneBarcode.html index a1f50376..089bd90f 100644 --- a/tests/data/laneBarcode.html +++ b/tests/data/laneBarcode.html @@ -1,101 +1,97 @@ - - - - - - - -
-

- FCIDXX / [all projects] / [all samples] / [all barcodes] -

-
-

- hide barcodes -

-
-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
-

Top Unknown Barcodes

- - - - - - - - - - - - - - - - - - -
LaneCountSequenceLaneCountSequence
1494,288,120unknown2464,768,960unknown
-

- + + + + + + + +
+

FCIDXX / [all projects] / [all samples] / [all barcodes]

+
+

+ hide barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
+

Top Unknown Barcodes

+ + + + + + + + + + + + + + + + + + +
LaneCountSequenceLaneCountSequence
1494,288,120unknown2464,768,960unknown
+

+ diff --git a/tests/data/lane_result.html b/tests/data/lane_result.html index 41ace736..b187c938 100644 --- a/tests/data/lane_result.html +++ b/tests/data/lane_result.html @@ -1,95 +1,92 @@ - - - - - - - -
-

- C6L1WANXX / [all projects] / [all samples] / [all - barcodes] -

-
-

- show barcodes -

-
-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
-

- + + + + + + + +
+

C6L1WANXX / [all projects] / [all samples] / [all barcodes]

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
+

+ From 726d2f3ea14c0830cfde8766288178c885ffcde4 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:30:10 +0100 Subject: [PATCH 38/44] add non-invasive formatting commits hashes to ignore revs --- .git-blame-ignore-revs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index a99e5d13..538d5112 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1 +1,8 @@ # Start adding here + +# 2024-01-31, Non-invasive fixes after merge with master //AKe +a676908bb32d60bcbc5e42ce710f889980845af4 +2ba0179015e380b3b7e0ce8b9b99666533a7443f +8ea4523b1c9789d03410178d75aea93b6b2ffa77 +d5330f615b237beadcec22d5422dff3c02aa54ff +b9ee704ad4da26790e539b8fe1d39aa71f831ef1 From 632bb85d9652212bbe0216408b0823ee2dbea92b Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:33:25 +0100 Subject: [PATCH 39/44] manual fix, NumberReads_Summary should only be accessed as attribute, not local var --- taca/illumina/Runs.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index 483d278b..c562fc2f 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -871,8 +871,6 @@ def _fix_html_reports_for_complex_lanes( ) _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) - return NumberReads_Summary - def _fix_demultiplexingstats_xml_dir( self, demux_folder, @@ -882,7 +880,6 @@ def _fix_demultiplexingstats_xml_dir( simple_lanes, complex_lanes, noindex_lanes, - NumberReads_Summary, ): # Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done) DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"]) @@ -1416,7 +1413,7 @@ def _aggregate_demux_results_simple_complex(self): ) # Create the html reports - NumberReads_Summary = self._fix_html_reports_for_complex_lanes( + self._fix_html_reports_for_complex_lanes( demux_folder, index_cycles, complex_lanes, @@ -1434,7 +1431,6 @@ def _aggregate_demux_results_simple_complex(self): simple_lanes, complex_lanes, noindex_lanes, - NumberReads_Summary, ) return True From 23879e190052754388be0c2ec704abc728e73ae4 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:35:04 +0100 Subject: [PATCH 40/44] fix mypy error --- taca/nanopore/ONT_run_classes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index e301f6d8..d6b45255 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -477,6 +477,8 @@ def has_barcode_dirs(self) -> bool: for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")): if re.search(barcode_dir_pattern, dir): return True + + return False def run_anglerfish(self): """Run Anglerfish as subprocess within it's own Conda environment. From a6c700600773fc8234df83f751dc4260b3a3cc89 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:35:46 +0100 Subject: [PATCH 41/44] format --- taca/nanopore/ONT_run_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index d6b45255..675edcd2 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -477,7 +477,7 @@ def has_barcode_dirs(self) -> bool: for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")): if re.search(barcode_dir_pattern, dir): return True - + return False def run_anglerfish(self): From 32614eecc4699746ae017de89b1d5cd88c4d6e86 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:42:06 +0100 Subject: [PATCH 42/44] bump reqs based on GHA pipreqs linting --- requirements.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index fa2e4812..baf1d47c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ -click CouchDB +PyYAML +click flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser pandas -sphinx_rtd_theme -python_crontab pytest +python_crontab +python_dateutil +setuptools +sphinx_rtd_theme From 6a3edf3710b6fdd8c233662ebf900e9bd24e6bd5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:50:09 +0100 Subject: [PATCH 43/44] prettier --- .devcontainer/devcontainer.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 4d2cc553..f4f1fd01 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,13 +6,13 @@ // Sets the run context to one level up instead of the .devcontainer folder. "context": "..", // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "../Dockerfile" + "dockerfile": "../Dockerfile", }, "features": {}, "customizations": { "vscode": { - "extensions": ["ms-python.python"] - } + "extensions": ["ms-python.python"], + }, }, // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, @@ -24,6 +24,6 @@ // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "devcontainer" "mounts": [ - "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached" - ] + "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached", + ], } From 7878b89d3c624897e3df83db2cd4404e97a409a2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 31 Jan 2024 11:50:50 +0100 Subject: [PATCH 44/44] supress blame of last commit --- .git-blame-ignore-revs | 1 + 1 file changed, 1 insertion(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 538d5112..ffb399a1 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,3 +6,4 @@ a676908bb32d60bcbc5e42ce710f889980845af4 8ea4523b1c9789d03410178d75aea93b6b2ffa77 d5330f615b237beadcec22d5422dff3c02aa54ff b9ee704ad4da26790e539b8fe1d39aa71f831ef1 +6a3edf3710b6fdd8c233662ebf900e9bd24e6bd5