diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9c0c3268..f4f1fd01 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,31 +1,29 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile { - "name": "TACA", - "build": { - // Sets the run context to one level up instead of the .devcontainer folder. - "context": "..", - // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "../Dockerfile" + "name": "TACA", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile", + }, + "features": {}, + "customizations": { + "vscode": { + "extensions": ["ms-python.python"], }, - "features": {}, - "customizations": { - "vscode": { - "extensions": [ - "ms-python.python", - ] - } - }, - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .", - // Configure tool-specific properties. - // "customizations": {}, - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "devcontainer" - "mounts": [ - "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached" - ] -} \ No newline at end of file + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" + "mounts": [ + "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached", + ], +} diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..ffb399a1 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,9 @@ +# Start adding here + +# 2024-01-31, Non-invasive fixes after merge with master //AKe +a676908bb32d60bcbc5e42ce710f889980845af4 +2ba0179015e380b3b7e0ce8b9b99666533a7443f +8ea4523b1c9789d03410178d75aea93b6b2ffa77 +d5330f615b237beadcec22d5422dff3c02aa54ff +b9ee704ad4da26790e539b8fe1d39aa71f831ef1 +6a3edf3710b6fdd8c233662ebf900e9bd24e6bd5 diff --git a/.github/pr_labels.yml b/.github/pr_labels.yml index d04f24f3..8db6d109 100644 --- a/.github/pr_labels.yml +++ b/.github/pr_labels.yml @@ -1,4 +1,4 @@ -version: '1' +version: "1" invalidStatus: "pending" labelRule: values: diff --git a/.github/workflows/check-log.yml b/.github/workflows/check-log.yml new file mode 100644 index 00000000..1447daba --- /dev/null +++ b/.github/workflows/check-log.yml @@ -0,0 +1,26 @@ +name: Check VERSIONLOG.MD has been updated +on: [pull_request] + +jobs: + check-versionlog: + runs-on: ubuntu-latest + steps: + - name: Checkout PR + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for all branches and tags + + - name: Check for VERSIONLOG.MD changes + id: versionlog_check + # 1) Find the common ancestor between the current HEAD and the base branch + # 2) Then see if the versionlog has been updated in the PR since it diverged + # from the common ancestor + run: | + PR_BASE_SHA=$(git merge-base HEAD ${{ github.event.pull_request.base.sha }}) + FILE_CHANGED=$(git diff --name-only $PR_BASE_SHA HEAD | grep 'VERSIONLOG.md' || true) + if [ -n "$FILE_CHANGED" ]; then + echo "VERSIONLOG.MD has been changed." + else + echo "VERSIONLOG.MD has NOT been changed." + exit 1 # Fail the workflow if no changes in VERSIONLOG.MD + fi diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml new file mode 100644 index 00000000..1ab877de --- /dev/null +++ b/.github/workflows/lint-code.yml @@ -0,0 +1,124 @@ +name: Lint code +on: [push, pull_request] + +jobs: + # Use ruff to check for code style violations + ruff-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check for style violations + # Configured in pyproject.toml + run: ruff check . + + # Use ruff to check code formatting + ruff-format: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check code formatting + run: ruff format --check . + + # Use mypy for static type checking + mypy-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mypy + # Start by installing type stubs + - name: mypy --> Install stubs + run: echo -e "y" | mypy --install-types **/*.py || exit 0 + - name: mypy --> Static type checking + # Configured in pyprojet.toml + run: mypy **/*.py + + # Use pipreqs to check for missing dependencies + pipreqs-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install pipreqs + run: pip install pipreqs + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Run pipreqs + run: pipreqs --savepath pipreqs.txt + + - name: Compare requirements + run: | + # Extract and sort package names + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' requirements.txt | sort -u > requirements.compare + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' pipreqs.txt | sort -u > pipreqs.compare + + # Compare package lists + if cmp -s requirements.compare pipreqs.compare + then + echo "Requirements are the same" + + exit 0 + else + echo "Requirements are different" + echo "" + + echo "=== current requirements.txt ===" + echo "" + cat requirements.compare + echo "" + + echo "=== pipreqs requirements ===" + echo "" + cat pipreqs.compare + + exit 1 + fi + + # Use Prettier to check various file formats + prettier: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install Prettier + run: npm install -g prettier + + - name: Run Prettier --check + run: prettier --check . diff --git a/.gitignore b/.gitignore index eb7ce2ba..f60e5c99 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,7 @@ _build .benchmarks .coverage __pycache__ +.pytest_cache +.vscode +.ruff_cache +.mypy_cache diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..c30ff77b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +# .pre-commit-config.yaml +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.6 + hooks: + - id: ruff + - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.7.1" + hooks: + - id: mypy + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v4.0.0-alpha.8" + hooks: + - id: prettier diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b1ae1922..00000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -language: python - -python: - - "2.7" - - "3.8" - -install: - - python setup.py install - - mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml - - pip install codecov - -script: - - cd tests && nosetests --with-coverage -v -s - -after_success: - - codecov - -notifications: - email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c2345165..57b41e7d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,13 +2,14 @@ When contribution to this package please have the following things in mind: -__NOTE__: _Please make sure that there are no exisiting [issues]((https://github.com/SciLifeLab/TACA/issues)) relating to whatever you want to report._ +**NOTE**: _Please make sure that there are no exisiting [issues](<(https://github.com/SciLifeLab/TACA/issues)>) relating to whatever you want to report._ ####To contribute: + 1. Create an issue describing the bug / suggestion / improvement / ... [here](https://github.com/SciLifeLab/TACA/issues). 2. Fork this repository to your GitHub account 3. Make the necessary changes / additions to your forked TACA repository -4. Please *make sure* that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/) +4. Please _make sure_ that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/) 5. Update the version number in `TACA/__init__.py` 6. Pull Request and wait for the responsible reviewer to review and merge the code diff --git a/Dockerfile b/Dockerfile index 3c63be1a..db00553a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,4 +24,4 @@ COPY requirements-dev.txt requirements-dev.txt RUN python -m pip install -r requirements-dev.txt RUN mkdir /root/.taca/ -COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml \ No newline at end of file +COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml diff --git a/README.md b/README.md index 50ce07c3..e38d2fee 100644 --- a/README.md +++ b/README.md @@ -4,34 +4,103 @@

-## Tool for the Automation of Cleanup and Analyses +# Tool for the Automation of Cleanup and Analyses [![PyPI version](https://badge.fury.io/py/taca.svg)](http://badge.fury.io/py/taca) -[![Build Status](https://travis-ci.org/SciLifeLab/TACA.svg?branch=master)](https://travis-ci.org/SciLifeLab/TACA) [![Documentation Status](https://readthedocs.org/projects/taca/badge/?version=latest)](https://readthedocs.org/projects/taca/?badge=latest) [![codecov](https://codecov.io/gh/scilifelab/taca/branch/master/graph/badge.svg)](https://codecov.io/gh/scilifelab/taca) -This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) in Stockholm, Sweden. +This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://ngisweden.scilifelab.se/) in Stockholm, Sweden. -### Install for development -You can install your own fork of taca in for instance a local conda environment for development. Provided you have conda installed: +## Installation + +Inside the repo, run `pip install .` + +## Development + +Run `pip install requirements-dev.txt` to install packages used for development and `pip install -e .` to make the installation editable. + +### Automated linting + +This repo is configured for automated linting. Linter parameters are defined in `pyproject.toml`. + +As of now, we use: + +- [ruff](https://docs.astral.sh/ruff/) to perform automated formatting and a variety of lint checks. + - Run with `ruff check .` and `ruff format .` +- [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking and to prevent contradictory type annotation. + - Run with `mypy **/*.py` +- [pipreqs](https://github.com/bndr/pipreqs) to check that the requirement files are up-to-date with the code. + + - This is run with a custom Bash script in GitHub Actions which will only compare the list of package names. + + ``` + # Extract and sort package names + awk '{print $1}' $1 | sort -u > "$1".compare + awk -F'==' '{print $1}' $2 | sort -u > "$2".compare + + # Compare package lists + if cmp -s "$1".compare "$2".compare + then + echo "Requirements are the same" + exit 0 + else + echo "Requirements are different" + exit 1 + fi + ``` + +- [prettier](https://prettier.io/) to format common languages. + - Run with `prettier .` +- [editorconfig-checker](https://github.com/editorconfig-checker/editorconfig-checker) to enforce `.editorconfig` rules for all files not covered by the tools above. + - Run with + ``` + editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html') + ``` + +#### [GitHub Actions](https://docs.github.com/en/actions) + +Configured in `.github/workflows/lint-code.yml`. Will test all commits in pushes or pull requests, but not change code or prevent merges. + +#### [Pre-commit](https://pre-commit.com/) + +Will prevent local commits that fail linting checks. Configured in `.pre-commit-config.yml`. + +To set up pre-commit checking: + +1. Run `pip install pre-commit` +2. Navigate to the repo root +3. Run `pre-commit install` + +This can be disabled with `pre-commit uninstall` + +#### VS Code automation + +To enable automated linting in VS Code, go the the user `settings.json` and include the following lines: ``` -# clone the repo -git clone https://github.com//TACA.git +"[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", +} +``` -# create an environment -conda create -n taca_dev python=2.7 -conda activate taca_dev +This will run the `ruff`-mediated linting with the same parameters as the `GitHub Actions` and `pre-commit` every time VS Code is used to format the code in the repository. -# install TACA and dependencies for developoment -cd TACA -python setup.py develop -pip install -r ./requirements-dev.txt +To run formatting on save, include the lines: -# Check that tests pass: -cd tests && nosetests -v -s ``` +"[python]": { + "editor.formatOnSave": true, +} +``` + +### Git blame suppression + +When a non-invasive tool is used to tidy up a lot of code, it is useful to supress the Git blame for that particular commit, so the original author can still be traced. + +To do this, add the hash of the commit containing the changes to `.git-blame-ignore-revs`, headed by an explanatory comment. + +### Deliver command There is also a [plugin for the deliver command](https://github.com/SciLifeLab/taca-ngi-pipeline). To install this in the same development environment: @@ -43,7 +112,8 @@ python setup.py develop pip install -r ./requirements-dev.txt # add required config files and env for taca delivery plugin -echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml +echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml +echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml export CHARON_BASE_URL="http://tracking.database.org" export CHARON_API_TOKEN="charonapitokengoeshere" diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 0216daf7..5b805399 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,5 +1,13 @@ # TACA Version Log +## 20240123.1 + +Exclude pod5 dir and files from being copied to metadata dir. + +## 20240122.1 + +Adapt ONT analysis to new ONT JSON format (also backwards compatible). + ## 20231204.1 Update ONT instrument transfer script to ignore runs started in the 3rd PromethION column, which will be used by Clinical Genomics. @@ -21,9 +29,11 @@ Version 1.0.0 Fix bug with rsync permission issue cont. ## 20231031.1 + Improve run_folder transfer ## 20231026.1 + Fix bug with rsync permission issue ## 20231024.1 @@ -46,7 +56,6 @@ Fix bug that NovaSeqXPlus date format cause error in writing pdc_archived timest Remove the temp change of creating links - ## 20230920.1 Supplement last PR, primary purpose is to differentiate user runs from QC runs in the instrument transfer script rather than the installed TACA. diff --git a/doc/conf.py b/doc/conf.py index cb58a377..6a064945 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # TACA documentation build configuration file, created by # sphinx-quickstart on Wed Sep 17 12:39:41 2014. @@ -12,168 +11,168 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.todo", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -#templates_path = ['_templates'] +# templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'TACA' -copyright = u'2014, Guillermo Carrasco' +project = "TACA" +copyright = "2014, Guillermo Carrasco" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '1.0' +version = "1.0" # The full version, including alpha/beta/rc tags. -release = '1.0' +release = "1.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +html_theme = "default" +on_rtd = os.environ.get("READTHEDOCS", None) == "True" if not on_rtd: import sphinx_rtd_theme - html_theme = 'sphinx_rtd_theme' + + html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. html_show_copyright = False @@ -181,68 +180,62 @@ # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'TACAdoc' +htmlhelp_basename = "TACAdoc" # -- Options for LaTeX output --------------------------------------------- -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', +latex_elements: dict = { + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'TACA.tex', u'TACA Documentation', - u'Guillermo Carrasco', 'manual'), + ("index", "TACA.tex", "TACA Documentation", "Guillermo Carrasco", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'taca', u'TACA Documentation', - [u'Guillermo Carrasco'], 1) -] +man_pages = [("index", "taca", "TACA Documentation", ["Guillermo Carrasco"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -251,89 +244,95 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'TACA', u'TACA Documentation', - u'Guillermo Carrasco', 'TACA', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "TACA", + "TACA Documentation", + "Guillermo Carrasco", + "TACA", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = u'TACA' -epub_author = u'Guillermo Carrasco' -epub_publisher = u'Guillermo Carrasco' -epub_copyright = u'2014, Guillermo Carrasco' +epub_title = "TACA" +epub_author = "Guillermo Carrasco" +epub_publisher = "Guillermo Carrasco" +epub_copyright = "2014, Guillermo Carrasco" # The basename for the epub file. It defaults to the project name. -#epub_basename = u'TACA' +# epub_basename = u'TACA' # The HTML theme for the epub output. Since the default themes are not optimized # for small screen space, using the same theme for HTML and epub output is # usually not wise. This defaults to 'epub', a theme designed to save visual # space. -#epub_theme = 'epub' +# epub_theme = 'epub' # The language of the text. It defaults to the language option # or en if the language is not set. -#epub_language = '' +# epub_language = '' # The scheme of the identifier. Typical schemes are ISBN or URL. -#epub_scheme = '' +# epub_scheme = '' # The unique identifier of the text. This can be a ISBN number # or the project homepage. -#epub_identifier = '' +# epub_identifier = '' # A unique identification for the text. -#epub_uid = '' +# epub_uid = '' # A tuple containing the cover image and cover page html template filenames. -#epub_cover = () +# epub_cover = () # A sequence of (type, uri, title) tuples for the guide element of content.opf. -#epub_guide = () +# epub_guide = () # HTML files that should be inserted before the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_pre_files = [] +# epub_pre_files = [] # HTML files shat should be inserted after the pages created by sphinx. # The format is a list of tuples containing the path and title. -#epub_post_files = [] +# epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. -#epub_tocdepth = 3 +# epub_tocdepth = 3 # Allow duplicate toc entries. -#epub_tocdup = True +# epub_tocdup = True # Choose between 'default' and 'includehidden'. -#epub_tocscope = 'default' +# epub_tocscope = 'default' # Fix unsupported image types using the PIL. -#epub_fix_images = False +# epub_fix_images = False # Scale large images. -#epub_max_image_width = 0 +# epub_max_image_width = 0 # How to display URL addresses: 'footnote', 'no', or 'inline'. -#epub_show_urls = 'inline' +# epub_show_urls = 'inline' # If false, no index is generated. -#epub_use_index = True +# epub_use_index = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..17ba1fbc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +title = "taca" + + +[tool.ruff.lint] +select =[ + # Ruff default rules + # ------------------------------ + "E4", # pycodestyle Imports + "E7", # pycodestyle Statements + "E9", # pycodestyle Runtime + "F", # Pyflakes + + # Additional Comment + # ------------------------------------------------------ + "I", # isort Best-practice sorting of imports + "UP", # pyupgrade Make sure syntax is up-to-date +] +ignore = [ + "E402", # Module level import not at top of file + "E722", # Do not use bare 'except' + "E741", # Ambiguous variable name +] + + +[tool.mypy] +ignore_missing_imports = true +follow_imports = 'skip' diff --git a/requirements-dev.txt b/requirements-dev.txt index af58407f..9118bd64 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,3 +5,8 @@ mock sphinx sphinx-rtd-theme pytest +ipython +ipdb +ruff +mypy +pipreqs diff --git a/requirements.txt b/requirements.txt index b2bc63b1..baf1d47c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ +CouchDB +PyYAML click -requests -pyyaml flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser -soupsieve<2.0 -beautifulsoup4 -python-crontab pandas +pytest +python_crontab +python_dateutil +setuptools +sphinx_rtd_theme diff --git a/setup.py b/setup.py index cc05b49c..e278a522 100644 --- a/setup.py +++ b/setup.py @@ -1,50 +1,49 @@ -from setuptools import setup, find_packages import glob -import os -import sys + +from setuptools import find_packages, setup from taca import __version__ -from io import open try: - with open("requirements.txt", "r") as f: + with open("requirements.txt") as f: install_requires = [x.strip() for x in f.readlines()] -except IOError: +except OSError: install_requires = [] try: - with open("dependency_links.txt", "r") as f: + with open("dependency_links.txt") as f: dependency_links = [x.strip() for x in f.readlines()] -except IOError: +except OSError: dependency_links = [] -setup(name='taca', +setup( + name="taca", version=__version__, description="Tool for the Automation of Cleanup and Analyses", - long_description='This package contains a set of functionalities that are ' - 'useful in the day-to-day tasks of bioinformatitians in ' - 'National Genomics Infrastructure in Stockholm, Sweden.', - keywords='bioinformatics', - author='NGI-stockholm', - author_email='ngi_pipeline_operators@scilifelab.se', - url='http://taca.readthedocs.org/en/latest/', - license='MIT', - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), - scripts=glob.glob('scripts/*.py'), + long_description="This package contains a set of functionalities that are " + "useful in the day-to-day tasks of bioinformatitians in " + "National Genomics Infrastructure in Stockholm, Sweden.", + keywords="bioinformatics", + author="NGI-stockholm", + author_email="ngi_pipeline_operators@scilifelab.se", + url="http://taca.readthedocs.org/en/latest/", + license="MIT", + packages=find_packages(exclude=["ez_setup", "examples", "tests"]), + scripts=glob.glob("scripts/*.py"), include_package_data=True, zip_safe=False, entry_points={ - 'console_scripts': ['taca = taca.cli:cli'], - 'taca.subcommands': [ - 'cleanup = taca.cleanup.cli:cleanup', - 'analysis = taca.analysis.cli:analysis', - 'bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries', - 'server_status = taca.server_status.cli:server_status', - 'backup = taca.backup.cli:backup', - 'create_env = taca.testing.cli:uppmax_env' - ] + "console_scripts": ["taca = taca.cli:cli"], + "taca.subcommands": [ + "cleanup = taca.cleanup.cli:cleanup", + "analysis = taca.analysis.cli:analysis", + "bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries", + "server_status = taca.server_status.cli:server_status", + "backup = taca.backup.cli:backup", + "create_env = taca.testing.cli:uppmax_env", + ], }, install_requires=install_requires, - dependency_links=dependency_links + dependency_links=dependency_links, ) diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py index c817b064..c19f0582 100755 --- a/taca/analysis/analysis.py +++ b/taca/analysis/analysis.py @@ -2,22 +2,19 @@ import glob import logging import os -import sys import subprocess +import sys +from shutil import copyfile, copytree + +from flowcell_parser.classes import RunParametersParser -from shutil import copyfile -from shutil import copytree -from taca.illumina.Standard_Runs import Standard_Run from taca.illumina.MiSeq_Runs import MiSeq_Run from taca.illumina.NextSeq_Runs import NextSeq_Run from taca.illumina.NovaSeq_Runs import NovaSeq_Run from taca.illumina.NovaSeqXPlus_Runs import NovaSeqXPlus_Run +from taca.utils import statusdb from taca.utils.config import CONFIG from taca.utils.transfer import RsyncAgent -from taca.utils import statusdb - -from flowcell_parser.classes import RunParametersParser -from io import open logger = logging.getLogger(__name__) @@ -32,54 +29,68 @@ def get_runObj(run, software): None if the sequencer type is unknown of there was an error """ - if os.path.exists(os.path.join(run, 'runParameters.xml')): - run_parameters_file = 'runParameters.xml' - elif os.path.exists(os.path.join(run, 'RunParameters.xml')): - run_parameters_file = 'RunParameters.xml' + if os.path.exists(os.path.join(run, "runParameters.xml")): + run_parameters_file = "runParameters.xml" + elif os.path.exists(os.path.join(run, "RunParameters.xml")): + run_parameters_file = "RunParameters.xml" else: - logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run)) + logger.error( + f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}" + ) return run_parameters_path = os.path.join(run, run_parameters_file) try: run_parameters = RunParametersParser(run_parameters_path) except OSError: - logger.warn('Problems parsing the runParameters.xml file at {}. ' - 'This is quite unexpected. please archive the run {} manually'.format(run_parameters_path, run)) + logger.warn( + f"Problems parsing the runParameters.xml file at {run_parameters_path}. " + f"This is quite unexpected. please archive the run {run} manually" + ) else: # Do a case by case test because there are so many version of RunParameters that there is no real other way - runtype = run_parameters.data['RunParameters'].get('InstrumentType', - run_parameters.data['RunParameters'].get('ApplicationName', - run_parameters.data['RunParameters'].get('Application', - ''))) - if 'Setup' in run_parameters.data['RunParameters']: + runtype = run_parameters.data["RunParameters"].get( + "InstrumentType", + run_parameters.data["RunParameters"].get( + "ApplicationName", + run_parameters.data["RunParameters"].get("Application", ""), + ), + ) + if "Setup" in run_parameters.data["RunParameters"]: # This is the HiSeq2500, MiSeq, and HiSeqX case try: # Works for recent control software - runtype = run_parameters.data['RunParameters']['Setup']['Flowcell'] + runtype = run_parameters.data["RunParameters"]["Setup"]["Flowcell"] except KeyError: # Use this as second resource but print a warning in the logs - logger.warn('Parsing runParameters to fecth instrument type, ' - 'not found Flowcell information in it. Using ApplicationName') + logger.warn( + "Parsing runParameters to fecth instrument type, " + "not found Flowcell information in it. Using ApplicationName" + ) # Here makes sense to use get with default value '' -> # so that it doesn't raise an exception in the next lines # (in case ApplicationName is not found, get returns None) - runtype = run_parameters.data['RunParameters']['Setup'].get('ApplicationName', '') - - if 'MiSeq' in runtype: - return MiSeq_Run(run, software, CONFIG['analysis']['MiSeq']) - elif 'NextSeq' in runtype: - return NextSeq_Run(run, software, CONFIG['analysis']['NextSeq']) - elif 'NovaSeqXPlus' in runtype: - return NovaSeqXPlus_Run(run, software, CONFIG['analysis']['NovaSeqXPlus']) - elif 'NovaSeq' in runtype: - return NovaSeq_Run(run, software, CONFIG['analysis']['NovaSeq']) + runtype = run_parameters.data["RunParameters"]["Setup"].get( + "ApplicationName", "" + ) + + if "MiSeq" in runtype: + return MiSeq_Run(run, software, CONFIG["analysis"]["MiSeq"]) + elif "NextSeq" in runtype: + return NextSeq_Run(run, software, CONFIG["analysis"]["NextSeq"]) + elif "NovaSeqXPlus" in runtype: + return NovaSeqXPlus_Run(run, software, CONFIG["analysis"]["NovaSeqXPlus"]) + elif "NovaSeq" in runtype: + return NovaSeq_Run(run, software, CONFIG["analysis"]["NovaSeq"]) else: - logger.warn('Unrecognized run type {}, cannot archive the run {}. ' - 'Someone as likely bought a new sequencer without telling ' - 'it to the bioinfo team'.format(runtype, run)) + logger.warn( + f"Unrecognized run type {runtype}, cannot archive the run {run}. " + "Someone as likely bought a new sequencer without telling " + "it to the bioinfo team" + ) return None + def upload_to_statusdb(run_dir, software): """Function to upload run_dir informations to statusDB directly from click interface. @@ -93,60 +104,80 @@ def upload_to_statusdb(run_dir, software): # Make the actual upload _upload_to_statusdb(runObj) + def _upload_to_statusdb(run): """Triggers the upload to statusdb using the dependency flowcell_parser. :param Run run: the object run """ - couch_conf = CONFIG['statusdb'] + couch_conf = CONFIG["statusdb"] couch_connection = statusdb.StatusdbSession(couch_conf).connection - db = couch_connection[couch_conf['xten_db']] + db = couch_connection[couch_conf["xten_db"]] parser = run.runParserObj # Check if I have NoIndex lanes - for element in parser.obj['samplesheet_csv']: - if 'NoIndex' in element['index'] or not element['index']: # NoIndex in the case of HiSeq, empty in the case of HiSeqX - lane = element['Lane'] # This is a lane with NoIndex + for element in parser.obj["samplesheet_csv"]: + if ( + "NoIndex" in element["index"] or not element["index"] + ): # NoIndex in the case of HiSeq, empty in the case of HiSeqX + lane = element["Lane"] # This is a lane with NoIndex # In this case PF Cluster is the number of undetermined reads try: - PFclusters = parser.obj['Undetermined'][lane]['unknown'] + PFclusters = parser.obj["Undetermined"][lane]["unknown"] except KeyError: - logger.error('While taking extra care of lane {} of NoIndex type ' \ - 'I found out that not all values were available'.format(lane)) + logger.error( + f"While taking extra care of lane {lane} of NoIndex type " + "I found out that not all values were available" + ) continue # In Lanes_stats fix the lane yield - parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters) + parser.obj["illumina"]["Demultiplex_Stats"]["Lanes_stats"][int(lane) - 1][ + "PF Clusters" + ] = str(PFclusters) # Now fix Barcode lane stats - updated = 0 # Check that only one update is made - for sample in parser.obj['illumina']['Demultiplex_Stats']['Barcode_lane_statistics']: - if lane in sample['Lane']: + updated = 0 # Check that only one update is made + for sample in parser.obj["illumina"]["Demultiplex_Stats"][ + "Barcode_lane_statistics" + ]: + if lane in sample["Lane"]: updated += 1 - sample['PF Clusters'] = str(PFclusters) + sample["PF Clusters"] = str(PFclusters) if updated != 1: - logger.error('While taking extra care of lane {} of NoIndex type ' - 'I updated more than once the barcode_lane. ' - 'This is too much to continue so I will fail.'.format(lane)) + logger.error( + f"While taking extra care of lane {lane} of NoIndex type " + "I updated more than once the barcode_lane. " + "This is too much to continue so I will fail." + ) os.sys.exit() # If I am here it means I changed the HTML representation to something # else to accomodate the wired things we do # someone told me that in such cases it is better to put a place holder for this - parser.obj['illumina']['Demultiplex_Stats']['NotOriginal'] = 'True' + parser.obj["illumina"]["Demultiplex_Stats"]["NotOriginal"] = "True" # Update info about bcl2fastq tool - if not parser.obj.get('DemultiplexConfig'): - parser.obj['DemultiplexConfig'] = {'Setup': {'Software': run.CONFIG.get('bcl2fastq', {})}} + if not parser.obj.get("DemultiplexConfig"): + parser.obj["DemultiplexConfig"] = { + "Setup": {"Software": run.CONFIG.get("bcl2fastq", {})} + } statusdb.update_doc(db, parser.obj, over_write_db_entry=True) -def transfer_run(run_dir): + +def transfer_run(run_dir, software): """Interface for click to force a transfer a run to uppmax. :param: string run_dir: the run to tranfer """ runObj = get_runObj(run_dir, software) - mail_recipients = CONFIG.get('mail', {}).get('recipients') + mail_recipients = CONFIG.get("mail", {}).get("recipients") if runObj is None: - mail_recipients = CONFIG.get('mail', {}).get('recipients') - logger.error('Trying to force a transfer of run {} but the sequencer was not recognized.'.format(run_dir)) + mail_recipients = CONFIG.get("mail", {}).get("recipients") + logger.error( + f"Trying to force a transfer of run {run_dir} but the sequencer was not recognized." + ) else: - runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients) + runObj.transfer_run( + os.path.join("nosync", CONFIG["analysis"]["status_dir"], "transfer.tsv"), + mail_recipients, + ) + def transfer_runfolder(run_dir, pid, exclude_lane): """Transfer the entire run folder for a specified project and run to uppmax. @@ -159,82 +190,105 @@ def transfer_runfolder(run_dir, pid, exclude_lane): # Validate whether run_dir exists or is valid run_dir = os.path.abspath(run_dir) if not os.path.exists(run_dir) or not os.path.isdir(run_dir): - logger.error('Unable to locate the specified run directory for transfer.') + logger.error("Unable to locate the specified run directory for transfer.") sys.exit() - original_sample_sheet = os.path.join(run_dir, 'SampleSheet.csv') - pid_list = list(set([x.strip() for x in pid.split(',')])) - new_sample_sheet = os.path.join(run_dir, '_'.join(pid_list) + '_SampleSheet.txt') + original_sample_sheet = os.path.join(run_dir, "SampleSheet.csv") + pid_list = list(set([x.strip() for x in pid.split(",")])) + new_sample_sheet = os.path.join(run_dir, "_".join(pid_list) + "_SampleSheet.txt") # Write new sample sheet including only rows for the specified project try: - with open(new_sample_sheet, 'w') as nss: + with open(new_sample_sheet, "w") as nss: nss.write(extract_project_samplesheet(original_sample_sheet, pid_list)) - except IOError as e: - logger.error('An error occured while parsing the samplesheet. ' - 'Please check the sample sheet and try again.') + except OSError as e: + logger.error( + "An error occured while parsing the samplesheet. " + "Please check the sample sheet and try again." + ) raise e # Create a tar archive of the runfolder dir_name = os.path.basename(run_dir) - archive = run_dir + '.tar.gz' + archive = run_dir + ".tar.gz" run_dir_path = os.path.dirname(run_dir) # Prepare the options for excluding lanes - if exclude_lane != '': + if exclude_lane != "": dir_for_excluding_lane = [] - lane_to_exclude = exclude_lane.split(',') + lane_to_exclude = exclude_lane.split(",") for lane in lane_to_exclude: - if os.path.isdir('{}/{}/Thumbnail_Images/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Thumbnail_Images/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Images/Focus/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Images/Focus/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Data/Intensities/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/L00{}'.format(lane)]) - if os.path.isdir('{}/{}/Data/Intensities/BaseCalls/L00{}'.format(run_dir_path, dir_name, lane)): - dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/BaseCalls/L00{}'.format(lane)]) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}"): + dir_for_excluding_lane.extend( + ["--exclude", f"Thumbnail_Images/L00{lane}"] + ) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Images/Focus/L00{lane}"): + dir_for_excluding_lane.extend(["--exclude", f"Images/Focus/L00{lane}"]) + if os.path.isdir(f"{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}"): + dir_for_excluding_lane.extend( + ["--exclude", f"Data/Intensities/L00{lane}"] + ) + if os.path.isdir( + f"{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}" + ): + dir_for_excluding_lane.extend( + ["--exclude", f"Data/Intensities/BaseCalls/L00{lane}"] + ) try: - exclude_options_for_tar = ['--exclude', 'Demultiplexing*', - '--exclude', 'demux_*', - '--exclude', 'rsync*', - '--exclude', '*.csv'] - if exclude_lane != '': + exclude_options_for_tar = [ + "--exclude", + "Demultiplexing*", + "--exclude", + "demux_*", + "--exclude", + "rsync*", + "--exclude", + "*.csv", + ] + if exclude_lane != "": exclude_options_for_tar += dir_for_excluding_lane - subprocess.call(['tar'] + exclude_options_for_tar + ['-cvzf', archive, '-C', run_dir_path, dir_name]) + subprocess.call( + ["tar"] + + exclude_options_for_tar + + ["-cvzf", archive, "-C", run_dir_path, dir_name] + ) except subprocess.CalledProcessError as e: - logger.error('Error creating tar archive') + logger.error("Error creating tar archive") raise e # Generate the md5sum under the same folder as run_dir - md5file = archive + '.md5' + md5file = archive + ".md5" try: - f = open(md5file, 'w') + f = open(md5file, "w") os.chdir(run_dir_path) - subprocess.call(['md5sum', os.path.basename(archive)], stdout=f) + subprocess.call(["md5sum", os.path.basename(archive)], stdout=f) f.close() except subprocess.CalledProcessError as e: - logger.error('Error creating md5 file') + logger.error("Error creating md5 file") raise e # Rsync the files to the analysis cluster - destination = CONFIG['analysis']['deliver_runfolder'].get('destination') - rsync_opts = {'-LtDrv': None, - '--chmod': 'g+rw'} - connection_details = CONFIG['analysis']['deliver_runfolder'].get('analysis_server') - archive_transfer = RsyncAgent(archive, - dest_path=destination, - remote_host=connection_details['host'], - remote_user=connection_details['user'], - validate=False, - opts=rsync_opts) - md5_transfer = RsyncAgent(md5file, - dest_path=destination, - remote_host=connection_details['host'], - remote_user=connection_details['user'], - validate=False, - opts=rsync_opts) + destination = CONFIG["analysis"]["deliver_runfolder"].get("destination") + rsync_opts = {"-LtDrv": None, "--chmod": "g+rw"} + connection_details = CONFIG["analysis"]["deliver_runfolder"].get("analysis_server") + archive_transfer = RsyncAgent( + archive, + dest_path=destination, + remote_host=connection_details["host"], + remote_user=connection_details["user"], + validate=False, + opts=rsync_opts, + ) + md5_transfer = RsyncAgent( + md5file, + dest_path=destination, + remote_host=connection_details["host"], + remote_user=connection_details["user"], + validate=False, + opts=rsync_opts, + ) archive_transfer.transfer() md5_transfer.transfer() @@ -244,83 +298,108 @@ def transfer_runfolder(run_dir, pid, exclude_lane): os.remove(new_sample_sheet) os.remove(archive) os.remove(md5file) - except IOError as e: - logger.error('Was not able to delete all temporary files') + except OSError as e: + logger.error("Was not able to delete all temporary files") raise e return + def extract_project_samplesheet(sample_sheet, pid_list): - header_line = '' - project_entries = '' + header_line = "" + project_entries = "" with open(sample_sheet) as f: for line in f: - if line.split(',')[0] in ('Lane', 'FCID'): # include the header + if line.split(",")[0] in ("Lane", "FCID"): # include the header header_line += line elif any(pid in line for pid in pid_list): - project_entries += line # include only lines related to the specified project + project_entries += ( + line # include only lines related to the specified project + ) new_samplesheet_content = header_line + project_entries return new_samplesheet_content + def run_preprocessing(run, software): """Run demultiplexing in all data directories. :param str run: Process a particular run instead of looking for runs """ + def _process(run): """Process a run/flowcell and transfer to analysis server. :param taca.illumina.Run run: Run to be processed and transferred """ - logger.info('Checking run {}'.format(run.id)) - transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv') - if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot - logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id)) + logger.info(f"Checking run {run.id}") + transfer_file = os.path.join(CONFIG["analysis"]["status_dir"], "transfer.tsv") + if run.is_transferred( + transfer_file + ): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot + logger.info( + f"Run {run.id} already transferred to analysis server, skipping it" + ) return - if run.get_run_status() == 'SEQUENCING': - logger.info('Run {} is not finished yet'.format(run.id)) - if 'statusdb' in CONFIG: + if run.get_run_status() == "SEQUENCING": + logger.info(f"Run {run.id} is not finished yet") + if "statusdb" in CONFIG: _upload_to_statusdb(run) - elif run.get_run_status() == 'TO_START': - if run.get_run_type() == 'NON-NGI-RUN': + elif run.get_run_status() == "TO_START": + if run.get_run_type() == "NON-NGI-RUN": # For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs) - logger.warn('Run {} marked as {}, ' - 'TACA will skip this and move the run to ' - 'no-sync directory'.format(run.id, run.get_run_type())) - if 'storage' in CONFIG: - run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type]) + logger.warn( + f"Run {run.id} marked as {run.get_run_type()}, " + "TACA will skip this and move the run to " + "no-sync directory" + ) + if "storage" in CONFIG: + run.archive_run( + CONFIG["storage"]["archive_dirs"][run.sequencer_type] + ) return - logger.info(('Starting BCL to FASTQ conversion and demultiplexing for run {}'.format(run.id))) - if 'statusdb' in CONFIG: + logger.info( + f"Starting BCL to FASTQ conversion and demultiplexing for run {run.id}" + ) + if "statusdb" in CONFIG: _upload_to_statusdb(run) run.demultiplex_run() - elif run.get_run_status() == 'IN_PROGRESS': - logger.info(('BCL conversion and demultiplexing process in ' - 'progress for run {}, skipping it'.format(run.id))) + elif run.get_run_status() == "IN_PROGRESS": + logger.info( + "BCL conversion and demultiplexing process in " + f"progress for run {run.id}, skipping it" + ) # Upload to statusDB if applies - if 'statusdb' in CONFIG: + if "statusdb" in CONFIG: _upload_to_statusdb(run) # This function checks if demux is done run.check_run_status() # Previous elif might change the status to COMPLETED, therefore to avoid skipping # a cycle take the last if out of the elif - if run.get_run_status() == 'COMPLETED': + if run.get_run_status() == "COMPLETED": run.check_run_status() - logger.info(('Preprocessing of run {} is finished, transferring it'.format(run.id))) + logger.info(f"Preprocessing of run {run.id} is finished, transferring it") # Upload to statusDB if applies - if 'statusdb' in CONFIG: + if "statusdb" in CONFIG: _upload_to_statusdb(run) demux_summary_message = [] for demux_id, demux_log in run.demux_summary.items(): - if demux_log['errors'] or demux_log['warnings']: - demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings'])) - demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5])) - if len(demux_log['error_and_warning_messages'])>5: - demux_summary_message.append("...... Only the first 5 errors or warnings are displayed for Demultiplexing_{}.".format(demux_id)) + if demux_log["errors"] or demux_log["warnings"]: + demux_summary_message.append( + "Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format( + demux_id, demux_log["errors"], demux_log["warnings"] + ) + ) + demux_summary_message.append( + "\n".join(demux_log["error_and_warning_messages"][:5]) + ) + if len(demux_log["error_and_warning_messages"]) > 5: + demux_summary_message.append( + f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}." + ) # Notify with a mail run completion and stats uploaded if demux_summary_message: - sbt = ("{} Demultiplexing Completed with ERRORs or WARNINGS!".format(run.id)) + sbt = f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!" msg = """The run {run} has been demultiplexed with errors or warnings! {errors_warnings} @@ -329,9 +408,11 @@ def _process(run): The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} - """.format(errors_warnings='\n'.join(demux_summary_message), run=run.id) + """.format( + errors_warnings="\n".join(demux_summary_message), run=run.id + ) else: - sbt = ("{} Demultiplexing Completed!".format(run.id)) + sbt = f"{run.id} Demultiplexing Completed!" msg = """The run {run} has been demultiplexed without any error or warning. The Run will be transferred to the analysis cluster for further analysis. @@ -339,67 +420,103 @@ def _process(run): The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} """.format(run=run.id) - run.send_mail(sbt, msg, rcp=CONFIG['mail']['recipients']) + run.send_mail(sbt, msg, rcp=CONFIG["mail"]["recipients"]) # Copy demultiplex stats file, InterOp meta data and run xml files to shared file system for LIMS purpose - if 'mfs_path' in CONFIG['analysis']: + if "mfs_path" in CONFIG["analysis"]: try: - mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id) - logger.info('Copying demultiplex stats, InterOp metadata and XML files for run {} to {}'.format(run.id, mfs_dest)) + mfs_dest = os.path.join( + CONFIG["analysis"]["mfs_path"][run.sequencer_type.lower()], + run.id, + ) + logger.info( + f"Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}" + ) if not os.path.exists(mfs_dest): os.mkdir(mfs_dest) - demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports', - 'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html') - copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html')) + demulti_stat_src = os.path.join( + run.run_dir, + run.demux_dir, + "Reports", + "html", + run.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) + copyfile( + demulti_stat_src, os.path.join(mfs_dest, "laneBarcode.html") + ) # Copy RunInfo.xml - run_info_xml_src = os.path.join(run.run_dir, 'RunInfo.xml') + run_info_xml_src = os.path.join(run.run_dir, "RunInfo.xml") if os.path.isfile(run_info_xml_src): - copyfile(run_info_xml_src, os.path.join(mfs_dest, 'RunInfo.xml')) + copyfile( + run_info_xml_src, os.path.join(mfs_dest, "RunInfo.xml") + ) # Copy RunParameters.xml - run_parameters_xml_src = os.path.join(run.run_dir, 'RunParameters.xml') + run_parameters_xml_src = os.path.join( + run.run_dir, "RunParameters.xml" + ) if os.path.isfile(run_info_xml_src): - copyfile(run_parameters_xml_src, os.path.join(mfs_dest, 'RunParameters.xml')) + copyfile( + run_parameters_xml_src, + os.path.join(mfs_dest, "RunParameters.xml"), + ) # Copy InterOp - interop_src = os.path.join(run.run_dir, 'InterOp') + interop_src = os.path.join(run.run_dir, "InterOp") if os.path.exists(interop_src): - copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True) + copytree( + interop_src, + os.path.join(mfs_dest, "InterOp"), + dirs_exist_ok=True, + ) except: - logger.warn('Could not copy demultiplex stats, InterOp metadata or XML files for run {}'.format(run.id)) + logger.warn( + f"Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}" + ) # Transfer to analysis server if flag is True if run.transfer_to_analysis_server: - mail_recipients = CONFIG.get('mail', {}).get('recipients') - logger.info('Transferring run {} to {} into {}' - .format(run.id, - run.CONFIG['analysis_server']['host'], - run.CONFIG['analysis_server']['sync']['data_archive'])) + mail_recipients = CONFIG.get("mail", {}).get("recipients") + logger.info( + "Transferring run {} to {} into {}".format( + run.id, + run.CONFIG["analysis_server"]["host"], + run.CONFIG["analysis_server"]["sync"]["data_archive"], + ) + ) run.transfer_run(transfer_file, mail_recipients) # Archive the run if indicated in the config file - if 'storage' in CONFIG: #TODO: make sure archiving to PDC is not ongoing - run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type]) + if "storage" in CONFIG: # TODO: make sure archiving to PDC is not ongoing + run.archive_run(CONFIG["storage"]["archive_dirs"][run.sequencer_type]) if run: # Determine the run type runObj = get_runObj(run, software) if not runObj: - raise RuntimeError("Unrecognized instrument type or incorrect run folder {}".format(run)) + raise RuntimeError( + f"Unrecognized instrument type or incorrect run folder {run}" + ) else: _process(runObj) else: - data_dirs = CONFIG.get('analysis').get('data_dirs') + data_dirs = CONFIG.get("analysis").get("data_dirs") for data_dir in data_dirs: # Run folder looks like DATE_*_*_*, the last section is the FC name. - runs = glob.glob(os.path.join(data_dir, '[1-9]*_*_*_*')) + runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*")) for _run in runs: runObj = get_runObj(_run, software) if not runObj: - logger.warning('Unrecognized instrument type or incorrect run folder {}'.format(run)) + logger.warning( + f"Unrecognized instrument type or incorrect run folder {run}" + ) else: try: _process(runObj) except: # This function might throw and exception, # it is better to continue processing other runs - logger.warning('There was an error processing the run {}'.format(run)) + logger.warning(f"There was an error processing the run {run}") pass diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py index 74e4c3ef..4f8f9345 100644 --- a/taca/analysis/analysis_nanopore.py +++ b/taca/analysis/analysis_nanopore.py @@ -1,17 +1,17 @@ """Nanopore analysis methods for TACA.""" -import os import logging +import os import re import traceback -from taca.utils.config import CONFIG -from taca.utils.misc import send_mail from taca.nanopore.ONT_run_classes import ( + ONT_RUN_PATTERN, + ONT_qc_run, ONT_run, ONT_user_run, - ONT_qc_run, - ONT_RUN_PATTERN, ) +from taca.utils.config import CONFIG +from taca.utils.misc import send_mail logger = logging.getLogger(__name__) @@ -36,7 +36,6 @@ def find_run_dirs(dir_to_search: str, skip_dirs: list): def send_error_mail(run_name, error: BaseException): - email_subject = f"Run processed with errors: {run_name}" email_message = f"{str(error)}\n\n{traceback.format_exc()}" email_recipients = CONFIG["mail"]["recipients"] @@ -75,7 +74,6 @@ def process_user_run(ont_user_run: ONT_user_run): if not ont_user_run.is_synced(): logger.info(f"{ont_user_run.run_name}: Run is not fully synced, skipping.") else: - if ont_user_run.is_transferred(): logger.warning( f"{ont_user_run.run_name}: Run is already logged as transferred, sending mail." @@ -157,7 +155,6 @@ def process_qc_run(ont_qc_run: ONT_qc_run): if not ont_qc_run.is_synced(): logger.info(f"{ont_qc_run.run_name}: Run is not fully synced, skipping.") else: - # Assert all files are in place logger.info(f"{ont_qc_run.run_name}: Asserting run contents...") ont_qc_run.assert_contents() @@ -209,6 +206,10 @@ def process_qc_run(ont_qc_run: ONT_qc_run): logger.info( f"{ont_qc_run.run_name}: Could not find Anglerfish sample sheet, skipping." ) + elif not ont_qc_run.has_fastq_output(): + logger.info( + f"{ont_qc_run.run_name}: Run has no fastq output, skipping." + ) else: logger.info(f"{ont_qc_run.run_name}: Starting Anglerfish...") ont_qc_run.run_anglerfish() @@ -243,7 +244,7 @@ def process_qc_run(ont_qc_run: ONT_qc_run): ont_qc_run.archive_run() -def ont_transfer(run_abspath: str or None, qc: bool = False): +def ont_transfer(run_abspath: str | None, qc: bool = False): """CLI entry function. Find finished ONT runs in ngi-nas and transfer to HPC cluster. @@ -257,7 +258,6 @@ def ont_transfer(run_abspath: str or None, qc: bool = False): # If no run is specified, locate all runs else: - for run_type in ["user_run", "qc_run"]: logger.info(f"Looking for runs of type '{run_type}'...") diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py index ba101d66..52b6423b 100644 --- a/taca/analysis/cli.py +++ b/taca/analysis/cli.py @@ -13,21 +13,42 @@ def analysis(): # Illumina analysis subcommands + @analysis.command() -@click.option('-r', '--run', type=click.Path(exists=True), default=None, - help='Demultiplex only a particular run') -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + default=None, + help="Demultiplex only a particular run", +) +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) def demultiplex(run, software): - """Demultiplex and transfer all runs present in the data directories.""" - an.run_preprocessing(run, software) + """Demultiplex and transfer all runs present in the data directories.""" + an.run_preprocessing(run, software) + @analysis.command() -@click.option('--runfolder-project', is_flag=False, help='Project IDs for runfolder transfer separated by comma') -@click.option('--exclude-lane', default='', help='Lanes to exclude separated by comma') -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') -@click.argument('rundir') +@click.option( + "--runfolder-project", + is_flag=False, + help="Project IDs for runfolder transfer separated by comma", +) +@click.option("--exclude-lane", default="", help="Lanes to exclude separated by comma") +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) +@click.argument("rundir") def transfer(rundir, runfolder_project, exclude_lane, software): """Transfers the run without qc.""" if not runfolder_project: @@ -35,10 +56,16 @@ def transfer(rundir, runfolder_project, exclude_lane, software): else: an.transfer_runfolder(rundir, pid=runfolder_project, exclude_lane=exclude_lane) + @analysis.command() -@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq', - help='Available software for demultiplexing: bcl2fastq (default), bclconvert') -@click.argument('rundir') +@click.option( + "-s", + "--software", + type=click.Choice(["bcl2fastq", "bclconvert"]), + default="bcl2fastq", + help="Available software for demultiplexing: bcl2fastq (default), bclconvert", +) +@click.argument("rundir") def updatedb(rundir, software): """Save the run to statusdb.""" an.upload_to_statusdb(rundir, software) @@ -46,6 +73,7 @@ def updatedb(rundir, software): # Nanopore analysis subcommands + @analysis.command() @click.option( "-r", @@ -65,6 +93,7 @@ def ont_transfer(run, qc): """Find and process all runs""" analysis_nanopore.ont_transfer(run, qc) + @analysis.command() @click.argument("run") def ont_updatedb(run): diff --git a/taca/backup/backup.py b/taca/backup/backup.py index 037b1ea6..8d43a558 100644 --- a/taca/backup/backup.py +++ b/taca/backup/backup.py @@ -1,55 +1,63 @@ """Backup methods and utilities.""" +import csv import logging import os import re import shutil import subprocess as sp import time -import csv - from datetime import datetime + +from taca.utils import filesystem, misc, statusdb from taca.utils.config import CONFIG -from taca.utils import statusdb, filesystem, misc -from io import open logger = logging.getLogger(__name__) -class run_vars(object): + +class run_vars: """A simple variable storage class.""" + def __init__(self, run, archive_path): self.abs_path = os.path.abspath(run) self.path, self.name = os.path.split(self.abs_path) - self.name = self.name.split('.', 1)[0] - self.zip = os.path.join(archive_path, f'{self.name}.tar.gz') - self.key = '{}.key'.format(self.name) - self.key_encrypted = '{}.key.gpg'.format(self.name) - self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg') + self.name = self.name.split(".", 1)[0] + self.zip = os.path.join(archive_path, f"{self.name}.tar.gz") + self.key = f"{self.name}.key" + self.key_encrypted = f"{self.name}.key.gpg" + self.zip_encrypted = os.path.join(archive_path, f"{self.name}.tar.gz.gpg") + -class backup_utils(object): +class backup_utils: """A class object with main utility methods related to backing up.""" def __init__(self, run=None): self.run = run self.fetch_config_info() - self.host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0] + self.host_name = os.getenv("HOSTNAME", os.uname()[1]).split(".", 1)[0] def fetch_config_info(self): """Try to fecth required info from the config file. Log and exit if any neccesary info is missing.""" try: - self.data_dirs = CONFIG['backup']['data_dirs'] - self.archive_dirs = CONFIG['backup']['archive_dirs'] - self.archived_dirs = CONFIG['backup']['archived_dirs'] - self.exclude_list = CONFIG['backup']['exclude_list'] - self.keys_path = CONFIG['backup']['keys_path'] - self.gpg_receiver = CONFIG['backup']['gpg_receiver'] - self.mail_recipients = CONFIG['mail']['recipients'] - self.check_demux = CONFIG.get('backup', {}).get('check_demux', False) - self.couch_info = CONFIG.get('statusdb') - self.finished_run_indicator = CONFIG.get('storage', {}).get('finished_run_indicator', 'RTAComplete.txt') - self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt') - self.archive_log_location = CONFIG['backup']['archive_log'] + self.data_dirs = CONFIG["backup"]["data_dirs"] + self.archive_dirs = CONFIG["backup"]["archive_dirs"] + self.archived_dirs = CONFIG["backup"]["archived_dirs"] + self.exclude_list = CONFIG["backup"]["exclude_list"] + self.keys_path = CONFIG["backup"]["keys_path"] + self.gpg_receiver = CONFIG["backup"]["gpg_receiver"] + self.mail_recipients = CONFIG["mail"]["recipients"] + self.check_demux = CONFIG.get("backup", {}).get("check_demux", False) + self.couch_info = CONFIG.get("statusdb") + self.finished_run_indicator = CONFIG.get("storage", {}).get( + "finished_run_indicator", "RTAComplete.txt" + ) + self.copy_complete_indicator = CONFIG.get("storage", {}).get( + "copy_complete_indicator", "CopyComplete.txt" + ) + self.archive_log_location = CONFIG["backup"]["archive_log"] except KeyError as e: - logger.error('Config file is missing the key {}, make sure it have all required information'.format(str(e))) + logger.error( + f"Config file is missing the key {str(e)}, make sure it have all required information" + ) raise SystemExit def collect_runs(self, ext=None, filter_by_ext=False): @@ -59,24 +67,30 @@ def collect_runs(self, ext=None, filter_by_ext=False): run_type = self._get_run_type(self.run) archive_path = self.archive_dirs[run_type] run = run_vars(self.run, archive_path) - if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)): - logger.error('Given run {} did not match a FC pattern'.format(self.run)) + if not ( + re.match(filesystem.RUN_RE, run.name) + or re.match(filesystem.RUN_RE_ONT, run.name) + ): + logger.error(f"Given run {self.run} did not match a FC pattern") raise SystemExit if self._is_ready_to_archive(run, ext): self.runs.append(run) else: for adir in self.archive_dirs.values(): if not os.path.isdir(adir): - logger.warn('Path {} does not exist or it is not a directory'.format(adir)) + logger.warn(f"Path {adir} does not exist or it is not a directory") continue for item in os.listdir(adir): if filter_by_ext and not item.endswith(ext): continue elif item.endswith(ext): - item = item.replace(ext, '') + item = item.replace(ext, "") elif not os.path.isdir(os.path.join(adir, item)): continue - if (re.match(filesystem.RUN_RE, item) or re.match(filesystem.RUN_RE_ONT, item)) and item not in self.runs: + if ( + re.match(filesystem.RUN_RE, item) + or re.match(filesystem.RUN_RE_ONT, item) + ) and item not in self.runs: run_type = self._get_run_type(item) archive_path = self.archive_dirs[run_type] run = run_vars(os.path.join(adir, item), archive_path) @@ -86,7 +100,14 @@ def collect_runs(self, ext=None, filter_by_ext=False): def avail_disk_space(self, path, run): """Check the space on file system based on parent directory of the run.""" # not able to fetch runtype use the max size as precaution, size units in GB - illumina_run_sizes = {'novaseq': 1800, 'miseq': 20, 'nextseq': 250, 'NovaSeqXPlus': 3600, 'promethion': 3000, 'minion': 1000} + illumina_run_sizes = { + "novaseq": 1800, + "miseq": 20, + "nextseq": 250, + "NovaSeqXPlus": 3600, + "promethion": 3000, + "minion": 1000, + } required_size = illumina_run_sizes.get(self._get_run_type(run), 900) * 2 # check for any ongoing runs and add up the required size accrdingly for ddir in self.data_dirs.values(): @@ -95,19 +116,25 @@ def avail_disk_space(self, path, run): for item in os.listdir(ddir): if not re.match(filesystem.RUN_RE, item): continue - if not os.path.exists(os.path.join(ddir, item, 'RTAComplete.txt')): - required_size += illumina_run_sizes.get(self._get_run_type(run), 900) + if not os.path.exists(os.path.join(ddir, item, "RTAComplete.txt")): + required_size += illumina_run_sizes.get( + self._get_run_type(run), 900 + ) # get available free space from the file system try: - df_proc = sp.Popen(['df', path], stdout=sp.PIPE, stderr=sp.PIPE) + df_proc = sp.Popen(["df", path], stdout=sp.PIPE, stderr=sp.PIPE) df_out, df_err = df_proc.communicate() - available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024 + available_size = ( + int(df_out.strip().decode("utf-8").split("\n")[-1].strip().split()[3]) + / 1024 + / 1024 + ) except Exception as e: - logger.error('Evaluation of disk space failed with error {}'.format(e)) + logger.error(f"Evaluation of disk space failed with error {e}") raise SystemExit if available_size < required_size: - e_msg = 'Required space for encryption is {}GB, but only {}GB available'.format(required_size, available_size) - subjt = 'Low space for encryption - {}'.format(self.host_name) + e_msg = f"Required space for encryption is {required_size}GB, but only {available_size}GB available" + subjt = f"Low space for encryption - {self.host_name}" logger.error(e_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) raise SystemExit @@ -118,47 +145,63 @@ def file_in_pdc(self, src_file, silent=True): # non-zero/False though cmd is execudted but file not found src_file_abs = os.path.abspath(src_file) try: - sp.check_call(['dsmc', 'query', 'archive', src_file_abs], stdout=sp.PIPE, stderr=sp.PIPE) + sp.check_call( + ["dsmc", "query", "archive", src_file_abs], + stdout=sp.PIPE, + stderr=sp.PIPE, + ) value = True except sp.CalledProcessError: value = False if not silent: - msg = 'File {} {} in PDC'.format(src_file_abs, 'exist' if value else 'do not exist') + msg = "File {} {} in PDC".format( + src_file_abs, "exist" if value else "do not exist" + ) logger.info(msg) return value def _get_run_type(self, run): """Returns run type based on the flowcell name.""" - run_type = '' + run_type = "" try: - if '_A0' in run: - run_type = 'novaseq' - elif '-' in run.split('_')[-1]: - run_type = 'miseq' - elif '_NS' in run or '_VH' in run: - run_type = 'nextseq' - elif '_LH' in run: - run_type = 'NovaSeqXPlus' - elif '_MN' in run: - run_type = 'minion' - elif re.match("^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$",run): - run_type = 'promethion' + if "_A0" in run: + run_type = "novaseq" + elif "-" in run.split("_")[-1]: + run_type = "miseq" + elif "_NS" in run or "_VH" in run: + run_type = "nextseq" + elif "_LH" in run: + run_type = "NovaSeqXPlus" + elif "_MN" in run: + run_type = "minion" + elif re.match( + "^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$", run + ): + run_type = "promethion" else: - run_type = '' + run_type = "" except: - logger.warn('Could not fetch run type for run {}'.format(run)) + logger.warn(f"Could not fetch run type for run {run}") return run_type - def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]): + def _call_commands( + self, + cmd1, + cmd2=None, + out_file=None, + return_out=False, + mail_failed=False, + tmp_files=[], + ): """Call an external command(s) with atmost two commands per function call. Given 'out_file' is always used for the later cmd and also stdout can be return for the later cmd. In case of failure, the 'tmp_files' are removed""" if out_file: if not cmd2: - stdout1 = open(out_file, 'w') + stdout1 = open(out_file, "w") else: stdout1 = sp.PIPE - stdout2 = open(out_file, 'w') + stdout2 = open(out_file, "w") else: stdout1 = sp.PIPE stdout2 = sp.PIPE @@ -171,7 +214,9 @@ def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_ p2 = sp.Popen(cmd2, stdin=p1.stdout, stdout=stdout2, stderr=sp.PIPE) p2_stat = p2.wait() p2_out, p2_err = p2.communicate() - if not self._check_status(cmd2, p2_stat, p2_err, mail_failed, tmp_files): + if not self._check_status( + cmd2, p2_stat, p2_err, mail_failed, tmp_files + ): return (False, p2_err) if return_out else False p1_stat = p1.wait() p1_out, p1_err = p1.communicate() @@ -194,10 +239,12 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]): if status != 0: self._clean_tmp_files(files_to_remove) if mail_failed: - subjt = 'Command call failed - {}'.format(self.host_name) - e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg) + subjt = f"Command call failed - {self.host_name}" + e_msg = "Called cmd: {}\n\nError msg: {}".format(" ".join(cmd), err_msg) misc.send_mail(subjt, e_msg, self.mail_recipients) - logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg)) + logger.error( + 'Command "{}" failed with the error "{}"'.format(" ".join(cmd), err_msg) + ) return False return True @@ -210,22 +257,24 @@ def _clean_tmp_files(self, files): def _log_pdc_statusdb(self, run): """Log the time stamp in statusDB if a file is succussfully sent to PDC.""" try: - run_vals = run.split('_') + run_vals = run.split("_") if len(run_vals[0]) == 8: run_date = run_vals[0][2:] else: run_date = run_vals[0] - run_fc = '{}_{}'.format(run_date, run_vals[-1]) + run_fc = f"{run_date}_{run_vals[-1]}" couch_connection = statusdb.StatusdbSession(self.couch_info).connection - db = couch_connection[self.couch_info['db']] - fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)} + db = couch_connection[self.couch_info["db"]] + fc_names = {e.key: e.id for e in db.view("names/name", reduce=False)} d_id = fc_names[run_fc] doc = db.get(d_id) - doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + doc["pdc_archived"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") db.save(doc) - logger.info('Logged "pdc_archived" timestamp for fc {} in statusdb doc "{}"'.format(run, d_id)) + logger.info( + f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"' + ) except: - logger.warn('Not able to log "pdc_archived" timestamp for run {}'.format(run)) + logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}') def _is_ready_to_archive(self, run, ext): """Check if the run to be encrypted has finished sequencing and has been copied completely to nas""" @@ -233,24 +282,35 @@ def _is_ready_to_archive(self, run, ext): run_path = run.abs_path rta_file = os.path.join(run_path, self.finished_run_indicator) cp_file = os.path.join(run_path, self.copy_complete_indicator) - if (os.path.exists(rta_file) and os.path.exists(cp_file) and (not self.file_in_pdc(run.zip_encrypted))) or (self._get_run_type(run.name) in ['promethion', 'minion'] and os.path.exists(os.path.join(run_path, ".sync_finished"))): + if ( + os.path.exists(rta_file) + and os.path.exists(cp_file) + and (not self.file_in_pdc(run.zip_encrypted)) + ) or ( + self._get_run_type(run.name) in ["promethion", "minion"] + and os.path.exists(os.path.join(run_path, ".sync_finished")) + ): # Case for encrypting # Run has NOT been encrypted (run.tar.gz.gpg not exists) - if ext == '.tar.gz' and (not os.path.exists(run.zip_encrypted)): - logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving') + if ext == ".tar.gz" and (not os.path.exists(run.zip_encrypted)): + logger.info( + f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving" + ) archive_ready = True # Case for putting data to PDC # Run has already been encrypted (run.tar.gz.gpg exists) - elif ext == '.tar.gz.gpg' and os.path.exists(run.zip_encrypted): - logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC') + elif ext == ".tar.gz.gpg" and os.path.exists(run.zip_encrypted): + logger.info( + f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC" + ) archive_ready = True return archive_ready def log_archived_run(self, file_name): """Write files archived to PDC to log file""" - with open(self.archive_log_location, 'a') as archive_file: - tsv_writer = csv.writer(archive_file, delimiter='\t') + with open(self.archive_log_location, "a") as archive_file: + tsv_writer = csv.writer(archive_file, delimiter="\t") tsv_writer.writerow([file_name, str(datetime.now())]) def _move_run_to_archived(self, run): @@ -258,7 +318,7 @@ def _move_run_to_archived(self, run): run_type = self._get_run_type(run.name) archived_path = self.archived_dirs[run_type] if os.path.isdir(archived_path): - logger.info('Moving run {} to the archived folder'.format(run.name)) + logger.info(f"Moving run {run.name} to the archived folder") shutil.move(run.name, archived_path) else: logger.warning("Cannot move run to archived, destination does not exist") @@ -267,130 +327,207 @@ def _move_run_to_archived(self, run): def encrypt_runs(cls, run, force): """Encrypt the runs that have been collected.""" bk = cls(run) - bk.collect_runs(ext='.tar.gz') - logger.info(f'In total, found {len(bk.runs)} run(s) to be encrypted') + bk.collect_runs(ext=".tar.gz") + logger.info(f"In total, found {len(bk.runs)} run(s) to be encrypted") for run in bk.runs: - run.flag = f'{run.name}.encrypting' + run.flag = f"{run.name}.encrypting" run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag] - logger.info(f'Encryption of run {run.name} is now started') + logger.info(f"Encryption of run {run.name} is now started") # Check if there is enough space and exit if not bk.avail_disk_space(run.path, run.name) # Check if the run in demultiplexed if not force and bk.check_demux: - if not misc.run_is_demuxed(run, bk.couch_info, bk._get_run_type(run.name)): - logger.warn(f'Run {run.name} is not demultiplexed yet, so skipping it') + if not misc.run_is_demuxed( + run, bk.couch_info, bk._get_run_type(run.name) + ): + logger.warn( + f"Run {run.name} is not demultiplexed yet, so skipping it" + ) continue - logger.info(f'Run {run.name} is demultiplexed and proceeding with encryption') + logger.info( + f"Run {run.name} is demultiplexed and proceeding with encryption" + ) with filesystem.chdir(run.path): # skip run if already ongoing if os.path.exists(run.flag): - logger.warn(f'Run {run.name} is already being encrypted, so skipping now') + logger.warn( + f"Run {run.name} is already being encrypted, so skipping now" + ) continue - flag = open(run.flag, 'w').close() + open(run.flag, "w").close() # zip the run directory if os.path.exists(run.zip): if os.path.isdir(run.name): - logger.warn(f'Both run source and zipped archive exist for run {run.name}, skipping run as precaution') + logger.warn( + f"Both run source and zipped archive exist for run {run.name}, skipping run as precaution" + ) bk._clean_tmp_files([run.flag]) continue - logger.info(f'Zipped archive already exist for run {run.name}, so using it for encryption') + logger.info( + f"Zipped archive already exist for run {run.name}, so using it for encryption" + ) else: - exclude_files = " ".join([f'--exclude {x}' for x in bk.exclude_list]) - logger.info(f'Creating zipped archive for run {run.name}') - if bk._call_commands(cmd1=f'tar {exclude_files} -cf - {run.name}', cmd2='pigz --fast -c -', - out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]): - logger.info(f'Run {run.name} was successfully compressed and transferred to {run.zip}') + exclude_files = " ".join( + [f"--exclude {x}" for x in bk.exclude_list] + ) + logger.info(f"Creating zipped archive for run {run.name}") + if bk._call_commands( + cmd1=f"tar {exclude_files} -cf - {run.name}", + cmd2="pigz --fast -c -", + out_file=run.zip, + mail_failed=True, + tmp_files=[run.zip, run.flag], + ): + logger.info( + f"Run {run.name} was successfully compressed and transferred to {run.zip}" + ) else: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue # Remove encrypted file if already exists if os.path.exists(run.zip_encrypted): - logger.warn((f'Removing already existing encrypted file for run {run.name}, this is a precaution ' - 'to make sure the file was encrypted with correct key file')) - bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted]) + logger.warn( + f"Removing already existing encrypted file for run {run.name}, this is a precaution " + "to make sure the file was encrypted with correct key file" + ) + bk._clean_tmp_files( + [ + run.zip_encrypted, + run.key, + run.key_encrypted, + run.dst_key_encrypted, + ] + ) # Generate random key to use as pasphrase - if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files): - logger.warn(f'Skipping run {run.name} and moving on') + if not bk._call_commands( + cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files + ): + logger.warn(f"Skipping run {run.name} and moving on") continue - logger.info(f'Generated random phrase key for run {run.name}') + logger.info(f"Generated random phrase key for run {run.name}") # Calculate md5 sum pre encryption if not force: - logger.info('Calculating md5sum before encryption') - md5_call, md5_out = bk._call_commands(cmd1=f'md5sum {run.zip}', return_out=True, tmp_files=tmp_files) + logger.info("Calculating md5sum before encryption") + md5_call, md5_out = bk._call_commands( + cmd1=f"md5sum {run.zip}", return_out=True, tmp_files=tmp_files + ) if not md5_call: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue md5_pre_encrypt = md5_out.split()[0] # Encrypt the zipped run file - logger.info('Encrypting the zipped run file') - if not bk._call_commands(cmd1=(f'gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo ' - f'none -o {run.zip_encrypted} {run.zip}'), tmp_files=tmp_files): - logger.warn(f'Skipping run {run.name} and moving on') + logger.info("Encrypting the zipped run file") + if not bk._call_commands( + cmd1=( + f"gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo " + f"none -o {run.zip_encrypted} {run.zip}" + ), + tmp_files=tmp_files, + ): + logger.warn(f"Skipping run {run.name} and moving on") continue # Decrypt and check for md5 if not force: - logger.info('Calculating md5sum after encryption') - md5_call, md5_out = bk._call_commands(cmd1=f'gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}', - cmd2='md5sum', return_out=True, tmp_files=tmp_files) + logger.info("Calculating md5sum after encryption") + md5_call, md5_out = bk._call_commands( + cmd1=f"gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}", + cmd2="md5sum", + return_out=True, + tmp_files=tmp_files, + ) if not md5_call: - logger.warn(f'Skipping run {run.name} and moving on') + logger.warn(f"Skipping run {run.name} and moving on") continue md5_post_encrypt = md5_out.split()[0] if md5_pre_encrypt != md5_post_encrypt: - logger.error(f'md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on') + logger.error( + f"md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on" + ) bk._clean_tmp_files(tmp_files) continue - logger.info('Md5sum matches before and after encryption') + logger.info("Md5sum matches before and after encryption") # Encrypt and move the key file - if bk._call_commands(cmd1=f'gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}', tmp_files=tmp_files): + if bk._call_commands( + cmd1=f"gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}", + tmp_files=tmp_files, + ): shutil.move(run.key_encrypted, run.dst_key_encrypted) else: - logger.error('Encryption of key file failed, skipping run') + logger.error("Encryption of key file failed, skipping run") continue bk._clean_tmp_files([run.zip, run.key, run.flag]) - logger.info(f'Encryption of run {run.name} is successfully done, removing zipped run file') + logger.info( + f"Encryption of run {run.name} is successfully done, removing zipped run file" + ) @classmethod def pdc_put(cls, run): """Archive the collected runs to PDC.""" bk = cls(run) - bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True) - logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs))) + bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True) + logger.info(f"In total, found {len(bk.runs)} run(s) to send PDC") for run in bk.runs: - run.flag = '{}.archiving'.format(run.name) + run.flag = f"{run.name}.archiving" run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted) if run.path not in bk.archive_dirs.values(): - logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate ' - 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name))) + logger.error( + "Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate " + "archive dir before sending it to PDC".format( + ",".join(list(bk.archive_dirs.values())), run.name + ) + ) continue if not os.path.exists(run.dst_key_encrypted): - logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted)) + logger.error( + f"Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it" + ) continue with filesystem.chdir(run.path): - #skip run if being encrypted - if os.path.exists('{}.encrypting'.format(run.name)): - logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name)) + # skip run if being encrypted + if os.path.exists(f"{run.name}.encrypting"): + logger.warn( + f"Run {run.name} is currently being encrypted, so skipping now" + ) continue # skip run if already ongoing if os.path.exists(run.flag): - logger.warn('Run {} is already being archived, so skipping now'.format(run.name)) + logger.warn( + f"Run {run.name} is already being archived, so skipping now" + ) continue - if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False): - logger.warn('Seems like files related to run {} already exist in PDC, check and cleanup'.format(run.name)) + if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc( + run.dst_key_encrypted, silent=False + ): + logger.warn( + f"Seems like files related to run {run.name} already exist in PDC, check and cleanup" + ) continue - flag = open(run.flag, 'w').close() - logger.info('Sending file {} to PDC'.format(run.zip_encrypted)) - if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]): - time.sleep(15) # give some time just in case 'dsmc' needs to settle - if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]): - time.sleep(5) # give some time just in case 'dsmc' needs to settle - if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted): - logger.info('Successfully sent file {} to PDC, moving file locally from {} to archived folder'.format(run.zip_encrypted, run.path)) + open(run.flag, "w").close() + logger.info(f"Sending file {run.zip_encrypted} to PDC") + if bk._call_commands( + cmd1=f"dsmc archive {run.zip_encrypted}", tmp_files=[run.flag] + ): + time.sleep(15) # give some time just in case 'dsmc' needs to settle + if bk._call_commands( + cmd1=f"dsmc archive {run.dst_key_encrypted}", + tmp_files=[run.flag], + ): + time.sleep( + 5 + ) # give some time just in case 'dsmc' needs to settle + if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc( + run.dst_key_encrypted + ): + logger.info( + f"Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder" + ) bk.log_archived_run(run.zip_encrypted) if bk.couch_info: bk._log_pdc_statusdb(run.name) - bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag]) + bk._clean_tmp_files( + [run.zip_encrypted, run.dst_key_encrypted, run.flag] + ) bk._move_run_to_archived(run) continue - logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted)) + logger.warn(f"Sending file {run.zip_encrypted} to PDC failed") diff --git a/taca/backup/cli.py b/taca/backup/cli.py index 07cce810..60d8c442 100644 --- a/taca/backup/cli.py +++ b/taca/backup/cli.py @@ -1,39 +1,75 @@ """CLI for the backup subcommand.""" import click + from taca.backup.backup import backup_utils as bkut + @click.group() @click.pass_context def backup(ctx): - """ Backup management methods and utilities """ + """Backup management methods and utilities""" pass + @backup.command() -@click.option('-r', '--run', type=click.Path(exists=True), help="A run (directory or a zipped archive) to be encrypted") -@click.option('-f', '--force', is_flag=True, help="Ignore the checks and just try encryption. USE IT WITH CAUTION.") +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + help="A run (directory or a zipped archive) to be encrypted", +) +@click.option( + "-f", + "--force", + is_flag=True, + help="Ignore the checks and just try encryption. USE IT WITH CAUTION.", +) @click.pass_context def encrypt(ctx, run, force): bkut.encrypt_runs(run, force) -@backup.command(name='put_data') -@click.option('-r', '--run', type=click.Path(exists=True), help="A run name (without extension) to be sent to PDC") + +@backup.command(name="put_data") +@click.option( + "-r", + "--run", + type=click.Path(exists=True), + help="A run name (without extension) to be sent to PDC", +) @click.pass_context def put_data(ctx, run): bkut.pdc_put(run) -@backup.command(name='get_data') -@click.option('-r', '--run', required=True, help="A run name (without extension) to download from PDC") -@click.option('-o', '--outdir', type=click.Path(exists=True, file_okay=False, writable=True), - help="Optional directory name to save the downloaded file. Directory should exist") + +@backup.command(name="get_data") +@click.option( + "-r", + "--run", + required=True, + help="A run name (without extension) to download from PDC", +) +@click.option( + "-o", + "--outdir", + type=click.Path(exists=True, file_okay=False, writable=True), + help="Optional directory name to save the downloaded file. Directory should exist", +) @click.pass_context def get_data(ctx, run, outdir): ## W I P ## raise NotImplementedError + @backup.command() -@click.option('-r', '--run', required=True, type=click.Path(exists=True, dir_okay=False), help="A encripted run file") -@click.option('-k', '--key', required=True, help="Key file to be used for decryption") -@click.option('-p', '--password', help="To pass decryption passphrase via command line") +@click.option( + "-r", + "--run", + required=True, + type=click.Path(exists=True, dir_okay=False), + help="A encripted run file", +) +@click.option("-k", "--key", required=True, help="Key file to be used for decryption") +@click.option("-p", "--password", help="To pass decryption passphrase via command line") @click.pass_context def decrypt(ctx, run, key, password): ## W I P ## diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index 07600870..df1e80ab 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -2,23 +2,28 @@ import logging import os import re - from collections import defaultdict from datetime import datetime from glob import glob -from taca.utils.config import CONFIG, load_config from taca.utils import filesystem, misc, statusdb -from io import open -from six.moves import map +from taca.utils.config import CONFIG, load_config logger = logging.getLogger(__name__) -def cleanup_miarka(days_fastq, days_analysis, - only_fastq, only_analysis, - clean_undetermined, status_db_config, - exclude_projects, list_only, - date, dry_run=False): + +def cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run=False, +): """Remove fastq/analysis data for projects that have been closed more than given days (as days_fastq/days_analysis) from the given 'miarka' cluster. @@ -49,217 +54,334 @@ def cleanup_miarka(days_fastq, days_analysis, - "*.bam" """ try: - config = CONFIG['cleanup']['miarka'] - flowcell_dir_root = config['flowcell']['root'] - flowcell_project_source = config['flowcell']['relative_project_source'] - flowcell_undet_files = config['flowcell']['undet_file_pattern'] - data_dir = config['data_dir'] - analysis_dir = config['analysis']['root'] - analysis_data_to_remove = config['analysis']['files_to_remove'] + config = CONFIG["cleanup"]["miarka"] + flowcell_dir_root = config["flowcell"]["root"] + flowcell_project_source = config["flowcell"]["relative_project_source"] + flowcell_undet_files = config["flowcell"]["undet_file_pattern"] + data_dir = config["data_dir"] + analysis_dir = config["analysis"]["root"] + analysis_data_to_remove = config["analysis"]["files_to_remove"] if date: - date = datetime.strptime(date, '%Y-%m-%d') + date = datetime.strptime(date, "%Y-%m-%d") except KeyError as e: - logger.error('Config file is missing the key {}, make sure it has all required information'.format(str(e))) + logger.error( + f"Config file is missing the key {str(e)}, make sure it has all required information" + ) raise SystemExit - except ValueError as e: - logger.error('Date given with "--date" option is not in required format, see help for more info') + except ValueError: + logger.error( + 'Date given with "--date" option is not in required format, see help for more info' + ) raise SystemExit # make a connection for project db db_config = load_config(status_db_config) - pcon = statusdb.ProjectSummaryConnection(db_config.get('statusdb')) - assert pcon, 'Could not connect to project database in StatusDB' + pcon = statusdb.ProjectSummaryConnection(db_config.get("statusdb")) + assert pcon, "Could not connect to project database in StatusDB" # make exclude project list if provided exclude_list = [] if exclude_projects: if os.path.isfile(exclude_projects): - with open(exclude_projects, 'r') as in_file: + with open(exclude_projects) as in_file: exclude_list.extend([p.strip() for p in in_file.readlines()]) else: - exclude_list.extend(exclude_projects.split(',')) + exclude_list.extend(exclude_projects.split(",")) # sanity check for mentioned project to exculde or valid - invalid_projects = [p for p in exclude_list if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()] + invalid_projects = [ + p + for p in exclude_list + if p not in pcon.id_view.keys() and p not in pcon.name_view.keys() + ] if invalid_projects: - logger.error('"--exclude_projects" was called with some invalid projects "{}", ' - 'provide valid project name/id'.format(','.join(invalid_projects))) + logger.error( + '"--exclude_projects" was called with some invalid projects "{}", ' + "provide valid project name/id".format(",".join(invalid_projects)) + ) raise SystemExit - #compile list for project to delete + # compile list for project to delete project_clean_list, project_processed_list = ({}, []) if not list_only and not clean_undetermined: - logger.info('Building initial project list for removing data...') + logger.info("Building initial project list for removing data...") if only_fastq: - logger.info('Option "--only_fastq" is given, so will not look for analysis data') + logger.info( + 'Option "--only_fastq" is given, so will not look for analysis data' + ) elif only_analysis: - logger.info('Option "--only_analysis" is given, so will not look for fastq data') + logger.info( + 'Option "--only_analysis" is given, so will not look for fastq data' + ) if clean_undetermined: all_undet_files = [] for flowcell_dir in flowcell_dir_root: - for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)]: + for fc in [ + d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) + ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn('Flowcell {} does not contain a "{}" directory'.format(fc, flowcell_project_source)) + logger.warn( + f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory' + ) continue - projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ - if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \ - not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))] + projects_in_fc = [ + d + for d in os.listdir(flowcell_project_source) + if re.match(r"^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$", d) + and not os.path.exists( + os.path.join(flowcell_project_source, d, "cleaned") + ) + ] # the above check looked for project directories and also that are not cleaned # so if it could not find any project, means there is no project diretory at all # or all the project directory is already cleaned. Then we can remove the undet if len(projects_in_fc) > 0: continue - fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files)) + fc_undet_files = glob( + os.path.join(flowcell_project_source, flowcell_undet_files) + ) if fc_undet_files: - logger.info('All projects was cleaned for FC {}, found {} undeterminded files'.format(fc, len(fc_undet_files))) - all_undet_files.extend(list(map(os.path.abspath, fc_undet_files))) + logger.info( + f"All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files" + ) + all_undet_files.extend( + list(map(os.path.abspath, fc_undet_files)) + ) if all_undet_files: undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files))) - if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files), - undet_size), default='no'): - removed = _remove_files(all_undet_files) + if misc.query_yes_no( + "In total found {} undetermined files which are {} in size, delete now ?".format( + len(all_undet_files), undet_size + ), + default="no", + ): + _remove_files(all_undet_files) return elif only_analysis: - for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \ - not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]: - proj_abs_path = os.path.join(analysis_dir, pid) - proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date) - if proj_info and proj_info['closed_days'] >= days_analysis: + for pid in [ + d + for d in os.listdir(analysis_dir) + if re.match(r"^P\d+$", d) + and not os.path.exists(os.path.join(analysis_dir, d, "cleaned")) + ]: + os.path.join(analysis_dir, pid) + proj_info = get_closed_proj_info( + pid, pcon.get_entry(pid, use_id_view=True), date + ) + if proj_info and proj_info["closed_days"] >= days_analysis: # move on if this project has to be excluded - if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: + if ( + proj_info["name"] in exclude_list + or proj_info["pid"] in exclude_list + ): continue - analysis_data, analysis_size = collect_analysis_data_miarka(pid, analysis_dir, analysis_data_to_remove) - proj_info['analysis_to_remove'] = analysis_data - proj_info['analysis_size'] = analysis_size - proj_info['fastq_to_remove'] = 'not_selected' - proj_info['fastq_size'] = 0 - project_clean_list[proj_info['name']] = proj_info + analysis_data, analysis_size = collect_analysis_data_miarka( + pid, analysis_dir, analysis_data_to_remove + ) + proj_info["analysis_to_remove"] = analysis_data + proj_info["analysis_size"] = analysis_size + proj_info["fastq_to_remove"] = "not_selected" + proj_info["fastq_size"] = 0 + project_clean_list[proj_info["name"]] = proj_info else: for flowcell_dir in flowcell_dir_root: - for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]: + for fc in [ + d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d) + ]: fc_abs_path = os.path.join(flowcell_dir, fc) with filesystem.chdir(fc_abs_path): if not os.path.exists(flowcell_project_source): - logger.warn('Flowcell {} do not contain a "{}" direcotry'.format(fc, flowcell_project_source)) + logger.warn( + f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry' + ) continue - projects_in_fc = [d for d in os.listdir(flowcell_project_source) \ - if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \ - not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))] + projects_in_fc = [ + d + for d in os.listdir(flowcell_project_source) + if re.match(r"^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$", d) + and not os.path.exists( + os.path.join(flowcell_project_source, d, "cleaned") + ) + ] for _proj in projects_in_fc: - proj = re.sub(r'_+', '.', _proj, 1) + proj = re.sub(r"_+", ".", _proj, 1) # if a project is already processed no need of fetching it again from status db if proj in project_processed_list: # if the project is closed more than threshold days collect the fastq files from FC # no need of looking for analysis data as they would have been collected in the first time - if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq: - fc_fq_files, fq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj)) - project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc] - project_clean_list[proj]['fastq_size'] += fq_size + if ( + proj in project_clean_list + and project_clean_list[proj]["closed_days"] + >= days_fastq + ): + fc_fq_files, fq_size = collect_fastq_data_miarka( + fc_abs_path, + os.path.join(flowcell_project_source, _proj), + ) + project_clean_list[proj]["fastq_to_remove"][ + "flowcells" + ][fc] = fc_fq_files["flowcells"][fc] + project_clean_list[proj]["fastq_size"] += fq_size continue project_processed_list.append(proj) - #by default assume all projects are not old enough for delete - fastq_data, analysis_data = ('young', 'young') + # by default assume all projects are not old enough for delete + fastq_data, analysis_data = ("young", "young") fastq_size, analysis_size = (0, 0) - proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date) + proj_info = get_closed_proj_info( + proj, pcon.get_entry(proj), date + ) if proj_info: # move on if this project has to be excluded - if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list: + if ( + proj_info["name"] in exclude_list + or proj_info["pid"] in exclude_list + ): continue # if project not old enough for fastq files and only fastq files selected move on to next project - if proj_info['closed_days'] >= days_fastq: - fastq_data, fastq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj), - data_dir, proj_info['pid']) + if proj_info["closed_days"] >= days_fastq: + fastq_data, fastq_size = collect_fastq_data_miarka( + fc_abs_path, + os.path.join(flowcell_project_source, _proj), + data_dir, + proj_info["pid"], + ) if not only_fastq: # if project is old enough for fastq files and not 'only_fastq' try collect analysis files - if proj_info['closed_days'] >= days_analysis: - analysis_data, analysis_size = collect_analysis_data_miarka(proj_info['pid'], analysis_dir, analysis_data_to_remove) + if proj_info["closed_days"] >= days_analysis: + ( + analysis_data, + analysis_size, + ) = collect_analysis_data_miarka( + proj_info["pid"], + analysis_dir, + analysis_data_to_remove, + ) # if both fastq and analysis files are not old enough move on - if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == 'cleaned') and fastq_data == 'young'): + if (analysis_data == fastq_data) or ( + (not analysis_data or analysis_data == "cleaned") + and fastq_data == "young" + ): continue - elif fastq_data == 'young': + elif fastq_data == "young": continue else: - analysis_data = 'not_selected' - proj_info['fastq_to_remove'] = fastq_data - proj_info['fastq_size'] = fastq_size - proj_info['analysis_to_remove'] = analysis_data - proj_info['analysis_size'] = analysis_size + analysis_data = "not_selected" + proj_info["fastq_to_remove"] = fastq_data + proj_info["fastq_size"] = fastq_size + proj_info["analysis_to_remove"] = analysis_data + proj_info["analysis_size"] = analysis_size project_clean_list[proj] = proj_info if not project_clean_list: - logger.info('There are no projects to clean') + logger.info("There are no projects to clean") return # list only the project and exit if 'list_only' option is selected if list_only: - print('Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size') - for p_info in sorted(list(project_clean_list.values()), key=lambda d: d['closed_days'], reverse=True): - print('\t'.join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'], - str(p_info['closed_days']), p_info['closed_date'], - _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])])) + print( + "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size" + ) + for p_info in sorted( + list(project_clean_list.values()), + key=lambda d: d["closed_days"], + reverse=True, + ): + print( + "\t".join( + [ + p_info["name"], + p_info["pid"], + p_info["bioinfo_responsible"], + str(p_info["closed_days"]), + p_info["closed_date"], + _def_get_size_unit(p_info["fastq_size"]), + _def_get_size_unit(p_info["analysis_size"]), + ] + ) + ) raise SystemExit - logger.info('Initial list is built with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list))) - if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'): + logger.info( + f"Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}" + ) + if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"): filtered_project, proj_count = ([], 0) - #go through complied project list and remove files + # go through complied project list and remove files for proj, info in project_clean_list.items(): proj_count += 1 - if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq), - proj_count, len(project_clean_list)), default='no'): - logger.info('Will not remove files for project {}'.format(proj)) + if not misc.query_yes_no( + "{}Delete files for this project ({}/{})".format( + get_proj_meta_info(info, days_fastq), + proj_count, + len(project_clean_list), + ), + default="no", + ): + logger.info(f"Will not remove files for project {proj}") filtered_project.append(proj) # remove projects that were decided not to delete map(project_clean_list.pop, filtered_project) - logger.info('Removed {}/{} projects from initial list'.format(len(filtered_project), proj_count)) + logger.info( + f"Removed {len(filtered_project)}/{proj_count} projects from initial list" + ) if not project_clean_list: - logger.info('There are no projects to clean after filtering') + logger.info("There are no projects to clean after filtering") return - logger.info('Final list is created with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list))) - if not misc.query_yes_no('Proceed with cleanup ?', default='no'): - logger.info('Aborting cleanup') + logger.info( + f"Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}" + ) + if not misc.query_yes_no("Proceed with cleanup ?", default="no"): + logger.info("Aborting cleanup") return - logger.info('Will start cleaning up project now') + logger.info("Will start cleaning up project now") for proj, info in project_clean_list.items(): - fastq_info = info.get('fastq_to_remove') + fastq_info = info.get("fastq_to_remove") if fastq_info and isinstance(fastq_info, dict): - logger.info('Cleaning fastq files for project {}'.format(proj)) - fastq_fc = fastq_info.get('flowcells', {}) + logger.info(f"Cleaning fastq files for project {proj}") + fastq_fc = fastq_info.get("flowcells", {}) removed_fc = [] for fc, fc_info in fastq_fc.items(): - proj_fc_root = fc_info['proj_root'] - logger.info('Removing fastq files from {}'.format(proj_fc_root)) + proj_fc_root = fc_info["proj_root"] + logger.info(f"Removing fastq files from {proj_fc_root}") if not dry_run: - if _remove_files(fc_info['fq_files']): - logger.info('Removed fastq files from FC {} for project {}, marking it as cleaned'.format(fc, proj)) + if _remove_files(fc_info["fq_files"]): + logger.info( + f"Removed fastq files from FC {fc} for project {proj}, marking it as cleaned" + ) _touch_cleaned(proj_fc_root) removed_fc.append(fc) if len(fastq_fc) == len(removed_fc): try: - proj_data_root = fastq_info['proj_data']['proj_data_root'] - logger.info('All flowcells cleaned for this project, marking it as cleaned in {}'.format(proj_data_root)) + proj_data_root = fastq_info["proj_data"]["proj_data_root"] + logger.info( + f"All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}" + ) _touch_cleaned(proj_data_root) except: pass - analysis_info = info.get('analysis_to_remove') + analysis_info = info.get("analysis_to_remove") if analysis_info and isinstance(analysis_info, dict): - proj_analysis_root = analysis_info['proj_analysis_root'] - logger.info('cleaning analysis data for project {}'.format(proj)) + proj_analysis_root = analysis_info["proj_analysis_root"] + logger.info(f"cleaning analysis data for project {proj}") removed_qc = [] - for qc, files in analysis_info['analysis_files'].items(): - logger.info('Removing files of "{}" from {}'.format(qc, proj_analysis_root)) + for qc, files in analysis_info["analysis_files"].items(): + logger.info(f'Removing files of "{qc}" from {proj_analysis_root}') if not dry_run: if _remove_files(files): removed_qc.append(qc) else: - logger.warn('Could not remove some files in qc directory "{}"'.format(qc)) - map(analysis_info['analysis_files'].pop, removed_qc) - if len(analysis_info['analysis_files']) == 0: - logger.info('Removed analysis data for project {}, marking it cleaned'.format(proj)) + logger.warn( + f'Could not remove some files in qc directory "{qc}"' + ) + map(analysis_info["analysis_files"].pop, removed_qc) + if len(analysis_info["analysis_files"]) == 0: + logger.info( + f"Removed analysis data for project {proj}, marking it cleaned" + ) _touch_cleaned(proj_analysis_root) @@ -267,27 +389,38 @@ def cleanup_miarka(days_fastq, days_analysis, # Class helper methods, not exposed as commands/subcommands # ############################################################# + def get_closed_proj_info(prj, pdoc, tdate=None): """Check and return a dict if project is closed.""" pdict = None if not tdate: tdate = datetime.today() if not pdoc: - logger.warn('Seems like project {} does not have a proper statusdb document, skipping it'.format(prj)) - elif 'close_date' in pdoc: - closed_date = pdoc['close_date'] + logger.warn( + f"Seems like project {prj} does not have a proper statusdb document, skipping it" + ) + elif "close_date" in pdoc: + closed_date = pdoc["close_date"] try: - closed_days = tdate - datetime.strptime(closed_date, '%Y-%m-%d') - pdict = {'name' : pdoc.get('project_name'), - 'pid' : pdoc.get('project_id'), - 'closed_date' : closed_date, - 'closed_days' : closed_days.days, - 'bioinfo_responsible' : pdoc.get('project_summary',{}).get('bioinfo_responsible','')} + closed_days = tdate - datetime.strptime(closed_date, "%Y-%m-%d") + pdict = { + "name": pdoc.get("project_name"), + "pid": pdoc.get("project_id"), + "closed_date": closed_date, + "closed_days": closed_days.days, + "bioinfo_responsible": pdoc.get("project_summary", {}).get( + "bioinfo_responsible", "" + ), + } except: - logger.warn('Problem calculating closed days for project {} with close date {}. Skipping it'.format( - pdoc.get('project_name'), closed_date)) + logger.warn( + "Problem calculating closed days for project {} with close date {}. Skipping it".format( + pdoc.get("project_name"), closed_date + ) + ) return pdict + def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}): """Collect the analysis files that have to be removed from Miarka return a tuple with files and total size of collected files.""" @@ -295,42 +428,57 @@ def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}): proj_abs_path = os.path.join(analysis_root, pid) if not os.path.exists(proj_abs_path): file_list = None - elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')): - file_list = 'cleaned' + elif os.path.exists(os.path.join(proj_abs_path, "cleaned")): + file_list = "cleaned" else: - file_list = {'proj_analysis_root':proj_abs_path, - 'analysis_files': defaultdict(list)} - for qc_type,ext in files_ext_to_remove.items(): + file_list = { + "proj_analysis_root": proj_abs_path, + "analysis_files": defaultdict(list), + } + for qc_type, ext in files_ext_to_remove.items(): qc_path = os.path.join(proj_abs_path, qc_type) if os.path.exists(qc_path): - file_list['analysis_files'][qc_type].extend(collect_files_by_ext(qc_path, ext)) + file_list["analysis_files"][qc_type].extend( + collect_files_by_ext(qc_path, ext) + ) try: - size += sum([sum(map(os.path.getsize, fls)) for fls in file_list['analysis_files'].values()]) + size += sum( + [ + sum(map(os.path.getsize, fls)) + for fls in file_list["analysis_files"].values() + ] + ) except: pass return (file_list, size) + def collect_fastq_data_miarka(fc_root, fc_proj_src, proj_root=None, pid=None): """Collect the fastq files that have to be removed from Miarka. Return a tuple with files and total size of collected files.""" size = 0 - file_list = {'flowcells': defaultdict(dict)} + file_list = {"flowcells": defaultdict(dict)} fc_proj_path = os.path.join(fc_root, fc_proj_src) fc_id = os.path.basename(fc_root) - file_list['flowcells'][fc_id] = {'proj_root': fc_proj_path, - 'fq_files': collect_files_by_ext(fc_proj_path, '*.fastq.gz')} + file_list["flowcells"][fc_id] = { + "proj_root": fc_proj_path, + "fq_files": collect_files_by_ext(fc_proj_path, "*.fastq.gz"), + } if proj_root and pid: proj_abs_path = os.path.join(proj_root, pid) if not os.path.exists(proj_abs_path): - file_list['proj_data'] = None - elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')): - file_list['proj_data'] = 'cleaned' + file_list["proj_data"] = None + elif os.path.exists(os.path.join(proj_abs_path, "cleaned")): + file_list["proj_data"] = "cleaned" else: - file_list['proj_data'] = {'proj_data_root': proj_abs_path, - 'fastq_files' : collect_files_by_ext(proj_abs_path, '*.fastq.gz')} - size += sum(map(os.path.getsize, file_list['flowcells'][fc_id]['fq_files'])) + file_list["proj_data"] = { + "proj_data_root": proj_abs_path, + "fastq_files": collect_files_by_ext(proj_abs_path, "*.fastq.gz"), + } + size += sum(map(os.path.getsize, file_list["flowcells"][fc_id]["fq_files"])) return (file_list, size) + def collect_files_by_ext(path, ext=[]): """Collect files with a given extension from a given path.""" if isinstance(ext, str): @@ -343,60 +491,79 @@ def collect_files_by_ext(path, ext=[]): collected_files.extend(collect_files_by_ext(d, ext)) return collected_files + def get_proj_meta_info(info, days_fastq): """From given info collect meta info for a project.""" - template = '\n' + template = "\n" + def _get_template_string(h, v): try: - v = '{}: {}\n'.format(h, v) + v = f"{h}: {v}\n" except: - v = '{}: Problem getting this'.format(h) + v = f"{h}: Problem getting this" return v - template += _get_template_string('Project overview', info.get('name')) - template += _get_template_string('Project ID', info.get('pid')) - template += _get_template_string('Bioinfo Responsible', info.get('bioinfo_responsible','')) - template += _get_template_string('Closed for (days)', info.get('closed_days')) - template += _get_template_string('Closed from (date)', info.get('closed_date')) + + template += _get_template_string("Project overview", info.get("name")) + template += _get_template_string("Project ID", info.get("pid")) + template += _get_template_string( + "Bioinfo Responsible", info.get("bioinfo_responsible", "") + ) + template += _get_template_string("Closed for (days)", info.get("closed_days")) + template += _get_template_string("Closed from (date)", info.get("closed_date")) # set analysis info based upon what we have - analysis_info = info.get('analysis_to_remove') + analysis_info = info.get("analysis_to_remove") if not analysis_info: - template += 'Project analysis: No analysis directory\n' - elif isinstance(analysis_info, str) and analysis_info == 'cleaned': - template += 'Project analysis: Analysis directory already cleaned\n' + template += "Project analysis: No analysis directory\n" + elif isinstance(analysis_info, str) and analysis_info == "cleaned": + template += "Project analysis: Analysis directory already cleaned\n" elif isinstance(analysis_info, dict): f_stat = [] - for qc_type, files in analysis_info['analysis_files'].items(): - f_stat.append('{} ({} files)'.format(qc_type, len(files))) - template += 'Project analyzed: {}\n'.format(', '.join(f_stat)) + for qc_type, files in analysis_info["analysis_files"].items(): + f_stat.append(f"{qc_type} ({len(files)} files)") + template += "Project analyzed: {}\n".format(", ".join(f_stat)) # set fastq info based upon what we have - fq_info = info.get('fastq_to_remove') + fq_info = info.get("fastq_to_remove") if isinstance(fq_info, str) and fq_info == "young": - template += 'Project been closed less than {} days, so will not remove any fastq files\n'.format(days_fastq) + template += f"Project been closed less than {days_fastq} days, so will not remove any fastq files\n" elif isinstance(fq_info, dict): - proj_fq_info = fq_info.get('proj_data') + proj_fq_info = fq_info.get("proj_data") if not proj_fq_info: - template += 'Project organized: No organized directory for project\n' + template += "Project organized: No organized directory for project\n" elif isinstance(proj_fq_info, str) and proj_fq_info == "cleaned": - template += 'Project organized: Project directory is already cleaned\n' + template += "Project organized: Project directory is already cleaned\n" elif isinstance(proj_fq_info, dict): - template += 'Project organized: Project is organized with {} fastq files\n'.format(len(proj_fq_info['fastq_files'])) - fc_fq_info = fq_info.get('flowcells', {}) + template += ( + "Project organized: Project is organized with {} fastq files\n".format( + len(proj_fq_info["fastq_files"]) + ) + ) + fc_fq_info = fq_info.get("flowcells", {}) fc_num = len(fc_fq_info.keys()) - fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()])) - template += 'Flowcells: There are {} FC with total {} fastq files\n'.format(fc_num, fc_files) - template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0))) + fc_files = sum( + map(len, [fc_info.get("fq_files", []) for fc_info in fc_fq_info.values()]) + ) + template += ( + f"Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n" + ) + template += "Estimated data size: {}\n".format( + _def_get_size_unit(info.get("fastq_size", 0) + info.get("fastq_size", 0)) + ) return template + def get_files_size_text(plist): """Get project list dict and give back string with overll sizes.""" - fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()])) - asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()])) - return '({f}{s}{a}) '.format(f = '~{} fastq data'.format(fsize) if fsize else '', - a = '~{} analysis data'.format(asize) if asize else '', - s = ' and ' if fsize and asize else '') + fsize = _def_get_size_unit(sum([i.get("fastq_size", 0) for i in plist.values()])) + asize = _def_get_size_unit(sum([i.get("analysis_size", 0) for i in plist.values()])) + return "({f}{s}{a}) ".format( + f=f"~{fsize} fastq data" if fsize else "", + a=f"~{asize} analysis data" if asize else "", + s=" and " if fsize and asize else "", + ) + def _def_get_size_unit(s): """Change the given size to appropriate unit measurement for better readability.""" @@ -405,17 +572,18 @@ def _def_get_size_unit(s): gb = mb * 1000 tb = gb * 1000 if s > tb: - s = '~{}tb'.format(int(s/tb)) + s = f"~{int(s/tb)}tb" elif s > gb: - s = '~{}gb'.format(int(s/gb)) + s = f"~{int(s/gb)}gb" elif s > mb: - s = '~{}mb'.format(int(s/mb)) + s = f"~{int(s/mb)}mb" elif s > kb: - s = '~{}kb'.format(int(s/kb)) + s = f"~{int(s/kb)}kb" elif s > 0: - s = '~{}b'.format(int(s/b)) + s = f"~{int(s)}b" return str(s) + def _remove_files(files): """Remove files from given list.""" status = True @@ -423,13 +591,16 @@ def _remove_files(files): try: os.remove(fl) except Exception as e: - logger.warn('Could not remove file {} due to "{}"'.format(fl, e.message)) + logger.warn(f'Could not remove file {fl} due to "{e.message}"') status = False return status + def _touch_cleaned(path): """Touch a 'cleaned' file in a given path.""" try: - open(os.path.join(path, 'cleaned'), 'w').close() + open(os.path.join(path, "cleaned"), "w").close() except Exception as e: - logger.warn('Could not create "cleaned" file in path {} due to "{}"'.format(path, e.message)) + logger.warn( + f'Could not create "cleaned" file in path {path} due to "{e.message}"' + ) diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py index 65abaf50..fe7e11ba 100644 --- a/taca/cleanup/cli.py +++ b/taca/cleanup/cli.py @@ -1,67 +1,125 @@ """CLI for the storage subcommand.""" import click + from taca.cleanup import cleanup as cln from taca.utils import misc + @click.group() @click.pass_context -@click.option('--status_db_config', - type=click.Path(exists=True, dir_okay=False), - envvar='STATUS_DB_CONFIG', - help='Path to statusdb-configuration.') +@click.option( + "--status_db_config", + type=click.Path(exists=True, dir_okay=False), + envvar="STATUS_DB_CONFIG", + help="Path to statusdb-configuration.", +) def cleanup(ctx, status_db_config): """Cleaning up servers - management methods and utilities.""" pass + # cleanup subcommands @cleanup.command() -@click.option('-d', '--days', type=click.IntRange(min=1), - help='Days to consider as thershold, should not be combined with option "--hours"') -@click.option('-h', '--hours', type=click.IntRange(min=1), - help='Hours to consider as thershold, should not be combined with option "--days"') +@click.option( + "-d", + "--days", + type=click.IntRange(min=1), + help='Days to consider as thershold, should not be combined with option "--hours"', +) +@click.option( + "-h", + "--hours", + type=click.IntRange(min=1), + help='Hours to consider as thershold, should not be combined with option "--days"', +) @click.pass_context def preproc(ctx, days, hours): """Do appropriate cleanup on preproc.""" seconds = misc.to_seconds(days, hours) cln.cleanup_processing(seconds) + @cleanup.command() -@click.option('--days_fastq', type=click.IntRange(min=1), - help='Days to consider as thershold for removing "fastq" files') -@click.option('--days_analysis', type=click.IntRange(min=1), - help='Days to consider as thershold for removing analysis data') -@click.option('--only_fastq', is_flag=True, - help='Clean only fastq data in "miarka"') -@click.option('--only_analysis', is_flag=True, - help='Clean only analysis data in "miarka"') -@click.option('--date', type=click.STRING, - help='Consider the given date instead of today while collecting closed projects. ' - 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"') -@click.option('--exclude_projects', type=click.STRING, - help='A project or a file with a list of projects to exclude from deleting. ' - 'Either name or id can be given. Examples: --exclude_projects P1234 or ' - '--exclude_projects P1234,P5678 or ' - '--exclude_projects file_with_projects_id.txt') -@click.option('--clean_undetermined', is_flag=True, - help='Remove only the undetermined reads for a flowcell that have ' - 'all project cleaned. All other parameters are ignored if this ' - 'flag is called.') -@click.option('-l', '--list_only', is_flag=True, - help='Only build the project list that will be cleaned') -@click.option('-n', '--dry_run', is_flag=True, - help='Perform dry run i.e. execute nothing but log') +@click.option( + "--days_fastq", + type=click.IntRange(min=1), + help='Days to consider as thershold for removing "fastq" files', +) +@click.option( + "--days_analysis", + type=click.IntRange(min=1), + help="Days to consider as thershold for removing analysis data", +) +@click.option("--only_fastq", is_flag=True, help='Clean only fastq data in "miarka"') +@click.option( + "--only_analysis", is_flag=True, help='Clean only analysis data in "miarka"' +) +@click.option( + "--date", + type=click.STRING, + help="Consider the given date instead of today while collecting closed projects. " + 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"', +) +@click.option( + "--exclude_projects", + type=click.STRING, + help="A project or a file with a list of projects to exclude from deleting. " + "Either name or id can be given. Examples: --exclude_projects P1234 or " + "--exclude_projects P1234,P5678 or " + "--exclude_projects file_with_projects_id.txt", +) +@click.option( + "--clean_undetermined", + is_flag=True, + help="Remove only the undetermined reads for a flowcell that have " + "all project cleaned. All other parameters are ignored if this " + "flag is called.", +) +@click.option( + "-l", + "--list_only", + is_flag=True, + help="Only build the project list that will be cleaned", +) +@click.option( + "-n", "--dry_run", is_flag=True, help="Perform dry run i.e. execute nothing but log" +) @click.pass_context -def miarka(ctx, days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, date, exclude_projects, list_only, dry_run): +def miarka( + ctx, + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + date, + exclude_projects, + list_only, + dry_run, +): """Do appropriate cleanup on Miarka.""" - status_db_config = ctx.parent.params['status_db_config'] + status_db_config = ctx.parent.params["status_db_config"] if only_fastq and only_analysis: - raise SystemExit('ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one') + raise SystemExit( + 'ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one' + ) if not days_fastq and not only_analysis and not clean_undetermined: - raise SystemExit('ERROR: "days_fastq" is not given while not selecting "only_analysis" option') + raise SystemExit( + 'ERROR: "days_fastq" is not given while not selecting "only_analysis" option' + ) if not days_analysis and not only_fastq and not clean_undetermined: - raise SystemExit('ERROR: "days_analysis" is not given while not selecting "only_fastq" option') - cln.cleanup_miarka(days_fastq, days_analysis, - only_fastq, only_analysis, - clean_undetermined, status_db_config, - exclude_projects, list_only, - date, dry_run) + raise SystemExit( + 'ERROR: "days_analysis" is not given while not selecting "only_fastq" option' + ) + cln.cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run, + ) diff --git a/taca/cli.py b/taca/cli.py index 1c78dabc..d777884a 100644 --- a/taca/cli.py +++ b/taca/cli.py @@ -1,35 +1,39 @@ -# -*- coding: utf-8 -*- import logging import os -from pkg_resources import iter_entry_points + import click -import taca.log +from pkg_resources import iter_entry_points +import taca.log from taca import __version__ from taca.utils import config as conf logger = logging.getLogger(__name__) + @click.group() @click.version_option(__version__) # Priority for the configuration file is: environment variable > -c option > default -@click.option('-c', '--config-file', - default=os.path.join(os.environ['HOME'], '.taca/taca.yaml'), - envvar='TACA_CONFIG', - type=click.File('r'), - help='Path to TACA configuration file') - +@click.option( + "-c", + "--config-file", + default=os.path.join(os.environ["HOME"], ".taca/taca.yaml"), + envvar="TACA_CONFIG", + type=click.File("r"), + help="Path to TACA configuration file", +) @click.pass_context def cli(ctx, config_file): - """ Tool for the Automation of Storage and Analyses """ + """Tool for the Automation of Storage and Analyses""" ctx.obj = {} config = conf.load_yaml_config(config_file.name) - log_file = config.get('log', {}).get('file', None) + log_file = config.get("log", {}).get("file", None) if log_file: - level = config.get('log').get('log_level', 'INFO') + level = config.get("log").get("log_level", "INFO") taca.log.init_logger_file(log_file, level) - logger.debug('starting up CLI') + logger.debug("starting up CLI") + -#Add subcommands dynamically to the CLI -for entry_point in iter_entry_points('taca.subcommands'): +# Add subcommands dynamically to the CLI +for entry_point in iter_entry_points("taca.subcommands"): cli.add_command(entry_point.load()) diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py index fd3d3b16..d6483823 100644 --- a/taca/illumina/MiSeq_Runs.py +++ b/taca/illumina/MiSeq_Runs.py @@ -1,21 +1,24 @@ +import logging import os import re import shutil -import logging + from flowcell_parser.classes import SampleSheetParser + from taca.illumina.Standard_Runs import Standard_Run logger = logging.getLogger(__name__) -TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?') -TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?') -SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]') -IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)') -RECIPE_PAT = re.compile('[0-9]+-[0-9]+') +TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?") +TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?") +SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]") +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") +RECIPE_PAT = re.compile("[0-9]+-[0-9]+") + class MiSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(MiSeq_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() self._copy_samplesheet() @@ -30,8 +33,7 @@ def _get_samplesheet(self): """Locate and parse the samplesheet for a run. In MiSeq case this is located in FC_DIR/SampleSheet.csv """ - ssname = os.path.join(self.run_dir, - 'SampleSheet.csv') + ssname = os.path.join(self.run_dir, "SampleSheet.csv") if os.path.exists(ssname): # If exists parse the SampleSheet return ssname @@ -46,14 +48,14 @@ def _copy_samplesheet(self): # Load index files indexfile = dict() try: - indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path'] + indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"] except KeyError: - logger.error('Path to index file (10X) not found in the config file') + logger.error("Path to index file (10X) not found in the config file") raise RuntimeError try: - indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path'] + indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"] except KeyError: - logger.error('Path to index file (Smart-seq) not found in the config file') + logger.error("Path to index file (Smart-seq) not found in the config file") raise RuntimeError if ssname is None: return None @@ -62,97 +64,144 @@ def _copy_samplesheet(self): # Copy the original samplesheet locally. # Copy again if already done as there might have been changes to the samplesheet try: - shutil.copy(ssname, os.path.join(self.run_dir, '{}.csv'.format(self.flowcell_id))) + shutil.copy(ssname, os.path.join(self.run_dir, f"{self.flowcell_id}.csv")) ssname = os.path.join(self.run_dir, os.path.split(ssname)[1]) except: - raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir)) + raise RuntimeError( + f"unable to copy file {ssname} to destination {self.run_dir}" + ) # This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready # to be used it needs some editing. # This will contain the samplesheet with all the renaiming to be used with bcl2fastq - samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet_copy.csv') + samplesheet_dest = os.path.join(self.run_dir, "SampleSheet_copy.csv") # Check that the samplesheet is not already present. In this case go the next step if os.path.exists(samplesheet_dest): - logger.info('SampleSheet_copy.csv found ... overwriting it') + logger.info("SampleSheet_copy.csv found ... overwriting it") try: - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_clean_samplesheet(ssparser, - indexfile, - fields_to_remove=None, - rename_samples=True, - rename_qPCR_suffix = True, - fields_qPCR=[ssparser.dfield_snm])) + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_clean_samplesheet( + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=True, + fields_qPCR=[ssparser.dfield_snm], + ) + ) except Exception as e: logger.error(e) return False - logger.info(('Created SampleSheet_copy.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest))) + logger.info( + f"Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} " + ) # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use - self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv')) - if not self.runParserObj.obj.get('samplesheet_csv'): - self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data + self.runParserObj.samplesheet = SampleSheetParser( + os.path.join(self.run_dir, "SampleSheet_copy.csv") + ) + if not self.runParserObj.obj.get("samplesheet_csv"): + self.runParserObj.obj[ + "samplesheet_csv" + ] = self.runParserObj.samplesheet.data - def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None): + def _generate_clean_samplesheet( + self, + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=False, + fields_qPCR=None, + ): """Generate a 'clean' samplesheet, the given fields will be removed. If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC """ - output = u'' - compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + output = "" + compl = {"A": "T", "C": "G", "G": "C", "T": "A"} # Expand the ssparser if there are lanes with 10X or Smart-seq samples - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) # Replace 10X or Smart-seq indices for sample in ssparser.data: - if sample['index'] in index_dict_tenX.keys(): - tenX_index = sample['index'] + if sample["index"] in index_dict_tenX.keys(): + tenX_index = sample["index"] # In the case of 10X dual indexes, replace index and index2 if TENX_DUAL_PAT.findall(tenX_index): - sample['index'] = index_dict_tenX[tenX_index][0] - sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_tenX[tenX_index][1].replace(',','').upper() ] ) ) + sample["index"] = index_dict_tenX[tenX_index][0] + sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_tenX[tenX_index][1] + .replace(",", "") + .upper() + ] + ) + ) # In the case of 10X single indexes, replace the index name with the 4 actual indicies else: x = 0 indices_number = len(index_dict_tenX[tenX_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_tenX[tenX_index][x] + new_sample["index"] = index_dict_tenX[tenX_index][x] ssparser.data.append(new_sample) x += 1 # Set the original 10X index to the 4th correct index - sample['index'] = index_dict_tenX[tenX_index][x] - elif SMARTSEQ_PAT.findall(sample['index']): + sample["index"] = index_dict_tenX[tenX_index][x] + elif SMARTSEQ_PAT.findall(sample["index"]): x = 0 - smartseq_index = sample['index'].split('-')[1] + smartseq_index = sample["index"].split("-")[1] indices_number = len(index_dict_smartseq[smartseq_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_smartseq[smartseq_index][x][0] - new_sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) ) + new_sample["index"] = index_dict_smartseq[smartseq_index][x][0] + new_sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_smartseq[smartseq_index][x][1] + .replace(",", "") + .upper() + ] + ) + ) ssparser.data.append(new_sample) x += 1 - sample['index'] = index_dict_smartseq[smartseq_index][x][0] - sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) ) + sample["index"] = index_dict_smartseq[smartseq_index][x][0] + sample["index2"] = "".join( + reversed( + [ + compl.get(b, b) + for b in index_dict_smartseq[smartseq_index][x][1] + .replace(",", "") + .upper() + ] + ) + ) # Sort to get the added indicies from 10x in the right place # Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID - ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID'))) + ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID"))) if not fields_to_remove: fields_to_remove = [] # Header - output += '[Header]{}'.format(os.linesep) + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Data - output += '[Data]{}'.format(os.linesep) + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: line_ar = [] @@ -162,16 +211,18 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm])) + value = re.sub( + "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}" + ) else: # Substitute SampleID with SampleName, add Sample_ as prefix - value ='Sample_{}'.format(line[ssparser.dfield_snm]) + value = f"Sample_{line[ssparser.dfield_snm]}" except: - # Otherwise add Sample_ as prefix - value = 'Sample_{}'.format(line[ssparser.dfield_sid]) + # Otherwise add Sample_ as prefix + value = f"Sample_{line[ssparser.dfield_sid]}" elif rename_qPCR_suffix and field in fields_qPCR: - value = re.sub('__qPCR_$', '', line[field]) + value = re.sub("__qPCR_$", "", line[field]) line_ar.append(value) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py index bcdf34ff..5785542c 100755 --- a/taca/illumina/NextSeq_Runs.py +++ b/taca/illumina/NextSeq_Runs.py @@ -3,7 +3,7 @@ class NextSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(NextSeq_Run, self).__init__( run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() # NextSeq2000 has a different FC ID pattern that ID contains the first letter for position diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py index 58b384af..116d7c1a 100644 --- a/taca/illumina/NovaSeqXPlus_Runs.py +++ b/taca/illumina/NovaSeqXPlus_Runs.py @@ -3,7 +3,7 @@ class NovaSeqXPlus_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(NovaSeqXPlus_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() self._copy_samplesheet() diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py index 670b1fa9..52a7e162 100644 --- a/taca/illumina/NovaSeq_Runs.py +++ b/taca/illumina/NovaSeq_Runs.py @@ -3,7 +3,7 @@ class NovaSeq_Run(Standard_Run): def __init__(self, run_dir, software, configuration): - super(NovaSeq_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) self._set_sequencer_type() self._set_run_type() self._copy_samplesheet() diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py index f5ab583b..c562fc2f 100644 --- a/taca/illumina/Runs.py +++ b/taca/illumina/Runs.py @@ -1,46 +1,53 @@ -import os -import re import csv -import logging -import subprocess -import shutil import glob import json - +import logging +import os +import re +import shutil +import subprocess from datetime import datetime +from flowcell_parser.classes import LaneBarcodeParser, RunParser, SampleSheetParser + from taca.utils import misc from taca.utils.misc import send_mail -from flowcell_parser.classes import RunParser, LaneBarcodeParser, SampleSheetParser logger = logging.getLogger(__name__) -class Run(object): - """ Defines an Illumina run - """ + +class Run: + """Defines an Illumina run""" def __init__(self, run_dir, software, configuration): if not os.path.exists(run_dir): - raise RuntimeError("Could not locate run directory {}".format(run_dir)) - - if 'analysis_server' not in configuration or \ - 'bcl2fastq' not in configuration or \ - 'bclconvert' not in configuration or \ - 'samplesheets_dir' not in configuration: - raise RuntimeError("configuration missing required entries " - "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)") - if not os.path.exists(os.path.join(run_dir, 'runParameters.xml')) \ - and os.path.exists(os.path.join(run_dir, 'RunParameters.xml')): + raise RuntimeError(f"Could not locate run directory {run_dir}") + + if ( + "analysis_server" not in configuration + or "bcl2fastq" not in configuration + or "bclconvert" not in configuration + or "samplesheets_dir" not in configuration + ): + raise RuntimeError( + "configuration missing required entries " + "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)" + ) + if not os.path.exists( + os.path.join(run_dir, "runParameters.xml") + ) and os.path.exists(os.path.join(run_dir, "RunParameters.xml")): # In NextSeq runParameters is named RunParameters logger.warning("Creating link from runParameters.xml to RunParameters.xml") - os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml')) - elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')): - raise RuntimeError("Could not locate runParameters.xml in run directory {}".format(run_dir)) + os.symlink("RunParameters.xml", os.path.join(run_dir, "runParameters.xml")) + elif not os.path.exists(os.path.join(run_dir, "runParameters.xml")): + raise RuntimeError( + f"Could not locate runParameters.xml in run directory {run_dir}" + ) self.run_dir = os.path.abspath(run_dir) self.software = software self.id = os.path.basename(os.path.normpath(run_dir)) - pattern = r'(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)' + pattern = r"(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)" m = re.match(pattern, self.id) self.date = m.group(1) self.instrument = m.group(2) @@ -63,51 +70,78 @@ def check_run_status(self): This function checks the status of a run while in progress. In the case of HiSeq check that all demux have been done and in that case perform aggregation """ - dex_status = self.get_run_status() - if self.software == 'bcl2fastq': - legacy_path = '' - elif self.software == 'bclconvert': - legacy_path = "Reports/{}".format(self.legacy_dir) + dex_status = self.get_run_status() + if self.software == "bcl2fastq": + legacy_path = "" + elif self.software == "bclconvert": + legacy_path = f"Reports/{self.legacy_dir}" # Check the status of running demux # Collect all samplesheets generated before - samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while + samplesheets = glob.glob( + os.path.join(self.run_dir, "*_[0-9].csv") + ) # A single digit, this hypothesis should hold for a while all_demux_done = True for samplesheet in samplesheets: demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] - demux_folder = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id)) + demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") # Check if this job is done - if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')): + if os.path.exists( + os.path.join( + self.run_dir, + demux_folder, + legacy_path, + "Stats", + "DemultiplexingStats.xml", + ) + ): all_demux_done = all_demux_done and True - if self.software == 'bcl2fastq': - demux_log = os.path.join(self.run_dir, "demux_{}_bcl2fastq.err".format(demux_id)) - elif self.software == 'bclconvert': - demux_log = os.path.join(self.run_dir, "demux_{}_bcl-convert.err".format(demux_id)) + if self.software == "bcl2fastq": + demux_log = os.path.join( + self.run_dir, f"demux_{demux_id}_bcl2fastq.err" + ) + elif self.software == "bclconvert": + demux_log = os.path.join( + self.run_dir, f"demux_{demux_id}_bcl-convert.err" + ) else: raise RuntimeError("Unrecognized software!") if os.path.isfile(demux_log): - errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log) + ( + errors, + warnings, + error_and_warning_messages, + ) = self._check_demux_log(demux_id, demux_log) else: - raise RuntimeError("No demux log file found for sub-demultiplexing {}!".format(demux_id)) - self.demux_summary[demux_id] = {'errors' : errors, - 'warnings' : warnings, - 'error_and_warning_messages' : error_and_warning_messages - } + raise RuntimeError( + f"No demux log file found for sub-demultiplexing {demux_id}!" + ) + self.demux_summary[demux_id] = { + "errors": errors, + "warnings": warnings, + "error_and_warning_messages": error_and_warning_messages, + } if errors or warnings: - logger.info("Sub-Demultiplexing in {} completed with {} errors and {} warnings!".format(demux_folder, errors, warnings)) + logger.info( + f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!" + ) else: - logger.info("Sub-Demultiplexing in {} completed without any error or warning.".format(demux_folder)) + logger.info( + f"Sub-Demultiplexing in {demux_folder} completed without any error or warning." + ) else: all_demux_done = all_demux_done and False - logger.info("Sub-Demultiplexing in {} not completed yet.".format(demux_folder)) + logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.") # All demux jobs finished and all stats aggregated under Demultiplexing # Aggreate all the results in the Demultiplexing folder - if all_demux_done and dex_status!='COMPLETED': - dex_status = 'COMPLETED' + if all_demux_done and dex_status != "COMPLETED": + dex_status = "COMPLETED" self._aggregate_demux_results() self.runParserObj = RunParser(self.run_dir) # Rename undetermined if needed - lanes = misc.return_unique([lanes['Lane'] for lanes in self.runParserObj.samplesheet.data]) + lanes = misc.return_unique( + [lanes["Lane"] for lanes in self.runParserObj.samplesheet.data] + ) samples_per_lane = self.get_samples_per_lane() for lane in lanes: if self.is_unpooled_lane(lane): @@ -119,10 +153,10 @@ def _check_demux_log(self, demux_id, demux_log): This function checks the log files of bcl2fastq/bclconvert Errors or warnings will be captured and email notifications will be sent """ - with open(demux_log, 'r') as demux_log_file: + with open(demux_log) as demux_log_file: demux_log_content = demux_log_file.readlines() - if self.software == 'bcl2fastq': - pattern = r'Processing completed with (\d+) errors and (\d+) warnings' + if self.software == "bcl2fastq": + pattern = r"Processing completed with (\d+) errors and (\d+) warnings" match = re.search(pattern, demux_log_content[-1]) if match: errors = int(match.group(1)) @@ -130,21 +164,23 @@ def _check_demux_log(self, demux_id, demux_log): error_and_warning_messages = [] if errors or warnings: for line in demux_log_content: - if 'ERROR' in line or 'WARN' in line: + if "ERROR" in line or "WARN" in line: error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages else: - raise RuntimeError("Bad format with log file demux_{}_bcl2fastq.err".format(demux_id)) - elif self.software == 'bclconvert': + raise RuntimeError( + f"Bad format with log file demux_{demux_id}_bcl2fastq.err" + ) + elif self.software == "bclconvert": errors = 0 warnings = 0 error_and_warning_messages = [] for line in demux_log_content: - if 'ERROR' in line: + if "ERROR" in line: errors += 1 error_and_warning_messages.append(line) - elif 'WARNING' in line: - warnnings += 1 + elif "WARNING" in line: + warnings += 1 error_and_warning_messages.append(line) return errors, warnings, error_and_warning_messages else: @@ -170,50 +206,53 @@ def _get_demux_folder(self): def _get_samplesheet(self): """ - Locate and parse the samplesheet for a run. The idea is that there is a folder in - samplesheet_folders that contains a samplesheet named flowecell_id.csv. + Locate and parse the samplesheet for a run. The idea is that there is a folder in + samplesheet_folders that contains a samplesheet named flowecell_id.csv. """ try: # Only implemented for some, (e.g. NovaSeqXPlus) # Will raise AttributeError if not implemented. current_year = self._current_year() except AttributeError: - current_year = '20' + self.id[0:2] + current_year = "20" + self.id[0:2] - samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'], - current_year) - ssname = os.path.join(samplesheets_dir, '{}.csv'.format(self.flowcell_id)) + samplesheets_dir = os.path.join(self.CONFIG["samplesheets_dir"], current_year) + ssname = os.path.join(samplesheets_dir, f"{self.flowcell_id}.csv") if os.path.exists(ssname): return ssname else: - raise RuntimeError("not able to find samplesheet {}.csv in {}".format(self.flowcell_id, self.CONFIG['samplesheets_dir'])) + raise RuntimeError( + "not able to find samplesheet {}.csv in {}".format( + self.flowcell_id, self.CONFIG["samplesheets_dir"] + ) + ) def _is_demultiplexing_done(self): - return os.path.exists(os.path.join(self.run_dir, - self._get_demux_folder(), - 'Stats', - 'Stats.json')) + return os.path.exists( + os.path.join(self.run_dir, self._get_demux_folder(), "Stats", "Stats.json") + ) def _is_demultiplexing_started(self): return os.path.exists(os.path.join(self.run_dir, self._get_demux_folder())) def _is_sequencing_done(self): - return os.path.exists(os.path.join(self.run_dir, 'RTAComplete.txt')) and os.path.exists(os.path.join(self.run_dir, 'CopyComplete.txt')) + return os.path.exists( + os.path.join(self.run_dir, "RTAComplete.txt") + ) and os.path.exists(os.path.join(self.run_dir, "CopyComplete.txt")) def get_run_status(self): - """ Return the current status of the run. - """ + """Return the current status of the run.""" demux_started = self._is_demultiplexing_started() demux_done = self._is_demultiplexing_done() sequencing_done = self._is_sequencing_done() if sequencing_done and demux_done: - return 'COMPLETED' # run is done, transfer might be ongoing. + return "COMPLETED" # run is done, transfer might be ongoing. elif sequencing_done and demux_started and not demux_done: - return 'IN_PROGRESS' + return "IN_PROGRESS" elif sequencing_done and not demux_started: - return 'TO_START' + return "TO_START" elif not sequencing_done: - return 'SEQUENCING' + return "SEQUENCING" else: raise RuntimeError("Unexpected status in get_run_status") @@ -249,66 +288,69 @@ def _compute_base_mask(self): raise NotImplementedError("Please Implement this method") def transfer_run(self, t_file, mail_recipients=None): - """ Transfer a run to the analysis server. Will add group R/W permissions to - the run directory in the destination server so that the run can be processed - by any user/account in that group (i.e a functional account...). - :param str t_file: File where to put the transfer information + """Transfer a run to the analysis server. Will add group R/W permissions to + the run directory in the destination server so that the run can be processed + by any user/account in that group (i.e a functional account...). + :param str t_file: File where to put the transfer information """ # The option -a implies -o and -g which is not the desired behaviour - command_line = ['rsync', '-LtDrv'] + command_line = ["rsync", "-LtDrv"] # Add R/W permissions to the group - command_line.append('--chmod=g+rw') + command_line.append("--chmod=g+rw") # This horrible thing here avoids data dup when we use multiple indexes in a lane/FC command_line.append("--exclude=Demultiplexing_*/*_*") command_line.append("--include=*/") - for to_include in self.CONFIG['analysis_server']['sync']['include']: - command_line.append("--include={}".format(to_include)) + for to_include in self.CONFIG["analysis_server"]["sync"]["include"]: + command_line.append(f"--include={to_include}") command_line.extend(["--exclude=*", "--prune-empty-dirs"]) - r_user = self.CONFIG['analysis_server']['user'] - r_host = self.CONFIG['analysis_server']['host'] - r_dir = self.CONFIG['analysis_server']['sync']['data_archive'] - remote = "{}@{}:{}".format(r_user, r_host, r_dir) + r_user = self.CONFIG["analysis_server"]["user"] + r_host = self.CONFIG["analysis_server"]["host"] + r_dir = self.CONFIG["analysis_server"]["sync"]["data_archive"] + remote = f"{r_user}@{r_host}:{r_dir}" command_line.extend([self.run_dir, remote]) # Create temp file indicating that the run is being transferred try: - open(os.path.join(self.run_dir, 'transferring'), 'w').close() - except IOError as e: - logger.error("Cannot create a file in {}. " - "Check the run name, and the permissions.".format(self.id)) + open(os.path.join(self.run_dir, "transferring"), "w").close() + except OSError as e: + logger.error( + f"Cannot create a file in {self.id}. " + "Check the run name, and the permissions." + ) raise e - started = ("Started transfer of run {} on {}".format(self.id, datetime.now())) + started = f"Started transfer of run {self.id} on {datetime.now()}" logger.info(started) # In this particular case we want to capture the exception because we want # to delete the transfer file try: - msge_text="I am about to transfer with this command \n{}".format(command_line) - logger.info(msge_text) - misc.call_external_command(command_line, with_log_files=True, - prefix="", log_dir=self.run_dir) + msge_text = f"I am about to transfer with this command \n{command_line}" + logger.info(msge_text) + misc.call_external_command( + command_line, with_log_files=True, prefix="", log_dir=self.run_dir + ) except subprocess.CalledProcessError as exception: - os.remove(os.path.join(self.run_dir, 'transferring')) - #Send an email notifying that the transfer failed + os.remove(os.path.join(self.run_dir, "transferring")) + # Send an email notifying that the transfer failed runname = self.id - sbt = ("Rsync of run {} failed".format(runname)) - msg= """ Rsync of data for run {run} has failed! - Raised the following exception: {e} - """.format(run=runname, e=exception) + sbt = f"Rsync of run {runname} failed" + msg = f""" Rsync of data for run {runname} has failed! + Raised the following exception: {exception} + """ if mail_recipients: send_mail(sbt, msg, mail_recipients) raise exception - logger.info('Adding run {} to {}'.format(self.id, t_file)) - with open(t_file, 'a') as tranfer_file: - tsv_writer = csv.writer(tranfer_file, delimiter='\t') + logger.info(f"Adding run {self.id} to {t_file}") + with open(t_file, "a") as tranfer_file: + tsv_writer = csv.writer(tranfer_file, delimiter="\t") tsv_writer.writerow([self.id, str(datetime.now())]) - os.remove(os.path.join(self.run_dir, 'transferring')) + os.remove(os.path.join(self.run_dir, "transferring")) - #Send an email notifying that the transfer was successful + # Send an email notifying that the transfer was successful runname = self.id - sbt = ("Rsync of data for run {} to the analysis cluster has finished".format(runname)) - msg= """ Rsync of data for run {run} to the analysis cluster has finished! + sbt = f"Rsync of data for run {runname} to the analysis cluster has finished" + msg = """ Rsync of data for run {run} to the analysis cluster has finished! The run is available at : https://genomics-status.scilifelab.se/flowcells/{run} """.format(run=runname) @@ -316,52 +358,50 @@ def transfer_run(self, t_file, mail_recipients=None): send_mail(sbt, msg, mail_recipients) def archive_run(self, destination): - """ Move run to the archive folder - :param str destination: the destination folder + """Move run to the archive folder + :param str destination: the destination folder """ if destination and os.path.isdir(destination): - logger.info('archiving run {}'.format(self.id)) + logger.info(f"archiving run {self.id}") shutil.move(self.run_dir, os.path.join(destination, self.id)) else: logger.warning("Cannot move run to archive, destination does not exist") def send_mail(self, sbt, msg, rcp): - """ Sends mail about run completion - """ - already_seen = False + """Sends mail about run completion""" runname = self.id if not sbt: - sbt = "{}".format(runname) + sbt = f"{runname}" misc.send_mail(sbt, msg, rcp) def is_transferred(self, transfer_file): - """ Checks wether a run has been transferred to the analysis server or not. - Returns true in the case in which the tranfer is finished or ongoing. - :param str transfer_file: Path to file with information about transferred runs + """Checks wether a run has been transferred to the analysis server or not. + Returns true in the case in which the tranfer is finished or ongoing. + :param str transfer_file: Path to file with information about transferred runs """ try: - with open(transfer_file, 'r') as file_handle: - transfer_file_contents = csv.reader(file_handle, delimiter='\t') + with open(transfer_file) as file_handle: + transfer_file_contents = csv.reader(file_handle, delimiter="\t") for row in transfer_file_contents: # Rows have two columns: run and transfer date if row[0] == os.path.basename(self.id): return True - if os.path.exists(os.path.join(self.run_dir, 'transferring')): + if os.path.exists(os.path.join(self.run_dir, "transferring")): return True return False - except IOError: + except OSError: return False def is_unpooled_lane(self, lane): """ - :param lane: lane identifier - :type lane: string - :rtype: boolean - :returns: True if the samplesheet has one entry for that lane, False otherwise + :param lane: lane identifier + :type lane: string + :rtype: boolean + :returns: True if the samplesheet has one entry for that lane, False otherwise """ count = 0 for l in self.runParserObj.samplesheet.data: - if l['Lane'] == lane: + if l["Lane"] == lane: count += 1 return count == 1 @@ -375,7 +415,7 @@ def get_samples_per_lane(self): ss = self.runParserObj.samplesheet d = {} for l in ss.data: - d[l['Lane']] = l[ss.dfield_snm] + d[l["Lane"]] = l[ss.dfield_snm] return d def _rename_undet(self, lane, samples_per_lane): @@ -388,25 +428,35 @@ def _rename_undet(self, lane, samples_per_lane): :param samples_per_lane: lane:sample dict :type status: dict """ - for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, "Undetermined*L0?{}*".format(lane))): - old_name=os.path.basename(file) - old_name_comps=old_name.split("_") - old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined - old_name_comps[0]=samples_per_lane[lane]#replace Undetermined with samplename + for file in glob.glob( + os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*") + ): + old_name = os.path.basename(file) + old_name_comps = old_name.split("_") + old_name_comps[1] = old_name_comps[0] # replace S0 with Undetermined + old_name_comps[0] = samples_per_lane[ + lane + ] # replace Undetermined with samplename for index, comp in enumerate(old_name_comps): - if comp.startswith('L00'): - old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper - - new_name="_".join(old_name_comps) - logger.info("Renaming {} to {}".format(file, os.path.join(os.path.dirname(file), new_name))) + if comp.startswith("L00"): + old_name_comps[index] = comp.replace( + "L00", "L01" + ) # adds a 1 as the second lane number in order to differentiate undetermined from normal in piper + + new_name = "_".join(old_name_comps) + logger.info( + f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}" + ) os.rename(file, os.path.join(os.path.dirname(file), new_name)) def _classify_lanes(self, samplesheets): # Prepare a list for lanes with NoIndex samples noindex_lanes = [] for entry in self.runParserObj.samplesheet.data: - if entry['index'].upper() == 'NOINDEX' or (entry['index'] == '' and entry['index2'] == ''): - noindex_lanes.append(entry['Lane']) + if entry["index"].upper() == "NOINDEX" or ( + entry["index"] == "" and entry["index2"] == "" + ): + noindex_lanes.append(entry["Lane"]) # Prepare a dict with the lane, demux_id and index_length info based on the sub-samplesheets # This is for the purpose of deciding simple_lanes and complex_lanes, plus we should start with the Stats.json file from which demux_id for each lane lane_demuxid_indexlength = dict() @@ -414,10 +464,18 @@ def _classify_lanes(self, samplesheets): demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] ssparser = SampleSheetParser(samplesheet) for row in ssparser.data: - if row['Lane'] not in lane_demuxid_indexlength.keys(): - lane_demuxid_indexlength[row['Lane']] = {demux_id: [len(row.get('index','')), len(row.get('index2',''))]} - elif demux_id not in lane_demuxid_indexlength[row['Lane']].keys(): - lane_demuxid_indexlength[row['Lane']][demux_id] = [len(row.get('index','')), len(row.get('index2',''))] + if row["Lane"] not in lane_demuxid_indexlength.keys(): + lane_demuxid_indexlength[row["Lane"]] = { + demux_id: [ + len(row.get("index", "")), + len(row.get("index2", "")), + ] + } + elif demux_id not in lane_demuxid_indexlength[row["Lane"]].keys(): + lane_demuxid_indexlength[row["Lane"]][demux_id] = [ + len(row.get("index", "")), + len(row.get("index2", "")), + ] else: pass @@ -434,7 +492,12 @@ def _classify_lanes(self, samplesheets): # Dual and longer indexes have higher priority if 0 in list(complex_lanes[key].values())[0] and 0 not in vv: complex_lanes[key] = {vk: vv} - elif (0 in list(complex_lanes[key].values())[0] and 0 in vv) or (0 not in list(complex_lanes[key].values())[0] and 0 not in vv): + elif ( + 0 in list(complex_lanes[key].values())[0] and 0 in vv + ) or ( + 0 not in list(complex_lanes[key].values())[0] + and 0 not in vv + ): if sum(vv) > sum(list(complex_lanes[key].values())[0]): complex_lanes[key] = {vk: vv} else: @@ -442,333 +505,622 @@ def _classify_lanes(self, samplesheets): return noindex_lanes, simple_lanes, complex_lanes - def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, legacy_path): + def _process_noindex_sample_with_fake_index_with_single_demux( + self, demux_id, legacy_path + ): demux_folder = os.path.join(self.run_dir, self.demux_dir) sample_counter = 1 - for entry in sorted(self.runParserObj.samplesheet.data, key=lambda k: k['Lane']): - lane = entry['Lane'] - project = entry['Sample_Project'] - sample = entry['Sample_ID'] + for entry in sorted( + self.runParserObj.samplesheet.data, key=lambda k: k["Lane"] + ): + lane = entry["Lane"] + project = entry["Sample_Project"] + sample = entry["Sample_ID"] project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): os.makedirs(project_dest) sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))): + for file in glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined*L0?{lane}*", + ) + ): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_',''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:] + new_name_comps = [ + sample.replace("Sample_", ""), + f"S{str(sample_counter)}", + ] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) - logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name)) + logger.info( + "For undet sample {}, renaming {} to {}".format( + sample.replace("Sample_", ""), old_name, new_name + ) + ) sample_counter += 1 # Make a softlink of lane.html - html_report_lane_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") - html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html") + html_report_lane_source = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) + html_report_lane_dest = os.path.join( + demux_folder, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) if not os.path.isdir(os.path.dirname(html_report_lane_dest)): os.makedirs(os.path.dirname(html_report_lane_dest)) os.symlink(html_report_lane_source, html_report_lane_dest) # Modify the laneBarcode.html file - html_report_laneBarcode = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + html_report_laneBarcode = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) html_report_laneBarcode_parser = LaneBarcodeParser(html_report_laneBarcode) lane_project_sample = dict() for entry in html_report_laneBarcode_parser.sample_data: - if entry['Sample'] != 'Undetermined': - lane_project_sample[entry['Lane']] = {'Project': entry['Project'], - 'Sample': entry['Sample'] - } + if entry["Sample"] != "Undetermined": + lane_project_sample[entry["Lane"]] = { + "Project": entry["Project"], + "Sample": entry["Sample"], + } for entry in html_report_laneBarcode_parser.sample_data[:]: - if entry['Sample'] == 'Undetermined': - entry['Project'] = lane_project_sample[entry['Lane']]['Project'] - entry['Sample'] = lane_project_sample[entry['Lane']]['Sample'] + if entry["Sample"] == "Undetermined": + entry["Project"] = lane_project_sample[entry["Lane"]]["Project"] + entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"] else: html_report_laneBarcode_parser.sample_data.remove(entry) - html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data, - key=lambda k: (k['Lane'].lower(), k['Sample'])) - new_html_report_laneBarcode = os.path.join(demux_folder, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + html_report_laneBarcode_parser.sample_data = sorted( + html_report_laneBarcode_parser.sample_data, + key=lambda k: (k["Lane"].lower(), k["Sample"]), + ) + new_html_report_laneBarcode = os.path.join( + demux_folder, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) # Modify the Stats.json file - stat_json_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json") + stat_json_source = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + "Stats.json", + ) stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json") with open(stat_json_source) as json_data: data = json.load(json_data) # Fix the sample stats per lane - for entry in data['ConversionResults'][:]: - del entry['DemuxResults'][0]['IndexMetrics'] - entry['DemuxResults'][0].update(entry['Undetermined']) - del entry['Undetermined'] + for entry in data["ConversionResults"][:]: + del entry["DemuxResults"][0]["IndexMetrics"] + entry["DemuxResults"][0].update(entry["Undetermined"]) + del entry["Undetermined"] # Reset unknown barcodes list - for entry in data['UnknownBarcodes'][:]: - entry['Barcodes'] = {'unknown': 1} + for entry in data["UnknownBarcodes"][:]: + entry["Barcodes"] = {"unknown": 1} # Write to a new Stats.json file - with open(stat_json_new, 'w') as stat_json_new_file: + with open(stat_json_new, "w") as stat_json_new_file: json.dump(data, stat_json_new_file) - def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes): - elements = [element for element in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) ] + def _process_simple_lane_with_single_demux( + self, demux_id, legacy_path, noindex_lanes + ): + elements = [ + element + for element in os.listdir( + os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") + ) + ] for element in elements: - if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), element) + if ( + "Stats" not in element and "Reports" not in element + ): # skip this folder and treat it differently to take into account the NoIndex case + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", element + ) dest = os.path.join(self.run_dir, self.demux_dir, element) os.symlink(source, dest) os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats")) # Fetch the lanes that have NoIndex - statsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "*" )) + statsFiles = glob.glob( + os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" + ) + ) for source in statsFiles: source_name = os.path.split(source)[1] - if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: - lane = os.path.splitext(os.path.split(source)[1])[0][-1] #lane + if source_name not in [ + "DemultiplexingStats.xml", + "AdapterTrimming.txt", + "ConversionStats.xml", + "Stats.json", + ]: + lane = os.path.splitext(os.path.split(source)[1])[0][-1] # lane if lane not in noindex_lanes: - dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name) + dest = os.path.join( + self.run_dir, self.demux_dir, "Stats", source_name + ) os.symlink(source, dest) - for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]: - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", file) + for file in [ + "DemultiplexingStats.xml", + "AdapterTrimming.txt", + "ConversionStats.xml", + "Stats.json", + ]: + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file + ) dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file) os.symlink(source, dest) - source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports") + source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports" + ) dest = os.path.join(self.run_dir, self.demux_dir, "Reports") if os.path.exists(dest): try: os.rmdir(dest) - except NotADirectoryError as e: + except NotADirectoryError: os.unlink(dest) os.symlink(source, dest) - def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode): + def _fix_html_reports_for_complex_lanes( + self, + demux_folder, + index_cycles, + complex_lanes, + noindex_lanes, + html_reports_lane, + html_reports_laneBarcode, + ): # Start with the lane html_report_lane_parser = None for next_html_report_lane in html_reports_lane: if html_report_lane_parser is None: html_report_lane_parser = LaneBarcodeParser(next_html_report_lane) else: - lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data] + lanesInReport = [ + Lane["Lane"] for Lane in html_report_lane_parser.sample_data + ] next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane) for entry in next_html_report_lane_parser.sample_data: - if not entry['Lane'] in lanesInReport: + if entry["Lane"] not in lanesInReport: # If this is a new lane not included before html_report_lane_parser.sample_data.append(entry) # Now all lanes have been inserted # NumberReads for total lane cluster/yields and total sample cluster/yields - NumberReads_Summary = dict() + self.NumberReads_Summary = dict() # The numbers in Flowcell Summary also need to be aggregated if multiple demultiplexing is done Clusters_Raw = 0 Clusters_PF = 0 Yield_Mbases = 0 for entry in html_report_lane_parser.sample_data: # Update NumberReads for total lane clusters - NumberReads_Summary[entry['Lane']] = {'total_lane_cluster': int(entry['PF Clusters'].replace(',', '')), - 'total_lane_yield': int(entry['Yield (Mbases)'].replace(',', ''))} - Clusters_Raw += int(int(entry['PF Clusters'].replace(',', '')) / float(entry['% PFClusters']) * 100) - Clusters_PF += int(entry['PF Clusters'].replace(',', '')) - Yield_Mbases += int(entry['Yield (Mbases)'].replace(',', '')) - if entry['Lane'] in complex_lanes.keys(): - entry['% Perfectbarcode'] = None - entry['% One mismatchbarcode'] = None + self.NumberReads_Summary[entry["Lane"]] = { + "total_lane_cluster": int(entry["PF Clusters"].replace(",", "")), + "total_lane_yield": int(entry["Yield (Mbases)"].replace(",", "")), + } + Clusters_Raw += int( + int(entry["PF Clusters"].replace(",", "")) + / float(entry["% PFClusters"]) + * 100 + ) + Clusters_PF += int(entry["PF Clusters"].replace(",", "")) + Yield_Mbases += int(entry["Yield (Mbases)"].replace(",", "")) + if entry["Lane"] in complex_lanes.keys(): + entry["% Perfectbarcode"] = None + entry["% One mismatchbarcode"] = None # Update the values in Flowcell Summary - html_report_lane_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw) - html_report_lane_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF) - html_report_lane_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases) + html_report_lane_parser.flowcell_data["Clusters (Raw)"] = f"{Clusters_Raw:,}" + html_report_lane_parser.flowcell_data["Clusters(PF)"] = f"{Clusters_PF:,}" + html_report_lane_parser.flowcell_data["Yield (MBases)"] = f"{Yield_Mbases:,}" # Add lanes not present in this demux # Create the new lane.html - new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all']) - new_html_report_lane = os.path.join(new_html_report_lane_dir, 'lane.html') + new_html_report_lane_dir = _create_folder_structure( + demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"] + ) + new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html") _generate_lane_html(new_html_report_lane, html_report_lane_parser) # Generate the laneBarcode html_report_laneBarcode_parser = None for next_html_report_laneBarcode in html_reports_laneBarcode: if html_report_laneBarcode_parser is None: - html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode) + html_report_laneBarcode_parser = LaneBarcodeParser( + next_html_report_laneBarcode + ) else: # No need to check samples occuring in more than one file as it would be spotted while softlinking - next_html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode) + next_html_report_laneBarcode_parser = LaneBarcodeParser( + next_html_report_laneBarcode + ) for entry in next_html_report_laneBarcode_parser.sample_data: html_report_laneBarcode_parser.sample_data.append(entry) # For complex lanes, set all numbers of undetermined to 0. And only keep one such entry - constant_keys = ['Lane', 'Barcode sequence', 'Project', 'Sample'] + constant_keys = ["Lane", "Barcode sequence", "Project", "Sample"] modified_complex_lanes = [] for entry in html_report_laneBarcode_parser.sample_data: - if entry['Lane'] in list(complex_lanes.keys()) and entry['Project'] in 'default': - if entry['Lane'] not in modified_complex_lanes: + if ( + entry["Lane"] in list(complex_lanes.keys()) + and entry["Project"] in "default" + ): + if entry["Lane"] not in modified_complex_lanes: for key in entry.keys(): if key not in constant_keys: - entry[key] = '0' - modified_complex_lanes.append(entry['Lane']) + entry[key] = "0" + modified_complex_lanes.append(entry["Lane"]) else: html_report_laneBarcode_parser.sample_data.remove(entry) # Update NumberReads for total sample yields for entry in html_report_laneBarcode_parser.sample_data: - if 'total_sample_cluster' not in NumberReads_Summary[entry['Lane']].keys(): - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] = 0 - NumberReads_Summary[entry['Lane']]['total_sample_yield'] = 0 - if entry['Project'] != 'default': - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', '')) - NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', '')) + if ( + "total_sample_cluster" + not in self.NumberReads_Summary[entry["Lane"]].keys() + ): + self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0 + self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0 + if entry["Project"] != "default": + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_cluster" + ] += int(entry["PF Clusters"].replace(",", "")) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_yield" + ] += int(entry["Yield (Mbases)"].replace(",", "")) else: - if entry['Project'] != 'default': - NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', '')) - NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', '')) + if entry["Project"] != "default": + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_cluster" + ] += int(entry["PF Clusters"].replace(",", "")) + self.NumberReads_Summary[entry["Lane"]][ + "total_sample_yield" + ] += int(entry["Yield (Mbases)"].replace(",", "")) # Calculate the numbers clusters/yields of undet reads - for key, value in NumberReads_Summary.items(): - value['undet_cluster'] = value['total_lane_cluster'] - value['total_sample_cluster'] - value['undet_yield'] = value['total_lane_yield'] - value['total_sample_yield'] + for key, value in self.NumberReads_Summary.items(): + value["undet_cluster"] = ( + value["total_lane_cluster"] - value["total_sample_cluster"] + ) + value["undet_yield"] = ( + value["total_lane_yield"] - value["total_sample_yield"] + ) # Update the cluster/yield info of undet for complex lanes for entry in html_report_laneBarcode_parser.sample_data: - if entry['Project'] == 'default' and entry['Lane'] in complex_lanes.keys(): - entry['PF Clusters'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_cluster']) - entry['Yield (Mbases)'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_yield']) + if entry["Project"] == "default" and entry["Lane"] in complex_lanes.keys(): + entry["PF Clusters"] = "{:,}".format( + self.NumberReads_Summary[entry["Lane"]]["undet_cluster"] + ) + entry["Yield (Mbases)"] = "{:,}".format( + self.NumberReads_Summary[entry["Lane"]]["undet_yield"] + ) # Fix special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: lane_project_sample = dict() for entry in html_report_laneBarcode_parser.sample_data: - if entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined': - lane_project_sample[entry['Lane']] = {'Project': entry['Project'], - 'Sample': entry['Sample']} + if entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined": + lane_project_sample[entry["Lane"]] = { + "Project": entry["Project"], + "Sample": entry["Sample"], + } for entry in html_report_laneBarcode_parser.sample_data[:]: - if entry['Lane'] in noindex_lanes and entry['Sample'] == 'Undetermined': - entry['Project'] = lane_project_sample[entry['Lane']]['Project'] - entry['Sample'] = lane_project_sample[entry['Lane']]['Sample'] - elif entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined': + if entry["Lane"] in noindex_lanes and entry["Sample"] == "Undetermined": + entry["Project"] = lane_project_sample[entry["Lane"]]["Project"] + entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"] + elif ( + entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined" + ): html_report_laneBarcode_parser.sample_data.remove(entry) # Sort sample_data: first by lane then by sample ID - html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data, - key=lambda k: (k['Lane'].lower(), k['Sample'])) + html_report_laneBarcode_parser.sample_data = sorted( + html_report_laneBarcode_parser.sample_data, + key=lambda k: (k["Lane"].lower(), k["Sample"]), + ) # Update the values in Flowcell Summary - html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw) - html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF) - html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases) + html_report_laneBarcode_parser.flowcell_data[ + "Clusters (Raw)" + ] = f"{Clusters_Raw:,}" + html_report_laneBarcode_parser.flowcell_data[ + "Clusters(PF)" + ] = f"{Clusters_PF:,}" + html_report_laneBarcode_parser.flowcell_data[ + "Yield (MBases)" + ] = f"{Yield_Mbases:,}" # Generate the new report for laneBarcode.html - new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html') + new_html_report_laneBarcode = os.path.join( + new_html_report_lane_dir, "laneBarcode.html" + ) _generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser) - return NumberReads_Summary - - def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary): + def _fix_demultiplexingstats_xml_dir( + self, + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ): # Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done) - DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ['Stats']) + DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"]) # For creating DemuxSummary.txt files for complex lanes DemuxSummaryFiles_complex_lanes = dict() # Generate the Stats.json - with open(os.path.join(DemultiplexingStats_xml_dir, 'Stats.json'), 'w') as json_data_cumulative: + with open( + os.path.join(DemultiplexingStats_xml_dir, "Stats.json"), "w" + ) as json_data_cumulative: stats_list = {} for stat_json in stats_json: - demux_id = re.findall('Demultiplexing_([0-9])', stat_json)[0] + demux_id = re.findall("Demultiplexing_([0-9])", stat_json)[0] with open(stat_json) as json_data_partial: data = json.load(json_data_partial) if len(stats_list) == 0: # First time I do this - stats_list['RunNumber'] = data['RunNumber'] - stats_list['Flowcell'] = data['Flowcell'] - stats_list['RunId'] = data['RunId'] - stats_list['ConversionResults'] = data['ConversionResults'] - stats_list['ReadInfosForLanes'] = data['ReadInfosForLanes'] - stats_list['UnknownBarcodes'] = [] + stats_list["RunNumber"] = data["RunNumber"] + stats_list["Flowcell"] = data["Flowcell"] + stats_list["RunId"] = data["RunId"] + stats_list["ConversionResults"] = data["ConversionResults"] + stats_list["ReadInfosForLanes"] = data["ReadInfosForLanes"] + stats_list["UnknownBarcodes"] = [] else: # Update only the importat fields - lanes_present_in_stats_json = [entry['LaneNumber'] for entry in stats_list['ConversionResults']] - for ReadInfosForLanes_lane in data['ReadInfosForLanes']: - if ReadInfosForLanes_lane['LaneNumber'] not in lanes_present_in_stats_json: - stats_list['ReadInfosForLanes'].extend([ReadInfosForLanes_lane]) - for ConversionResults_lane in data['ConversionResults']: - if ConversionResults_lane['LaneNumber'] in lanes_present_in_stats_json and str(ConversionResults_lane['LaneNumber']) in complex_lanes.keys(): + lanes_present_in_stats_json = [ + entry["LaneNumber"] + for entry in stats_list["ConversionResults"] + ] + for ReadInfosForLanes_lane in data["ReadInfosForLanes"]: + if ( + ReadInfosForLanes_lane["LaneNumber"] + not in lanes_present_in_stats_json + ): + stats_list["ReadInfosForLanes"].extend( + [ReadInfosForLanes_lane] + ) + for ConversionResults_lane in data["ConversionResults"]: + if ( + ConversionResults_lane["LaneNumber"] + in lanes_present_in_stats_json + and str(ConversionResults_lane["LaneNumber"]) + in complex_lanes.keys() + ): # For complex lanes, we set all stats to 0, except for read number and yield which will use values from NumberReads_Summary - ConversionResults_lane['Undetermined']['NumberReads'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_cluster'] - ConversionResults_lane['Undetermined']['Yield'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_yield']*1000000 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['QualityScoreSum'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['TrimmedBases'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['Yield'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][0]['YieldQ30'] = 0 - if len([r for r in self.runParserObj.runinfo.data['Reads'] if r['IsIndexedRead'] == 'N']) == 2: - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['QualityScoreSum'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['TrimmedBases'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['Yield'] = 0 - ConversionResults_lane['Undetermined']['ReadMetrics'][1]['YieldQ30'] = 0 + ConversionResults_lane["Undetermined"][ + "NumberReads" + ] = self.NumberReads_Summary[ + str(ConversionResults_lane["LaneNumber"]) + ]["undet_cluster"] + ConversionResults_lane["Undetermined"]["Yield"] = ( + self.NumberReads_Summary[ + str(ConversionResults_lane["LaneNumber"]) + ]["undet_yield"] + * 1000000 + ) + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["QualityScoreSum"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["TrimmedBases"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["Yield"] = 0 + ConversionResults_lane["Undetermined"]["ReadMetrics"][ + 0 + ]["YieldQ30"] = 0 + if ( + len( + [ + r + for r in self.runParserObj.runinfo.data[ + "Reads" + ] + if r["IsIndexedRead"] == "N" + ] + ) + == 2 + ): + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["QualityScoreSum"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["TrimmedBases"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["Yield"] = 0 + ConversionResults_lane["Undetermined"][ + "ReadMetrics" + ][1]["YieldQ30"] = 0 # Find the list containing info for this lane #TODO: can lane_to_update be removed? - lane_to_update = [entry for entry in stats_list['ConversionResults'] if entry['LaneNumber'] == ConversionResults_lane['LaneNumber']][0] - lane_to_update['DemuxResults'].extend(ConversionResults_lane['DemuxResults']) - lane_to_update['Undetermined'] = ConversionResults_lane['Undetermined'] + lane_to_update = [ + entry + for entry in stats_list["ConversionResults"] + if entry["LaneNumber"] + == ConversionResults_lane["LaneNumber"] + ][0] + lane_to_update["DemuxResults"].extend( + ConversionResults_lane["DemuxResults"] + ) + lane_to_update["Undetermined"] = ConversionResults_lane[ + "Undetermined" + ] else: - stats_list['ConversionResults'].extend([ConversionResults_lane]) - - for unknown_barcode_lane in data['UnknownBarcodes']: - if str(unknown_barcode_lane['Lane']) in simple_lanes.keys(): - stats_list['UnknownBarcodes'].extend([unknown_barcode_lane]) - elif str(unknown_barcode_lane['Lane']) in complex_lanes.keys(): - if list(complex_lanes[str(unknown_barcode_lane['Lane'])].keys())[0] == demux_id: + stats_list["ConversionResults"].extend( + [ConversionResults_lane] + ) + + for unknown_barcode_lane in data["UnknownBarcodes"]: + if str(unknown_barcode_lane["Lane"]) in simple_lanes.keys(): + stats_list["UnknownBarcodes"].extend([unknown_barcode_lane]) + elif str(unknown_barcode_lane["Lane"]) in complex_lanes.keys(): + if ( + list( + complex_lanes[ + str(unknown_barcode_lane["Lane"]) + ].keys() + )[0] + == demux_id + ): # First have the list of unknown indexes from the top priority demux run full_list_unknownbarcodes = unknown_barcode_lane # Remove the samples involved in the other samplesheets for samplesheet in samplesheets: - demux_id_ss = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] + demux_id_ss = os.path.splitext( + os.path.split(samplesheet)[1] + )[0].split("_")[1] if demux_id_ss != demux_id: ssparser = SampleSheetParser(samplesheet) - ssparser_data_lane = [row for row in ssparser.data if row['Lane'] == str(unknown_barcode_lane['Lane'])] + ssparser_data_lane = [ + row + for row in ssparser.data + if row["Lane"] + == str(unknown_barcode_lane["Lane"]) + ] for row in ssparser_data_lane: - sample_idx1 = row.get('index','') - sample_idx2 = row.get('index2','') - idx_copy = tuple(full_list_unknownbarcodes['Barcodes'].keys()) + sample_idx1 = row.get("index", "") + sample_idx2 = row.get("index2", "") + idx_copy = tuple( + full_list_unknownbarcodes[ + "Barcodes" + ].keys() + ) for idx in idx_copy: - unknownbarcode_idx1 = idx.split('+')[0] if '+' in idx else idx - unknownbarcode_idx2 = idx.split('+')[1] if '+' in idx else '' + unknownbarcode_idx1 = ( + idx.split("+")[0] + if "+" in idx + else idx + ) + unknownbarcode_idx2 = ( + idx.split("+")[1] + if "+" in idx + else "" + ) if sample_idx1 and sample_idx2: - comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)] - comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx2) else sample_idx2[:len(unknownbarcode_idx2)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)] and comparepart_idx2 == unknownbarcode_idx2[:len(comparepart_idx2)]: - del full_list_unknownbarcodes['Barcodes'][idx] + comparepart_idx1 = ( + sample_idx1 + if len(sample_idx1) + <= len(unknownbarcode_idx1) + else sample_idx1[ + : len(unknownbarcode_idx1) + ] + ) + comparepart_idx2 = ( + sample_idx2 + if len(sample_idx2) + <= len(unknownbarcode_idx2) + else sample_idx2[ + : len(unknownbarcode_idx2) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx1) + ] + and comparepart_idx2 + == unknownbarcode_idx2[ + : len(comparepart_idx2) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] elif sample_idx1 and not sample_idx2: - comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)]: - del full_list_unknownbarcodes['Barcodes'][idx] + comparepart_idx1 = ( + sample_idx1 + if len(sample_idx1) + <= len(unknownbarcode_idx1) + else sample_idx1[ + : len(unknownbarcode_idx1) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx1) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] elif not sample_idx1 and sample_idx2: - comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx1) else sample_idx2[:len(unknownbarcode_idx1)] - if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx2)]: - del full_list_unknownbarcodes['Barcodes'][idx] - stats_list['UnknownBarcodes'].extend([full_list_unknownbarcodes]) - DemuxSummaryFiles_complex_lanes[str(unknown_barcode_lane['Lane'])] = full_list_unknownbarcodes + comparepart_idx2 = ( + sample_idx2 + if len(sample_idx2) + <= len(unknownbarcode_idx1) + else sample_idx2[ + : len(unknownbarcode_idx1) + ] + ) + if ( + comparepart_idx1 + == unknownbarcode_idx1[ + : len(comparepart_idx2) + ] + ): + del full_list_unknownbarcodes[ + "Barcodes" + ][idx] + stats_list["UnknownBarcodes"].extend( + [full_list_unknownbarcodes] + ) + DemuxSummaryFiles_complex_lanes[ + str(unknown_barcode_lane["Lane"]) + ] = full_list_unknownbarcodes else: pass # Fix special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: - for entry in stats_list['ConversionResults'][:]: - if str(entry['LaneNumber']) in noindex_lanes: - del entry['DemuxResults'][0]['IndexMetrics'] - entry['DemuxResults'][0].update(entry['Undetermined']) - del entry['Undetermined'] + for entry in stats_list["ConversionResults"][:]: + if str(entry["LaneNumber"]) in noindex_lanes: + del entry["DemuxResults"][0]["IndexMetrics"] + entry["DemuxResults"][0].update(entry["Undetermined"]) + del entry["Undetermined"] # Reset unknown barcodes list - for entry in stats_list['UnknownBarcodes'][:]: - if str(entry['Lane']) in noindex_lanes: - entry['Barcodes'] = {'unknown': 1} + for entry in stats_list["UnknownBarcodes"][:]: + if str(entry["Lane"]) in noindex_lanes: + entry["Barcodes"] = {"unknown": 1} # Write the final version of Stats.json file json.dump(stats_list, json_data_cumulative) @@ -776,100 +1128,161 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet # Create DemuxSummary.txt files for complex lanes if len(DemuxSummaryFiles_complex_lanes) > 0: for key, value in DemuxSummaryFiles_complex_lanes.items(): - with open(os.path.join(DemultiplexingStats_xml_dir, 'DemuxSummaryF1L{}.txt'.format(key)), 'w') as DemuxSummaryFile: - DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n') - DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n') - for idx, count in value['Barcodes'].items(): - DemuxSummaryFile.write('{}\t{}\n'.format(idx, count)) - - open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close() - - def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes): + with open( + os.path.join( + DemultiplexingStats_xml_dir, f"DemuxSummaryF1L{key}.txt" + ), + "w", + ) as DemuxSummaryFile: + DemuxSummaryFile.write("### Most Popular Unknown Index Sequences\n") + DemuxSummaryFile.write("### Columns: Index_Sequence Hit_Count\n") + for idx, count in value["Barcodes"].items(): + DemuxSummaryFile.write(f"{idx}\t{count}\n") + + open( + os.path.join(DemultiplexingStats_xml_dir, "DemultiplexingStats.xml"), "a" + ).close() + + def _process_demux_with_complex_lanes( + self, + demux_folder, + samplesheets, + legacy_path, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ): html_reports_lane = [] html_reports_laneBarcode = [] stats_json = [] for samplesheet in samplesheets: ssparser = SampleSheetParser(samplesheet) demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1] - html_report_lane = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "lane.html" - ) + html_report_lane = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "lane.html", + ) if os.path.exists(html_report_lane): html_reports_lane.append(html_report_lane) else: - raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_lane)) - - html_report_laneBarcode = os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - legacy_path, - "Reports", - "html", - self.flowcell_id, - "all", - "all", - "all", - "laneBarcode.html" - ) + raise RuntimeError( + f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing" + ) + + html_report_laneBarcode = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Reports", + "html", + self.flowcell_id, + "all", + "all", + "all", + "laneBarcode.html", + ) if os.path.exists(html_report_laneBarcode): html_reports_laneBarcode.append(html_report_laneBarcode) else: - raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_laneBarcode)) - - stat_json = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json") + raise RuntimeError( + f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing" + ) + + stat_json = os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + "Stats.json", + ) if os.path.exists(stat_json): stats_json.append(stat_json) else: - raise RuntimeError("Not able to find Stats.json report {}: possible cause is problem in demultiplexing".format(stat_json)) + raise RuntimeError( + f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing" + ) # Aggregate fastq lanes_samples = dict() for row in ssparser.data: - if row['Lane'] not in lanes_samples.keys(): - lanes_samples[row['Lane']] = [row['Sample_Name']] + if row["Lane"] not in lanes_samples.keys(): + lanes_samples[row["Lane"]] = [row["Sample_Name"]] else: - lanes_samples[row['Lane']].append(row['Sample_Name']) + lanes_samples[row["Lane"]].append(row["Sample_Name"]) # Special case that when we assign fake indexes for NoIndex samples - if (set(list(lanes_samples.keys())) & set(noindex_lanes)) and index_cycles != [0, 0]: + if ( + set(list(lanes_samples.keys())) & set(noindex_lanes) + ) and index_cycles != [0, 0]: sample_counter = 1 - for entry in sorted(ssparser.data, key=lambda k: k['Lane']): - lane = entry['Lane'] - project = entry['Sample_Project'] - sample = entry['Sample_ID'] + for entry in sorted(ssparser.data, key=lambda k: k["Lane"]): + lane = entry["Lane"] + project = entry["Sample_Project"] + sample = entry["Sample_ID"] project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): os.makedirs(project_dest) sample_dest = os.path.join(project_dest, sample) if not os.path.exists(sample_dest): os.makedirs(sample_dest) - for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))): + for file in glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined*L0?{lane}*", + ) + ): old_name = os.path.basename(file) old_name_comps = old_name.split("_") - new_name_comps = [sample.replace('Sample_', ''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:] + new_name_comps = [ + sample.replace("Sample_", ""), + f"S{str(sample_counter)}", + ] + old_name_comps[2:] new_name = "_".join(new_name_comps) os.symlink(file, os.path.join(sample_dest, new_name)) - logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name)) + logger.info( + "For undet sample {}, renaming {} to {}".format( + sample.replace("Sample_", ""), old_name, new_name + ) + ) sample_counter += 1 # Ordinary cases else: - projects = [project for project in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) if os.path.isdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project))] + projects = [ + project + for project in os.listdir( + os.path.join(self.run_dir, f"Demultiplexing_{demux_id}") + ) + if os.path.isdir( + os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", project + ) + ) + ] for project in projects: if project in "Reports" or project in "Stats": continue - project_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project) + project_source = os.path.join( + self.run_dir, f"Demultiplexing_{demux_id}", project + ) project_dest = os.path.join(demux_folder, project) if not os.path.exists(project_dest): # There might be project seqeunced with multiple index lengths os.makedirs(project_dest) - samples = [sample for sample in os.listdir(project_source) if os.path.isdir(os.path.join(project_source, sample))] + samples = [ + sample + for sample in os.listdir(project_source) + if os.path.isdir(os.path.join(project_source, sample)) + ] for sample in samples: sample_source = os.path.join(project_source, sample) sample_dest = os.path.join(project_dest, sample) @@ -877,13 +1290,31 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p # There should never be the same sample sequenced with different index length, # however a sample might be pooled in several lanes and therefore sequenced using different samplesheets os.makedirs(sample_dest) - fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*")) + fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*")) for fastqfile in fastqfiles: - os.symlink(fastqfile, os.path.join(sample_dest, os.path.split(fastqfile)[1])) + os.symlink( + fastqfile, + os.path.join(sample_dest, os.path.split(fastqfile)[1]), + ) # Copy fastq files for undetermined and the undetermined stats for simple lanes only lanes_in_sub_samplesheet = [] - header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project'] - with open(samplesheet, mode='r') as sub_samplesheet_file: + header = [ + "[Header]", + "[Data]", + "FCID", + "Lane", + "Sample_ID", + "Sample_Name", + "Sample_Ref", + "index", + "index2", + "Description", + "Control", + "Recipe", + "Operator", + "Sample_Project", + ] + with open(samplesheet) as sub_samplesheet_file: sub_samplesheet_reader = csv.reader(sub_samplesheet_file) for row in sub_samplesheet_reader: if row[0] not in header: @@ -891,68 +1322,120 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p lanes_in_sub_samplesheet = list(set(lanes_in_sub_samplesheet)) for lane in lanes_in_sub_samplesheet: if lane in simple_lanes.keys(): - undetermined_fastq_files = glob.glob(os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - "Undetermined_S0_L00{}*.fastq*".format(lane))) # Contains only simple lanes undetermined + undetermined_fastq_files = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + f"Undetermined_S0_L00{lane}*.fastq*", + ) + ) # Contains only simple lanes undetermined for fastqfile in undetermined_fastq_files: - os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1])) - DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir, - "Demultiplexing_{}".format(demux_id), - legacy_path, - "Stats", - "*L{}*txt".format(lane))) + os.symlink( + fastqfile, + os.path.join(demux_folder, os.path.split(fastqfile)[1]), + ) + DemuxSummaryFiles = glob.glob( + os.path.join( + self.run_dir, + f"Demultiplexing_{demux_id}", + legacy_path, + "Stats", + f"*L{lane}*txt", + ) + ) if not os.path.exists(os.path.join(demux_folder, "Stats")): os.makedirs(os.path.join(demux_folder, "Stats")) for DemuxSummaryFile in DemuxSummaryFiles: - os.symlink(DemuxSummaryFile, os.path.join(demux_folder, "Stats", os.path.split(DemuxSummaryFile)[1])) + os.symlink( + DemuxSummaryFile, + os.path.join( + demux_folder, + "Stats", + os.path.split(DemuxSummaryFile)[1], + ), + ) return html_reports_lane, html_reports_laneBarcode, stats_json def _aggregate_demux_results_simple_complex(self): runSetup = self.runParserObj.runinfo.get_read_configuration() - demux_folder = os.path.join(self.run_dir , self.demux_dir) + demux_folder = os.path.join(self.run_dir, self.demux_dir) samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) - if self.software == 'bcl2fastq': - legacy_path = '' - elif self.software == 'bclconvert': - legacy_path = "Reports/{}".format(self.legacy_dir) + if self.software == "bcl2fastq": + legacy_path = "" + elif self.software == "bclconvert": + legacy_path = f"Reports/{self.legacy_dir}" else: raise RuntimeError("Unrecognized software!") index_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) # Classify lanes in samplesheets - (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(samplesheets) + (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes( + samplesheets + ) # Case with only one sub-demultiplexing if len(complex_lanes) == 0 and len(samplesheets) == 1: - demux_id = "0" # in this case this is the only demux dir + demux_id = "0" # in this case this is the only demux dir # Special case that when we assign fake indexes for NoIndex samples if noindex_lanes and index_cycles != [0, 0]: # We first softlink the FastQ files of undet as the FastQ files of samples - self._process_noindex_sample_with_fake_index_with_single_demux(demux_id, legacy_path) + self._process_noindex_sample_with_fake_index_with_single_demux( + demux_id, legacy_path + ) # This is the simple case, Demultiplexing dir is simply a symlink to the only sub-demultiplexing dir else: - self._process_simple_lane_with_single_demux(demux_id, legacy_path, noindex_lanes) + self._process_simple_lane_with_single_demux( + demux_id, legacy_path, noindex_lanes + ) return True # Case with multiple sub-demultiplexings - (html_reports_lane, html_reports_laneBarcode, stats_json) = self._process_demux_with_complex_lanes(demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes) + ( + html_reports_lane, + html_reports_laneBarcode, + stats_json, + ) = self._process_demux_with_complex_lanes( + demux_folder, + samplesheets, + legacy_path, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ) # Create the html reports - NumberReads_Summary = self._fix_html_reports_for_complex_lanes(demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode) + self._fix_html_reports_for_complex_lanes( + demux_folder, + index_cycles, + complex_lanes, + noindex_lanes, + html_reports_lane, + html_reports_laneBarcode, + ) # Fix contents under the DemultiplexingStats folder - self._fix_demultiplexingstats_xml_dir(demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary) + self._fix_demultiplexingstats_xml_dir( + demux_folder, + stats_json, + samplesheets, + index_cycles, + simple_lanes, + complex_lanes, + noindex_lanes, + ) return True + def _create_folder_structure(root, dirs): """Creates a fodler stucture rooted in root usinf all dirs listed in dirs (a list) returns the path to the deepest directory @@ -964,49 +1447,56 @@ def _create_folder_structure(root, dirs): os.makedirs(path) return path + def _generate_lane_html(html_file, html_report_lane_parser): - with open(html_file, 'w') as html: + with open(html_file, "w") as html: # HEADER - html.write('\n') - html.write('\n') - html.write('\n') - html.write('\n') + html.write( + '\n' + ) + html.write("\n") + html.write( + '\n' + ) + html.write("\n") html.write('\n') - html.write('\n') - html.write('\n') - html.write('

C6L1WANXX /\n') - html.write(' [all projects] /\n') - html.write(' [all samples] /\n') - html.write(' [all barcodes]

show barcodes

\n') + html.write("

C6L1WANXX /\n") + html.write(" [all projects] /\n") + html.write(" [all samples] /\n") + html.write(" [all barcodes]

\n") + html.write( + '

show barcodes

\n' + ) + html.write("\n") # FLOWCELL SUMMARY TABLE - html.write('

Flowcell Summary

\n') + html.write("

Flowcell Summary

\n") html.write('\n') - html.write('\n') + html.write("\n") fc_keys = sorted(list(html_report_lane_parser.flowcell_data.keys())) for key in fc_keys: - html.write('\n'.format(key)) - html.write('\n') - html.write('\n') + html.write(f"\n") + html.write("\n") + html.write("\n") for key in fc_keys: - html.write('\n'.format(html_report_lane_parser.flowcell_data[key])) - html.write('\n') - html.write('
{}
{key}
{}
\n') + html.write(f"{html_report_lane_parser.flowcell_data[key]}\n") + html.write("\n") + html.write("\n") # LANE SUMMARY TABLE - html.write('

Lane Summary

\n') + html.write("

Lane Summary

\n") html.write('\n') - html.write('\n') + html.write("\n") lane_keys = sorted(list(html_report_lane_parser.sample_data[0].keys())) for key in lane_keys: - html.write('\n'.format(key)) - html.write('\n') + html.write(f"\n") + html.write("\n") for sample in html_report_lane_parser.sample_data: - html.write('\n') + html.write("\n") for key in lane_keys: - html.write('\n'.format(sample[key])) - html.write('\n') - html.write('
{}
{key}
{}
\n') + html.write(f"{sample[key]}\n") + html.write("\n") + html.write("\n") # FOOTER - html.write('

\n') - html.write('\n') - html.write('\n') + html.write("

\n") + html.write("\n") + html.write("\n") diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py index bb718787..7f051d66 100755 --- a/taca/illumina/Standard_Runs.py +++ b/taca/illumina/Standard_Runs.py @@ -1,76 +1,88 @@ +import logging import os import re -import logging from datetime import datetime -from taca.utils.filesystem import chdir +from flowcell_parser.classes import SampleSheetParser + from taca.illumina.Runs import Run from taca.utils import misc -from flowcell_parser.classes import SampleSheetParser -from io import open +from taca.utils.filesystem import chdir logger = logging.getLogger(__name__) -TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?') -TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?') -SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]') -IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)') -RECIPE_PAT = re.compile('[0-9]+-[0-9]+') +TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?") +TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?") +SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]") +IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)") +RECIPE_PAT = re.compile("[0-9]+-[0-9]+") class Standard_Run(Run): - def __init__(self, run_dir, software, configuration): - super(Standard_Run, self).__init__(run_dir, software, configuration) + super().__init__(run_dir, software, configuration) + self._set_sequencer_type() + self._set_run_type() + self._copy_samplesheet() def _set_sequencer_type(self): - self.sequencer_type = '' + self.sequencer_type = "" def _set_run_type(self): - self.run_type = 'NGI-RUN' + self.run_type = "NGI-RUN" def _copy_samplesheet(self): - ssname = self._get_samplesheet() + ssname = self._get_samplesheet() ssparser = SampleSheetParser(ssname) indexfile = dict() runSetup = self.runParserObj.runinfo.get_read_configuration() # Loading index files try: - indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path'] + indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"] except KeyError: - logger.error('Path to index file (10X) not found in the config file') + logger.error("Path to index file (10X) not found in the config file") raise RuntimeError try: - indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path'] + indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"] except KeyError: - logger.error('Path to index file (Smart-seq) not found in the config file') + logger.error("Path to index file (Smart-seq) not found in the config file") raise RuntimeError # Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default) # If this is not the case then create it and take special care of modification to be done on the SampleSheet - samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet.csv') + samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv") # Function that goes through the original sample sheet and check for sample types self.sample_table = self._classify_samples(indexfile, ssparser, runSetup) # Check that the samplesheet is not already present. In this case go the next step if not os.path.exists(samplesheet_dest): try: - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_clean_samplesheet(ssparser, - indexfile, - fields_to_remove=None, - rename_samples=True, - rename_qPCR_suffix = True, - fields_qPCR=[ssparser.dfield_snm])) + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_clean_samplesheet( + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=True, + fields_qPCR=[ssparser.dfield_snm], + ) + ) except Exception as e: - logger.error('Encountered the following exception {}'.format(e)) + logger.error(f"Encountered the following exception {e}") return False - logger.info(('Created SampleSheet.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest))) + logger.info( + f"Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} " + ) # SampleSheet.csv generated # When demultiplexing SampleSheet.csv is the one I need to use # Need to rewrite so that SampleSheet_0.csv is always used. - self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet.csv')) - if not self.runParserObj.obj.get('samplesheet_csv'): - self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data + self.runParserObj.samplesheet = SampleSheetParser( + os.path.join(self.run_dir, "SampleSheet.csv") + ) + if not self.runParserObj.obj.get("samplesheet_csv"): + self.runParserObj.obj[ + "samplesheet_csv" + ] = self.runParserObj.samplesheet.data def _parse_10X_indexes(self, indexfile): """ @@ -78,9 +90,9 @@ def _parse_10X_indexes(self, indexfile): Todo: Set it up to take the file from config instead """ index_dict = {} - with open(indexfile, 'r') as f: + with open(indexfile) as f: for line in f: - line_ = line.rstrip().split(',') + line_ = line.rstrip().split(",") index_dict[line_[0]] = line_[1:5] return index_dict @@ -90,107 +102,140 @@ def _parse_smartseq_indexes(self, indexfile): Todo: Set it up to take the file from config instead """ index_dict = {} - with open(indexfile, 'r') as f: + with open(indexfile) as f: for line in f: - line_ = line.rstrip().split(',') + line_ = line.rstrip().split(",") if index_dict.get(line_[0]): - index_dict[line_[0]].append((line_[1],line_[2])) + index_dict[line_[0]].append((line_[1], line_[2])) else: - index_dict.update({line_[0]:[(line_[1],line_[2])]}) + index_dict.update({line_[0]: [(line_[1], line_[2])]}) return index_dict def _classify_samples(self, indexfile, ssparser, runSetup): """Given an ssparser object, go through all samples and decide sample types.""" sample_table = dict() - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) index_cycles = [0, 0] read_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) - elif read['IsIndexedRead'] == 'N': - if int(read['Number']) == 1: - read_cycles[0] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) + elif read["IsIndexedRead"] == "N": + if int(read["Number"]) == 1: + read_cycles[0] = int(read["NumCycles"]) else: - read_cycles[1] = int(read['NumCycles']) + read_cycles[1] = int(read["NumCycles"]) for sample in ssparser.data: - lane = sample['Lane'] - sample_name = sample.get('Sample_Name') or sample.get('SampleName') + lane = sample["Lane"] + sample_name = sample.get("Sample_Name") or sample.get("SampleName") umi_length = [0, 0] read_length = read_cycles # Read the length of read 1 and read 2 from the field Recipe - if sample.get('Recipe') and RECIPE_PAT.findall(sample.get('Recipe')): - ss_read_length = [int(sample.get('Recipe').split('-')[0]), int(sample.get('Recipe').split('-')[1])] + if sample.get("Recipe") and RECIPE_PAT.findall(sample.get("Recipe")): + ss_read_length = [ + int(sample.get("Recipe").split("-")[0]), + int(sample.get("Recipe").split("-")[1]), + ] else: ss_read_length = [0, 0] # By default use the read cycles from the sequncing setup. Otherwise use the shorter read length if ss_read_length != [0, 0]: read_length = [min(rd) for rd in zip(ss_read_length, read_length)] # 10X single index - if TENX_SINGLE_PAT.findall(sample['index']): - index_length = [len(index_dict_tenX[sample['index']][0]),0] - sample_type = '10X_SINGLE' + if TENX_SINGLE_PAT.findall(sample["index"]): + index_length = [len(index_dict_tenX[sample["index"]][0]), 0] + sample_type = "10X_SINGLE" # 10X dual index - elif TENX_DUAL_PAT.findall(sample['index']): - index_length = [len(index_dict_tenX[sample['index']][0]),len(index_dict_tenX[sample['index']][1])] - sample_type = '10X_DUAL' + elif TENX_DUAL_PAT.findall(sample["index"]): + index_length = [ + len(index_dict_tenX[sample["index"]][0]), + len(index_dict_tenX[sample["index"]][1]), + ] + sample_type = "10X_DUAL" # IDT UMI samples - elif IDT_UMI_PAT.findall(sample['index']) or IDT_UMI_PAT.findall(sample['index2']): + elif IDT_UMI_PAT.findall(sample["index"]) or IDT_UMI_PAT.findall( + sample["index2"] + ): # Index length after removing "N" part - index_length = [len(sample['index'].replace('N', '')), - len(sample['index2'].replace('N', ''))] - sample_type = 'IDT_UMI' - umi_length = [sample['index'].upper().count('N'), sample['index2'].upper().count('N')] + index_length = [ + len(sample["index"].replace("N", "")), + len(sample["index2"].replace("N", "")), + ] + sample_type = "IDT_UMI" + umi_length = [ + sample["index"].upper().count("N"), + sample["index2"].upper().count("N"), + ] # Smart-seq - elif SMARTSEQ_PAT.findall(sample['index']): - smartseq_index = sample['index'].split('-')[1] - index_length = [len(index_dict_smartseq[smartseq_index][0][0]),len(index_dict_smartseq[smartseq_index][0][1])] - sample_type = 'SMARTSEQ' + elif SMARTSEQ_PAT.findall(sample["index"]): + smartseq_index = sample["index"].split("-")[1] + index_length = [ + len(index_dict_smartseq[smartseq_index][0][0]), + len(index_dict_smartseq[smartseq_index][0][1]), + ] + sample_type = "SMARTSEQ" # No Index case 1. We will write indexes to separate FastQ files - elif sample['index'].upper() == 'NOINDEX' and index_cycles != [0, 0]: + elif sample["index"].upper() == "NOINDEX" and index_cycles != [0, 0]: index_length = index_cycles - sample_type = 'NOINDEX' + sample_type = "NOINDEX" # No Index case 2. Both index 1 and 2 are empty, it will be the same index type but will be handled in the next case - elif sample['index'].upper() == 'NOINDEX' and index_cycles == [0, 0]: + elif sample["index"].upper() == "NOINDEX" and index_cycles == [0, 0]: index_length = [0, 0] - sample_type = 'ordinary' + sample_type = "ordinary" # Ordinary samples else: - index_length = [len(sample['index']),len(sample['index2'])] + index_length = [len(sample["index"]), len(sample["index2"])] # Short single index (<=6nt) - if (index_length[0] <= 8 and index_length[1] == 0) or (index_length[0] == 0 and index_length[1] <= 8): - sample_type = 'short_single_index' + if (index_length[0] <= 8 and index_length[1] == 0) or ( + index_length[0] == 0 and index_length[1] <= 8 + ): + sample_type = "short_single_index" else: - sample_type = 'ordinary' + sample_type = "ordinary" # Write in sample table # {'1': [('101', {'sample_type': 'ordinary', 'index_length': [8, 8]}), ('102', {'sample_type': 'ordinary', 'index_length': [8, 8]})]} if sample_table.get(lane): - sample_table[lane].append((sample_name, - {'sample_type': sample_type, - 'index_length': index_length, - 'umi_length': umi_length, - 'read_length': read_length})) + sample_table[lane].append( + ( + sample_name, + { + "sample_type": sample_type, + "index_length": index_length, + "umi_length": umi_length, + "read_length": read_length, + }, + ) + ) else: - sample_table.update({lane:[(sample_name, - {'sample_type': sample_type, - 'index_length': index_length, - 'umi_length': umi_length, - 'read_length': read_length})]}) + sample_table.update( + { + lane: [ + ( + sample_name, + { + "sample_type": sample_type, + "index_length": index_length, + "umi_length": umi_length, + "read_length": read_length, + }, + ) + ] + } + ) return sample_table - def demultiplex_run(self): """ - Demultiplex a run: - - Make sub-samplesheet based on sample classes - - Decide correct bcl2fastq/bclconvert command parameters based on sample classes - - run bcl2fastq/bclconvert conversion + Demultiplex a run: + - Make sub-samplesheet based on sample classes + - Decide correct bcl2fastq/bclconvert command parameters based on sample classes + - run bcl2fastq/bclconvert conversion """ runSetup = self.runParserObj.runinfo.get_read_configuration() # Check sample types @@ -198,7 +243,7 @@ def demultiplex_run(self): for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] - sample_type = sample_detail['sample_type'] + sample_type = sample_detail["sample_type"] if sample_type not in sample_type_list: sample_type_list.append(sample_type) @@ -210,21 +255,43 @@ def demultiplex_run(self): for lane, lane_contents in self.sample_table.items(): for sample in lane_contents: sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] if sample_type_t == sample_type: if lane_table.get(lane): - if (sample_index_length, sample_umi_length, sample_read_length) not in lane_table[lane]: - lane_table[lane].append((sample_index_length, sample_umi_length, sample_read_length)) + if ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) not in lane_table[lane]: + lane_table[lane].append( + ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) + ) else: - lane_table.update({lane:[(sample_index_length, sample_umi_length, sample_read_length)]}) + lane_table.update( + { + lane: [ + ( + sample_index_length, + sample_umi_length, + sample_read_length, + ) + ] + } + ) # Determine the number of demux needed for the same sample type - if self.software == 'bcl2fastq': - demux_number_with_the_same_sample_type = len(max([v for k, v in lane_table.items()],key=len)) - elif self.software == 'bclconvert': + if self.software == "bcl2fastq": + demux_number_with_the_same_sample_type = len( + max([v for k, v in lane_table.items()], key=len) + ) + elif self.software == "bclconvert": unique_masks = [] for masks in lane_table.values(): for mask in masks: @@ -232,33 +299,44 @@ def demultiplex_run(self): unique_masks.append(mask) demux_number_with_the_same_sample_type = len(unique_masks) # Prepare sub-samplesheets, masks and commands - for i in range(0,demux_number_with_the_same_sample_type): + for i in range(0, demux_number_with_the_same_sample_type): # Prepare sub-samplesheet # A dictionary with lane and sample IDs to include samples_to_include = dict() # A dictionary with lane and index length for generating masks mask_table = dict() - if self.software == 'bcl2fastq': + if self.software == "bcl2fastq": for lane, lane_contents in self.sample_table.items(): try: - (index_length, umi_length, read_length) = lane_table[lane][i] - mask_table.update({lane: (index_length, umi_length, read_length)}) + (index_length, umi_length, read_length) = lane_table[lane][ + i + ] + mask_table.update( + {lane: (index_length, umi_length, read_length)} + ) for sample in lane_contents: sample_name = sample[0] sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] - if sample_type_t == sample_type and sample_index_length == index_length and sample_umi_length == umi_length and sample_read_length == read_length: + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] + if ( + sample_type_t == sample_type + and sample_index_length == index_length + and sample_umi_length == umi_length + and sample_read_length == read_length + ): if samples_to_include.get(lane): samples_to_include[lane].append(sample_name) else: - samples_to_include.update({lane:[sample_name]}) - except (KeyError, IndexError) as err: - logger.info(('No corresponding mask in lane {}. Skip it.'.format(lane))) + samples_to_include.update({lane: [sample_name]}) + except (KeyError, IndexError): + logger.info( + f"No corresponding mask in lane {lane}. Skip it." + ) continue - elif self.software == 'bclconvert': + elif self.software == "bclconvert": mask = unique_masks[i] for lane, lane_contents in self.sample_table.items(): if lane_table.get(lane): @@ -267,17 +345,24 @@ def demultiplex_run(self): for sample in lane_contents: sample_name = sample[0] sample_detail = sample[1] - sample_type_t = sample_detail['sample_type'] - sample_index_length = sample_detail['index_length'] - sample_umi_length = sample_detail['umi_length'] - sample_read_length = sample_detail['read_length'] - if sample_type_t == sample_type and sample_index_length == mask[0] and sample_umi_length == mask[1] and sample_read_length == mask[2]: + sample_type_t = sample_detail["sample_type"] + sample_index_length = sample_detail["index_length"] + sample_umi_length = sample_detail["umi_length"] + sample_read_length = sample_detail["read_length"] + if ( + sample_type_t == sample_type + and sample_index_length == mask[0] + and sample_umi_length == mask[1] + and sample_read_length == mask[2] + ): if samples_to_include.get(lane): samples_to_include[lane].append(sample_name) else: - samples_to_include.update({lane:[sample_name]}) + samples_to_include.update( + {lane: [sample_name]} + ) - if self.software == 'bclconvert': + if self.software == "bclconvert": runSetup = self.runParserObj.runinfo.get_read_configuration() (index_length, umi_length, read_length) = mask index1_size = int(index_length[0]) @@ -287,37 +372,61 @@ def demultiplex_run(self): read1_size = int(read_length[0]) read2_size = int(read_length[1]) is_dual_index = False - if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0): + if (index1_size != 0 and index2_size != 0) or ( + index1_size == 0 and index2_size != 0 + ): is_dual_index = True - base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size) + base_mask = self._compute_base_mask( + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ) else: index1_size = 0 index2_size = 0 base_mask = [] # Make sub-samplesheet with chdir(self.run_dir): - samplesheet_dest='SampleSheet_{}.csv'.format(bcl_cmd_counter) - with open(samplesheet_dest, 'w') as fcd: - fcd.write(self._generate_samplesheet_subset(self.runParserObj.samplesheet, - samples_to_include, runSetup, self.software, sample_type, index1_size, index2_size, base_mask, self.CONFIG)) + samplesheet_dest = f"SampleSheet_{bcl_cmd_counter}.csv" + with open(samplesheet_dest, "w") as fcd: + fcd.write( + self._generate_samplesheet_subset( + self.runParserObj.samplesheet, + samples_to_include, + runSetup, + self.software, + sample_type, + index1_size, + index2_size, + base_mask, + self.CONFIG, + ) + ) # Prepare demultiplexing dir with chdir(self.run_dir): # Create Demultiplexing dir, this changes the status to IN_PROGRESS - if not os.path.exists('Demultiplexing'): - os.makedirs('Demultiplexing') + if not os.path.exists("Demultiplexing"): + os.makedirs("Demultiplexing") # Prepare demultiplexing command with chdir(self.run_dir): - cmd = self.generate_bcl_command(sample_type, - mask_table, - bcl_cmd_counter) - misc.call_external_command_detached(cmd, - with_log_files = True, - prefix='demux_{}'.format(bcl_cmd_counter)) - logger.info(('BCL to FASTQ conversion and demultiplexing ' \ - 'started for run {} on {}'.format(os.path.basename(self.id), - datetime.now()))) + cmd = self.generate_bcl_command( + sample_type, mask_table, bcl_cmd_counter + ) + misc.call_external_command_detached( + cmd, with_log_files=True, prefix=f"demux_{bcl_cmd_counter}" + ) + logger.info( + "BCL to FASTQ conversion and demultiplexing " + f"started for run {os.path.basename(self.id)} on {datetime.now()}" + ) # Demutiplexing done for one mask type and scripts will continue # Working with the next type. Command counter should increase by 1 @@ -333,47 +442,59 @@ def _aggregate_demux_results(self): def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter): with chdir(self.run_dir): # Software - cl = [self.CONFIG.get(self.software)['bin']] + cl = [self.CONFIG.get(self.software)["bin"]] # Case with bcl2fastq - if self.software == 'bcl2fastq': - logger.info('Building a bcl2fastq command') - per_lane_base_masks = self._generate_per_lane_base_mask(sample_type, mask_table) + if self.software == "bcl2fastq": + logger.info("Building a bcl2fastq command") + per_lane_base_masks = self._generate_per_lane_base_mask( + sample_type, mask_table + ) # Add the base_mask for each lane lanes = list(mask_table.keys()) for lane in sorted(lanes): # Iterate thorugh each lane and add the correct --use-bases-mask for that lane - base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # Get the base_mask - base_mask_expr = '{}:'.format(lane) + ','.join(base_mask) - cl.extend(['--use-bases-mask', base_mask_expr]) + base_mask = [ + per_lane_base_masks[lane][bm]["base_mask"] + for bm in per_lane_base_masks[lane] + ][0] # Get the base_mask + base_mask_expr = f"{lane}:" + ",".join(base_mask) + cl.extend(["--use-bases-mask", base_mask_expr]) # Case with bclconvert - elif self.software == 'bclconvert': - logger.info('Building a bclconvert command') - cl.extend(['--bcl-input-directory', self.run_dir]) + elif self.software == "bclconvert": + logger.info("Building a bclconvert command") + cl.extend(["--bcl-input-directory", self.run_dir]) else: raise RuntimeError("Unrecognized software!") # Output dir - output_dir = os.path.join(self.run_dir, 'Demultiplexing_{}'.format(bcl_cmd_counter)) + output_dir = os.path.join(self.run_dir, f"Demultiplexing_{bcl_cmd_counter}") if not os.path.exists(output_dir): os.makedirs(output_dir) - cl.extend(['--output-dir', output_dir]) + cl.extend(["--output-dir", output_dir]) # Samplesheet - cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, 'SampleSheet_{}.csv'.format(bcl_cmd_counter)))]) + cl.extend( + [ + "--sample-sheet", + os.path.join( + os.path.join(self.run_dir, f"SampleSheet_{bcl_cmd_counter}.csv") + ), + ] + ) # Demux options cl_options = [] - if 'options' in self.CONFIG.get(self.software): - if self.CONFIG[self.software]['options'].get('common'): - for option in self.CONFIG[self.software]['options']['common']: + if "options" in self.CONFIG.get(self.software): + if self.CONFIG[self.software]["options"].get("common"): + for option in self.CONFIG[self.software]["options"]["common"]: cl_options.extend([option]) - if self.CONFIG[self.software]['options'].get(sample_type): - for option in self.CONFIG[self.software]['options'][sample_type]: + if self.CONFIG[self.software]["options"].get(sample_type): + for option in self.CONFIG[self.software]["options"][sample_type]: cl_options.extend([option]) for option in cl_options: if isinstance(option, dict): opt, val = list(option.items())[0] - if 'output-dir' not in opt: - cl.extend(['--{}'.format(opt), str(val).lower()]) + if "output-dir" not in opt: + cl.extend([f"--{opt}", str(val).lower()]) else: - cl.append('--{}'.format(option)) + cl.append(f"--{option}") return cl def _generate_per_lane_base_mask(self, sample_type, mask_table): @@ -405,200 +526,302 @@ def _generate_per_lane_base_mask(self, sample_type, mask_table): read1_size = lane_contents[2][0] read2_size = lane_contents[2][1] is_dual_index = False - if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0): + if (index1_size != 0 and index2_size != 0) or ( + index1_size == 0 and index2_size != 0 + ): is_dual_index = True # Compute the basemask - base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size) - base_mask_string = ''.join(base_mask) - - base_masks[lane][base_mask_string] = {'base_mask':base_mask} + base_mask = self._compute_base_mask( + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ) + base_mask_string = "".join(base_mask) + + base_masks[lane][base_mask_string] = {"base_mask": base_mask} return base_masks - def _compute_base_mask(self, runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size): + def _compute_base_mask( + self, + runSetup, + sample_type, + index1_size, + is_dual_index, + index2_size, + umi1_size, + umi2_size, + read1_size, + read2_size, + ): """ - Assumptions: - - if runSetup is of size 3, then single index run - - if runSetup is of size 4, then dual index run + Assumptions: + - if runSetup is of size 3, then single index run + - if runSetup is of size 4, then dual index run """ bm = [] - dual_index_run = False if len(runSetup) > 4: - raise RuntimeError("when generating base_masks looks like there are" \ - " more than 4 reads in the RunSetup.xml") + raise RuntimeError( + "when generating base_masks looks like there are" + " more than 4 reads in the RunSetup.xml" + ) for read in runSetup: - cycles = int(read['NumCycles']) - if read['IsIndexedRead'] == 'N': + cycles = int(read["NumCycles"]) + if read["IsIndexedRead"] == "N": # Prepare the base mask for the 1st read - is_first_read = int(read['Number']) == 1 + is_first_read = int(read["Number"]) == 1 if is_first_read: if cycles > read1_size: r_remainder = cycles - read1_size if read1_size != 0: - bm.append('Y' + str(read1_size) + 'N' + str(r_remainder)) + bm.append("Y" + str(read1_size) + "N" + str(r_remainder)) else: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('Y' + str(cycles)) + bm.append("Y" + str(cycles)) else: if cycles > read2_size: r_remainder = cycles - read2_size if read2_size != 0: - bm.append('Y' + str(read2_size) + 'N' + str(r_remainder)) + bm.append("Y" + str(read2_size) + "N" + str(r_remainder)) else: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('Y' + str(cycles)) + bm.append("Y" + str(cycles)) else: - is_first_index_read = int(read['Number']) == 2 + is_first_index_read = int(read["Number"]) == 2 # Prepare the base mask for the 1st index read if is_first_index_read: # The size of the index of the sample sheet is larger than the # one specified by RunInfo.xml, somethig must be wrong if index1_size > cycles: - raise RuntimeError("when generating base_masks found index 1 in" \ - " samplesheet larger than the index specifed in RunInfo.xml") + raise RuntimeError( + "when generating base_masks found index 1 in" + " samplesheet larger than the index specifed in RunInfo.xml" + ) i_remainder = cycles - index1_size if i_remainder > 0: - if sample_type == 'IDT_UMI': # Case of IDT UMI + if sample_type == "IDT_UMI": # Case of IDT UMI if umi1_size != 0: if i_remainder - umi1_size > 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index1_size) + 'Y' + str(umi1_size) + 'N' + str(i_remainder - umi1_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index1_size) + 'U' + str(umi1_size) + 'N' + str(i_remainder - umi1_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index1_size) + + "Y" + + str(umi1_size) + + "N" + + str(i_remainder - umi1_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index1_size) + + "U" + + str(umi1_size) + + "N" + + str(i_remainder - umi1_size) + ) else: raise RuntimeError("Unrecognized software!") elif i_remainder - umi1_size == 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index1_size) + 'Y' + str(umi1_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index1_size) + 'U' + str(umi1_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index1_size) + + "Y" + + str(umi1_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index1_size) + + "U" + + str(umi1_size) + ) else: raise RuntimeError("Unrecognized software!") else: - raise RuntimeError("when generating base_masks for UMI samples" \ - " some UMI1 length is longer than specified in RunInfo.xml") + raise RuntimeError( + "when generating base_masks for UMI samples" + " some UMI1 length is longer than specified in RunInfo.xml" + ) else: - bm.append('I' + str(index1_size) + 'N' + str(i_remainder)) + bm.append( + "I" + str(index1_size) + "N" + str(i_remainder) + ) elif index1_size == 0: - bm.append('N' + str(cycles)) # Case of NoIndex + bm.append("N" + str(cycles)) # Case of NoIndex else: - bm.append('I' + str(index1_size) + 'N' + str(i_remainder)) + bm.append("I" + str(index1_size) + "N" + str(i_remainder)) else: - bm.append('I' + str(cycles)) + bm.append("I" + str(cycles)) else: # The size of the index of the sample sheet is larger than the # one specified by RunInfo.xml, somethig must be wrong if index2_size > cycles: - raise RuntimeError("when generating base_masks found index 2 in" \ - " samplesheet larger than the index specifed in RunInfo.xml") + raise RuntimeError( + "when generating base_masks found index 2 in" + " samplesheet larger than the index specifed in RunInfo.xml" + ) # When working on the second read index I need to know if the sample is dual index or not - if is_dual_index or sample_type == '10X_SINGLE': - if sample_type == '10X_SINGLE': # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert - if self.software == 'bcl2fastq': - bm.append('Y' + str(cycles)) - elif self.software == 'bclconvert': - bm.append('N' + str(cycles)) + if is_dual_index or sample_type == "10X_SINGLE": + if ( + sample_type == "10X_SINGLE" + ): # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert + if self.software == "bcl2fastq": + bm.append("Y" + str(cycles)) + elif self.software == "bclconvert": + bm.append("N" + str(cycles)) else: raise RuntimeError("Unrecognized software!") else: i_remainder = cycles - index2_size if i_remainder > 0: - if sample_type == 'IDT_UMI': # Case of IDT UMI + if sample_type == "IDT_UMI": # Case of IDT UMI if umi2_size != 0: if i_remainder - umi2_size > 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index2_size) + 'Y' + str(umi2_size) + 'N' + str(i_remainder - umi2_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index2_size) + 'U' + str(umi2_size) + 'N' + str(i_remainder - umi2_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index2_size) + + "Y" + + str(umi2_size) + + "N" + + str(i_remainder - umi2_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index2_size) + + "U" + + str(umi2_size) + + "N" + + str(i_remainder - umi2_size) + ) else: - raise RuntimeError("Unrecognized software!") + raise RuntimeError( + "Unrecognized software!" + ) elif i_remainder - umi2_size == 0: - if self.software == 'bcl2fastq': - bm.append('I' + str(index2_size) + 'Y' + str(umi2_size)) - elif self.software == 'bclconvert': - bm.append('I' + str(index2_size) + 'U' + str(umi2_size)) + if self.software == "bcl2fastq": + bm.append( + "I" + + str(index2_size) + + "Y" + + str(umi2_size) + ) + elif self.software == "bclconvert": + bm.append( + "I" + + str(index2_size) + + "U" + + str(umi2_size) + ) else: - raise RuntimeError("Unrecognized software!") + raise RuntimeError( + "Unrecognized software!" + ) else: - raise RuntimeError("when generating base_masks for UMI samples" \ - " some UMI2 length is longer than specified in RunInfo.xml") + raise RuntimeError( + "when generating base_masks for UMI samples" + " some UMI2 length is longer than specified in RunInfo.xml" + ) else: - bm.append('I' + str(index2_size) + 'N' + str(i_remainder)) + bm.append( + "I" + + str(index2_size) + + "N" + + str(i_remainder) + ) elif index2_size == 0: - bm.append('N' + str(cycles)) + bm.append("N" + str(cycles)) else: - bm.append('I' + str(index2_size) + 'N' + str(i_remainder)) + bm.append( + "I" + str(index2_size) + "N" + str(i_remainder) + ) else: - bm.append('I' + str(cycles)) + bm.append("I" + str(cycles)) else: - # If this sample is not dual index but the run is, - # then I need to ignore the second index completely - bm.append('N' + str(cycles)) + # If this sample is not dual index but the run is, + # then I need to ignore the second index completely + bm.append("N" + str(cycles)) return bm - - def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None): + def _generate_clean_samplesheet( + self, + ssparser, + indexfile, + fields_to_remove=None, + rename_samples=True, + rename_qPCR_suffix=False, + fields_qPCR=None, + ): """Generate a 'clean' samplesheet, the given fields will be removed. If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG) """ - output = u'' + output = "" # Expand the ssparser if there are lanes with 10X or Smart-seq samples - index_dict_tenX = self._parse_10X_indexes(indexfile['tenX']) - index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq']) + index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"]) + index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"]) # Replace 10X or Smart-seq indices for sample in ssparser.data: - if sample['index'] in index_dict_tenX.keys(): - tenX_index = sample['index'] + if sample["index"] in index_dict_tenX.keys(): + tenX_index = sample["index"] # In the case of 10X dual indexes, replace index and index2 if TENX_DUAL_PAT.findall(tenX_index): - sample['index'] = index_dict_tenX[tenX_index][0] - sample['index2'] = index_dict_tenX[tenX_index][1] + sample["index"] = index_dict_tenX[tenX_index][0] + sample["index2"] = index_dict_tenX[tenX_index][1] # In the case of 10X single indexes, replace the index name with the 4 actual indicies else: x = 0 indices_number = len(index_dict_tenX[tenX_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_tenX[tenX_index][x] + new_sample["index"] = index_dict_tenX[tenX_index][x] ssparser.data.append(new_sample) x += 1 # Set the original 10X index to the 4th correct index - sample['index'] = index_dict_tenX[tenX_index][x] - elif SMARTSEQ_PAT.findall(sample['index']): + sample["index"] = index_dict_tenX[tenX_index][x] + elif SMARTSEQ_PAT.findall(sample["index"]): x = 0 - smartseq_index = sample['index'].split('-')[1] + smartseq_index = sample["index"].split("-")[1] indices_number = len(index_dict_smartseq[smartseq_index]) while x < indices_number - 1: new_sample = dict(sample) - new_sample['index'] = index_dict_smartseq[smartseq_index][x][0] - new_sample['index2'] = index_dict_smartseq[smartseq_index][x][1] + new_sample["index"] = index_dict_smartseq[smartseq_index][x][0] + new_sample["index2"] = index_dict_smartseq[smartseq_index][x][1] ssparser.data.append(new_sample) x += 1 - sample['index'] = index_dict_smartseq[smartseq_index][x][0] - sample['index2'] = index_dict_smartseq[smartseq_index][x][1] + sample["index"] = index_dict_smartseq[smartseq_index][x][0] + sample["index2"] = index_dict_smartseq[smartseq_index][x][1] # Sort to get the added indicies from 10x in the right place # Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID - ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID'))) + ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID"))) if not fields_to_remove: fields_to_remove = [] # Header - output += '[Header]{}'.format(os.linesep) + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Data - output += '[Data]{}'.format(os.linesep) + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: if field not in fields_to_remove: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: line_ar = [] @@ -608,79 +831,108 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None try: if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR: # Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix - value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm])) + value = re.sub( + "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}" + ) else: # Substitute SampleID with SampleName, add Sample_ as prefix - value ='Sample_{}'.format(line[ssparser.dfield_snm]) + value = f"Sample_{line[ssparser.dfield_snm]}" except: - # Otherwise add Sample_ as prefix - value = 'Sample_{}'.format(line[ssparser.dfield_sid]) + # Otherwise add Sample_ as prefix + value = f"Sample_{line[ssparser.dfield_sid]}" elif rename_qPCR_suffix and field in fields_qPCR: - value = re.sub('__qPCR_$', '', line[field]) + value = re.sub("__qPCR_$", "", line[field]) line_ar.append(value) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output - def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, software, sample_type, index1_size, index2_size, base_mask, CONFIG): - output = u'' + def _generate_samplesheet_subset( + self, + ssparser, + samples_to_include, + runSetup, + software, + sample_type, + index1_size, + index2_size, + base_mask, + CONFIG, + ): + output = "" # Prepare index cycles index_cycles = [0, 0] for read in runSetup: - if read['IsIndexedRead'] == 'Y': - if int(read['Number']) == 2: - index_cycles[0] = int(read['NumCycles']) + if read["IsIndexedRead"] == "Y": + if int(read["Number"]) == 2: + index_cycles[0] = int(read["NumCycles"]) else: - index_cycles[1] = int(read['NumCycles']) + index_cycles[1] = int(read["NumCycles"]) # Header - output += '[Header]{}'.format(os.linesep) + output += f"[Header]{os.linesep}" for field in sorted(ssparser.header): - output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip()) + output += f"{field.rstrip()},{ssparser.header[field].rstrip()}" output += os.linesep # Settings for BCL Convert - if software == 'bclconvert': - output += '[Settings]{}'.format(os.linesep) - output += 'OverrideCycles,{}{}'.format(';'.join(base_mask), os.linesep) + if software == "bclconvert": + output += f"[Settings]{os.linesep}" + output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep) - if CONFIG.get('bclconvert'): - if CONFIG['bclconvert'].get('settings'): + if CONFIG.get("bclconvert"): + if CONFIG["bclconvert"].get("settings"): # Put common settings - if CONFIG['bclconvert']['settings'].get('common'): - for setting in CONFIG['bclconvert']['settings']['common']: + if CONFIG["bclconvert"]["settings"].get("common"): + for setting in CONFIG["bclconvert"]["settings"]["common"]: for k, v in setting.items(): - output += '{},{}{}'.format(k, v, os.linesep) + output += f"{k},{v}{os.linesep}" # Put special settings: - if sample_type in CONFIG['bclconvert']['settings'].keys(): - for setting in CONFIG['bclconvert']['settings'][sample_type]: + if sample_type in CONFIG["bclconvert"]["settings"].keys(): + for setting in CONFIG["bclconvert"]["settings"][sample_type]: for k, v in setting.items(): - if (k == 'BarcodeMismatchesIndex1' and index1_size != 0) or (k == 'BarcodeMismatchesIndex2' and index2_size != 0) or 'BarcodeMismatchesIndex' not in k: - output += '{},{}{}'.format(k, v, os.linesep) + if ( + ( + k == "BarcodeMismatchesIndex1" + and index1_size != 0 + ) + or ( + k == "BarcodeMismatchesIndex2" + and index2_size != 0 + ) + or "BarcodeMismatchesIndex" not in k + ): + output += f"{k},{v}{os.linesep}" # Data - output += '[Data]{}'.format(os.linesep) + output += f"[Data]{os.linesep}" datafields = [] for field in ssparser.datafields: datafields.append(field) - output += ','.join(datafields) + output += ",".join(datafields) output += os.linesep for line in ssparser.data: - sample_name = line.get('Sample_Name') or line.get('SampleName') - lane = line['Lane'] + sample_name = line.get("Sample_Name") or line.get("SampleName") + lane = line["Lane"] noindex_flag = False if lane in samples_to_include.keys(): if sample_name in samples_to_include.get(lane): line_ar = [] for field in datafields: # Case with NoIndex - if field == 'index' and 'NOINDEX' in line['index'].upper(): - line[field] = 'T'*index_cycles[0] if index_cycles[0] !=0 else '' + if field == "index" and "NOINDEX" in line["index"].upper(): + line[field] = ( + "T" * index_cycles[0] if index_cycles[0] != 0 else "" + ) noindex_flag = True - if field == 'index2' and noindex_flag: - line[field] = 'A'*index_cycles[1] if index_cycles[1] !=0 else '' + if field == "index2" and noindex_flag: + line[field] = ( + "A" * index_cycles[1] if index_cycles[1] != 0 else "" + ) noindex_flag = False # Case of IDT UMI - if (field == 'index' or field == 'index2') and IDT_UMI_PAT.findall(line[field]): - line[field] = line[field].replace('N', '') + if ( + field == "index" or field == "index2" + ) and IDT_UMI_PAT.findall(line[field]): + line[field] = line[field].replace("N", "") line_ar.append(line[field]) - output += ','.join(line_ar) + output += ",".join(line_ar) output += os.linesep return output diff --git a/taca/illumina/__init__.py b/taca/illumina/__init__.py index 14e36756..50a56a43 100644 --- a/taca/illumina/__init__.py +++ b/taca/illumina/__init__.py @@ -1,3 +1,3 @@ """ Runs class to parse and work with illumina flowcells -""" \ No newline at end of file +""" diff --git a/taca/log/__init__.py b/taca/log/__init__.py index 0946603e..0ce995d1 100644 --- a/taca/log/__init__.py +++ b/taca/log/__init__.py @@ -8,25 +8,28 @@ # Console logger stream_handler = logging.StreamHandler() -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") stream_handler.setFormatter(formatter) ROOT_LOG.addHandler(stream_handler) LOG_LEVELS = { - 'ERROR': logging.ERROR, - 'WARN': logging.WARN, - 'INFO': logging.INFO, - 'DEBUG': logging.DEBUG + "ERROR": logging.ERROR, + "WARN": logging.WARN, + "INFO": logging.INFO, + "DEBUG": logging.DEBUG, } -def init_logger_file(log_file, log_level='INFO'): - """ Append a FileHandler to the root logger. + +def init_logger_file(log_file, log_level="INFO"): + """Append a FileHandler to the root logger. :param str log_file: Path to the log file :param str log_level: Logging level """ - ROOT_LOG.handlers=[] - log_level = LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO + ROOT_LOG.handlers = [] + log_level = ( + LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO + ) ROOT_LOG.setLevel(log_level) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 5f839058..675edcd2 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -1,18 +1,18 @@ -import os -import logging import csv -import shutil import glob -import re import json -import pandas as pd -import subprocess +import logging import os +import re +import shutil +import subprocess +from datetime import datetime from typing import Union -from taca.utils.statusdb import NanoporeRunsConnection -from datetime import datetime +import pandas as pd + from taca.utils.config import CONFIG +from taca.utils.statusdb import NanoporeRunsConnection from taca.utils.transfer import RsyncAgent, RsyncError logger = logging.getLogger(__name__) @@ -22,24 +22,27 @@ ) -class ONT_run(object): +class ONT_run: """General Nanopore run. Expects instantiation from absolute path of run directory on preprocessing server. """ def __init__(self, run_abspath: str): - # Get paths and names of MinKNOW experiment, sample and run self.run_name = os.path.basename(run_abspath) self.run_abspath = run_abspath + self.run_type: str | None = ( + None # This will be defined upon instantiation of a child class + ) + assert re.match( ONT_RUN_PATTERN, self.run_name ), f"Run {self.run_name} doesn't look like a run dir" # Parse MinKNOW sample and experiment name - with open(self.get_file("/run_path.txt"), "r") as stream: + with open(self.get_file("/run_path.txt")) as stream: self.experiment_name, self.sample_name, _ = stream.read().split("/") # Get info from run name @@ -122,7 +125,7 @@ def assert_contents(self): def is_transferred(self) -> bool: """Return True if run ID in transfer.tsv, else False.""" - with open(self.transfer_details["transfer_log"], "r") as f: + with open(self.transfer_details["transfer_log"]) as f: return self.run_name in f.read() # DB update @@ -159,7 +162,7 @@ def update_db_entry(self, force_update=False): self.touch_db_entry() # If the run document is marked as "ongoing" or database is being manually updated - if self.db.check_run_status(self) == "ongoing" or force_update == True: + if self.db.check_run_status(self) == "ongoing" or force_update is True: logger.info( f"{self.run_name}: Run exists in the database with run status: {self.db.check_run_status(self)}." ) @@ -185,7 +188,6 @@ def update_db_entry(self, force_update=False): ) def parse_pore_activity(self, db_update): - logger.info(f"{self.run_name}: Parsing pore activity...") pore_activity = {} @@ -230,7 +232,7 @@ def parse_minknow_json(self, db_update): logger.info(f"{self.run_name}:Parsing report JSON...") - dict_json_report = json.load(open(self.get_file("/report*.json"), "r")) + dict_json_report = json.load(open(self.get_file("/report*.json"))) # Initialize return dict parsed_data = {} @@ -257,7 +259,10 @@ def parse_minknow_json(self, db_update): # -- Run output subsection seq_metadata_trimmed["acquisition_output"] = [] for section in seq_metadata["acquisition_output"]: - if section["type"] in ["AllData", "SplitByBarcode"]: + if "type" not in section.keys() or section["type"] in [ + "AllData", + "SplitByBarcode", + ]: seq_metadata_trimmed["acquisition_output"].append(section) # -- Read length subseqtion @@ -282,11 +287,13 @@ def copy_metadata(self): "**/bam*/***", "**/fast5*/***", "**/fastq*/***", + "**/pod5*/***", # Any files found elsewhere "*.bam*", "*.bai*", "*.fast5*", "*.fastq*", + "*.pod5*", ] exclude_patterns_quoted = ["'" + pattern + "'" for pattern in exclude_patterns] @@ -299,7 +306,6 @@ def copy_metadata(self): ) def copy_html_report(self): - logger.info(f"{self.run_name}: Transferring .html report to ngi-internal...") # Transfer the MinKNOW .html report file to ngi-internal, renaming it to the full run ID. Requires password-free SSH access. @@ -352,10 +358,10 @@ def update_transfer_log(self): with open(self.transfer_details["transfer_log"], "a") as f: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow([self.run_name, str(datetime.now())]) - except IOError: + except OSError: msg = f"{self.run_name}: Could not update the transfer logfile {self.transfer_details['transfer_log']}" logger.error(msg) - raise IOError(msg) + raise OSError(msg) # Archive run @@ -372,7 +378,7 @@ class ONT_user_run(ONT_run): def __init__(self, run_abspath: str): self.run_type = "user_run" - super(ONT_user_run, self).__init__(run_abspath) + super().__init__(run_abspath) class ONT_qc_run(ONT_run): @@ -380,7 +386,7 @@ class ONT_qc_run(ONT_run): def __init__(self, run_abspath: str): self.run_type = "qc_run" - super(ONT_qc_run, self).__init__(run_abspath) + super().__init__(run_abspath) # Get Anglerfish attributes from run self.anglerfish_done_abspath = f"{self.run_abspath}/.anglerfish_done" @@ -404,7 +410,7 @@ def get_anglerfish_exit_code(self) -> Union[int, None]: Return exit code or None. """ if os.path.exists(self.anglerfish_done_abspath): - return int(open(self.anglerfish_done_abspath, "r").read()) + return int(open(self.anglerfish_done_abspath).read()) else: return None @@ -413,7 +419,7 @@ def get_anglerfish_pid(self) -> Union[str, None]: Return process ID or None.""" if os.path.exists(self.anglerfish_ongoing_abspath): - return str(open(self.anglerfish_ongoing_abspath, "r").read()) + return str(open(self.anglerfish_ongoing_abspath).read()) else: return None @@ -458,12 +464,32 @@ def fetch_anglerfish_samplesheet(self) -> bool: f"{self.run_name}: Error occured when copying anglerfish samplesheet to run dir." ) + def has_fastq_output(self) -> bool: + """Check whether run has fastq output.""" + + reads_dir = os.path.join(self.run_abspath, "fastq_pass") + + return os.path.exists(reads_dir) + + def has_barcode_dirs(self) -> bool: + barcode_dir_pattern = r"barcode\d{2}" + + for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")): + if re.search(barcode_dir_pattern, dir): + return True + + return False + def run_anglerfish(self): """Run Anglerfish as subprocess within it's own Conda environment. Dump files to indicate ongoing and finished processes. """ + timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S") + + # "anglerfish_run*" is the dir pattern recognized by the LIMS script parsing the results anglerfish_run_name = "anglerfish_run" + n_threads = 2 # This could possibly be changed anglerfish_command = [ @@ -473,9 +499,18 @@ def run_anglerfish(self): f"--run_name {anglerfish_run_name}", f"--threads {n_threads}", "--lenient", - "--ont_barcodes", "--skip_demux", ] + if self.has_barcode_dirs(): + anglerfish_command.append("--barcoding") + + # Create dir to trace TACA executing Anglerfish as a subprocess + taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}" + os.mkdir(taca_anglerfish_run_dir) + # Copy samplesheet used for traceability + shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/") + # Create files to dump subprocess std + stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt" full_command = [ # Dump subprocess PID into 'run-ongoing'-indicator file. @@ -484,20 +519,29 @@ def run_anglerfish(self): "conda run -n anglerfish " + " ".join(anglerfish_command), # Dump Anglerfish exit code into file f"echo $? > {self.anglerfish_done_abspath}", - # Copy the Anglerfish samplesheet used to start the run into the run dir, for traceability - # (The correct anglerfish run dir is identified by it being younger than the "run-ongoing" file) - f"new_runs=$(find . -type d -name 'anglerfish_run*' -newer {self.anglerfish_ongoing_abspath})", - f"if [[ $(echo '${{new_runs}}' | wc -l) -eq 1 ]] ; then cp {self.anglerfish_samplesheet} ${{new_runs}}/ ; fi", - # Regardless of exit status: Remove 'run-ongoing' file. + # Move run to subdir + # 1) Find the latest Anglerfish run dir (younger than the 'run-ongoing' file) + f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing ' + # 2) Move the Anglerfish run dir into the TACA Anglerfish run dir + + "-exec mv \{\} " + + f"{self.run_abspath}/{taca_anglerfish_run_dir}/ \; " + # 3) Only do this once + + "-quit", + # Remove 'run-ongoing' file. f"rm {self.anglerfish_ongoing_abspath}", ] + with open(f"{taca_anglerfish_run_dir}/command.sh", "w") as stream: + stream.write("\n".join(full_command)) + # Start Anglerfish subprocess - process = subprocess.Popen( - "; ".join(full_command), - shell=True, - cwd=self.run_abspath, - ) + with open(stderr_relpath, "w") as stderr: + process = subprocess.Popen( + f"bash {taca_anglerfish_run_dir}/command.sh", + shell=True, + cwd=self.run_abspath, + stderr=stderr, + ) logger.info( f"{self.run_name}: Anglerfish subprocess started with process ID {process.pid}." ) diff --git a/taca/nanopore/__init__.py b/taca/nanopore/__init__.py index c8b7802c..5063a460 100644 --- a/taca/nanopore/__init__.py +++ b/taca/nanopore/__init__.py @@ -1,3 +1,3 @@ """ Classes to parse and work with ONT data -""" \ No newline at end of file +""" diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py index 75c2d56d..978701aa 100644 --- a/taca/nanopore/instrument_transfer.py +++ b/taca/nanopore/instrument_transfer.py @@ -2,14 +2,14 @@ """ __version__ = "1.0.13" +import argparse import logging import os import re import shutil -import argparse import subprocess -from glob import glob from datetime import datetime as dt +from glob import glob def main(args): @@ -48,19 +48,18 @@ def main(args): # Iterate over runs for run_path in run_paths: - logging.info(f"Handling {run_path}...") if run_path.split(os.sep)[-2][0:3] == "QC_": # For QC runs, the sample name should start with "QC_" - logging.info(f"Run categorized as QC.") + logging.info("Run categorized as QC.") rsync_dest = args.dest_dir_qc else: rsync_dest = args.dest_dir - logging.info(f"Dumping run path...") + logging.info("Dumping run path...") dump_path(run_path) - logging.info(f"Dumping QC and MUX history...") + logging.info("Dumping QC and MUX history...") dump_pore_count_history(run_path, pore_counts) if not sequencing_finished(run_path): @@ -96,7 +95,7 @@ def write_finished_indicator(run_path): open(new_file, "w").close() -def sync_to_storage(run_dir, destination, log): +def sync_to_storage(run_dir: str, destination: str, rsync_log: str): """Sync the run to storage using rsync. Skip if rsync is already running on the run.""" @@ -104,7 +103,7 @@ def sync_to_storage(run_dir, destination, log): "run-one", "rsync", "-rvu", - "--log-file=" + log, + "--log-file=" + rsync_log, run_dir, destination, ] @@ -115,17 +114,19 @@ def sync_to_storage(run_dir, destination, log): ) -def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: list[str]): +def final_sync_to_storage( + run_dir: str, destination: str, archive_dir: str, rsync_log: str +): """Do a final sync of the run to storage, then archive it. Skip if rsync is already running on the run.""" - logging.info("Performing a final sync of {} to storage".format(run_dir)) + logging.info(f"Performing a final sync of {run_dir} to storage") command = [ "run-one", "rsync", "-rvu", - "--log-file=" + log, + "--log-file=" + rsync_log, run_dir, destination, ] @@ -140,9 +141,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: archive_finished_run(run_dir, archive_dir) else: logging.info( - "Previous rsync might be running still. Skipping {} for now.".format( - run_dir - ) + f"Previous rsync might be running still. Skipping {run_dir} for now." ) return @@ -155,7 +154,7 @@ def archive_finished_run(run_dir: str, archive_dir: str): sample_dir = os.path.dirname(run_dir) exp_dir = os.path.dirname(sample_dir) - run_name = os.path.basename(run_dir) + os.path.basename(run_dir) sample_name = os.path.basename(sample_dir) exp_name = os.path.basename(exp_dir) @@ -214,9 +213,9 @@ def parse_position_logs(minknow_logs_dir: str) -> list: for row in "ABCDEFGH": positions.append(col + row) - entries = [] + headers = [] + header: dict | None = None for position in positions: - log_files = glob( os.path.join(minknow_logs_dir, position, "control_server_log-*.txt") ) @@ -227,32 +226,35 @@ def parse_position_logs(minknow_logs_dir: str) -> list: for log_file in log_files: with open(log_file) as stream: lines = stream.readlines() - for i in range(0, len(lines)): - line = lines[i] - if line[0:4] != " ": + + # Iterate across log lines + for line in lines: + if not line[0:4] == " ": # Line is log header split_header = line.split(" ") timestamp = " ".join(split_header[0:2]) category = " ".join(split_header[2:]) - entry = { + header = { "position": position, "timestamp": timestamp.strip(), "category": category.strip(), } - entries.append(entry) - else: + headers.append(header) + + elif header: # Line is log body - if "body" not in entry: - entry["body"] = {} + if "body" not in header.keys(): + body: dict = {} + header["body"] = body key = line.split(": ")[0].strip() val = ": ".join(line.split(": ")[1:]).strip() - entry["body"][key] = val + header["body"][key] = val - entries.sort(key=lambda x: x["timestamp"]) - logging.info(f"Parsed {len(entries)} log entries.") + headers.sort(key=lambda x: x["timestamp"]) + logging.info(f"Parsed {len(headers)} log entries.") - return entries + return headers def get_pore_counts(position_logs: list) -> list: @@ -260,7 +262,6 @@ def get_pore_counts(position_logs: list) -> list: pore_counts = [] for entry in position_logs: - if "INFO: platform_qc.report (user_messages)" in entry["category"]: type = "qc" elif "INFO: mux_scan_result (user_messages)" in entry["category"]: @@ -269,7 +270,6 @@ def get_pore_counts(position_logs: list) -> list: type = "other" if type in ["qc", "mux"]: - new_entry = { "flow_cell_id": entry["body"]["flow_cell_id"], "timestamp": entry["timestamp"], @@ -329,6 +329,7 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str: return new_file_path + # BEGIN_EXCLUDE if __name__ == "__main__": # This is clunky but should be fine since it will only ever run as a cronjob @@ -367,4 +368,4 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str: args = parser.parse_args() main(args) -# END_EXCLUDE \ No newline at end of file +# END_EXCLUDE diff --git a/taca/server_status/cli.py b/taca/server_status/cli.py index 723410df..1833035f 100644 --- a/taca/server_status/cli.py +++ b/taca/server_status/cli.py @@ -1,41 +1,46 @@ -import click import logging +import click + +from taca.server_status import ( + cronjobs as cj, # to avoid similar names with command, otherwise exception +) from taca.server_status import server_status as status from taca.utils.config import CONFIG -from taca.server_status import cronjobs as cj # to avoid similar names with command, otherwise exception -@click.group(name='server_status') +@click.group(name="server_status") def server_status(): - """ Monitor server status """ + """Monitor server status""" + # server status subcommands @server_status.command() -@click.option('--statusdb', is_flag=True, help="Update the statusdb") +@click.option("--statusdb", is_flag=True, help="Update the statusdb") def nases(statusdb): - """ Checks the available space on all the nases - """ - if not CONFIG.get('server_status', ''): + """Checks the available space on all the nases""" + if not CONFIG.get("server_status", ""): logging.warning("Configuration missing required entries: server_status") disk_space = status.get_nases_disk_space() if statusdb: - status.update_status_db(disk_space, server_type='nas') + status.update_status_db(disk_space, server_type="nas") + @server_status.command() def cronjobs(): - """ Monitors cronjobs and updates statusdb - """ + """Monitors cronjobs and updates statusdb""" cj.update_cronjob_db() + @server_status.command() def monitor_promethion(): - """ Checks the status of PromethION and if ngi-nas is mounted - """ - if not CONFIG.get('promethion_status', ''): + """Checks the status of PromethION and if ngi-nas is mounted""" + if not CONFIG.get("promethion_status", ""): logging.warning("Configuration missing required entries: server_status") promethion_status = status.check_promethion_status() if promethion_status: logging.info("No issues encountered with the PromethION") else: - logging.warning("An issue with the PromethION was encountered. Operator has been notified by email.") \ No newline at end of file + logging.warning( + "An issue with the PromethION was encountered. Operator has been notified by email." + ) diff --git a/taca/server_status/cronjobs.py b/taca/server_status/cronjobs.py index 9b808bd8..1f1605c4 100644 --- a/taca/server_status/cronjobs.py +++ b/taca/server_status/cronjobs.py @@ -1,77 +1,84 @@ +import datetime +import getpass import logging import platform -import getpass -import datetime from crontab import CronTab + from taca.utils import statusdb from taca.utils.config import CONFIG + def _parse_crontab(): result = {} user = getpass.getuser() - logging.info('Getting crontab for user {}'.format(user)) + logging.info(f"Getting crontab for user {user}") try: crontab = CronTab(user=user) except Exception as e: - logging.error('Cannot get a crontab for user: {}'.format(user)) + logging.error(f"Cannot get a crontab for user: {user}") logging.error(e.message) else: result[user] = [] for job in crontab.crons: # this is for special syntax like @monthly or @reboot - special_syntax = str(job).split()[0] if str(job).startswith('@') else '' - result[user].append({'Command': job.command, - 'Comment': job.comment, - 'Enabled': job.enabled, - 'Minute': str(job.minutes), - 'Hour': str(job.hours), - 'Day of month' : str(job.dom), - 'Month': str(job.month), - 'Day of week': str(job.dow), - 'Special syntax': special_syntax}) + special_syntax = str(job).split()[0] if str(job).startswith("@") else "" + result[user].append( + { + "Command": job.command, + "Comment": job.comment, + "Enabled": job.enabled, + "Minute": str(job.minutes), + "Hour": str(job.hours), + "Day of month": str(job.dom), + "Month": str(job.month), + "Day of week": str(job.dow), + "Special syntax": special_syntax, + } + ) return result def update_cronjob_db(): - server = platform.node().split('.')[0] + server = platform.node().split(".")[0] timestamp = datetime.datetime.now() # parse results result = _parse_crontab() # connect to db - statusdb_conf = CONFIG.get('statusdb') - logging.info('Connecting to database: {}'.format(CONFIG.get('statusdb', {}).get('url'))) + statusdb_conf = CONFIG.get("statusdb") + logging.info( + "Connecting to database: {}".format(CONFIG.get("statusdb", {}).get("url")) + ) try: couch_connection = statusdb.StatusdbSession(statusdb_conf).connection except Exception as e: logging.error(e.message) else: # update document - crontab_db = couch_connection['cronjobs'] - view = crontab_db.view('server/alias') + crontab_db = couch_connection["cronjobs"] + view = crontab_db.view("server/alias") # to be safe doc = {} # create doc if not exist if not view[server].rows: - logging.info('Creating a document') + logging.info("Creating a document") doc = { - 'users': {user: cronjobs for user, cronjobs in result.items()}, - 'Last updated': str(timestamp), - 'server': server, + "users": {user: cronjobs for user, cronjobs in result.items()}, + "Last updated": str(timestamp), + "server": server, } # else: get existing doc for row in view[server]: - logging.info('Updating the document') + logging.info("Updating the document") doc = crontab_db.get(row.value) - doc['users'].update(result) - doc['Last updated'] = str(timestamp) + doc["users"].update(result) + doc["Last updated"] = str(timestamp) if doc: try: crontab_db.save(doc) except Exception as e: logging.error(e.message) else: - logging.info('{} has been successfully updated'.format(server)) + logging.info(f"{server} has been successfully updated") else: - logging.warning('Document has not been created/updated') - + logging.warning("Document has not been created/updated") diff --git a/taca/server_status/server_status.py b/taca/server_status/server_status.py index a03a107a..3431da31 100644 --- a/taca/server_status/server_status.py +++ b/taca/server_status/server_status.py @@ -1,6 +1,6 @@ -import subprocess -import logging import datetime +import logging +import subprocess from taca.utils import statusdb from taca.utils.config import CONFIG @@ -9,41 +9,42 @@ def get_nases_disk_space(): result = {} - config = CONFIG['server_status'] - servers = config.get('servers', dict()) + config = CONFIG["server_status"] + servers = config.get("servers", dict()) for server_url, path in servers.items(): - # Get command - command = '{command} {path}'.format(command=config['command'], path=path) + command = "{command} {path}".format(command=config["command"], path=path) # If localhost, don't connect to ssh - if server_url == 'localhost': + if server_url == "localhost": command = command.split() else: - if 'promethion' in server_url: - user = 'prom' + if "promethion" in server_url: + user = "prom" else: - user = config['user'] + user = config["user"] # Connect via ssh to server and execute the command - command = ['ssh', '-t', '{}@{}'.format(user, server_url), command] + command = ["ssh", "-t", f"{user}@{server_url}", command] result[server_url] = _run_cmd(command) # Storage systems are mouted locally, e.g. ngi-nas - for storage_system, path in config.get('storage_systems', {}).items(): + for storage_system, path in config.get("storage_systems", {}).items(): # Get command - command = '{command} {path}'.format(command=config['command'], path=path) + command = "{command} {path}".format(command=config["command"], path=path) result[storage_system] = _run_cmd(command.split()) return result + def _run_cmd(command): proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = proc.stdout.read().decode("utf-8") return _parse_output(output) -def _parse_output(output): # for nases + +def _parse_output(output): # for nases # command = df -h /home # output = Filesystem Size Used Avail Use% Mounted on # /dev/mapper/VGStor-lv_illumina @@ -59,39 +60,40 @@ def _parse_output(output): # for nases disk_size = output[-5] filesystem = output[-6] - available_percentage = str(100 - int(used_percentage.replace('%',''))) + '%' + available_percentage = str(100 - int(used_percentage.replace("%", ""))) + "%" result = { - 'disk_size': disk_size, - 'space_used': space_used, - 'space_available': space_available, - 'used_percentage': used_percentage, - 'available_percentage': available_percentage, - 'mounted_on': mounted_on, - 'filesystem': filesystem + "disk_size": disk_size, + "space_used": space_used, + "space_available": space_available, + "used_percentage": used_percentage, + "available_percentage": available_percentage, + "mounted_on": mounted_on, + "filesystem": filesystem, } except: # Sometimes it fails for whatever reason as Popen returns not what it is supposed to result = { - 'disk_size': 'NaN', - 'space_used': 'NaN', - 'space_available': 'NaN', - 'used_percentage': 'NaN', - 'available_percentage': 'NaN', - 'mounted_on': 'NaN', - 'filesystem': 'NaN' + "disk_size": "NaN", + "space_used": "NaN", + "space_available": "NaN", + "used_percentage": "NaN", + "available_percentage": "NaN", + "mounted_on": "NaN", + "filesystem": "NaN", } - logging.error('Can not parse the output: {}'.format(output)) + logging.error(f"Can not parse the output: {output}") return result + def update_status_db(data, server_type=None): - """ Pushed the data to status db. + """Pushed the data to status db. data can be from nases server_type should be 'nas'. """ - db_config = CONFIG.get('statusdb') + db_config = CONFIG.get("statusdb") if db_config is None: logging.error('"statusdb" must be present in the config file!') raise RuntimeError('"statusdb" must be present in the config file!') @@ -101,14 +103,14 @@ def update_status_db(data, server_type=None): logging.error(e.message) raise - db = couch_connection['server_status'] - logging.info('Connection established') - for key in data.keys(): # data is dict of dicts - server = data[key] # data[key] is dictionary (the command output) - server['name'] = key # key is nas url + db = couch_connection["server_status"] + logging.info("Connection established") + for key in data.keys(): # data is dict of dicts + server = data[key] # data[key] is dictionary (the command output) + server["name"] = key # key is nas url # datetime.datetime(2015, 11, 18, 9, 54, 33, 473189) is not JSON serializable - server['time'] = datetime.datetime.now().isoformat() - server['server_type'] = server_type or 'unknown' + server["time"] = datetime.datetime.now().isoformat() + server["server_type"] = server_type or "unknown" try: db.save(server) @@ -116,27 +118,30 @@ def update_status_db(data, server_type=None): logging.error(e.message) raise else: - logging.info('{}: Server status has been updated'.format(key)) + logging.info(f"{key}: Server status has been updated") + def check_promethion_status(): - config = CONFIG.get('promethion_status') - server = config.get('server') - path = config.get('path') - command = config.get('command') - command_to_run = f'{command} {path}' - user = config.get('user') + config = CONFIG.get("promethion_status") + server = config.get("server") + path = config.get("path") + command = config.get("command") + command_to_run = f"{command} {path}" + user = config.get("user") try: - subprocess.run(['ssh', '-t', f'{user}@{server}', command_to_run], - check=True) + subprocess.run(["ssh", "-t", f"{user}@{server}", command_to_run], check=True) except subprocess.CalledProcessError: _send_promethion_warning_email() return False return True + def _send_promethion_warning_email(): - email_recipients = CONFIG.get('mail').get('recipients') - email_subject = ('An issue with the PromethION has been detected.') - email_message = ('An issue with the PromethION has been detected. ' - 'Please investigate and consider pausing the transfer cronjob on preproc1') - send_mail(email_subject, email_message, email_recipients) \ No newline at end of file + email_recipients = CONFIG.get("mail").get("recipients") + email_subject = "An issue with the PromethION has been detected." + email_message = ( + "An issue with the PromethION has been detected. " + "Please investigate and consider pausing the transfer cronjob on preproc1" + ) + send_mail(email_subject, email_message, email_recipients) diff --git a/taca/testing/cli.py b/taca/testing/cli.py index 63b89a35..2abcea9e 100644 --- a/taca/testing/cli.py +++ b/taca/testing/cli.py @@ -1,67 +1,93 @@ - """ CLI for the testing commands """ -from __future__ import print_function import os + import click + import taca.testing.create_uppmax_like_env as createupp -@click.group(name='uppmax_env') + +@click.group(name="uppmax_env") def uppmax_env(): - """ Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log) - The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to: + """Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log) + The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to: - \b - environment: - project_id: ngi1234 #CAN BE ANYTHING - ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING - conda_env: TACA #CAN BE ANYTHING - flowcell_inbox: - - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS - analysis: - best_practice_analysis: - whole_genome_reseq: - analysis_engine: ngi_pipeline.engines.piper_ngi - IGN: - analysis_engine: ngi_pipeline.engines.piper_ngi + \b + environment: + project_id: ngi1234 #CAN BE ANYTHING + ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING + conda_env: TACA #CAN BE ANYTHING + flowcell_inbox: + - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS + analysis: + best_practice_analysis: + whole_genome_reseq: + analysis_engine: ngi_pipeline.engines.piper_ngi + IGN: + analysis_engine: ngi_pipeline.engines.piper_ngi - qc: + qc: - analysis_engine: ngi_pipeline.engines.qc_ngi + analysis_engine: ngi_pipeline.engines.qc_ngi - base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS - sthlm_root: uppmax_env #NEEDS TO EXISTS - top_dir: nobackup/NGI #NEEDS TO EXISTS - upps_root: nothing #CAN BE ANYTHING - logging: - log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL + base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS + sthlm_root: uppmax_env #NEEDS TO EXISTS + top_dir: nobackup/NGI #NEEDS TO EXISTS + upps_root: nothing #CAN BE ANYTHING + logging: + log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL - \b - The requested project will be divided into the following sets: - - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in: - - 1/4: closed more than 3 months ago - - 1/4: closed more than 1 month ago, less than 3 months - - 1/4: closed less than 1 month ago - - 1/4: open - - 1/3 will be selected amonf the projects with application different from 'WG re-seq': - - 1/4: closed more than 3 months ago - - 1/4: closed more than 1 month ago, less than 3 months - - 1/4: closed less than 1 month ago - - 1/4: open + \b + The requested project will be divided into the following sets: + - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in: + - 1/4: closed more than 3 months ago + - 1/4: closed more than 1 month ago, less than 3 months + - 1/4: closed less than 1 month ago + - 1/4: open + - 1/3 will be selected amonf the projects with application different from 'WG re-seq': + - 1/4: closed more than 3 months ago + - 1/4: closed more than 1 month ago, less than 3 months + - 1/4: closed less than 1 month ago + - 1/4: open - """ + """ pass -@uppmax_env.command() -@click.option('-p', '--projects', type=int, default=30, help='number of projects to be extracted from statusdb') -@click.option('-nc', '--ngi-config', type=str, default=os.environ.get('NGI_CONFIG') , help='path to ngi configuration file (expected in variable NGI_CONFIG)') -@click.option('-fq1', '--fastq_1', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 1') -@click.option('-fq2', '--fastq_2', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 2') +@uppmax_env.command() +@click.option( + "-p", + "--projects", + type=int, + default=30, + help="number of projects to be extracted from statusdb", +) +@click.option( + "-nc", + "--ngi-config", + type=str, + default=os.environ.get("NGI_CONFIG"), + help="path to ngi configuration file (expected in variable NGI_CONFIG)", +) +@click.option( + "-fq1", + "--fastq_1", + type=click.Path(exists=True, dir_okay=False), + default=None, + help="Path to fastq file for read 1", +) +@click.option( + "-fq2", + "--fastq_2", + type=click.Path(exists=True, dir_okay=False), + default=None, + help="Path to fastq file for read 2", +) def create(projects, ngi_config, fastq_1, fastq_2): - """creates a uppmax like env - """ - if (fastq_1 is None and fastq_2 is not None) or (fastq_1 is not None and fastq_2 is None): + """creates a uppmax like env""" + if (fastq_1 is None and fastq_2 is not None) or ( + fastq_1 is not None and fastq_2 is None + ): print("ERROR: either both fastq_1 and fastq_2 are specified or none of them") return 1 if fastq_1 is not None: @@ -71,11 +97,13 @@ def create(projects, ngi_config, fastq_1, fastq_2): if which("ngi_pipeline_start.py"): createupp.create(projects, ngi_config, fastq_1, fastq_2) else: - print("ERROR: ngi_pipeline_start.py needs to be available and properly installed") + print( + "ERROR: ngi_pipeline_start.py needs to be available and properly installed" + ) def which(file): for path in os.environ["PATH"].split(os.pathsep): if os.path.exists(os.path.join(path, file)): - return True + return True return False diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py index e4852b42..d3c4a615 100644 --- a/taca/testing/create_uppmax_like_env.py +++ b/taca/testing/create_uppmax_like_env.py @@ -1,140 +1,169 @@ """ Load and parse configuration file.""" -from __future__ import print_function +import datetime import logging import os -import datetime import random import subprocess +import sys from dateutil.relativedelta import relativedelta -from taca.utils.config import CONFIG + from taca.utils import config as conf from taca.utils import filesystem as fs from taca.utils import statusdb -from io import open - +from taca.utils.config import CONFIG logger = logging.getLogger(__name__) def create_version_report(path): # Creates the file version_report.txt for stuff run ngi_pipeline - with open(os.path.join(path, 'version_report.txt'), 'w') as VERSION_REPORT: - VERSION_REPORT.write(u'******\n') - VERSION_REPORT.write(u'README\n') - VERSION_REPORT.write(u'******\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'The versions of programs and references used:\n') - VERSION_REPORT.write(u'piper: unknown\n') - VERSION_REPORT.write(u'bwa: 0.7.12\n') - VERSION_REPORT.write(u'samtools: 0.1.19\n') - VERSION_REPORT.write(u'qualimap: v2.2\n') - VERSION_REPORT.write(u'snpEff: 4.1\n') - VERSION_REPORT.write(u'snpEff reference: GRCh37.75\n') - VERSION_REPORT.write(u'gatk: 3.3-0-geee94ec\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'reference: human_g1k_v37.fasta\n') - VERSION_REPORT.write(u'db_snp: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'hapmap: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'omni: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'1000G_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write(u'indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n') - VERSION_REPORT.write(u'\n') - VERSION_REPORT.write(u'piper\n') - VERSION_REPORT.write(u'-----\n') - VERSION_REPORT.write(u'Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n') + with open(os.path.join(path, "version_report.txt"), "w") as VERSION_REPORT: + VERSION_REPORT.write("******\n") + VERSION_REPORT.write("README\n") + VERSION_REPORT.write("******\n") + VERSION_REPORT.write("\n") + VERSION_REPORT.write( + "Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write("The versions of programs and references used:\n") + VERSION_REPORT.write("piper: unknown\n") + VERSION_REPORT.write("bwa: 0.7.12\n") + VERSION_REPORT.write("samtools: 0.1.19\n") + VERSION_REPORT.write("qualimap: v2.2\n") + VERSION_REPORT.write("snpEff: 4.1\n") + VERSION_REPORT.write("snpEff reference: GRCh37.75\n") + VERSION_REPORT.write("gatk: 3.3-0-geee94ec\n") + VERSION_REPORT.write("\n") + VERSION_REPORT.write("reference: human_g1k_v37.fasta\n") + VERSION_REPORT.write("db_snp: gatk-bundle/2.8\n") + VERSION_REPORT.write("hapmap: gatk-bundle/2.8\n") + VERSION_REPORT.write("omni: gatk-bundle/2.8\n") + VERSION_REPORT.write("1000G_indels: gatk-bundle/2.8\n") + VERSION_REPORT.write( + "Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write( + "indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n" + ) + VERSION_REPORT.write( + "indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n" + ) + VERSION_REPORT.write("\n") + VERSION_REPORT.write("piper\n") + VERSION_REPORT.write("-----\n") + VERSION_REPORT.write( + "Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n" + ) -def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None ): + +def create_FC(incoming_dir, run_name, samplesheet, fastq_1=None, fastq_2=None): # Create something like 160217_ST-E00201_0063_AHJHNYCCXX path_to_fc = os.path.join(incoming_dir, run_name) if os.path.exists(path_to_fc): # This FC exists, skip it return fs.create_folder(path_to_fc) - fs.touch(os.path.join(path_to_fc, 'RTAComplete.txt')) + fs.touch(os.path.join(path_to_fc, "RTAComplete.txt")) # Create folder Demultiplexing - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing")) # Create folder Demultiplexing/Reports - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Reports')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Reports")) # Create folder Demultiplexing/Stats - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Stats')) + fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Stats")) # Memorise SampleSheet stats header = [] for key in samplesheet[0]: header.append(key) counter = 1 - current_lane = '' + current_lane = "" for line in samplesheet: - project_name = line.get('Sample_Project', line.get('Project', '')) - lane = line['Lane'] - if current_lane == '': + project_name = line.get("Sample_Project", line.get("Project", "")) + lane = line["Lane"] + if current_lane == "": current_lane = lane elif current_lane != lane: counter = 1 current_lane = lane - sample_id = line.get('SampleID', line.get('Sample_ID', '')) - sample_name = line.get('SampleName', line.get('Sample_Name', '')) + sample_id = line.get("SampleID", line.get("Sample_ID", "")) + sample_name = line.get("SampleName", line.get("Sample_Name", "")) # Create dir structure - fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', project_name, sample_id)) + fs.create_folder( + os.path.join(path_to_fc, "Demultiplexing", project_name, sample_id) + ) # Now create the data - fastq_1_dest = '{}_S{}_L00{}_R1_001.fastq.gz'.format(sample_name, counter, lane) - fastq_2_dest = '{}_S{}_L00{}_R2_001.fastq.gz'.format(sample_name, counter, lane) + fastq_1_dest = f"{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz" + fastq_2_dest = f"{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz" counter += 1 if fastq_1 is None: - fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name, - sample_id, fastq_1_dest)) - fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name, - sample_id, fastq_2_dest)) + fs.touch( + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest + ) + ) + fs.touch( + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest + ) + ) else: - fs.do_symlink(fastq_1, os.path.join(path_to_fc, 'Demultiplexing', - project_name, sample_id, fastq_1_dest)) - fs.do_symlink(fastq_2, os.path.join(path_to_fc, 'Demultiplexing', - project_name, sample_id, fastq_2_dest)) + fs.do_symlink( + fastq_1, + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest + ), + ) + fs.do_symlink( + fastq_2, + os.path.join( + path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest + ), + ) - with open(os.path.join(path_to_fc, 'SampleSheet.csv'), 'w') as Samplesheet_file: - Samplesheet_file.write(u'[Header]\n') - Samplesheet_file.write(u'Date,2016-03-29\n') - Samplesheet_file.write(u'Investigator Name,Christian Natanaelsson\n') - Samplesheet_file.write(u'[Data]\n') + with open(os.path.join(path_to_fc, "SampleSheet.csv"), "w") as Samplesheet_file: + Samplesheet_file.write("[Header]\n") + Samplesheet_file.write("Date,2016-03-29\n") + Samplesheet_file.write("Investigator Name,Christian Natanaelsson\n") + Samplesheet_file.write("[Data]\n") for key in header: - Samplesheet_file.write(u'{},'.format(key)) - Samplesheet_file.write(u'\n') + Samplesheet_file.write(f"{key},") + Samplesheet_file.write("\n") for line in samplesheet: for key in header: - Samplesheet_file.write(u'{},'.format(line[key])) - Samplesheet_file.write(u'\n') + Samplesheet_file.write(f"{line[key]},") + Samplesheet_file.write("\n") + def create_uppmax_env(ngi_config): paths = {} - if 'analysis' not in ngi_config: - sys.exit('ERROR: analysis must be a field of NGI_CONFIG.') + if "analysis" not in ngi_config: + sys.exit("ERROR: analysis must be a field of NGI_CONFIG.") try: - base_root = ngi_config['analysis']['base_root'] - paths['base_root'] = base_root - sthlm_root = ngi_config['analysis']['sthlm_root'] - paths['sthlm_root'] = sthlm_root - top_dir = ngi_config['analysis']['top_dir'] - paths['top_dir'] = top_dir + base_root = ngi_config["analysis"]["base_root"] + paths["base_root"] = base_root + sthlm_root = ngi_config["analysis"]["sthlm_root"] + paths["sthlm_root"] = sthlm_root + top_dir = ngi_config["analysis"]["top_dir"] + paths["top_dir"] = top_dir except KeyError as e: - raise SystemExit('Config file is missing the key {}, make sure it have all required information'.format(str(e))) - if 'environment' not in ngi_config: - sys.exit('ERROR: environment must be a field of NGI_CONFIG.') + raise SystemExit( + f"Config file is missing the key {str(e)}, make sure it have all required information" + ) + if "environment" not in ngi_config: + sys.exit("ERROR: environment must be a field of NGI_CONFIG.") try: # Get base root - flowcell_inboxes = ngi_config['environment']['flowcell_inbox'] - flowcell_inbox = flowcell_inboxes[0] # I assume there is only one - paths['flowcell_inbox'] = flowcell_inbox + flowcell_inboxes = ngi_config["environment"]["flowcell_inbox"] + flowcell_inbox = flowcell_inboxes[0] # I assume there is only one + paths["flowcell_inbox"] = flowcell_inbox except ValueError as e: - sys.exit('key error, flowcell_inbox not found in "{}": {}'.format(ngi_config, e)) + sys.exit(f'key error, flowcell_inbox not found in "{ngi_config}": {e}') # Now I need to create the folders for this if not os.path.exists(base_root): - sys.exit('base_root needs to exists: {}'.format(base_root)) + sys.exit(f"base_root needs to exists: {base_root}") fs.create_folder(flowcell_inbox) if sthlm_root is None: path_to_analysis = os.path.join(base_root, top_dir) @@ -143,72 +172,89 @@ def create_uppmax_env(ngi_config): fs.create_folder(path_to_analysis) return paths + def produce_analysis_qc_ngi(ngi_config, project_id): - analysis_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'ANALYSIS', project_id) - data_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'DATA', project_id) + analysis_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "ANALYSIS", + project_id, + ) + data_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "DATA", + project_id, + ) - qc_ngi_dir = os.path.join(analysis_dir, 'qc_ngi') + qc_ngi_dir = os.path.join(analysis_dir, "qc_ngi") fs.create_folder(qc_ngi_dir) for sample_id in os.listdir(data_dir): sample_dir_qc = os.path.join(qc_ngi_dir, sample_id) fs.create_folder(sample_dir_qc) - fastqc_dir = os.path.join(sample_dir_qc, 'fastqc') + fastqc_dir = os.path.join(sample_dir_qc, "fastqc") fs.create_folder(fastqc_dir) - fastq_screen_dir = os.path.join(sample_dir_qc, 'fastq_screen') + fastq_screen_dir = os.path.join(sample_dir_qc, "fastq_screen") fs.create_folder(fastq_screen_dir) # Do not create more than this... + def produce_analysis_piper(ngi_config, project_id): # Create piper_ngi - analysis_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'ANALYSIS', project_id) - data_dir = os.path.join(ngi_config['analysis']['base_root'], - ngi_config['analysis']['sthlm_root'], - ngi_config['analysis']['top_dir'], - 'DATA', project_id) + analysis_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "ANALYSIS", + project_id, + ) + data_dir = os.path.join( + ngi_config["analysis"]["base_root"], + ngi_config["analysis"]["sthlm_root"], + ngi_config["analysis"]["top_dir"], + "DATA", + project_id, + ) - piper_ngi_dir = os.path.join(analysis_dir, 'piper_ngi') + piper_ngi_dir = os.path.join(analysis_dir, "piper_ngi") fs.create_folder(piper_ngi_dir) - piper_dirs = ['01_raw_alignments', - '02_preliminary_alignment_qc', - '03_genotype_concordance', - '04_merged_aligments', - '05_processed_alignments', - '06_final_alignment_qc', - '07_variant_calls', - '08_misc'] + piper_dirs = [ + "01_raw_alignments", + "02_preliminary_alignment_qc", + "03_genotype_concordance", + "04_merged_aligments", + "05_processed_alignments", + "06_final_alignment_qc", + "07_variant_calls", + "08_misc", + ] for piper_dir in piper_dirs: - current_dir = os.path.join(piper_ngi_dir, piper_dir) + current_dir = os.path.join(piper_ngi_dir, piper_dir) fs.create_folder(current_dir) - if piper_dir == '05_processed_alignments': + if piper_dir == "05_processed_alignments": for sample_id in os.listdir(data_dir): - bam_file = '{}.clean.dedup.bam'.format(sample_id) + bam_file = f"{sample_id}.clean.dedup.bam" fs.touch(os.path.join(current_dir, bam_file)) - if piper_dir == '07_variant_calls': + if piper_dir == "07_variant_calls": for sample_id in os.listdir(data_dir): - vcf_file = '{}.clean.dedup.recal.bam.raw.indel.vcf.gz'.format(sample_id) + vcf_file = f"{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz" fs.touch(os.path.join(current_dir, vcf_file)) - current_dir = os.path.join(piper_ngi_dir, 'sbatch') + current_dir = os.path.join(piper_ngi_dir, "sbatch") fs.create_folder(current_dir) - current_dir = os.path.join(piper_ngi_dir, 'setup_xml_files') + current_dir = os.path.join(piper_ngi_dir, "setup_xml_files") fs.create_folder(current_dir) - current_dir = os.path.join(piper_ngi_dir, 'logs') + current_dir = os.path.join(piper_ngi_dir, "logs") fs.create_folder(current_dir) create_version_report(current_dir) + def select_random_projects(projects_in, num_proj, application, projects_out, label): chosen_projects = 0 - iterations = 0 # Safe guard to avoid infinite loops - application_not_in_other = ['WG re-seq'] - while chosen_projects != num_proj and iterations < 4*len(projects_in): + iterations = 0 # Safe guard to avoid infinite loops + application_not_in_other = ["WG re-seq"] + while chosen_projects != num_proj and iterations < 4 * len(projects_in): iterations += 1 selected_proj = random.choice(list(projects_in.keys())) # Check if I have already picked up this element @@ -217,139 +263,190 @@ def select_random_projects(projects_in, num_proj, application, projects_out, lab if selected_proj == project_pair[0]: already_chosen = True if already_chosen: - continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops + continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops proj_value = projects_in[selected_proj] - if application == 'other': + if application == "other": # In this case everything expcept - if proj_value['application'] not in application_not_in_other: + if proj_value["application"] not in application_not_in_other: # I select this one projects_out.append([selected_proj, label]) chosen_projects += 1 - elif application == proj_value['application']: + elif application == proj_value["application"]: # I select this one projects_out.append([selected_proj, label]) chosen_projects += 1 + def create(projects, ngi_config_file, fastq_1, fastq_2): - statusdb_conf = CONFIG.get('statusdb') + statusdb_conf = CONFIG.get("statusdb") if statusdb_conf is None: - logger.error('No statusdb field in taca configuration file') + logger.error("No statusdb field in taca configuration file") return 1 - if 'dev' not in statusdb_conf['url']: - logger.error('url for status db is {}, but dev must be specified in this case'.format(statusdb_conf['url'])) + if "dev" not in statusdb_conf["url"]: + logger.error( + "url for status db is {}, but dev must be specified in this case".format( + statusdb_conf["url"] + ) + ) couch_connection = statusdb.StatusdbSession(statusdb_conf).connection - projectsDB = couch_connection['projects'] - project_summary = projectsDB.view('project/summary') + projectsDB = couch_connection["projects"] + project_summary = projectsDB.view("project/summary") projects_closed_more_than_three_months = {} projects_closed_more_than_one_month_less_than_three = {} projects_closed_less_than_one_month = {} projects_opened = {} current_date = datetime.datetime.today() - date_limit_one_year = current_date - relativedelta(months=6) #yes yes I know.. but in this way i am sure all data in in xflocell_db + date_limit_one_year = current_date - relativedelta( + months=6 + ) # yes yes I know.. but in this way i am sure all data in in xflocell_db date_limit_one_month = current_date - relativedelta(months=1) date_limit_three_month = current_date - relativedelta(months=3) for row in project_summary: - project_id = row['key'][1] - project_status = row['key'][0] - if 'application' not in row['value']: + project_id = row["key"][1] + project_status = row["key"][0] + if "application" not in row["value"]: continue - if row['value']['no_samples'] > 50: - continue # Skip large projects - application = row['value']['application'] - if project_status == 'closed': - if 'close_date' in row['value']: - close_date = datetime.datetime.strptime(row['value']['close_date'], '%Y-%m-%d') - if close_date > date_limit_one_year: # If the project has been closed after the date limit + if row["value"]["no_samples"] > 50: + continue # Skip large projects + application = row["value"]["application"] + if project_status == "closed": + if "close_date" in row["value"]: + close_date = datetime.datetime.strptime( + row["value"]["close_date"], "%Y-%m-%d" + ) + if ( + close_date > date_limit_one_year + ): # If the project has been closed after the date limit if close_date >= date_limit_one_month: - projects_closed_less_than_one_month[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} - elif close_date < date_limit_one_month and close_date >= date_limit_three_month: - projects_closed_more_than_one_month_less_than_three[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} + projects_closed_less_than_one_month[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } + elif ( + close_date < date_limit_one_month + and close_date >= date_limit_three_month + ): + projects_closed_more_than_one_month_less_than_three[ + project_id + ] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } elif close_date < date_limit_three_month: - projects_closed_more_than_three_months[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} - elif project_status == 'open': - if 'lanes_sequenced' in row['value'] and row['value']['lanes_sequenced'] > 0: - projects_opened[project_id] = {'project_name': row['value']['project_name'], - 'application': application, - 'no_samples': row['value']['no_samples']} + projects_closed_more_than_three_months[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } + elif project_status == "open": + if ( + "lanes_sequenced" in row["value"] + and row["value"]["lanes_sequenced"] > 0 + ): + projects_opened[project_id] = { + "project_name": row["value"]["project_name"], + "application": application, + "no_samples": row["value"]["no_samples"], + } else: - print('status {}'.format(project_status)) + print(f"status {project_status}") ## Now I can parse the x_flowcell db to check what I can and cannot use - whole_genome_projects = int(2*projects/3) + whole_genome_projects = int(2 * projects / 3) projects_to_reproduce = [] - select_random_projects(projects_closed_more_than_three_months, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_tot_closed') - select_random_projects(projects_closed_more_than_one_month_less_than_three, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_closed_clean_no_del') - select_random_projects(projects_closed_less_than_one_month, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_closed_no_clean') - select_random_projects(projects_opened, - whole_genome_projects/4+1, - 'WG re-seq', - projects_to_reproduce, - 'WGreseq_open') + select_random_projects( + projects_closed_more_than_three_months, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_tot_closed", + ) + select_random_projects( + projects_closed_more_than_one_month_less_than_three, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_closed_clean_no_del", + ) + select_random_projects( + projects_closed_less_than_one_month, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_closed_no_clean", + ) + select_random_projects( + projects_opened, + whole_genome_projects / 4 + 1, + "WG re-seq", + projects_to_reproduce, + "WGreseq_open", + ) - other_projects = int(projects/3) - select_random_projects(projects_closed_more_than_three_months, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_tot_closed') - select_random_projects(projects_closed_more_than_one_month_less_than_three, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_closed_clean_no_del') - select_random_projects(projects_closed_less_than_one_month, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_closed_no_clean') - select_random_projects(projects_opened, - other_projects/4+1, - 'other', - projects_to_reproduce, - 'noWGreseq_open') + other_projects = int(projects / 3) + select_random_projects( + projects_closed_more_than_three_months, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_tot_closed", + ) + select_random_projects( + projects_closed_more_than_one_month_less_than_three, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_closed_clean_no_del", + ) + select_random_projects( + projects_closed_less_than_one_month, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_closed_no_clean", + ) + select_random_projects( + projects_opened, + other_projects / 4 + 1, + "other", + projects_to_reproduce, + "noWGreseq_open", + ) # Create ngi_pipeline enviroment - print('#NGI_CONFIG varaible is {}. This variable needs to be in the .bashrc file'.format(ngi_config_file)) - print('NGI_CONFIG={}'.format(ngi_config_file)) + print( + f"#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file" + ) + print(f"NGI_CONFIG={ngi_config_file}") try: ngi_config = conf.load_config(ngi_config_file) - except IOError as e: - print('ERROR: {}'.format(e.message)) + except OSError as e: + print(f"ERROR: {e.message}") # Create uppmax env paths = create_uppmax_env(ngi_config) - print('#Going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry'.format(len(projects_to_reproduce))) + print( + f"#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry" + ) # Scan over x_flowcell and reproduce FCs - flowcellDB = couch_connection['x_flowcells'] + flowcellDB = couch_connection["x_flowcells"] reproduced_projects = {} for fc_doc in flowcellDB: try: - samplesheet_csv = flowcellDB[fc_doc]['samplesheet_csv'] + samplesheet_csv = flowcellDB[fc_doc]["samplesheet_csv"] except KeyError: - continue # Parse only FC that have a samplesheet + continue # Parse only FC that have a samplesheet # Check if this FC contains one of the proejcts I need to replicate. projects_in_FC = set() - if 'SampleName' in samplesheet_csv[0]: - projects_in_FC = set([line['SampleName'].split('_')[0] for line in samplesheet_csv]) + if "SampleName" in samplesheet_csv[0]: + projects_in_FC = set( + [line["SampleName"].split("_")[0] for line in samplesheet_csv] + ) else: - projects_in_FC = set([line['Sample_Name'].split('_')[0] for line in samplesheet_csv]) + projects_in_FC = set( + [line["Sample_Name"].split("_")[0] for line in samplesheet_csv] + ) found = False for project_pair in projects_to_reproduce: project = project_pair[0] @@ -357,31 +454,46 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): # This FC needs to be created if not found: # Create the FC only the first time I see a project belonging to it - create_FC(paths['flowcell_inbox'] , flowcellDB[fc_doc]['RunInfo']['Id'], samplesheet_csv, fastq_1, fastq_2) + create_FC( + paths["flowcell_inbox"], + flowcellDB[fc_doc]["RunInfo"]["Id"], + samplesheet_csv, + fastq_1, + fastq_2, + ) found = True # But I keep track of all projects-run I need to organise if project not in reproduced_projects: reproduced_projects[project] = [] - reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id']) - print('#Reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)'.format(len(reproduced_projects))) + reproduced_projects[project].append(flowcellDB[fc_doc]["RunInfo"]["Id"]) + print( + f"#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)" + ) for project in projects_to_reproduce: if project[0] in reproduced_projects: - print('# {}: {}'.format(project[0], project[1])) + print(f"# {project[0]}: {project[1]}") # Need to output the command to organise to_be_deleted = [] for project in reproduced_projects: for FC in reproduced_projects[project]: - print('Running: ngi_pipeline_start.py organize flowcell {} -p {}'.format(FC, project)) - with open('ngi_pipeline_local.logs', 'w') as NGILOGS: - return_value = subprocess.call(['ngi_pipeline_start.py', - 'organize', - 'flowcell', - '{}'.format(FC), - '-p', - '{}'.format(project)], - stdout=NGILOGS, stderr=NGILOGS) + print(f"Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}") + with open("ngi_pipeline_local.logs", "w") as NGILOGS: + return_value = subprocess.call( + [ + "ngi_pipeline_start.py", + "organize", + "flowcell", + f"{FC}", + "-p", + f"{project}", + ], + stdout=NGILOGS, + stderr=NGILOGS, + ) if return_value > 0: - print('#project {} not organised: have a look to the logs, but most likely this projec is not in charon'.format(project)) + print( + f"#project {project} not organised: have a look to the logs, but most likely this projec is not in charon" + ) if project not in to_be_deleted: to_be_deleted.append(project) @@ -390,13 +502,15 @@ def create(projects, ngi_config_file, fastq_1, fastq_2): # Create ANALYSIS -- for project in projects_to_reproduce: - if project[0] in reproduced_projects: # Only for projects that I know I have organised + if ( + project[0] in reproduced_projects + ): # Only for projects that I know I have organised produce_analysis_qc_ngi(ngi_config, project[0]) - if project[1].startswith('WGreseq'): + if project[1].startswith("WGreseq"): produce_analysis_piper(ngi_config, project[0]) # Store in a file the results - with open('projects.txt', 'w') as PROJECTS: + with open("projects.txt", "w") as PROJECTS: for project in projects_to_reproduce: if project[0] in reproduced_projects: - PROJECTS.write(u'{}:{}\n'.format(project[0], project[1])) + PROJECTS.write(f"{project[0]}:{project[1]}\n") diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 47da90a9..33ea19b5 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -1,13 +1,14 @@ -import os +import datetime import glob -import re import logging -import datetime +import os +import re +from collections import OrderedDict, defaultdict + +from flowcell_parser.classes import RunParametersParser, SampleSheetParser -from taca.utils.config import CONFIG from taca.utils import statusdb -from flowcell_parser.classes import SampleSheetParser, RunParametersParser -from collections import defaultdict, OrderedDict +from taca.utils.config import CONFIG from taca.utils.misc import send_mail logger = logging.getLogger(__name__) @@ -15,8 +16,9 @@ class Tree(defaultdict): """Constructor for a search tree.""" + def __init__(self, value=None): - super(Tree, self).__init__(Tree) + super().__init__(Tree) self.value = value @@ -25,31 +27,36 @@ def collect_runs(): found_runs = [] # Pattern explained: # 6-8Digits_(maybe ST-)AnythingLetterornumberNumber_Number_AorBLetterornumberordash - rundir_re = re.compile('\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+') - for data_dir in CONFIG['bioinfo_tab']['data_dirs']: + rundir_re = re.compile("\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+") + for data_dir in CONFIG["bioinfo_tab"]["data_dirs"]: if os.path.exists(data_dir): - potential_run_dirs = glob.glob(os.path.join(data_dir, '*')) + potential_run_dirs = glob.glob(os.path.join(data_dir, "*")) for run_dir in potential_run_dirs: - if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir): + if rundir_re.match( + os.path.basename(os.path.abspath(run_dir)) + ) and os.path.isdir(run_dir): found_runs.append(os.path.basename(run_dir)) - logger.info('Working on {}'.format(run_dir)) + logger.info(f"Working on {run_dir}") update_statusdb(run_dir) - nosync_data_dir = os.path.join(data_dir, 'nosync') - potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, '*')) + nosync_data_dir = os.path.join(data_dir, "nosync") + potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, "*")) for run_dir in potential_nosync_run_dirs: - if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir): + if rundir_re.match( + os.path.basename(os.path.abspath(run_dir)) + ) and os.path.isdir(run_dir): update_statusdb(run_dir) + def update_statusdb(run_dir): """Gets status for a project.""" # Fetch individual fields project_info = get_ss_projects(run_dir) run_id = os.path.basename(os.path.abspath(run_dir)) - statusdb_conf = CONFIG.get('statusdb') + statusdb_conf = CONFIG.get("statusdb") couch_connection = statusdb.StatusdbSession(statusdb_conf).connection valueskey = datetime.datetime.now().isoformat() - db = couch_connection['bioinfo_analysis'] - view = db.view('latest_data/sample_id') + db = couch_connection["bioinfo_analysis"] + view = db.view("latest_data/sample_id") # Construction and sending of individual records, if samplesheet is incorrectly formatted the loop is skipped if project_info: for flowcell in project_info: @@ -58,14 +65,20 @@ def update_statusdb(run_dir): for project in project_info[flowcell][lane][sample]: project_info[flowcell][lane][sample].value = get_status(run_dir) sample_status = project_info[flowcell][lane][sample].value - obj = {'run_id': run_id, - 'project_id': project, - 'flowcell': flowcell, - 'lane': lane, - 'sample': sample, - 'status': sample_status, - 'values': {valueskey: {'user': 'taca', - 'sample_status': sample_status}}} + obj = { + "run_id": run_id, + "project_id": project, + "flowcell": flowcell, + "lane": lane, + "sample": sample, + "status": sample_status, + "values": { + valueskey: { + "user": "taca", + "sample_status": sample_status, + } + }, + } # If entry exists, append to existing # Special if case to handle lanes written as int, can be safely removed when old lanes # is no longer stored as int @@ -74,151 +87,193 @@ def update_statusdb(run_dir): if len(view[[project, run_id, lane, sample]].rows) >= 1: remote_id = view[[project, run_id, lane, sample]].rows[0].id lane = str(lane) - remote_doc = db[remote_id]['values'] - remote_status = db[remote_id]['status'] + remote_doc = db[remote_id]["values"] + remote_status = db[remote_id]["status"] # Only updates the listed statuses - if remote_status in ['New', 'ERROR', 'Sequencing', 'Demultiplexing'] and sample_status != remote_status: + if ( + remote_status + in ["New", "ERROR", "Sequencing", "Demultiplexing"] + and sample_status != remote_status + ): # Appends old entry to new. Essentially merges the two for k, v in remote_doc.items(): - obj['values'][k] = v - logger.info('Updating {} {} {} {} {} as {}'.format(run_id, - project, - flowcell, - lane, - sample, - sample_status)) + obj["values"][k] = v + logger.info( + "Updating {} {} {} {} {} as {}".format( + run_id, + project, + flowcell, + lane, + sample, + sample_status, + ) + ) # Sorts timestamps - obj['values'] = OrderedDict(sorted(obj['values'].items(), key=lambda k_v: k_v[0], reverse=True)) + obj["values"] = OrderedDict( + sorted( + obj["values"].items(), + key=lambda k_v: k_v[0], + reverse=True, + ) + ) # Update record cluster - obj['_rev'] = db[remote_id].rev - obj['_id'] = remote_id + obj["_rev"] = db[remote_id].rev + obj["_id"] = remote_id db.save(obj) # Creates new entry else: - logger.info('Creating {} {} {} {} {} as {}'.format(run_id, - project, - flowcell, - lane, - sample, - sample_status)) + logger.info( + "Creating {} {} {} {} {} as {}".format( + run_id, + project, + flowcell, + lane, + sample, + sample_status, + ) + ) # Creates record db.save(obj) # Sets FC error flag - if not project_info[flowcell].value == None: - if (('Failed' in project_info[flowcell].value and 'Failed' not in sample_status) - or ('Failed' in sample_status and 'Failed' not in project_info[flowcell].value)): - project_info[flowcell].value = 'Ambiguous' + if project_info[flowcell].value is not None: + if ( + "Failed" in project_info[flowcell].value + and "Failed" not in sample_status + ) or ( + "Failed" in sample_status + and "Failed" not in project_info[flowcell].value + ): + project_info[flowcell].value = "Ambiguous" else: project_info[flowcell].value = sample_status # Checks if a flowcell needs partial re-doing # Email error per flowcell - if not project_info[flowcell].value == None: - if 'Ambiguous' in project_info[flowcell].value: - error_emailer('failed_run', run_id) + if project_info[flowcell].value is not None: + if "Ambiguous" in project_info[flowcell].value: + error_emailer("failed_run", run_id) + def get_status(run_dir): """Gets status of a sample run, based on flowcell info (folder structure).""" # Default state, should never occur - status = 'ERROR' - xten_dmux_folder = os.path.join(run_dir, 'Demultiplexing') - unaligned_folder = glob.glob(os.path.join(run_dir, 'Unaligned_*')) - nosync_pattern = re.compile('nosync') + status = "ERROR" + xten_dmux_folder = os.path.join(run_dir, "Demultiplexing") + unaligned_folder = glob.glob(os.path.join(run_dir, "Unaligned_*")) + nosync_pattern = re.compile("nosync") # If we're in a nosync folder if nosync_pattern.search(run_dir): - status = 'New' + status = "New" # If demux folder exist (or similar) - elif (os.path.exists(xten_dmux_folder) or unaligned_folder): - status = 'Demultiplexing' + elif os.path.exists(xten_dmux_folder) or unaligned_folder: + status = "Demultiplexing" # If RTAcomplete doesn't exist - elif not (os.path.exists(os.path.join(run_dir, 'RTAComplete.txt'))): - status = 'Sequencing' + elif not (os.path.exists(os.path.join(run_dir, "RTAComplete.txt"))): + status = "Sequencing" return status + def get_ss_projects(run_dir): """Fetches project, FC, lane & sample (sample-run) status for a given folder.""" proj_tree = Tree() - lane_pattern = re.compile('^([1-8]{1,2})$') - sample_proj_pattern = re.compile('^((P[0-9]{3,5})_[0-9]{3,5})') + lane_pattern = re.compile("^([1-8]{1,2})$") + sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})") run_name = os.path.basename(os.path.abspath(run_dir)) - run_date = run_name.split('_')[0] + run_date = run_name.split("_")[0] if len(run_date) == 6: - current_year = '20' + run_date[0:2] - elif len(run_name.split('_')[0]) == 8: # NovaSeqXPlus case + current_year = "20" + run_date[0:2] + elif len(run_name.split("_")[0]) == 8: # NovaSeqXPlus case current_year = run_date[0:4] - run_name_components = run_name.split('_') - if 'VH' in run_name_components[1]: + run_name_components = run_name.split("_") + if "VH" in run_name_components[1]: FCID = run_name_components[3] else: FCID = run_name_components[3][1:] miseq = False # FIXME: this check breaks if the system is case insensitive - if os.path.exists(os.path.join(run_dir, 'runParameters.xml')): - run_parameters_file = 'runParameters.xml' - elif os.path.exists(os.path.join(run_dir, 'RunParameters.xml')): - run_parameters_file = 'RunParameters.xml' + if os.path.exists(os.path.join(run_dir, "runParameters.xml")): + run_parameters_file = "runParameters.xml" + elif os.path.exists(os.path.join(run_dir, "RunParameters.xml")): + run_parameters_file = "RunParameters.xml" else: - logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run_dir)) + logger.error( + f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}" + ) return [] rp = RunParametersParser(os.path.join(run_dir, run_parameters_file)) - if 'Setup' in rp.data['RunParameters']: - runtype = rp.data['RunParameters']['Setup'].get('Flowcell', '') + if "Setup" in rp.data["RunParameters"]: + runtype = rp.data["RunParameters"]["Setup"].get("Flowcell", "") if not runtype: - logger.warn('Parsing runParameters to fetch instrument type, ' - 'not found Flowcell information in it. Using ApplicationName') - runtype = rp.data['RunParameters']['Setup'].get('ApplicationName', '') - elif 'InstrumentType' in rp.data['RunParameters']: - runtype = rp.data['RunParameters'].get('InstrumentType') + logger.warn( + "Parsing runParameters to fetch instrument type, " + "not found Flowcell information in it. Using ApplicationName" + ) + runtype = rp.data["RunParameters"]["Setup"].get("ApplicationName", "") + elif "InstrumentType" in rp.data["RunParameters"]: + runtype = rp.data["RunParameters"].get("InstrumentType") else: - runtype = rp.data['RunParameters'].get('Application') + runtype = rp.data["RunParameters"].get("Application") if not runtype: - logger.warn("Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'") - runtype = rp.data['RunParameters'].get('ApplicationName', '') + logger.warn( + "Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'" + ) + runtype = rp.data["RunParameters"].get("ApplicationName", "") # Miseq case - if 'MiSeq' in runtype: - if os.path.exists(os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv')): - FCID_samplesheet_origin = os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv') - elif os.path.exists(os.path.join(run_dir, 'SampleSheet.csv')): - FCID_samplesheet_origin = os.path.join(run_dir, 'SampleSheet.csv') + if "MiSeq" in runtype: + if os.path.exists( + os.path.join(run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv") + ): + FCID_samplesheet_origin = os.path.join( + run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv" + ) + elif os.path.exists(os.path.join(run_dir, "SampleSheet.csv")): + FCID_samplesheet_origin = os.path.join(run_dir, "SampleSheet.csv") else: - logger.warn('No samplesheet found for {}'.format(run_dir)) + logger.warn(f"No samplesheet found for {run_dir}") miseq = True lanes = str(1) # Pattern is a bit more rigid since we're no longer also checking for lanes - sample_proj_pattern=re.compile('^((P[0-9]{3,5})_[0-9]{3,5})$') + sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})$") data = parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=True) # HiSeq X case - elif 'HiSeq X' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['xten_samplesheets'], - current_year, '{}.csv'.format(FCID)) + elif "HiSeq X" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["xten_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # HiSeq 2500 case - elif 'HiSeq' in runtype or 'TruSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['hiseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + elif "HiSeq" in runtype or "TruSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["hiseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) - elif 'NovaSeqXPlus' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseqxplus_samplesheets'], - current_year, '{}.csv'.format(FCID)) + elif "NovaSeqXPlus" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["novaseqxplus_samplesheets"], + current_year, + f"{FCID}.csv", + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NovaSeq 6000 case - elif 'NovaSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + elif "NovaSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["novaseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) # NextSeq Case - elif 'NextSeq' in runtype: - FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['nextseq_samplesheets'], - current_year, '{}.csv'.format(FCID)) + elif "NextSeq" in runtype: + FCID_samplesheet_origin = os.path.join( + CONFIG["bioinfo_tab"]["nextseq_samplesheets"], current_year, f"{FCID}.csv" + ) data = parse_samplesheet(FCID_samplesheet_origin, run_dir) else: - logger.warn('Cannot locate the samplesheet for run {}'.format(run_dir)) + logger.warn(f"Cannot locate the samplesheet for run {run_dir}") return [] # If samplesheet is empty, don't bother going through it if data == []: - return data + return data proj_n_sample = False lane = False @@ -244,87 +299,110 @@ def get_ss_projects(run_dir): lane = False if list(proj_tree.keys()) == []: - logger.info('INCORRECTLY FORMATTED SAMPLESHEET, CHECK {}'.format(run_name)) + logger.info(f"INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}") return proj_tree + def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False): """Parses a samplesheet with SampleSheetParser - :param FCID_samplesheet_origin sample sheet path + :param FCID_samplesheet_origin sample sheet path """ data = [] try: ss_reader = SampleSheetParser(FCID_samplesheet_origin) data = ss_reader.data except: - logger.warn('Cannot initialize SampleSheetParser for {}. Most likely due to poor comma separation'.format(run_dir)) + logger.warn( + f"Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation" + ) return [] if is_miseq: - if not 'Description' in ss_reader.header or not \ - ('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']): - logger.warn('Run {} not labelled as production or application. Disregarding it.'.format(run_dir)) + if "Description" not in ss_reader.header or not ( + "Production" in ss_reader.header["Description"] + or "Application" in ss_reader.header["Description"] + ): + logger.warn( + f"Run {run_dir} not labelled as production or application. Disregarding it." + ) # Skip this run return [] return data + def error_emailer(flag, info): """Sends a custom error e-mail :param flag e-mail state :param info variable that describes the record in some way """ - recipients = CONFIG['mail']['recipients'] + recipients = CONFIG["mail"]["recipients"] # Failed_run: Samplesheet for a given project couldn't be found - body = 'TACA has encountered an issue that might be worth investigating\n' - body += 'The offending entry is: ' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " body += info - body += '\n\nSincerely, TACA' + body += "\n\nSincerely, TACA" - if (flag == 'no_samplesheet'): - subject='ERROR, Samplesheet error' - elif (flag == "failed_run"): - subject='WARNING, Reinitialization of partially failed FC' - elif (flag == 'weird_samplesheet'): - subject='ERROR, Incorrectly formatted samplesheet' + if flag == "no_samplesheet": + subject = "ERROR, Samplesheet error" + elif flag == "failed_run": + subject = "WARNING, Reinitialization of partially failed FC" + elif flag == "weird_samplesheet": + subject = "ERROR, Incorrectly formatted samplesheet" hour_now = datetime.datetime.now().hour if hour_now == 7 or hour_now == 12 or hour_now == 16: send_mail(subject, body, recipients) + def fail_run(runid, project): """Updates status of specified run or project-run to Failed.""" - statusdb_conf = CONFIG.get('statusdb') - logger.info('Connecting to status db: {}'.format(statusdb_conf.get('url'))) + statusdb_conf = CONFIG.get("statusdb") + logger.info("Connecting to status db: {}".format(statusdb_conf.get("url"))) try: status_db = statusdb.StatusdbSession(statusdb_conf).connection except Exception as e: - logger.error('Can not connect to status_db: https://{}:*****@{}'.format( - statusdb_conf.get('username'), - statusdb_conf.get('url'))) + logger.error( + "Can not connect to status_db: https://{}:*****@{}".format( + statusdb_conf.get("username"), statusdb_conf.get("url") + ) + ) logger.error(e) raise e - bioinfo_db = status_db['bioinfo_analysis'] + bioinfo_db = status_db["bioinfo_analysis"] if project is not None: - view = bioinfo_db.view('full_doc/pj_run_to_doc') + view = bioinfo_db.view("full_doc/pj_run_to_doc") rows = view[[project, runid]].rows - logger.info('Updating status of {} objects with flowcell_id: {} and project_id {}'.format(len(rows), runid, project)) + logger.info( + f"Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}" + ) else: - view = bioinfo_db.view('full_doc/run_id_to_doc') + view = bioinfo_db.view("full_doc/run_id_to_doc") rows = view[[runid]].rows - logger.info('Updating status of {} objects with flowcell_id: {}'.format(len(rows), runid)) + logger.info(f"Updating status of {len(rows)} objects with flowcell_id: {runid}") new_timestamp = datetime.datetime.now().isoformat() updated = 0 for row in rows: - if row.value['status'] != 'Failed': - row.value['values'][new_timestamp] = {'sample_status' : 'Failed', 'user': 'taca'} - row.value['status'] = 'Failed' + if row.value["status"] != "Failed": + row.value["values"][new_timestamp] = { + "sample_status": "Failed", + "user": "taca", + } + row.value["status"] = "Failed" try: bioinfo_db.save(row.value) updated += 1 except Exception as e: - logger.error('Cannot update object project-sample-run-lane: {}-{}-{}-{}'.format(row.value.get('project_id'), row.value.get('sample'), row.value.get('run_id'), row.value.get('lane'))) + logger.error( + "Cannot update object project-sample-run-lane: {}-{}-{}-{}".format( + row.value.get("project_id"), + row.value.get("sample"), + row.value.get("run_id"), + row.value.get("lane"), + ) + ) logger.error(e) raise e - logger.info('Successfully updated {} objects'.format(updated)) + logger.info(f"Successfully updated {updated} objects") diff --git a/taca/utils/cli.py b/taca/utils/cli.py index bbfdb819..3bef6eef 100644 --- a/taca/utils/cli.py +++ b/taca/utils/cli.py @@ -1,27 +1,34 @@ """CLI for the bioinfo subcommand.""" import click + import taca.utils.bioinfo_tab as bt -@click.group(name='bioinfo_deliveries') + +@click.group(name="bioinfo_deliveries") def bioinfo_deliveries(): """Update statusdb with information about FC entry point.""" pass + # bioinfo subcommands @bioinfo_deliveries.command() -@click.argument('rundir') +@click.argument("rundir") def updaterun(rundir): """Saves the bioinfo data to statusdb.""" bt.update_statusdb(rundir) + @bioinfo_deliveries.command() def update(): """Saves the bioinfo data of everything that can be found to statusdb.""" bt.collect_runs() -@bioinfo_deliveries.command(name='fail_run') -@click.argument('runid') -@click.option('-p','--project', is_flag=False, help='Fail run for the specified project') + +@bioinfo_deliveries.command(name="fail_run") +@click.argument("runid") +@click.option( + "-p", "--project", is_flag=False, help="Fail run for the specified project" +) def fail_run(runid, project=None): """Updates the status of the specified run to 'Failed'. Example of RUNID: 170113_ST-E00269_0163_BHCVH7ALXX""" diff --git a/taca/utils/config.py b/taca/utils/config.py index 74b8876f..e2710ba7 100644 --- a/taca/utils/config.py +++ b/taca/utils/config.py @@ -1,21 +1,23 @@ """Load and parse configuration file.""" + import yaml -from io import open CONFIG = {} + def load_config(config_file): """Loads a configuration file.""" config = {} try: - with open(config_file, 'r') as f: + with open(config_file) as f: content = yaml.load(f, Loader=yaml.FullLoader) config.update(content) return content - except IOError as e: - e.message = 'Could not open configuration file "{}".'.format(config_file) + except OSError as e: + e.message = f'Could not open configuration file "{config_file}".' raise e + def load_yaml_config(config_file): """Load YAML config file @@ -26,10 +28,10 @@ def load_yaml_config(config_file): :raises IOError: If the config file cannot be opened. """ try: - with open(config_file, 'r') as f: + with open(config_file) as f: content = yaml.load(f, Loader=yaml.FullLoader) CONFIG.update(content) return content - except IOError as e: - e.message = 'Could not open configuration file "{}".'.format(config_file) + except OSError as e: + e.message = f'Could not open configuration file "{config_file}".' raise e diff --git a/taca/utils/filesystem.py b/taca/utils/filesystem.py index f1db6968..a001615e 100644 --- a/taca/utils/filesystem.py +++ b/taca/utils/filesystem.py @@ -3,8 +3,9 @@ import os import shutil -RUN_RE = '^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$' -RUN_RE_ONT = '^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$' +RUN_RE = "^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$" +RUN_RE_ONT = "^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$" + @contextlib.contextmanager def chdir(new_dir): @@ -16,28 +17,32 @@ def chdir(new_dir): finally: os.chdir(cur_dir) + def create_folder(target_folder): - """ Ensure that a folder exists and create it if it doesn't, including any - parent folders, as necessary. + """Ensure that a folder exists and create it if it doesn't, including any + parent folders, as necessary. - :param target_folder: the target folder - :returns: True if the folder exists or was created, False if the folder - does not exists and could not be created + :param target_folder: the target folder + :returns: True if the folder exists or was created, False if the folder + does not exists and could not be created """ try: os.makedirs(target_folder) - except OSError as e: + except OSError: pass return os.path.exists(target_folder) + def touch(file): - open(file, 'w').close() + open(file, "w").close() + def do_symlink(src_file, dst_file): link_f = os.symlink if not os.path.isfile(dst_file): link_f(os.path.realpath(src_file), dst_file) + def do_copy(src_path, dst_path): # copies folder structure and files (recursively) # if symlinks, will copy content, not the links diff --git a/taca/utils/misc.py b/taca/utils/misc.py index 3f9bec6a..a180bcfd 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -5,12 +5,11 @@ import smtplib import subprocess import sys - from datetime import datetime from email.mime.text import MIMEText + from taca.utils import statusdb -from io import open -from six.moves import input + def send_mail(subject, content, receiver): """Sends an email. @@ -20,17 +19,18 @@ def send_mail(subject, content, receiver): :param str receiver: Address to send the email """ if not receiver: - raise SystemExit('No receiver was given to send mail') + raise SystemExit("No receiver was given to send mail") msg = MIMEText(content) - msg['Subject'] = 'TACA - {}'.format(subject) - msg['From'] = 'TACA@scilifelab.se' - msg['to'] = receiver + msg["Subject"] = f"TACA - {subject}" + msg["From"] = "TACA@scilifelab.se" + msg["to"] = receiver - s = smtplib.SMTP('localhost') - s.sendmail('TACA', [receiver], msg.as_string()) + s = smtplib.SMTP("localhost") + s.sendmail("TACA", [receiver], msg.as_string()) s.quit() -def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): + +def call_external_command(cl, with_log_files=False, prefix=None, log_dir=""): """Executes an external command. :param string cl: Command line to be executed (command + options and parameters) @@ -38,59 +38,60 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''): :param string prefix: the prefix to add to log file :param string log_dir: where to write the log file (to avoid problems with rights) """ - if type(cl) == str: - cl = cl.split(' ') + if isinstance(type(cl), str): + cl = cl.split(" ") logFile = os.path.basename(cl[0]) stdout = sys.stdout stderr = sys.stderr if with_log_files: if prefix: - logFile = '{}_{}'.format(prefix, logFile) + logFile = f"{prefix}_{logFile}" # Create log dir if it didn't exist in CWD if log_dir and not os.path.exists(log_dir): os.mkdir(log_dir) logFile = os.path.join(log_dir, logFile) - stdout = open(logFile + '.out', 'a') - stderr = open(logFile + '.err', 'a') - started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + u'\n') - stdout.write(''.join(['=']*len(cl)) + u'\n') + stdout = open(logFile + ".out", "a") + stderr = open(logFile + ".err", "a") + started = "Started command {} on {}".format(" ".join(cl), datetime.now()) + stdout.write(started + "\n") + stdout.write("".join(["="] * len(cl)) + "\n") try: subprocess.check_call(cl, stdout=stdout, stderr=stderr) except subprocess.CalledProcessError as e: - e.message = 'The command {} failed.'.format(' '.join(cl)) + e.message = "The command {} failed.".format(" ".join(cl)) raise e finally: if with_log_files: stdout.close() stderr.close() + def call_external_command_detached(cl, with_log_files=False, prefix=None): """Executes an external command. :param string cl: Command line to be executed (command + options and parameters) :param bool with_log_files: Create log files for stdout and stderr """ - if type(cl) == str: - cl = cl.split(' ') + if isinstance(type(cl), str): + cl = cl.split(" ") command = os.path.basename(cl[0]) stdout = sys.stdout stderr = sys.stderr if with_log_files: if prefix: - command = '{}_{}'.format(prefix, command) - stdout = open(command + '.out', 'a') - stderr = open(command + '.err', 'a') - started = 'Started command {} on {}'.format(' '.join(cl), datetime.now()) - stdout.write(started + u'\n') - stdout.write(''.join(['=']*len(cl)) + u'\n') + command = f"{prefix}_{command}" + stdout = open(command + ".out", "a") + stderr = open(command + ".err", "a") + started = "Started command {} on {}".format(" ".join(cl), datetime.now()) + stdout.write(started + "\n") + stdout.write("".join(["="] * len(cl)) + "\n") try: p_handle = subprocess.Popen(cl, stdout=stdout, stderr=stderr) except subprocess.CalledProcessError as e: - e.message = 'The command {} failed.'.format(' '.join(cl)) + e.message = "The command {} failed.".format(" ".join(cl)) raise e finally: if with_log_files: @@ -98,6 +99,7 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None): stderr.close() return p_handle + def to_seconds(days=None, hours=None): """Convert given day/hours to seconds and return. @@ -116,7 +118,8 @@ def to_seconds(days=None, hours=None): # 1 hour == 60*60 seconds --> 3600 return 3600 * hours -def hashfile(afile, hasher='sha1', blocksize=65536): + +def hashfile(afile, hasher="sha1", blocksize=65536): """Calculate the hash digest of a file with the specified algorithm and return it. @@ -130,14 +133,15 @@ def hashfile(afile, hasher='sha1', blocksize=65536): if not os.path.isfile(afile): return None hashobj = hashlib.new(hasher) - with open(afile,'rb') as fh: + with open(afile, "rb") as fh: buf = fh.read(blocksize) while len(buf) > 0: hashobj.update(buf) buf = fh.read(blocksize) return hashobj.hexdigest() -def query_yes_no(question, default='yes', force=False): + +def query_yes_no(question, default="yes", force=False): """Ask a yes/no question via raw_input() and return their answer. "question" is a string that is presented to the user. "default" is the presumed answer if the user just hits . It must be @@ -150,14 +154,13 @@ def query_yes_no(question, default='yes', force=False): :param force: set answer to default :returns: yes or no """ - valid = {'yes': True, 'y': True, 'ye': True, - 'no': False, 'n': False} - if default == None: - prompt = ' [y/n] ' - elif default == 'yes': - prompt = ' [Y/n] ' - elif default == 'no': - prompt = ' [y/N] ' + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " else: raise ValueError('invalid default answer: "%s"' % default) @@ -166,56 +169,60 @@ def query_yes_no(question, default='yes', force=False): if not force: choice = input().lower() else: - choice = 'yes' - if default is not None and choice == '': + choice = "yes" + if default is not None and choice == "": return valid[default] elif choice in valid: return valid[choice] else: - sys.stdout.write('Please respond with "yes" or "no" '\ - '(or "y" or "n").\n') + sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n') + def return_unique(seq): seen = set() seen_add = seen.add - return [ x for x in seq if not (x in seen or seen_add(x))] + return [x for x in seq if not (x in seen or seen_add(x))] + def run_is_demuxed(run, couch_info=None, seq_run_type=None): - """ + """ For ONT runs: check that .sync_finished exists, which is created by TACA when the sync is finalized. Since demux is done on the sequencers in parallel to sequencing, the presence of this file also implies that demux is done. - + For Illumina runs: Check in StatusDB 'x_flowcells' database if the given run has an entry which means it was demultiplexed (as TACA only creates a document upon successfull demultiplexing) :param dict couch_info: a dict with 'statusDB' info """ - if seq_run_type in ['promethion', 'minion']: + if seq_run_type in ["promethion", "minion"]: if os.path.exists(os.path.join(run.abs_path, ".sync_finished")): return True else: return False else: if not couch_info: - raise SystemExit('To check for demultiplexing is enabled in config file but no "statusDB" info was given') - run_terms = run.name.split('_') + raise SystemExit( + 'To check for demultiplexing is enabled in config file but no "statusDB" info was given' + ) + run_terms = run.name.split("_") run_date = run_terms[0] - if len(run_date)>6: + if len(run_date) > 6: run_date = run_date[2:] run_fc = run_terms[-1] - run_name = '{}_{}'.format(run_date, run_fc) + run_name = f"{run_date}_{run_fc}" try: couch_connection = statusdb.StatusdbSession(couch_info).connection - fc_db = couch_connection[couch_info['xten_db']] - for fc in fc_db.view('names/name', reduce=False, descending=True): + fc_db = couch_connection[couch_info["xten_db"]] + for fc in fc_db.view("names/name", reduce=False, descending=True): if fc.key != run_name: continue fc_doc = fc_db.get(fc.id) - if not fc_doc or not fc_doc.get('illumina', {}).get('Demultiplex_Stats', {}): + if not fc_doc or not fc_doc.get("illumina", {}).get( + "Demultiplex_Stats", {} + ): return False return True except Exception as e: raise e - diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index 3ae4d291..939e0606 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -1,24 +1,26 @@ """Classes for handling connection to StatusDB.""" -import couchdb -import logging import csv - +import logging from datetime import datetime +import couchdb + logger = logging.getLogger(__name__) -class StatusdbSession(object): + +class StatusdbSession: """Wrapper class for couchdb.""" + def __init__(self, config, db=None): - user = config.get('username') - password = config.get('password') - url = config.get('url') - url_string = 'https://{}:{}@{}'.format(user, password, url) - display_url_string = 'https://{}:{}@{}'.format(user, '*********', url) + user = config.get("username") + password = config.get("password") + url = config.get("url") + url_string = f"https://{user}:{password}@{url}" + display_url_string = "https://{}:{}@{}".format(user, "*********", url) self.connection = couchdb.Server(url=url_string) if not self.connection: - raise Exception('Couchdb connection failed for url {}'.format(display_url_string)) + raise Exception(f"Couchdb connection failed for url {display_url_string}") if db: self.db_connection = self.connection[db] @@ -40,9 +42,11 @@ def save_db_doc(self, doc, db=None): db = db or self.db db.save(doc) except Exception as e: - raise Exception('Failed saving document due to {}'.format(e)) + raise Exception(f"Failed saving document due to {e}") - def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='%Y-%m-%d'): + def get_project_flowcell( + self, project_id, open_date="2015-01-01", date_format="%Y-%m-%d" + ): """From information available in flowcell db connection, collect the flowcell this project was sequenced. @@ -53,68 +57,91 @@ def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format=' try: open_date = datetime.strptime(open_date, date_format) except: - open_date = datetime.strptime('2015-01-01', '%Y-%m-%d') + open_date = datetime.strptime("2015-01-01", "%Y-%m-%d") project_flowcells = {} - date_sorted_fcs = sorted(list(self.proj_list.keys()), key=lambda k: datetime.strptime(k.split('_')[0], '%y%m%d'), reverse=True) + date_sorted_fcs = sorted( + list(self.proj_list.keys()), + key=lambda k: datetime.strptime(k.split("_")[0], "%y%m%d"), + reverse=True, + ) for fc in date_sorted_fcs: - fc_date, fc_name = fc.split('_') - if datetime.strptime(fc_date,'%y%m%d') < open_date: + fc_date, fc_name = fc.split("_") + if datetime.strptime(fc_date, "%y%m%d") < open_date: break - if project_id in self.proj_list[fc] and fc_name not in project_flowcells.keys(): - project_flowcells[fc_name] = {'name':fc_name,'run_name':fc, 'date':fc_date, 'db':self.db.name} + if ( + project_id in self.proj_list[fc] + and fc_name not in project_flowcells.keys() + ): + project_flowcells[fc_name] = { + "name": fc_name, + "run_name": fc, + "date": fc_date, + "db": self.db.name, + } return project_flowcells + class ProjectSummaryConnection(StatusdbSession): - def __init__(self, config, dbname='projects'): - super(ProjectSummaryConnection, self).__init__(config) + def __init__(self, config, dbname="projects"): + super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key: k.id for k in self.db.view('project/project_name', reduce=False)} - self.id_view = {k.key: k.id for k in self.db.view('project/project_id', reduce=False)} + self.name_view = { + k.key: k.id for k in self.db.view("project/project_name", reduce=False) + } + self.id_view = { + k.key: k.id for k in self.db.view("project/project_id", reduce=False) + } class FlowcellRunMetricsConnection(StatusdbSession): - def __init__(self, config, dbname='flowcells'): - super(FlowcellRunMetricsConnection, self).__init__(config) + def __init__(self, config, dbname="flowcells"): + super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} - self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} + self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)} + self.proj_list = { + k.key: k.value + for k in self.db.view("names/project_ids_list", reduce=False) + if k.key + } class X_FlowcellRunMetricsConnection(StatusdbSession): - def __init__(self, config, dbname='x_flowcells'): - super(X_FlowcellRunMetricsConnection, self).__init__(config) + def __init__(self, config, dbname="x_flowcells"): + super().__init__(config) self.db = self.connection[dbname] - self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)} - self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key} + self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)} + self.proj_list = { + k.key: k.value + for k in self.db.view("names/project_ids_list", reduce=False) + if k.key + } class NanoporeRunsConnection(StatusdbSession): - - def __init__(self, config, dbname='nanopore_runs'): - super(NanoporeRunsConnection, self).__init__(config) + def __init__(self, config, dbname="nanopore_runs"): + super().__init__(config) self.db = self.connection[dbname] def check_run_exists(self, ont_run) -> bool: - view_names = self.db.view('names/name') + view_names = self.db.view("names/name") if len(view_names[ont_run.run_name].rows) > 0: return True else: return False - + def check_run_status(self, ont_run) -> str: - view_all_stats = self.db.view('names/name') + view_all_stats = self.db.view("names/name") doc_id = view_all_stats[ont_run.run_name].rows[0].id return self.db[doc_id]["run_status"] def create_ongoing_run( self, ont_run, run_path_file: str, pore_count_history_file: str ): - - run_path = open(run_path_file, "r").read().strip() + run_path = open(run_path_file).read().strip() pore_counts = [] - with open(pore_count_history_file, "r") as stream: + with open(pore_count_history_file) as stream: for line in csv.DictReader(stream): pore_counts.append(line) @@ -130,7 +157,7 @@ def create_ongoing_run( ) def finish_ongoing_run(self, ont_run, dict_json: dict): - view_names = self.db.view('names/name') + view_names = self.db.view("names/name") doc_id = view_names[ont_run.run_name].rows[0].id doc = self.db[doc_id] @@ -140,23 +167,23 @@ def finish_ongoing_run(self, ont_run, dict_json: dict): def update_doc(db, obj, over_write_db_entry=False): - view = db.view('info/name') - if len(view[obj['name']].rows) == 1: - remote_doc = view[obj['name']].rows[0].value - doc_id = remote_doc.pop('_id') - doc_rev = remote_doc.pop('_rev') + view = db.view("info/name") + if len(view[obj["name"]].rows) == 1: + remote_doc = view[obj["name"]].rows[0].value + doc_id = remote_doc.pop("_id") + doc_rev = remote_doc.pop("_rev") if remote_doc != obj: if not over_write_db_entry: obj = merge_dicts(obj, remote_doc) - obj['_id'] = doc_id - obj['_rev'] = doc_rev + obj["_id"] = doc_id + obj["_rev"] = doc_rev db[doc_id] = obj - logger.info('Updating {}'.format(obj['name'])) - elif len(view[obj['name']].rows) == 0: + logger.info("Updating {}".format(obj["name"])) + elif len(view[obj["name"]].rows) == 0: db.save(obj) - logger.info('Saving {}'.format(obj['name'])) + logger.info("Saving {}".format(obj["name"])) else: - logger.warn('More than one row with name {} found'.format(obj['name'])) + logger.warn("More than one row with name {} found".format(obj["name"])) def merge_dicts(d1, d2): @@ -166,12 +193,14 @@ def merge_dicts(d1, d2): for key in d2: if key in d1: if isinstance(d1[key], dict) and isinstance(d2[key], dict): - merge(d1[key], d2[key]) + merge_dicts(d1[key], d2[key]) elif d1[key] == d2[key]: pass # same leaf value else: - logger.debug('Values for key {key} in d1 and d2 differ, ' - 'using the value of d1'.format(key=key)) + logger.debug( + f"Values for key {key} in d1 and d2 differ, " + "using the value of d1" + ) else: d1[key] = d2[key] return d1 diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index 34e6b314..693f0725 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -6,109 +6,108 @@ import subprocess from taca.utils.filesystem import create_folder -from taca.utils.misc import hashfile, call_external_command -from io import open +from taca.utils.misc import call_external_command, hashfile logger = logging.getLogger(__name__) -class TransferAgent(object): +class TransferAgent: """ - (Abstract) superclass representing an Agent that performs file transfers. - Agents implementing specific methods for transferring files should extend - this and implement the transfer() method. + (Abstract) superclass representing an Agent that performs file transfers. + Agents implementing specific methods for transferring files should extend + this and implement the transfer() method. """ - def __init__( - self, - src_path=None, - dest_path=None, - opts={}, - **kwargs): - """ Creates an agent instance - :param string src_path: the file or folder that should be transferred - :param string dest_path: the destination file or folder - :param bool validate: whether to validate the transferred files - :param opts: options that will be passed to the transfer command + + def __init__(self, src_path=None, dest_path=None, opts={}, **kwargs): + """Creates an agent instance + :param string src_path: the file or folder that should be transferred + :param string dest_path: the destination file or folder + :param bool validate: whether to validate the transferred files + :param opts: options that will be passed to the transfer command """ self.src_path = src_path self.dest_path = dest_path - self.validate = kwargs.get('validate', False) + self.validate = kwargs.get("validate", False) self.cmdopts = opts def __str__(self): return type(self).__name__ def format_options(self): - """ Format the options dictionary stored in this instance's cmdopts - attribute and return the formatted options as a list of strings. - A key in the dictionary represents the option name. If - the corresponding value is None, the option will be assumed to - represent a flag. If the value is a list, the option will be given - multiple times. + """Format the options dictionary stored in this instance's cmdopts + attribute and return the formatted options as a list of strings. + A key in the dictionary represents the option name. If + the corresponding value is None, the option will be assumed to + represent a flag. If the value is a list, the option will be given + multiple times. - For example: + For example: - opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']} + opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']} - will be expanded to: + will be expanded to: - ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3'] + ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3'] - :returns: List of formatted options as strings + :returns: List of formatted options as strings """ cmdopts = [] for param, val in self.cmdopts.items(): if val is None: cmdopts.append(param) else: - if type(val) == str: + if isinstance(type(val), str): val = [val] for v in val: - cmdopts.append('{}={}'.format(param,v)) + cmdopts.append(f"{param}={v}") return cmdopts def transfer(self): """Abstract method, should be implemented by subclasses.""" - raise NotImplementedError('This method should be implemented by subclass') + raise NotImplementedError("This method should be implemented by subclass") def validate_src_path(self): """Validates that the src_path attribute of the Agent instance. - :raises transfer.TransferError: if src_path is not valid + :raises transfer.TransferError: if src_path is not valid """ if self.src_path is None: raise TransferError( - msg='src_path cannot be None', + msg="src_path cannot be None", src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) if not os.path.exists(self.src_path): raise TransferError( - msg='src_path "{}" does not exist'.format(self.src_path), + msg=f'src_path "{self.src_path}" does not exist', src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) def validate_dest_path(self): """Validates that the dest_path attribute of the Agent instance. - :raises transfer.TransferError: if dest_path is not valid + :raises transfer.TransferError: if dest_path is not valid """ if self.dest_path is None: raise TransferError( - msg='dest_path cannot be None', + msg="dest_path cannot be None", src_path=self.src_path, - dest_path=self.dest_path) + dest_path=self.dest_path, + ) def validate_transfer(self): """Abstract method, should be implemented by subclasses.""" - raise NotImplementedError('This method should be implemented by subclass') + raise NotImplementedError("This method should be implemented by subclass") class RsyncAgent(TransferAgent): """An agent that knows how to perform an rsync transfer locally or - between hosts. If supplied with a checksum file, the transfer can - be validated on the receiving side. + between hosts. If supplied with a checksum file, the transfer can + be validated on the receiving side. """ - CMD = 'rsync' + + CMD = "rsync" DEFAULT_OPTS = { - '-a': None, + "-a": None, } def __init__( @@ -120,68 +119,65 @@ def __init__( validate=True, digestfile=None, opts=None, - **kwargs): + **kwargs, + ): """Creates an RsyncAgent instance - :param string src_path: the file or folder that should be transferred - :param string dest_path: the destination file or folder - :param string remote_host: the remote host to transfer to. - If None, the transfer will be on the local filesystem - :param string remote_user: the remote user to connect with. - If None, the local user will be used - :param bool validate: whether to validate the transferred files - using a supplied file with checksums - :param string digestfile: a file with checksums for the files to be - transferred. Must be specified if validate is True. The checksum - algorithm will be inferred from the extension of the digest file - :param opts: options that will be passed to the rsync command + :param string src_path: the file or folder that should be transferred + :param string dest_path: the destination file or folder + :param string remote_host: the remote host to transfer to. + If None, the transfer will be on the local filesystem + :param string remote_user: the remote user to connect with. + If None, the local user will be used + :param bool validate: whether to validate the transferred files + using a supplied file with checksums + :param string digestfile: a file with checksums for the files to be + transferred. Must be specified if validate is True. The checksum + algorithm will be inferred from the extension of the digest file + :param opts: options that will be passed to the rsync command """ - super(RsyncAgent, self).__init__( + super().__init__( src_path=src_path, dest_path=dest_path, opts=opts or self.DEFAULT_OPTS, validate=validate, - **kwargs) + **kwargs, + ) self.remote_host = remote_host self.remote_user = remote_user self.digestfile = digestfile def transfer(self, transfer_log=None): """Execute the transfer as set up by this instance and, if requested, - validate the transfer. + validate the transfer. - :param string transfer_log: path prefix to log files where stderr - and stdout streams will be directed if this option is specified - :returns True on success, False if the validation failed - :raises transfer.TransferError: if src_path or dest_path were not valid - :raises transfer.RsyncError: if the rsync command did not exit successfully + :param string transfer_log: path prefix to log files where stderr + and stdout streams will be directed if this option is specified + :returns True on success, False if the validation failed + :raises transfer.TransferError: if src_path or dest_path were not valid + :raises transfer.RsyncError: if the rsync command did not exit successfully """ self.validate_src_path() self.validate_dest_path() - command = [self.CMD] + self.format_options() + [self.src_path, self.remote_path()] + command = ( + [self.CMD] + self.format_options() + [self.src_path, self.remote_path()] + ) try: call_external_command( - command, - with_log_files=(transfer_log is not None), - prefix=transfer_log) + command, with_log_files=(transfer_log is not None), prefix=transfer_log + ) except subprocess.CalledProcessError as e: raise RsyncError(e) return (not self.validate) or self.validate_transfer() def remote_path(self): """Construct the remote path according to what has been specified. - :returns: the remote path string on the form - [remote_user]@[remote_host]:[dest_path] + :returns: the remote path string on the form + [remote_user]@[remote_host]:[dest_path] """ - return '{}{}{}'.format( - '{}@'.format(self.remote_user) \ - if self.remote_user is not None \ - else '', - '{}:'.format(self.remote_host) \ - if self.remote_host is not None \ - else '', - self.dest_path \ - if self.dest_path is not None \ - else '' + return "{}{}{}".format( + f"{self.remote_user}@" if self.remote_user is not None else "", + f"{self.remote_host}:" if self.remote_host is not None else "", + self.dest_path if self.dest_path is not None else "", ) def validate_dest_path(self): @@ -193,63 +189,60 @@ def validate_dest_path(self): """ if self.dest_path is None and self.remote_host is None: raise TransferError( - msg='dest_path and remote_host cannot both be None', - src_path=self.src_path) + msg="dest_path and remote_host cannot both be None", + src_path=self.src_path, + ) if self.remote_user is not None and self.remote_host is None: raise TransferError( - msg='dest_path cannot be None if remote_user is not None', - src_path=self.src_path) + msg="dest_path cannot be None if remote_user is not None", + src_path=self.src_path, + ) def validate_transfer(self): """Validate the transferred files by computing checksums and comparing - to the pre-computed checksums, supplied in the digestfile attribute - of this Agent instance. The hash algorithm is inferred from the file - extension of the digestfile. The paths of the files to check are - assumed to be relative to the location of the digestfile. + to the pre-computed checksums, supplied in the digestfile attribute + of this Agent instance. The hash algorithm is inferred from the file + extension of the digestfile. The paths of the files to check are + assumed to be relative to the location of the digestfile. - Currently not implemented for remote transfers. + Currently not implemented for remote transfers. - :returns: False if any checksum does not match, or if a file does - not exist. True otherwise. - :raises transfer.RsyncValidationError: if the digestfile was not - supplied + :returns: False if any checksum does not match, or if a file does + not exist. True otherwise. + :raises transfer.RsyncValidationError: if the digestfile was not + supplied """ if self.remote_host is not None: - raise NotImplementedError('Validation on remote host not implemented') + raise NotImplementedError("Validation on remote host not implemented") try: with open(self.digestfile) as fh: - hasher = self.digestfile.split('.')[-1] + hasher = self.digestfile.split(".")[-1] dpath = os.path.dirname(self.digestfile) for line in fh: - digest,fpath = line.split() - tfile = os.path.join(dpath,fpath) + digest, fpath = line.split() + tfile = os.path.join(dpath, fpath) if not os.path.exists(tfile) or digest != hashfile( - tfile, - hasher=hasher): + tfile, hasher=hasher + ): return False - except TypeError as e: + except TypeError: raise RsyncValidationError( - 'no digest file specified', - self.src_path, - self.dest_path) + "no digest file specified", self.src_path, self.dest_path + ) return True class SymlinkAgent(TransferAgent): - def __init__(self, src_path, dest_path, overwrite=True, relative=True, **kwargs): """Creates an SymlinkAgent instance for creating symlinks. - :param string src_path: the file or folder that should be symlinked - :param string dest_path: the destination symlink - :param bool overwrite: if true, the destination file or folder will - be overwritten if it already exists - :param bool relative: if true, the destination symlink will be relative + :param string src_path: the file or folder that should be symlinked + :param string dest_path: the destination symlink + :param bool overwrite: if true, the destination file or folder will + be overwritten if it already exists + :param bool relative: if true, the destination symlink will be relative """ - super(SymlinkAgent,self).__init__( - src_path=src_path, - dest_path=dest_path, - **kwargs) + super().__init__(src_path=src_path, dest_path=dest_path, **kwargs) self.overwrite = overwrite self.relative = relative @@ -268,22 +261,23 @@ def transfer(self): # If the existing target is a symlink that points to the # source, we're all good if self.validate_transfer(): - logger.debug('target exists and points to the correct ' - 'source path: "{}"'.format(self.src_path)) + logger.debug( + "target exists and points to the correct " + f'source path: "{self.src_path}"' + ) return True # If we are not overwriting, return False if not self.overwrite: - logger.debug('target "{}" exists and will not be ' - 'overwritten'.format(self.dest_path)) + logger.debug( + f'target "{self.dest_path}" exists and will not be ' "overwritten" + ) return False # If the target is a mount, let's not mess with it if os.path.ismount(self.dest_path): - raise SymlinkError('target exists and is a mount') + raise SymlinkError("target exists and is a mount") # If the target is a link or a file, we remove it - if os.path.islink(self.dest_path) or \ - os.path.isfile(self.dest_path): - logger.debug('removing existing target file "{}"' - .format(self.dest_path)) + if os.path.islink(self.dest_path) or os.path.isfile(self.dest_path): + logger.debug(f'removing existing target file "{self.dest_path}"') try: os.unlink(self.dest_path) except OSError as e: @@ -291,44 +285,55 @@ def transfer(self): # If the target is a directory, we remove it and # everything underneath elif os.path.isdir(self.dest_path): - logger.debug('removing existing target folder "{}"' - .format(self.dest_path)) + logger.debug(f'removing existing target folder "{self.dest_path}"') try: shutil.rmtree(self.dest_path) except OSError as e: raise SymlinkError(e) # If it's something else, let's bail out else: - raise SymlinkError('target exists and will not be overwritten') + raise SymlinkError("target exists and will not be overwritten") if not create_folder(os.path.dirname(self.dest_path)): - raise SymlinkError('failed to create target folder hierarchy') + raise SymlinkError("failed to create target folder hierarchy") try: # If we should create a relative symlink, determine the relative path os.symlink( - os.path.relpath(self.src_path,os.path.dirname(self.dest_path)) \ - if self.relative else self.src_path, - self.dest_path) + os.path.relpath(self.src_path, os.path.dirname(self.dest_path)) + if self.relative + else self.src_path, + self.dest_path, + ) except OSError as e: raise SymlinkError(e) return (not self.validate) or self.validate_transfer() def validate_transfer(self): """Validates the symlinked files by verifying that the dest_path was - created, is a link and resolves to the same file as src_path. + created, is a link and resolves to the same file as src_path. - :returns: True if link is valid, False otherwise + :returns: True if link is valid, False otherwise """ - return os.path.exists(self.dest_path) and \ - os.path.islink(self.dest_path) and \ - os.path.samefile(self.src_path, self.dest_path) + return ( + os.path.exists(self.dest_path) + and os.path.islink(self.dest_path) + and os.path.samefile(self.src_path, self.dest_path) + ) class TransferError(Exception): def __init__(self, msg, src_path=None, dest_path=None): - super(TransferError, self).__init__(msg) + super().__init__(msg) self.src_path = src_path self.dest_path = dest_path -class SymlinkError(TransferError): pass -class RsyncError(TransferError): pass -class RsyncValidationError(TransferError): pass + +class SymlinkError(TransferError): + pass + + +class RsyncError(TransferError): + pass + + +class RsyncValidationError(TransferError): + pass diff --git a/tests/data/Stats.json b/tests/data/Stats.json index 5090f4ac..1d2e2bd1 100644 --- a/tests/data/Stats.json +++ b/tests/data/Stats.json @@ -1,75 +1,75 @@ { - "RunNumber":131, - "Flowcell":"FCIDXX", - "RunId":"141124_ST-COMPLEX1_01_AFCIDXX", - "ConversionResults":[ - { - "LaneNumber":1, - "DemuxResults":[ + "RunNumber": 131, + "Flowcell": "FCIDXX", + "RunId": "141124_ST-COMPLEX1_01_AFCIDXX", + "ConversionResults": [ + { + "LaneNumber": 1, + "DemuxResults": [ + { + "SampleId": "Sample_P12345_1001", + "SampleName": "P12345_1001", + "NumberReads": 494288265, + "Yield": 58820303535, + "ReadMetrics": [ { - "SampleId":"Sample_P12345_1001", - "SampleName":"P12345_1001", - "NumberReads":494288265, - "Yield":58820303535, - "ReadMetrics":[ - { - "ReadNumber":1, - "Yield":13840071420, - "YieldQ30":13329609381, - "QualityScoreSum":503672520160, - "TrimmedBases":0 - } - ] + "ReadNumber": 1, + "Yield": 13840071420, + "YieldQ30": 13329609381, + "QualityScoreSum": 503672520160, + "TrimmedBases": 0 } - ], - "Undetermined":{ - "NumberReads":17709745, - "Yield":2036620675, - "ReadMetrics":[ - { - "ReadNumber":1, - "Yield":885487250, - "YieldQ30":680049984, - "QualityScoreSum":28815661398, - "TrimmedBases":0 - }, - { - "ReadNumber":2, - "Yield":283355920, - "YieldQ30":179655904, - "QualityScoreSum":8324058259, - "TrimmedBases":0 - } - ] - } + ] + } + ], + "Undetermined": { + "NumberReads": 17709745, + "Yield": 2036620675, + "ReadMetrics": [ + { + "ReadNumber": 1, + "Yield": 885487250, + "YieldQ30": 680049984, + "QualityScoreSum": 28815661398, + "TrimmedBases": 0 + }, + { + "ReadNumber": 2, + "Yield": 283355920, + "YieldQ30": 179655904, + "QualityScoreSum": 8324058259, + "TrimmedBases": 0 + } + ] } - ], - "ReadInfosForLanes":[ - { - "LaneNumber":1, - "ReadInfos":[ - { - "Number":1, - "NumCycles":28, - "IsIndexedRead":"false" - } - ] + } + ], + "ReadInfosForLanes": [ + { + "LaneNumber": 1, + "ReadInfos": [ + { + "Number": 1, + "NumCycles": 28, + "IsIndexedRead": "false" + } + ] + } + ], + "UnknownBarcodes": [ + { + "Lane": 1, + "Barcodes": { + "GGGGGGGG": 3203920, + "CCCTAACA": 290420 } - ], - "UnknownBarcodes":[ - { - "Lane":1, - "Barcodes":{ - "GGGGGGGG":3203920, - "CCCTAACA":290420 - } - }, - { - "Lane":2, - "Barcodes":{ - "GGGGGGGG":3075440, - "CCCTAACA":296260 - } + }, + { + "Lane": 2, + "Barcodes": { + "GGGGGGGG": 3075440, + "CCCTAACA": 296260 } - ] + } + ] } diff --git a/tests/data/lane.html b/tests/data/lane.html index 0079244b..435f6b29 100644 --- a/tests/data/lane.html +++ b/tests/data/lane.html @@ -1,85 +1,92 @@ - - - - - -

H5YKFDSXY / - [all projects] / - [all samples] / - [all barcodes]

show barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
-

- + + + + + + + +
+

H5YKFDSXY / [all projects] / [all samples] / [all barcodes]

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
13,239,634,349100.00100.00NaN385,51684.5995.4436.23
23,077,777,014100.00100.00NaN366,25580.3694.9136.13
33,171,906,422100.00100.00NaN377,45782.8294.8036.11
43,173,497,970100.00100.00NaN377,64682.8694.8836.12
+

+ diff --git a/tests/data/laneBarcode.html b/tests/data/laneBarcode.html index 24d51031..089bd90f 100644 --- a/tests/data/laneBarcode.html +++ b/tests/data/laneBarcode.html @@ -1,96 +1,97 @@ - - - - - -

FCIDXX / - [all projects] / - [all samples] / - [all barcodes]

hide barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
-

Top Unknown Barcodes

- - - - - - - - - - - - - - - - - - -
- Lane - CountSequence - Lane - CountSequence
1494,288,120unknown2464,768,960unknown
-

- + + + + + + + +
+

FCIDXX / [all projects] / [all samples] / [all barcodes]

+
+

+ hide barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
1,276,674,048959,057,323114,128
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LaneProjectSampleBarcode sequencePF Clusters% of the
lane
% Perfect
barcode
% One mismatch
barcode
Yield (Mbases)% PF
Clusters
% >= Q30
bases
Mean Quality
Score
1N__One_20_01P12345_1001unknown494,288,265100.00100.00NaN58,82077.4394.3636.03
2N__One_20_01P12345_1001unknown464,769,058100.00100.00NaN55,30872.8193.2635.83
+

Top Unknown Barcodes

+ + + + + + + + + + + + + + + + + + +
LaneCountSequenceLaneCountSequence
1494,288,120unknown2464,768,960unknown
+

+ diff --git a/tests/data/lane_result.html b/tests/data/lane_result.html index e94bfef0..b187c938 100644 --- a/tests/data/lane_result.html +++ b/tests/data/lane_result.html @@ -1,85 +1,92 @@ - - - - - -

C6L1WANXX / - [all projects] / - [all samples] / - [all barcodes]

show barcodes

-

Flowcell Summary

- - - - - - - - - - - -
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
-

Lane Summary

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
-

- + + + + + + + +
+

C6L1WANXX / [all projects] / [all samples] / [all barcodes]

+
+

+ show barcodes +

+
+

Flowcell Summary

+ + + + + + + + + + + +
Clusters (Raw)Clusters(PF)Yield (MBases)
15,320,088,57612,662,815,7551,506,875
+

Lane Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
% >= Q30bases% One mismatchbarcode% PFClusters% Perfectbarcode% of thelaneLaneMean QualityScorePF ClustersYield (Mbases)
95.44NaN84.59100.00100.00136.233,239,634,349385,516
94.91NaN80.36100.00100.00236.133,077,777,014366,255
94.80NaN82.82100.00100.00336.113,171,906,422377,457
94.88NaN82.86100.00100.00436.123,173,497,970377,646
+

+ diff --git a/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/still_running_and_should_not_be_moved b/tests/data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2/still_running_and_should_not_be_moved deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt deleted file mode 100644 index 56e06611..00000000 --- a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/final_summary.txt +++ /dev/null @@ -1,17 +0,0 @@ -instrument=MN19414 -position= -flow_cell_id=AAU644 -sample_id=blah -protocol_group_id=blah -protocol=4a127386521a00415c821269a588a9271276dfd0 -protocol_run_id=5db4c5bc-34c9-452b-9d10-70e013228328 -acquisition_run_id=8b4541217a81f608d0562c0a0847b703ba77d13d -started=2020-08-03T15:05:12.504585+02:00 -acquisition_stopped=2020-08-04T09:05:16.104836+02:00 -processing_stopped=2020-08-04T09:05:17.311324+02:00 -basecalling_enabled=0 -sequencing_summary_file= -fast5_files_in_final_dest=42 -fast5_files_in_fallback=0 -fastq_files_in_final_dest=0 -fastq_files_in_fallback=0 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/finished_sequencing_and_ready_to_analyse b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/finished_sequencing_and_ready_to_analyse deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md deleted file mode 100644 index bfad9e66..00000000 --- a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/report.md +++ /dev/null @@ -1,44 +0,0 @@ -Tracking ID -=========== - -{ - "asic_id": "755228278", - "asic_id_eeprom": "2866631", - "asic_temp": "33.002907", - "asic_version": "IA02D", - "auto_update": "0", - "auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", - "bream_is_standard": "0", - "device_id": "MN19414", - "device_type": "minion", - "distribution_status": "stable", - "distribution_version": "19.10.1", - "exp_script_name": "N/A", - "exp_script_purpose": "sequencing_run", - "exp_start_time": "2020-08-03T13:05:12Z", - "flongle_adapter_id": "FA-00577", - "flow_cell_id": "ACG995", - "flow_cell_product_code": "FLO-FLG001", - "guppy_version": "3.2.6+afc8e14", - "heatsink_temp": "34.574219", - "hostname": "ngi-squiggle", - "installation_type": "nc", - "local_firmware_file": "1", - "operating_system": "ubuntu 16.04", - "protocol_group_id": "blah", - "protocol_run_id": "", - "protocols_version": "4.2.11", - "run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", - "sample_id": "blahblah", - "usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", - "version": "3.5.5" -} - -Duty Time -========= - -ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u - -Channel State,Experiment Time (minutes),State Time (samples), -strand,0,477917 -strand,1,1370562 diff --git a/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv deleted file mode 100644 index bc77384f..00000000 --- a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1025,,0,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/final_summary.txt b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/nanoseq_output/some_intermediary_results b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/nanoseq_output/some_intermediary_results deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish deleted file mode 100644 index 573541ac..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_anglerfish +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq deleted file mode 100644 index 573541ac..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index 9842c8d4..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -group,replicate,barcode,input_file,genome,transcriptome -P15608_1005,1,3,,, -P15608_1025,1,8,,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141922/anglerfish_stats.txt b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141922/anglerfish_stats.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141923/.file b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output/anglerfish_2020_09_23_141923/.file deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv deleted file mode 100644 index 069eff1f..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,truseq,ACAGTG,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode03.fastq.gz -P15608_1025,truseq,ACTGAT,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode08.fastq.gz diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv deleted file mode 100644 index d5b94506..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/expected_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1005,,3,, -P15608_1025,,0,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt deleted file mode 100644 index 56e06611..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/final_summary.txt +++ /dev/null @@ -1,17 +0,0 @@ -instrument=MN19414 -position= -flow_cell_id=AAU644 -sample_id=blah -protocol_group_id=blah -protocol=4a127386521a00415c821269a588a9271276dfd0 -protocol_run_id=5db4c5bc-34c9-452b-9d10-70e013228328 -acquisition_run_id=8b4541217a81f608d0562c0a0847b703ba77d13d -started=2020-08-03T15:05:12.504585+02:00 -acquisition_stopped=2020-08-04T09:05:16.104836+02:00 -processing_stopped=2020-08-04T09:05:17.311324+02:00 -basecalling_enabled=0 -sequencing_summary_file= -fast5_files_in_final_dest=42 -fast5_files_in_fallback=0 -fastq_files_in_final_dest=0 -fastq_files_in_fallback=0 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/final_results b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/final_results deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md deleted file mode 100644 index bfad9e66..00000000 --- a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/report.md +++ /dev/null @@ -1,44 +0,0 @@ -Tracking ID -=========== - -{ - "asic_id": "755228278", - "asic_id_eeprom": "2866631", - "asic_temp": "33.002907", - "asic_version": "IA02D", - "auto_update": "0", - "auto_update_source": "https://mirror.oxfordnanoportal.com/software/MinKNOW/", - "bream_is_standard": "0", - "device_id": "MN19414", - "device_type": "minion", - "distribution_status": "stable", - "distribution_version": "19.10.1", - "exp_script_name": "N/A", - "exp_script_purpose": "sequencing_run", - "exp_start_time": "2020-08-03T13:05:12Z", - "flongle_adapter_id": "FA-00577", - "flow_cell_id": "ACG995", - "flow_cell_product_code": "FLO-FLG001", - "guppy_version": "3.2.6+afc8e14", - "heatsink_temp": "34.574219", - "hostname": "ngi-squiggle", - "installation_type": "nc", - "local_firmware_file": "1", - "operating_system": "ubuntu 16.04", - "protocol_group_id": "blah", - "protocol_run_id": "", - "protocols_version": "4.2.11", - "run_id": "8b4541218d0562c0a02857ws9dh983eqwba77d13d", - "sample_id": "blahblah", - "usb_config": "MinION_fx3_1.1.1_ONT#MinION_fpga_1.1.0#bulk#Auto", - "version": "3.5.5" -} - -Duty Time -========= - -ID: 8b4hdksolsjdfj020kpojrn3o239834akslash23409j39ruhqw39u - -Channel State,Experiment Time (minutes),State Time (samples), -strand,0,477917 -strand,1,1370562 diff --git a/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/final_summary.txt b/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq deleted file mode 100644 index d00491fd..00000000 --- a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq +++ /dev/null @@ -1 +0,0 @@ -1 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index fd0b7b5b..00000000 --- a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -sample,fastq,barcode,genome,transcriptome -P15608_1005,,13,, \ No newline at end of file diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/final_summary.txt b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/final_summary.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/nanoseq_output/failed_output b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/nanoseq_output/failed_output deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/squigglefile.fast5 b/tests/data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/squigglefile.fast5 deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/data/nanopore_data/transfer.tsv b/tests/data/nanopore_data/transfer.tsv deleted file mode 100644 index 429a49f1..00000000 --- a/tests/data/nanopore_data/transfer.tsv +++ /dev/null @@ -1,2 +0,0 @@ -20200105_1412_MN19414_AAU645_68125dc2 20200105 -20200106_1412_MN19414_AAU646_68125dc2 20200106 diff --git a/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv b/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv deleted file mode 100644 index 3bbf5719..00000000 --- a/tests/data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,ACGTAACTTGGTTTGTTCCCTGAA,truseq,ACTGAT \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv b/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv deleted file mode 100644 index 6c6e8ad0..00000000 --- a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,CCTGGTAACTGGGACACAAGACTC,truseq,ACTGAT \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv b/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv deleted file mode 100644 index 14cb73ae..00000000 --- a/tests/data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU645_Samplesheet_24-594126.csv +++ /dev/null @@ -1,74 +0,0 @@ -P15051_103_3,ACAGACGACTACAAACGGAATCGA,truseq,AGCTGGAT -P15051_102_1,ACAGACGACTACAAACGGAATCGA,truseq,ATAGTTAC -P15051_101_4,ACAGACGACTACAAACGGAATCGA,truseq,ATTGGCGT -P15051_103_1,ACAGACGACTACAAACGGAATCGA,truseq,CATGAACA -P15051_102_3,ACAGACGACTACAAACGGAATCGA,truseq,CCTACGTA -P15051_101_3,ACAGACGACTACAAACGGAATCGA,truseq,CGCCATCG -P15051_102_4,ACAGACGACTACAAACGGAATCGA,truseq,GAGCACCG -P15051_101_1,ACAGACGACTACAAACGGAATCGA,truseq,GCAACAAA -P15051_103_4,ACAGACGACTACAAACGGAATCGA,truseq,GTGACTTG -P15051_101_2,ACAGACGACTACAAACGGAATCGA,truseq,TAGTTGTC -P15051_103_2,ACAGACGACTACAAACGGAATCGA,truseq,TCACTCGC -P15051_102_2,ACAGACGACTACAAACGGAATCGA,truseq,TGCTGAGT -P15608_1005,CCTGGTAACTGGGACACAAGACTC,truseq,ACAGTG -P15608_1025,CCTGGTAACTGGGACACAAGACTC,truseq,ACTGAT -P15608_1008,CCTGGTAACTGGGACACAAGACTC,truseq,ACTTGA -P15608_1013,CCTGGTAACTGGGACACAAGACTC,truseq,AGTCAA -P15608_1014,CCTGGTAACTGGGACACAAGACTC,truseq,AGTTCC -P15608_1001,CCTGGTAACTGGGACACAAGACTC,truseq,ATCACG -P15608_1026,CCTGGTAACTGGGACACAAGACTC,truseq,ATGAGC -P15608_1015,CCTGGTAACTGGGACACAAGACTC,truseq,ATGTCA -P15608_1027,CCTGGTAACTGGGACACAAGACTC,truseq,ATTCCT -P15608_1028,CCTGGTAACTGGGACACAAGACTC,truseq,CAAAAG -P15608_1029,CCTGGTAACTGGGACACAAGACTC,truseq,CAACTA -P15608_1030,CCTGGTAACTGGGACACAAGACTC,truseq,CACCGG -P15608_1031,CCTGGTAACTGGGACACAAGACTC,truseq,CACGAT -P15608_1032,CCTGGTAACTGGGACACAAGACTC,truseq,CACTCA -P15608_1007,CCTGGTAACTGGGACACAAGACTC,truseq,CAGATC -P15608_1033,CCTGGTAACTGGGACACAAGACTC,truseq,CAGGCG -P15608_1034,CCTGGTAACTGGGACACAAGACTC,truseq,CATGGC -P15608_1035,CCTGGTAACTGGGACACAAGACTC,truseq,CATTTT -P15608_1036,CCTGGTAACTGGGACACAAGACTC,truseq,CCAACA -P15608_1016,CCTGGTAACTGGGACACAAGACTC,truseq,CCGTCC -P15608_1002,CCTGGTAACTGGGACACAAGACTC,truseq,CGATGT -P15608_1037,CCTGGTAACTGGGACACAAGACTC,truseq,CGGAAT -P15608_1022,CCTGGTAACTGGGACACAAGACTC,truseq,CGTACG -P15608_1038,CCTGGTAACTGGGACACAAGACTC,truseq,CTAGCT -P15608_1012,CCTGGTAACTGGGACACAAGACTC,truseq,CTTGTA -P15608_1023,CCTGGTAACTGGGACACAAGACTC,truseq,GAGTGG -P15608_1009,CCTGGTAACTGGGACACAAGACTC,truseq,GATCAG -P15608_1006,CCTGGTAACTGGGACACAAGACTC,truseq,GCCAAT -P15608_1011,CCTGGTAACTGGGACACAAGACTC,truseq,GGCTAC -P15608_1024,CCTGGTAACTGGGACACAAGACTC,truseq,GGTAGC -P15608_1017,CCTGGTAACTGGGACACAAGACTC,truseq,GTAGAG -P15608_1018,CCTGGTAACTGGGACACAAGACTC,truseq,GTCCGC -P15608_1019,CCTGGTAACTGGGACACAAGACTC,truseq,GTGAAA -P15608_1020,CCTGGTAACTGGGACACAAGACTC,truseq,GTGGCC -P15608_1021,CCTGGTAACTGGGACACAAGACTC,truseq,GTTTCG -P15608_1010,CCTGGTAACTGGGACACAAGACTC,truseq,TAGCTT -P15608_1004,CCTGGTAACTGGGACACAAGACTC,truseq,TGACCA -P15608_1003,CCTGGTAACTGGGACACAAGACTC,truseq,TTAGGC -P15951_1007,AAGGTTACACAAACCCTGGACAAG,truseq_dual,AAGAGGCA-TGATGAAA -P15951_1003,AAGGTTACACAAACCCTGGACAAG,truseq_dual,AGGCAGAA-TATCCTCT -P15951_1004,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CAGAGAGG-CTAAGCCT -P15951_1006,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CGAGGCTG-AACATGAT -P15951_1002,AAGGTTACACAAACCCTGGACAAG,truseq_dual,CGTACTAG-CTCTCTAT -P15951_1005,AAGGTTACACAAACCCTGGACAAG,truseq_dual,GCTACGCT-TGGAAATC -P15951_1008,AAGGTTACACAAACCCTGGACAAG,truseq_dual,GTAGAGGA-GTCGGACT -P15951_1001,AAGGTTACACAAACCCTGGACAAG,truseq_dual,TAAGGCGA-TAGATCGC -P15951_1009,AAGGTTACACAAACCCTGGACAAG,truseq_dual,TGGATCTG-TTTCTAGC -P14604_101,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,AAGAGGCA-CTAAGCCT -P14604_106,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,AGGCAGAA-CGTCTAAT -P14604_105,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,CGTACTAG-CGTCTAAT -P14604_103,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GCTCATGA-CTAAGCCT -P14604_108,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GGACTCCT-CGTCTAAT -P14604_102,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,GTAGAGGA-CTAAGCCT -P14604_104,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TAAGGCGA-CGTCTAAT -P14604_109,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TAGGCATG-CGTCTAAT -P14604_107,GACTACTTTCTGCCTTTGCGAGAA,truseq_dual,TCCTGAGC-CGTCTAAT -P15759_1001,AAGGATTCATTCCCACGGTAACAC,truseq, -P14654_169,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-AGGCGAAG -P14654_121,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-ATAGAGGC -P14654_137,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-CCTATCCT -P14654_153,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-GGCTCTGA -P14654_185,ACGTAACTTGGTTTGTTCCCTGAA,truseq_dual,TCCGGAGA-TAATCTTA \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv b/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv deleted file mode 100644 index 9842c8d4..00000000 --- a/tests/data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -group,replicate,barcode,input_file,genome,transcriptome -P15608_1005,1,3,,, -P15608_1025,1,8,,, \ No newline at end of file diff --git a/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv b/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv deleted file mode 100644 index 069eff1f..00000000 --- a/tests/data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv +++ /dev/null @@ -1,2 +0,0 @@ -P15608_1005,truseq,ACAGTG,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode03.fastq.gz -P15608_1025,truseq,ACTGAT,data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/nanoseq_output/guppy/fastq/barcode08.fastq.gz diff --git a/tests/data/taca_test_cfg.yaml b/tests/data/taca_test_cfg.yaml index 34437461..6bb3e657 100644 --- a/tests/data/taca_test_cfg.yaml +++ b/tests/data/taca_test_cfg.yaml @@ -14,77 +14,77 @@ statusdb: xten_db: x_flowcells analysis: - status_dir: data/ - data_dir: data/test_data/ - deliver_runfolder: - analysis_server: - host: b5.biotech.kth.se - port: - user: sara.sjunnebo - destination: test - mfs_path: - miseq: data/ - hiseqx: data/ - novaseq: data/ - NovaSeq: - bcl2fastq: - bin: path_to_bcl_to_fastq - tenX_index_path: "data/test_10X_indexes" - smartseq_index_path: "data/test_smartseq_indexes" - options: - common: - - output-dir: Demultiplexing - - opt: b - - c - SMARTSEQ: - - d - 10X_SINGLE: - - a - 10X_DUAL: - - e - samplesheets_dir: "data" - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - MiSeq: - bcl2fastq: - bin: path_to_bcl_to_fastq - options: - common: - - output-dir: Demultiplexing - samplesheets_dir: "data" - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - NextSeq: - samplesheets_dir: "data" - bcl2fastq: - bin: path_to_bcl_to_fastq - tenX_index_path: "data/test_10X_indexes" - smartseq_index_path: "data/test_smartseq_indexes" - options: - common: - - output-dir: Demultiplexing - analysis_server: - host: - port: - user: - sync: - data_archive: - include: - - "*.file" - DummySeq: - samplesheets_dir: "data" + status_dir: data/ + data_dir: data/test_data/ + deliver_runfolder: + analysis_server: + host: b5.biotech.kth.se + port: + user: sara.sjunnebo + destination: test + mfs_path: + miseq: data/ + hiseqx: data/ + novaseq: data/ + NovaSeq: + bcl2fastq: + bin: path_to_bcl_to_fastq + tenX_index_path: "data/test_10X_indexes" + smartseq_index_path: "data/test_smartseq_indexes" + options: + common: + - output-dir: Demultiplexing + - opt: b + - c + SMARTSEQ: + - d + 10X_SINGLE: + - a + 10X_DUAL: + - e + samplesheets_dir: "data" + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + MiSeq: + bcl2fastq: + bin: path_to_bcl_to_fastq + options: + common: + - output-dir: Demultiplexing + samplesheets_dir: "data" + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + NextSeq: + samplesheets_dir: "data" + bcl2fastq: + bin: path_to_bcl_to_fastq + tenX_index_path: "data/test_10X_indexes" + smartseq_index_path: "data/test_smartseq_indexes" + options: + common: + - output-dir: Demultiplexing + analysis_server: + host: + port: + user: + sync: + data_archive: + include: + - "*.file" + DummySeq: + samplesheets_dir: "data" bioinfo_tab: data_dirs: @@ -92,5 +92,4 @@ bioinfo_tab: xten_samplesheets: "data" mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_cfg_backup.yaml b/tests/data/taca_test_cfg_backup.yaml index e9c0662e..f66f5528 100644 --- a/tests/data/taca_test_cfg_backup.yaml +++ b/tests/data/taca_test_cfg_backup.yaml @@ -1,12 +1,12 @@ backup: - archive_dirs: - miseq: data/nas/miseq.lab/nosync - hiseq: blah - data_dirs: - miseq: data/nas/miseq.lab - keys_path: data/nas/run_keys - gpg_receiver: some.user - check_demux: True + archive_dirs: + miseq: data/nas/miseq.lab/nosync + hiseq: blah + data_dirs: + miseq: data/nas/miseq.lab + keys_path: data/nas/run_keys + gpg_receiver: some.user + check_demux: True statusdb: url: url @@ -15,5 +15,4 @@ statusdb: xten_db: x_flowcells mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_cfg_cleanup.yaml b/tests/data/taca_test_cfg_cleanup.yaml index 2a7d9b92..7610a91d 100644 --- a/tests/data/taca_test_cfg_cleanup.yaml +++ b/tests/data/taca_test_cfg_cleanup.yaml @@ -14,20 +14,19 @@ storage: HiSeq: data/test_data/nosync cleanup: - miarka: - flowcell: - root: - - "data/miarka/incoming" - - "data/miarka/archive" - relative_project_source: Demultiplexing - undet_file_pattern: "*Undetermined_*.fastq.gz" - data_dir: "data/miarka/nobackup/NGI/DATA" - analysis: - root: "../../nobackup/NGI/ANALYSIS" - files_to_remove: - piper_ngi: - - "*.bam" + miarka: + flowcell: + root: + - "data/miarka/incoming" + - "data/miarka/archive" + relative_project_source: Demultiplexing + undet_file_pattern: "*Undetermined_*.fastq.gz" + data_dir: "data/miarka/nobackup/NGI/DATA" + analysis: + root: "../../nobackup/NGI/ANALYSIS" + files_to_remove: + piper_ngi: + - "*.bam" mail: - recipients: - some_user@some_email.com + recipients: some_user@some_email.com diff --git a/tests/data/taca_test_nanopore_cfg.yaml b/tests/data/taca_test_nanopore_cfg.yaml index df34c652..c5c32cc4 100644 --- a/tests/data/taca_test_nanopore_cfg.yaml +++ b/tests/data/taca_test_nanopore_cfg.yaml @@ -1,13 +1,13 @@ log: - file: "data/taca.log" + file: "data/taca.log" nanopore_analysis: minion_qc_run: nanoseq_version: 2.0.1 data_dir: data/nanopore_data/ ignore_dirs: - - 'nosync' - - '.nextflow' + - "nosync" + - ".nextflow" samplesheets_dir: data/nanopore_samplesheets lims_results_dir: some/dir transfer: @@ -18,11 +18,11 @@ nanopore_analysis: user: some_user destination: some_dir rsync_options: - '-LtDrv': None - '--chown': ':ngi2016003' - '--chmod' : 'Dg+s,g+rw' - '-r' : None - '--exclude' : 'work' + "-LtDrv": None + "--chown": ":ngi2016003" + "--chmod": "Dg+s,g+rw" + "-r": None + "--exclude": "work" finished_dir: data/nanopore_data/nosync mail: diff --git a/tests/test_analysis.py b/tests/test_analysis.py index c3150f1d..114f8316 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -1,15 +1,16 @@ #!/usr/bin/env python +import json import os -import tempfile import shutil -import json +import tempfile import unittest -import mock +from unittest import mock from taca.analysis import analysis as an from taca.utils import config -CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = config.load_yaml_config("data/taca_test_cfg.yaml") + class TestAnalysis(unittest.TestCase): """Tests for the Analysis functions.""" @@ -28,22 +29,36 @@ def setUpClass(self): | |__ RTAComplete.txt | |__ SampleSheet.csv """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - self.completed = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + self.completed = os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX") # Create runs directory structure os.makedirs(self.tmp_dir) - os.makedirs(os.path.join(self.completed, 'Demultiplexing', 'Stats')) + os.makedirs(os.path.join(self.completed, "Demultiplexing", "Stats")) # Set up files - open(os.path.join(self.completed, 'RTAComplete.txt'), 'w').close() - shutil.copy('data/samplesheet.csv', os.path.join(self.completed, 'SampleSheet.csv')) - open(os.path.join(self.completed, 'Demultiplexing', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(self.completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close() - with open(os.path.join(self.completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w') as stats_json: - json.dump({'silly': 1}, stats_json) - shutil.copy('data/RunInfo.xml', self.completed) - shutil.copy('data/runParameters.xml', self.completed) + open(os.path.join(self.completed, "RTAComplete.txt"), "w").close() + shutil.copy( + "data/samplesheet.csv", os.path.join(self.completed, "SampleSheet.csv") + ) + open( + os.path.join( + self.completed, "Demultiplexing", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + self.completed, "Demultiplexing", "Undetermined_S0_L001_R1_001.fastq.gz" + ), + "w", + ).close() + with open( + os.path.join(self.completed, "Demultiplexing", "Stats", "Stats.json"), "w" + ) as stats_json: + json.dump({"silly": 1}, stats_json) + shutil.copy("data/RunInfo.xml", self.completed) + shutil.copy("data/runParameters.xml", self.completed) @classmethod def tearDownClass(self): @@ -51,127 +66,159 @@ def tearDownClass(self): def test_get_runObj_miseq(self): """Return MiSeq run object.""" - miseq_run = os.path.join(self.tmp_dir, '141124_ST-MISEQ1_01_AFCIDXX') + miseq_run = os.path.join(self.tmp_dir, "141124_ST-MISEQ1_01_AFCIDXX") os.mkdir(miseq_run) - shutil.copy('data/runParameters_miseq.xml', os.path.join(miseq_run, 'runParameters.xml')) + shutil.copy( + "data/runParameters_miseq.xml", os.path.join(miseq_run, "runParameters.xml") + ) got_miseq_run = an.get_runObj(miseq_run) - self.assertEqual(got_miseq_run.sequencer_type, 'MiSeq') + self.assertEqual(got_miseq_run.sequencer_type, "MiSeq") def test_get_runObj_nextseq(self): """Return NextSeq run object.""" - nextseq_run = os.path.join(self.tmp_dir, '141124_ST-NEXTSEQ1_01_AFCIDXX') + nextseq_run = os.path.join(self.tmp_dir, "141124_ST-NEXTSEQ1_01_AFCIDXX") os.mkdir(nextseq_run) - shutil.copy('data/runParameters_nextseq.xml', os.path.join(nextseq_run, 'runParameters.xml')) + shutil.copy( + "data/runParameters_nextseq.xml", + os.path.join(nextseq_run, "runParameters.xml"), + ) got_nextseq_run = an.get_runObj(nextseq_run) - self.assertEqual(got_nextseq_run.sequencer_type, 'NextSeq') + self.assertEqual(got_nextseq_run.sequencer_type, "NextSeq") def test_get_runObj_novaseq(self): """Return NovaSeq run object.""" - novaseq_run = os.path.join(self.tmp_dir, '141124_ST-NOVASEQ1_01_AFCIDXX') + novaseq_run = os.path.join(self.tmp_dir, "141124_ST-NOVASEQ1_01_AFCIDXX") os.mkdir(novaseq_run) - shutil.copy('data/runParameters_novaseq.xml', os.path.join(novaseq_run, 'RunParameters.xml')) + shutil.copy( + "data/runParameters_novaseq.xml", + os.path.join(novaseq_run, "RunParameters.xml"), + ) got_novaseq_run = an.get_runObj(novaseq_run) - self.assertEqual(got_novaseq_run.sequencer_type, 'NovaSeq') + self.assertEqual(got_novaseq_run.sequencer_type, "NovaSeq") - @mock.patch('taca.analysis.analysis.get_runObj') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') + @mock.patch("taca.analysis.analysis.get_runObj") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") def test_upload_to_statusdb(self, mock_upload_to_statusdb, mock_get_runobj): """Get run object and initiate upload to statusdb.""" - mock_get_runobj.return_value = 'Standard_run_object' + mock_get_runobj.return_value = "Standard_run_object" an.upload_to_statusdb(self.completed) - mock_upload_to_statusdb.assert_called_once_with('Stan_run_object') + mock_upload_to_statusdb.assert_called_once_with("Stan_run_object") - @mock.patch('taca.analysis.analysis.statusdb') + @mock.patch("taca.analysis.analysis.statusdb") def test__upload_to_statusdb(self, mock_statusdb): """Upload to statusdb.""" - run = os.path.join(self.tmp_dir, '141124_ST-NOINDEX1_01_AFCIDYX') + run = os.path.join(self.tmp_dir, "141124_ST-NOINDEX1_01_AFCIDYX") os.mkdir(run) - shutil.copy('data/runParameters_minimal.xml', os.path.join(run, 'runParameters.xml')) - demux_dir = os.path.join(run, 'Demultiplexing', 'Stats') + shutil.copy( + "data/runParameters_minimal.xml", os.path.join(run, "runParameters.xml") + ) + demux_dir = os.path.join(run, "Demultiplexing", "Stats") os.makedirs(demux_dir) - shutil.copy('data/DemuxSummaryF1L1.txt', demux_dir) - reports_dir = os.path.join(run, 'Demultiplexing', 'Reports', 'html', 'FCIDYX', 'all', 'all', 'all') + shutil.copy("data/DemuxSummaryF1L1.txt", demux_dir) + reports_dir = os.path.join( + run, "Demultiplexing", "Reports", "html", "FCIDYX", "all", "all", "all" + ) os.makedirs(reports_dir) - shutil.copy('data/laneBarcode.html', (reports_dir)) - shutil.copy('data/lane.html', (reports_dir)) + shutil.copy("data/laneBarcode.html", (reports_dir)) + shutil.copy("data/lane.html", (reports_dir)) noindex_run = an.get_runObj(run) an._upload_to_statusdb(noindex_run) mock_statusdb.update_doc.assert_called_once() - @mock.patch('taca.analysis.analysis.Standard_Run.transfer_run') + @mock.patch("taca.analysis.analysis.Standard_Run.transfer_run") def test_transfer_run(self, mock_transfer_run): """Transfer run to Uppmax.""" - run_dir = (self.completed) + run_dir = self.completed an.transfer_run(run_dir) - mock_transfer_run.assert_called_once_with('nosync/data/transfer.tsv', 'some_user@some_email.com') - - @mock.patch('taca.analysis.analysis.RsyncAgent.transfer') - @mock.patch('taca.analysis.analysis.subprocess.call') - @mock.patch('taca.analysis.analysis.os.remove') - @mock.patch('taca.analysis.analysis.open') - def test_transfer_runfolder(self, mock_open, mock_remove, mock_subprocess_call, mock_transfer): + mock_transfer_run.assert_called_once_with( + "nosync/data/transfer.tsv", "some_user@some_email.com" + ) + + @mock.patch("taca.analysis.analysis.RsyncAgent.transfer") + @mock.patch("taca.analysis.analysis.subprocess.call") + @mock.patch("taca.analysis.analysis.os.remove") + @mock.patch("taca.analysis.analysis.open") + def test_transfer_runfolder( + self, mock_open, mock_remove, mock_subprocess_call, mock_transfer + ): """Transfer runfolder to uppmax.""" - run_dir = (self.completed) - pid = 'P1775' - exclude_lane = '' + run_dir = self.completed + pid = "P1775" + exclude_lane = "" an.transfer_runfolder(run_dir, pid, exclude_lane) mock_subprocess_call.assert_called() mock_transfer.assert_called() def test_extract_project_samplesheet(self): """Extract project specific lines from sample sheet.""" - sample_sheet = 'data/samplesheet.csv' - pid = 'P1775' + sample_sheet = "data/samplesheet.csv" + pid = "P1775" samplesheet_content = an.extract_project_samplesheet(sample_sheet, pid) expected_samplesheet_content = """Lane,SampleID,SampleName,SamplePlate,SampleWell,index,Project 1,Sample_P1775_147,P1775_147,FCB_150423,1:1,GAATTCGT,J_Lundeberg_14_24 """ self.assertEqual(samplesheet_content, expected_samplesheet_content) - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - def test_run_preprocessing_sequencing(self, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + def test_run_preprocessing_sequencing( + self, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocess run still sequencing.""" run = self.completed - mock_get_run_status.return_value = 'SEQUENCING' + mock_get_run_status.return_value = "SEQUENCING" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.demultiplex_run') - def test_run_preprocessing_to_start(self, mock_demultiplex_run, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.demultiplex_run") + def test_run_preprocessing_to_start( + self, mock_demultiplex_run, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocessing start demux.""" run = self.completed - mock_get_run_status.return_value = 'TO_START' + mock_get_run_status.return_value = "TO_START" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() mock_demultiplex_run.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.check_run_status') - def test_run_preprocessing_in_progress(self, mock_check_run_status, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.check_run_status") + def test_run_preprocessing_in_progress( + self, mock_check_run_status, mock_upload_to_statusdb, mock_get_run_status + ): """Run preprocessing demux in progress.""" run = self.completed - mock_get_run_status.return_value = 'IN_PROGRESS' + mock_get_run_status.return_value = "IN_PROGRESS" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() mock_check_run_status.assert_called_once() - @mock.patch('taca.analysis.analysis.NovaSeq_Run.get_run_status') - @mock.patch('taca.analysis.analysis._upload_to_statusdb') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.send_mail') - @mock.patch('taca.analysis.analysis.NovaSeq_Run.transfer_run') - @mock.patch('taca.analysis.analysis.os.mkdir') - @mock.patch('taca.analysis.analysis.copyfile') - def test_run_preprocessing_completed(self, mock_copy, mock_mkdir, mock_transfer_run, mock_send_mail, mock_upload_to_statusdb, mock_get_run_status): + @mock.patch("taca.analysis.analysis.NovaSeq_Run.get_run_status") + @mock.patch("taca.analysis.analysis._upload_to_statusdb") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.send_mail") + @mock.patch("taca.analysis.analysis.NovaSeq_Run.transfer_run") + @mock.patch("taca.analysis.analysis.os.mkdir") + @mock.patch("taca.analysis.analysis.copyfile") + def test_run_preprocessing_completed( + self, + mock_copy, + mock_mkdir, + mock_transfer_run, + mock_send_mail, + mock_upload_to_statusdb, + mock_get_run_status, + ): """Run preprocessing demux completed.""" run = self.completed - mock_get_run_status.return_value = 'COMPLETED' + mock_get_run_status.return_value = "COMPLETED" an.run_preprocessing(run, force_trasfer=True, statusdb=True) mock_upload_to_statusdb.assert_called_once() - message = 'The run 141124_ST-COMPLETED1_01_AFCIDXX has been demultiplexed.\n The Run will be transferred to the analysis cluster for further analysis.\n\n \ - The run is available at : https://genomics-status.scilifelab.se/flowcells/141124_ST-COMPLETED1_01_AFCIDXX\n\n ' - mock_send_mail.assert_called_once_with(message, rcp='some_user@some_email.com') - mock_transfer_run.assert_called_once_with('data/transfer.tsv', 'some_user@some_email.com') + message = "The run 141124_ST-COMPLETED1_01_AFCIDXX has been demultiplexed.\n The Run will be transferred to the analysis cluster for further analysis.\n\n \ + The run is available at : https://genomics-status.scilifelab.se/flowcells/141124_ST-COMPLETED1_01_AFCIDXX\n\n " + mock_send_mail.assert_called_once_with(message, rcp="some_user@some_email.com") + mock_transfer_run.assert_called_once_with( + "data/transfer.tsv", "some_user@some_email.com" + ) diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py deleted file mode 100644 index 1b3158dc..00000000 --- a/tests/test_analysis_nanopore.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -import unittest -import logging -import mock -import os - -from taca.analysis.analysis_nanopore import * -from taca.nanopore.minion import MinIONqc -from taca.utils import config as conf - - -CONFIG = conf.load_yaml_config('data/taca_test_nanopore_cfg.yaml') - -class TestNanoporeAnalysis(unittest.TestCase): - def test_find_runs_to_process(self): - """Find all expected nanopore runs to process.""" - expected_dirs = ["data/nanopore_data/run1/still_sequencing/20200101_1412_MN19414_AAU641_68125dc2", - "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2", - "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2", - "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2", - "data/nanopore_data/run7/done_no_sample_sheet/20200107_1412_MN19417_AAU645_68125dc2", - "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2"] - nanopore_data_dir = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('data_dir') - skip_dirs = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('ignore_dirs') - found_dirs = find_minion_runs(nanopore_data_dir, skip_dirs) - self.assertEqual(sorted(found_dirs), sorted(expected_dirs)) - - @mock.patch('taca.analysis.analysis_nanopore.os.path.isfile') - @mock.patch('taca.nanopore.minion.MinIONqc.start_nanoseq') - def test_process_minion_run_start_analysis(self, mock_start, mock_isfile): - """Start nanoseq analysis for minion.""" - nanoseq_sample_sheet = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2/SQK-LSK109_sample_sheet.csv' - anglerfish_sample_sheet = 'some/path' - mock_isfile.return_value = True - run_dir = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2' - minion_run = MinIONqc(run_dir, nanoseq_sample_sheet, anglerfish_sample_sheet) - process_minion_qc_run(minion_run) - mock_start.assert_called_once() - - @mock.patch('taca.nanopore.minion.MinIONqc.copy_results_for_lims') - @mock.patch('taca.nanopore.minion.Nanopore.transfer_run') - @mock.patch('taca.nanopore.minion.Nanopore.update_transfer_log') - @mock.patch('taca.nanopore.minion.Nanopore.archive_run') - @mock.patch('taca.analysis.analysis_nanopore.send_mail') - def test_process_minion_run_transfer(self, mock_mail, mock_archive, mock_update, mock_transfer, mock_cp): - """Start transfer of run directory.""" - mock_transfer.return_value = True - mock_cp.return_value = True - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - minion_run = MinIONqc(run_dir, 'dummy/path', None) - email_subject = ('Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2') - email_message = 'Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.' - email_recipients = 'test@test.com' - process_minion_qc_run(minion_run) - expected_calls = [mock.call('Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2', - 'Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.', - 'test@test.com'), - mock.call('Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2', - 'Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.', - 'test@test.com')] - mock_mail.assert_has_calls(expected_calls) - - @mock.patch('taca.analysis.analysis_nanopore.send_mail') - def test_process_minion_run_fail_analysis(self, mock_mail): - """Send email to operator if nanoseq analysis failed.""" - run_dir = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2' - minion_run = MinIONqc(run_dir, None, None) - minion_run.qc_run = True - process_minion_qc_run(minion_run) - email_subject = ('Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2') - email_message = 'The nanoseq analysis failed for run {}.'.format(minion_run.run_id) - email_recipients = 'test@test.com' - mock_mail.assert_called_once_with(email_subject, email_message, email_recipients) diff --git a/tests/test_backup.py b/tests/test_backup.py index c170f79f..204761e9 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -1,14 +1,14 @@ #!/usr/bin/env python -import unittest -import mock -import tempfile import os import shutil +import tempfile +import unittest +from unittest import mock from taca.backup import backup from taca.utils import config as conf -CONFIG = conf.load_yaml_config('data/taca_test_cfg_backup.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg_backup.yaml") class TestRunVars(unittest.TestCase): @@ -16,12 +16,18 @@ class TestRunVars(unittest.TestCase): def test_backup_variables(self): """Set up backup variables.""" - run_variables = backup.run_vars('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(run_variables.name, '190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(run_variables.zip, '190201_A00621_0032_BHHFCFDSXX.tar.gz') - self.assertEqual(run_variables.key, '190201_A00621_0032_BHHFCFDSXX.key') - self.assertEqual(run_variables.key_encrypted, '190201_A00621_0032_BHHFCFDSXX.key.gpg') - self.assertEqual(run_variables.zip_encrypted, '190201_A00621_0032_BHHFCFDSXX.tar.gz.gpg') + run_variables = backup.run_vars( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + ) + self.assertEqual(run_variables.name, "190201_A00621_0032_BHHFCFDSXX") + self.assertEqual(run_variables.zip, "190201_A00621_0032_BHHFCFDSXX.tar.gz") + self.assertEqual(run_variables.key, "190201_A00621_0032_BHHFCFDSXX.key") + self.assertEqual( + run_variables.key_encrypted, "190201_A00621_0032_BHHFCFDSXX.key.gpg" + ) + self.assertEqual( + run_variables.zip_encrypted, "190201_A00621_0032_BHHFCFDSXX.tar.gz.gpg" + ) class TestBackupUtils(unittest.TestCase): @@ -29,131 +35,164 @@ class TestBackupUtils(unittest.TestCase): def test_fetch_config_info(self): """Get backup info from config.""" - config_info = backup.backup_utils('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(config_info.data_dirs, {'miseq': 'data/nas/miseq.lab'}) - self.assertEqual(config_info.archive_dirs, {'hiseq': 'blah', 'miseq': 'data/nas/miseq.lab/nosync'}) - self.assertEqual(config_info.keys_path, 'data/nas/run_keys') - self.assertEqual(config_info.gpg_receiver, 'some.user') - self.assertEqual(config_info.mail_recipients, 'some_user@some_email.com') + config_info = backup.backup_utils( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + ) + self.assertEqual(config_info.data_dirs, {"miseq": "data/nas/miseq.lab"}) + self.assertEqual( + config_info.archive_dirs, + {"hiseq": "blah", "miseq": "data/nas/miseq.lab/nosync"}, + ) + self.assertEqual(config_info.keys_path, "data/nas/run_keys") + self.assertEqual(config_info.gpg_receiver, "some.user") + self.assertEqual(config_info.mail_recipients, "some_user@some_email.com") self.assertEqual(config_info.check_demux, True) - self.assertEqual(config_info.couch_info, {'url': 'url', 'username': 'username', 'password': 'pwd', 'xten_db': 'x_flowcells'}) + self.assertEqual( + config_info.couch_info, + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + }, + ) def test_collect_runs(self): """Get backup runs from archive directories.""" backup_object = backup.backup_utils() - backup_object.collect_runs(ext='.tar.gz', filter_by_ext=True) + backup_object.collect_runs(ext=".tar.gz", filter_by_ext=True) run = backup_object.runs[0].name - self.assertEqual(run, '200201_A00621_0032_BHHFCFDSXY') + self.assertEqual(run, "200201_A00621_0032_BHHFCFDSXY") def test_collect_runs_specific_run(self): """Collect only specific run.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) backup_object.collect_runs() run = backup_object.runs[0].name - self.assertEqual(run, '200201_A00621_0032_BHHFCFDSXX') + self.assertEqual(run, "200201_A00621_0032_BHHFCFDSXX") - missing_object = backup.backup_utils(run='some/missing/path/run') + missing_object = backup.backup_utils(run="some/missing/path/run") with self.assertRaises(SystemExit): missing_object.collect_runs() - @mock.patch('taca.backup.backup.sp.Popen.communicate') - @mock.patch('taca.backup.backup.misc') + @mock.patch("taca.backup.backup.sp.Popen.communicate") + @mock.patch("taca.backup.backup.misc") def test_avail_disk_space(self, mock_misc, mock_sp): """Check backup disk space.""" backup_object = backup.backup_utils() - mock_sp.return_value = ['Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted on\n/dev/disk1s1 976490576 100 813074776 15% 1086272 4881366608 0% /System/Volumes/Data', None] - path = 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX' - run = '190201_A00621_0032_BHHFCFDSXX' + mock_sp.return_value = [ + "Filesystem 512-blocks Used Available Capacity iused ifree %iused Mounted on\n/dev/disk1s1 976490576 100 813074776 15% 1086272 4881366608 0% /System/Volumes/Data", + None, + ] + path = "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX" + run = "190201_A00621_0032_BHHFCFDSXX" with self.assertRaises(SystemExit): backup_object.avail_disk_space(path, run) - @mock.patch('taca.backup.backup.sp.check_call') + @mock.patch("taca.backup.backup.sp.check_call") def test_file_in_pdc(self, mock_call): """Check if files exist in PDC.""" - mock_call.return_value = 'Whatever' + mock_call.return_value = "Whatever" backup_object = backup.backup_utils() - src_file = 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt' + src_file = "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt" self.assertTrue(backup_object.file_in_pdc(src_file, silent=True)) def test_get_run_type(self): """Get run types from flowcell names.""" backup_object = backup.backup_utils() - novaseq_run = backup_object._get_run_type('190201_A00621_0032_BHHFCFDSXX') - self.assertEqual(novaseq_run, 'novaseq') - hiseqx_run = backup_object._get_run_type('190711_ST-E00266_0356_AH2L32CCX2') - self.assertEqual(hiseqx_run, 'hiseqx') - miseq_run = backup_object._get_run_type('200604_M01320_0329_000000000-J668J') - self.assertEqual(miseq_run, 'miseq') - hiseq_run = backup_object._get_run_type('190628_D00415_0465_BH2HVYBCX3') - self.assertEqual(hiseq_run, 'hiseq') - nextseq_run = backup_object._get_run_type('200602_NS500688_0656_AHGCKWBGXF') - self.assertEqual(nextseq_run, 'nextseq') + novaseq_run = backup_object._get_run_type("190201_A00621_0032_BHHFCFDSXX") + self.assertEqual(novaseq_run, "novaseq") + hiseqx_run = backup_object._get_run_type("190711_ST-E00266_0356_AH2L32CCX2") + self.assertEqual(hiseqx_run, "hiseqx") + miseq_run = backup_object._get_run_type("200604_M01320_0329_000000000-J668J") + self.assertEqual(miseq_run, "miseq") + hiseq_run = backup_object._get_run_type("190628_D00415_0465_BH2HVYBCX3") + self.assertEqual(hiseq_run, "hiseq") + nextseq_run = backup_object._get_run_type("200602_NS500688_0656_AHGCKWBGXF") + self.assertEqual(nextseq_run, "nextseq") def test_call_commands(self): """Call expernal backup command.""" backup_object = backup.backup_utils() - got_output = backup_object._call_commands(cmd1='ls data/nas/miseq.lab', mail_failed=False, return_out=True) - expected_output = (True, b'190201_A00621_0032_BHHFCFDSXX\nnosync\n') + got_output = backup_object._call_commands( + cmd1="ls data/nas/miseq.lab", mail_failed=False, return_out=True + ) + expected_output = (True, b"190201_A00621_0032_BHHFCFDSXX\nnosync\n") self.assertEqual(got_output, expected_output) def test_call_commands_double(self): """Call external backup command, given two commands.""" backup_object = backup.backup_utils() - tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - tmp_file = os.path.join(tmp_dir, 'output.out') + tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + tmp_file = os.path.join(tmp_dir, "output.out") os.makedirs(tmp_dir) - cmd1 = 'ls data/nas/miseq.lab' - cmd2 = 'ls data/nas/miseq.lab' - got_output = backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False) + cmd1 = "ls data/nas/miseq.lab" + cmd2 = "ls data/nas/miseq.lab" + backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False) self.assertTrue(os.path.isfile(tmp_file)) shutil.rmtree(tmp_dir) def test_check_status(self): """Check subprocess status.""" backup_object = backup.backup_utils() - cmd = 'ls' + cmd = "ls" status_pass = 0 - err_msg = 'Error' - got_status_pass = backup_object._check_status(cmd, status_pass, err_msg, mail_failed=False) + err_msg = "Error" + got_status_pass = backup_object._check_status( + cmd, status_pass, err_msg, mail_failed=False + ) self.assertTrue(got_status_pass) status_fail = 1 - got_status_fail = backup_object._check_status(cmd, status_fail, err_msg, mail_failed=False) + got_status_fail = backup_object._check_status( + cmd, status_fail, err_msg, mail_failed=False + ) self.assertFalse(got_status_fail) - @mock.patch('taca.backup.backup.os.remove') + @mock.patch("taca.backup.backup.os.remove") def test_clean_tmp_files(self, mock_remove): """Remove file if it exist.""" backup_object = backup.backup_utils() - files = ['data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt', 'data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/missing_file.txt'] + files = [ + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt", + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/missing_file.txt", + ] backup_object._clean_tmp_files(files) - mock_remove.assert_called_once_with('data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt') + mock_remove.assert_called_once_with( + "data/nas/miseq.lab/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt" + ) - @mock.patch('taca.backup.backup.statusdb', autospec=True) - @mock.patch('taca.backup.backup.logger') + @mock.patch("taca.backup.backup.statusdb", autospec=True) + @mock.patch("taca.backup.backup.logger") def test_log_pdc_statusdb(self, mock_logger, mock_couch): """Update statusdb if transfer was successful.""" backup_object = backup.backup_utils() - run = '190201_A00621_0032_BHHFCFDSXX' + run = "190201_A00621_0032_BHHFCFDSXX" backup_object._log_pdc_statusdb(run) mock_logger.warn.assert_called_once() - @mock.patch('taca.backup.backup.backup_utils._call_commands', return_value=True) - @mock.patch('taca.backup.backup.shutil') - @mock.patch('taca.backup.backup.backup_utils._clean_tmp_files') - @mock.patch('taca.backup.backup.backup_utils.avail_disk_space') + @mock.patch("taca.backup.backup.backup_utils._call_commands", return_value=True) + @mock.patch("taca.backup.backup.shutil") + @mock.patch("taca.backup.backup.backup_utils._clean_tmp_files") + @mock.patch("taca.backup.backup.backup_utils.avail_disk_space") def test_encrypt_runs(self, mock_space, mock_clean, mock_shutil, mock_command): """Encrypt found runs.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') - run = 'data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX' + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) + run = "data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX" force = True backup_object.encrypt_runs(run, force) mock_clean.assert_called_once() - os.remove('data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX.encrypting') + os.remove("data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX.encrypting") - @mock.patch('taca.backup.backup.logger.error') + @mock.patch("taca.backup.backup.logger.error") def test_pdc_put(self, mock_logger): """Put runs on PDC.""" - backup_object = backup.backup_utils(run='data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX') - run = 'data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX' + backup_object = backup.backup_utils( + run="data/nas/miseq.lab/nosync/200201_A00621_0032_BHHFCFDSXX" + ) + run = "data/nas/miseq.lab/nosync/190201_A00621_0032_BHHFCFDSXX" backup_object.pdc_put(run) mock_logger.assert_called_once() diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index d7e04869..c5a0d53f 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -4,50 +4,54 @@ import shutil import tempfile import unittest -import mock from datetime import datetime +from unittest import mock from taca.cleanup import cleanup from taca.utils import config as conf -CONFIG = conf.load_yaml_config('data/taca_test_cfg_cleanup.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg_cleanup.yaml") class TestCleanup(unittest.TestCase): """Tests for TACA Cleanup module.""" - @mock.patch('taca.cleanup.cleanup.shutil.move') - @mock.patch('taca.cleanup.cleanup.os.listdir') + @mock.patch("taca.cleanup.cleanup.shutil.move") + @mock.patch("taca.cleanup.cleanup.os.listdir") def test_cleanup_nas(self, mock_listdir, mock_move): """Locate and move old data on NAS.""" seconds = 1 - run = '190201_A00621_0032_BHHFCFDSXX' + run = "190201_A00621_0032_BHHFCFDSXX" mock_listdir.return_value = [run] cleanup.cleanup_nas(seconds) - mock_move.assert_called_once_with(run, 'nosync') + mock_move.assert_called_once_with(run, "nosync") - @mock.patch('taca.cleanup.cleanup.shutil.rmtree') - @mock.patch('taca.cleanup.cleanup.os.listdir') + @mock.patch("taca.cleanup.cleanup.shutil.rmtree") + @mock.patch("taca.cleanup.cleanup.os.listdir") def test_cleanup_processing(self, mock_listdir, mock_rmtree): """Locate and move old data on preproc.""" seconds = 1 - run = '190201_A00621_0032_BHHFCFDSXY' + run = "190201_A00621_0032_BHHFCFDSXY" mock_listdir.return_value = [run] cleanup.cleanup_processing(seconds) mock_rmtree.assert_called_once_with(run) - @mock.patch('taca.cleanup.cleanup.statusdb') - @mock.patch('taca.cleanup.cleanup.get_closed_proj_info') - @mock.patch('taca.cleanup.cleanup.misc.query_yes_no') - @mock.patch('taca.cleanup.cleanup._remove_files') - @mock.patch('taca.cleanup.cleanup._touch_cleaned') - def test_cleanup_miarka(self, mock_touch, mock_rm, mock_query, mock_info, mock_statusdb): + @mock.patch("taca.cleanup.cleanup.statusdb") + @mock.patch("taca.cleanup.cleanup.get_closed_proj_info") + @mock.patch("taca.cleanup.cleanup.misc.query_yes_no") + @mock.patch("taca.cleanup.cleanup._remove_files") + @mock.patch("taca.cleanup.cleanup._touch_cleaned") + def test_cleanup_miarka( + self, mock_touch, mock_rm, mock_query, mock_info, mock_statusdb + ): """Locate and move old data on Miarka.""" - mock_info.return_value = {'closed_date': '2019-04-07', - 'bioinfo_responsible': 'O.B. One', - 'pid': 'P1234', - 'name': 'N.Owens_19_01', - 'closed_days': 5} + mock_info.return_value = { + "closed_date": "2019-04-07", + "bioinfo_responsible": "O.B. One", + "pid": "P1234", + "name": "N.Owens_19_01", + "closed_days": 5, + } mock_query.return_value = True mock_rm.return_value = True days_fastq = 1 @@ -55,71 +59,102 @@ def test_cleanup_miarka(self, mock_touch, mock_rm, mock_query, mock_info, mock_ only_fastq = False only_analysis = False clean_undetermined = False - status_db_config = 'data/taca_test_cfg_cleanup.yaml' + status_db_config = "data/taca_test_cfg_cleanup.yaml" exclude_projects = False list_only = False - date = '2016-01-31' - calls = [mock.call('data/miarka/incoming/190201_A00621_0032_BHHFCFDSXX/Demultiplexing/N.Owens_19_01'), - mock.call('../../nobackup/NGI/ANALYSIS/P1234')] - cleanup.cleanup_miarka(days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, status_db_config, exclude_projects, list_only, date, dry_run=False) + date = "2016-01-31" + calls = [ + mock.call( + "data/miarka/incoming/190201_A00621_0032_BHHFCFDSXX/Demultiplexing/N.Owens_19_01" + ), + mock.call("../../nobackup/NGI/ANALYSIS/P1234"), + ] + cleanup.cleanup_miarka( + days_fastq, + days_analysis, + only_fastq, + only_analysis, + clean_undetermined, + status_db_config, + exclude_projects, + list_only, + date, + dry_run=False, + ) mock_touch.assert_has_calls(calls) def test_get_closed_proj_info(self): """Return a dict if project is closed.""" - pid = 'P1234' - pdoc = {'close_date': '2019-04-07', - 'project_name': 'A.Name_19_01', - 'project_id': 'P1234', - 'project_summary': {'bioinfo_responsible': 'O.B. One'}} - tdate = datetime.strptime('2019-04-08', '%Y-%m-%d') + pid = "P1234" + pdoc = { + "close_date": "2019-04-07", + "project_name": "A.Name_19_01", + "project_id": "P1234", + "project_summary": {"bioinfo_responsible": "O.B. One"}, + } + tdate = datetime.strptime("2019-04-08", "%Y-%m-%d") got_data = cleanup.get_closed_proj_info(pid, pdoc, tdate) - expected_data = {'closed_date': '2019-04-07', - 'bioinfo_responsible': b'O.B. One', - 'pid': 'P1234', - 'name': 'A.Name_19_01', - 'closed_days': 1} + expected_data = { + "closed_date": "2019-04-07", + "bioinfo_responsible": b"O.B. One", + "pid": "P1234", + "name": "A.Name_19_01", + "closed_days": 1, + } self.assertEqual(got_data, expected_data) def test_collect_analysis_data_miarka(self): """Get analysis data on Miarka.""" - pid = 'P1234' - analysis_root = 'data/test_data/analysis' - file_list, size = cleanup.collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}) - self.assertEqual(file_list, 'cleaned') + pid = "P1234" + analysis_root = "data/test_data/analysis" + file_list, size = cleanup.collect_analysis_data_miarka( + pid, analysis_root, files_ext_to_remove={} + ) + self.assertEqual(file_list, "cleaned") def test_collect_fastq_data_miarka(self): """Collect removed files.""" - fc_root = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' - fc_proj_src = 'N.Owens_19_01' + fc_root = "data/test_data/190201_A00621_0032_BHHFCFDSXX" + fc_proj_src = "N.Owens_19_01" file_list, size = cleanup.collect_fastq_data_miarka(fc_root, fc_proj_src) - expected_data = {'flowcells': - {'190201_A00621_0032_BHHFCFDSXX': - {'proj_root': 'data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01', - 'fq_files': ['data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample1.fastq.gz', - 'data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample2.fastq.gz']}}} + expected_data = { + "flowcells": { + "190201_A00621_0032_BHHFCFDSXX": { + "proj_root": "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01", + "fq_files": [ + "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample1.fastq.gz", + "data/test_data/190201_A00621_0032_BHHFCFDSXX/N.Owens_19_01/sample2.fastq.gz", + ], + } + } + } self.assertEqual(file_list, expected_data) self.assertEqual(size, 0) def test_collect_files_by_ext(self): """Return found paths.""" - path = 'data/test_data' - ext = ['*.txt'] + path = "data/test_data" + ext = ["*.txt"] found_files = cleanup.collect_files_by_ext(path, ext) - expected_files = ['data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY/RTAComplete.txt', - 'data/test_data/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt'] + expected_files = [ + "data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY/RTAComplete.txt", + "data/test_data/190201_A00621_0032_BHHFCFDSXX/RTAComplete.txt", + ] self.assertEqual(found_files, expected_files) def test_get_proj_meta_info(self): """Get project metadata.""" - info = {'name': 'Nobody Owens', - 'pid': 'P1234', - 'bioinfo_responsible': 'O.B. One', - 'closed_days': 1, - 'closed_date': '2020-04-07', - 'fastq_size': 1001} - days_fastq = '' + info = { + "name": "Nobody Owens", + "pid": "P1234", + "bioinfo_responsible": "O.B. One", + "closed_days": 1, + "closed_date": "2020-04-07", + "fastq_size": 1001, + } + days_fastq = "" got_data = cleanup.get_proj_meta_info(info, days_fastq) - expected_data = ''' + expected_data = """ Project overview: Nobody Owens Project ID: P1234 Bioinfo Responsible: O.B. One @@ -127,42 +162,44 @@ def test_get_proj_meta_info(self): Closed from (date): 2020-04-07 Project analysis: No analysis directory Estimated data size: ~2kb -''' +""" self.assertEqual(got_data, expected_data) def test_get_files_size_text(self): """Format file size string.""" - plist = {'P1': {'fastq_size': 1001, 'analysis_size': 1000000}, - 'P2': {'fastq_size': 1001, 'analysis_size': 1000000}} + plist = { + "P1": {"fastq_size": 1001, "analysis_size": 1000000}, + "P2": {"fastq_size": 1001, "analysis_size": 1000000}, + } got_data = cleanup.get_files_size_text(plist) - expected_data = '(~~2kb fastq data and ~~2mb analysis data) ' + expected_data = "(~~2kb fastq data and ~~2mb analysis data) " self.assertEqual(got_data, expected_data) def test_def_get_size_unit(self): """Convert size.""" - #function broken if size < 1000 + # function broken if size < 1000 size = 1001 - self.assertEqual(cleanup._def_get_size_unit(size), '~1kb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1kb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1mb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1mb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1gb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1gb") size *= 1000 - self.assertEqual(cleanup._def_get_size_unit(size), '~1tb') + self.assertEqual(cleanup._def_get_size_unit(size), "~1tb") - @mock.patch('taca.cleanup.cleanup.os.remove') + @mock.patch("taca.cleanup.cleanup.os.remove") def test_remove_files(self, mock_remove): """Remove files in given list.""" - files = ['file1', 'file2'] + files = ["file1", "file2"] cleanup._remove_files(files) - calls = [mock.call('file1'), mock.call('file2')] + calls = [mock.call("file1"), mock.call("file2")] mock_remove.assert_has_calls(calls) def test_touch_cleaned(self): """Create empty file in specified dir.""" - tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") os.makedirs(tmp_dir) cleanup._touch_cleaned(tmp_dir) - expected_file = os.path.join(tmp_dir, 'cleaned') + expected_file = os.path.join(tmp_dir, "cleaned") self.assertTrue(os.path.exists(expected_file)) shutil.rmtree(tmp_dir) diff --git a/tests/test_illumina.py b/tests/test_illumina.py index 5bbf323a..c8c59a33 100644 --- a/tests/test_illumina.py +++ b/tests/test_illumina.py @@ -1,25 +1,21 @@ #!/usr/bin/env python +import filecmp +import json import os -import io import shutil -import tempfile -import unittest -import csv -import json -import mock -import filecmp import subprocess -from datetime import datetime import sys +import tempfile +import unittest +from unittest import mock + +from flowcell_parser.classes import LaneBarcodeParser -from taca.analysis.analysis import * -from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html -from taca.illumina.Standard_Runs import Standard_Runs, _generate_clean_samplesheet, _classify_samples, parse_10X_indexes, parse_smartseq_indexes, _generate_samplesheet_subset -from taca.illumina.MiSeq_Runs import MiSeq_Run -from taca.illumina.NovaSeq_Runs import NovaSeq_Run from taca.illumina.NextSeq_Runs import NextSeq_Run -from flowcell_parser.classes import LaneBarcodeParser, SampleSheetParser +from taca.illumina.NovaSeq_Runs import NovaSeq_Run +from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html +from taca.illumina.Standard_Runs import Standard_Run from taca.utils import config as conf if sys.version_info[0] >= 3: @@ -27,14 +23,15 @@ # This is only run if TACA is called from the CLI, as this is a test, we need to # call it explicitely -CONFIG = conf.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = conf.load_yaml_config("data/taca_test_cfg.yaml") class TestRuns(unittest.TestCase): """Tests for the Run base class.""" + @classmethod def setUpClass(self): - """ Creates the following directory tree for testing purposes: + """Creates the following directory tree for testing purposes: tmp/ |__ 141124_ST-COMPLETED_01_AFCIDXX @@ -90,107 +87,277 @@ def setUpClass(self): | |__lots of files |__ archive """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') - self.transfer_file = os.path.join(self.tmp_dir, 'transfer.tsv') - - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') - to_start = os.path.join(self.tmp_dir, '141124_ST-TOSTART1_04_FCIDXXX') - in_progress = os.path.join(self.tmp_dir, '141124_ST-INPROGRESS1_02_AFCIDXX') - in_progress_done = os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX') - completed = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX') - dummy = os.path.join(self.tmp_dir, '141124_ST-DUMMY1_01_AFCIDXX') - complex_run_dir = os.path.join(self.tmp_dir, '141124_ST-COMPLEX1_01_AFCIDXX') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") + self.transfer_file = os.path.join(self.tmp_dir, "transfer.tsv") + + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") + to_start = os.path.join(self.tmp_dir, "141124_ST-TOSTART1_04_FCIDXXX") + in_progress = os.path.join(self.tmp_dir, "141124_ST-INPROGRESS1_02_AFCIDXX") + in_progress_done = os.path.join( + self.tmp_dir, "141124_ST-INPROGRESSDONE1_02_AFCIDXX" + ) + completed = os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX") + dummy = os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX") + complex_run_dir = os.path.join(self.tmp_dir, "141124_ST-COMPLEX1_01_AFCIDXX") finished_runs = [to_start, in_progress, in_progress_done, completed] # Create runs directory structure os.makedirs(self.tmp_dir) os.makedirs(running) os.makedirs(to_start) - os.makedirs(os.path.join(in_progress, 'Demultiplexing')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_1')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_2')) - os.makedirs(os.path.join(in_progress, 'Demultiplexing_3')) - os.makedirs(os.path.join(in_progress_done, 'Demultiplexing')) - os.makedirs(os.path.join(in_progress_done, 'Demultiplexing_0/Stats')) - os.makedirs(os.path.join(completed, 'Demultiplexing', 'Stats')) + os.makedirs(os.path.join(in_progress, "Demultiplexing")) + os.makedirs( + os.path.join( + in_progress, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) + os.makedirs(os.path.join(in_progress, "Demultiplexing_1")) + os.makedirs(os.path.join(in_progress, "Demultiplexing_2")) + os.makedirs(os.path.join(in_progress, "Demultiplexing_3")) + os.makedirs(os.path.join(in_progress_done, "Demultiplexing")) + os.makedirs(os.path.join(in_progress_done, "Demultiplexing_0/Stats")) + os.makedirs(os.path.join(completed, "Demultiplexing", "Stats")) os.makedirs(dummy) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_0', 'Stats')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_1', 'Stats')) - os.makedirs(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001')) - os.makedirs(os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html','FCIDXX', 'all', 'all', 'all')) - os.makedirs(os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html','FCIDXX', 'all', 'all', 'all')) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing")) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing_0", "Stats")) + os.makedirs(os.path.join(complex_run_dir, "Demultiplexing_1", "Stats")) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + ) + ) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) + os.makedirs( + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ) + ) # Create files indicating that the run is finished for run in finished_runs: - open(os.path.join(run, 'RTAComplete.txt'), 'w').close() + open(os.path.join(run, "RTAComplete.txt"), "w").close() # Create sample sheets for running demultiplexing - open(os.path.join(in_progress, 'SampleSheet_0.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_1.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_2.csv'), 'w').close() - open(os.path.join(in_progress, 'SampleSheet_3.csv'), 'w').close() - open(os.path.join(in_progress_done, 'SampleSheet_0.csv'), 'w').close() - shutil.copy('data/samplesheet.csv', os.path.join(completed, 'SampleSheet.csv')) - shutil.copy('data/samplesheet.csv', os.path.join(complex_run_dir, 'SampleSheet_0.csv')) - shutil.copy('data/samplesheet.csv', os.path.join(complex_run_dir, 'SampleSheet_1.csv')) + open(os.path.join(in_progress, "SampleSheet_0.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_1.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_2.csv"), "w").close() + open(os.path.join(in_progress, "SampleSheet_3.csv"), "w").close() + open(os.path.join(in_progress_done, "SampleSheet_0.csv"), "w").close() + shutil.copy("data/samplesheet.csv", os.path.join(completed, "SampleSheet.csv")) + shutil.copy( + "data/samplesheet.csv", os.path.join(complex_run_dir, "SampleSheet_0.csv") + ) + shutil.copy( + "data/samplesheet.csv", os.path.join(complex_run_dir, "SampleSheet_1.csv") + ) # Create files indicating that demultiplexing is ongoing - open(os.path.join(in_progress_done, 'Demultiplexing_0', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(in_progress_done, 'Demultiplexing_0', 'Stats', 'DemuxSummaryF1L1.txt'), 'w').close() - shutil.copy('data/lane.html', os.path.join(in_progress,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) + open( + os.path.join( + in_progress_done, "Demultiplexing_0", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + in_progress_done, "Demultiplexing_0", "Stats", "DemuxSummaryF1L1.txt" + ), + "w", + ).close() + shutil.copy( + "data/lane.html", + os.path.join( + in_progress, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) # Create files indicating that the preprocessing is done - open(os.path.join(completed, 'Demultiplexing', 'Stats', 'DemultiplexingStats.xml'), 'w').close() - open(os.path.join(completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close() - open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R1_001.fastq.gz'), 'w').close() - open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R2_001.fastq.gz'), 'w').close() - with io.open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json: - stats_json.write(unicode(json.dumps({'silly': 1}, ensure_ascii=False))) + open( + os.path.join( + completed, "Demultiplexing", "Stats", "DemultiplexingStats.xml" + ), + "w", + ).close() + open( + os.path.join( + completed, "Demultiplexing", "Undetermined_S0_L001_R1_001.fastq.gz" + ), + "w", + ).close() + open( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + "P16510_1001_S1_L001_R1_001.fastq.gz", + ), + "w", + ).close() + open( + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "N__One_20_01", + "Sample_P12345_1001", + "P16510_1001_S1_L001_R2_001.fastq.gz", + ), + "w", + ).close() + with open( + os.path.join(completed, "Demultiplexing", "Stats", "Stats.json"), + "w", + encoding="utf-8", + ) as stats_json: + stats_json.write(unicode(json.dumps({"silly": 1}, ensure_ascii=False))) # Copy transfer file with the completed run - shutil.copy('data/test_transfer.tsv', self.transfer_file) + shutil.copy("data/test_transfer.tsv", self.transfer_file) # Move sample RunInfo.xml file to every run directory - for run in [running, to_start, in_progress, in_progress_done, completed, dummy, complex_run_dir]: - shutil.copy('data/RunInfo.xml', run) - shutil.copy('data/runParameters.xml', run) + for run in [ + running, + to_start, + in_progress, + in_progress_done, + completed, + dummy, + complex_run_dir, + ]: + shutil.copy("data/RunInfo.xml", run) + shutil.copy("data/runParameters.xml", run) # Create files for complex case - shutil.copy('data/Stats.json', os.path.join(complex_run_dir, 'Demultiplexing_0', 'Stats', 'Stats.json')) - shutil.copy('data/Stats.json', os.path.join(complex_run_dir, 'Demultiplexing_1', 'Stats', 'Stats.json')) - shutil.copy('data/lane.html', os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/lane.html', os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/laneBarcode.html', os.path.join(complex_run_dir,'Demultiplexing_0', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) - shutil.copy('data/laneBarcode.html', os.path.join(complex_run_dir,'Demultiplexing_1', 'Reports', 'html', 'FCIDXX', 'all', 'all', 'all')) + shutil.copy( + "data/Stats.json", + os.path.join(complex_run_dir, "Demultiplexing_0", "Stats", "Stats.json"), + ) + shutil.copy( + "data/Stats.json", + os.path.join(complex_run_dir, "Demultiplexing_1", "Stats", "Stats.json"), + ) + shutil.copy( + "data/lane.html", + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/lane.html", + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/laneBarcode.html", + os.path.join( + complex_run_dir, + "Demultiplexing_0", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) + shutil.copy( + "data/laneBarcode.html", + os.path.join( + complex_run_dir, + "Demultiplexing_1", + "Reports", + "html", + "FCIDXX", + "all", + "all", + "all", + ), + ) # Create archive dir - self.archive_dir = os.path.join(self.tmp_dir, 'archive') + self.archive_dir = os.path.join(self.tmp_dir, "archive") os.makedirs(self.archive_dir) # Create run objects - self.running = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.to_start = Run(os.path.join(self.tmp_dir, - '141124_ST-TOSTART1_04_FCIDXXX'), - CONFIG['analysis']['NovaSeq']) - self.in_progress = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-INPROGRESS1_02_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.in_progress_done = Standard_Run(os.path.join(self.tmp_dir, - '141124_ST-INPROGRESSDONE1_02_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.completed = Run(os.path.join(self.tmp_dir, - '141124_ST-COMPLETED1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) - self.dummy_run = Run(os.path.join(self.tmp_dir, - '141124_ST-DUMMY1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.to_start = Run( + os.path.join(self.tmp_dir, "141124_ST-TOSTART1_04_FCIDXXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.in_progress = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-INPROGRESS1_02_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.in_progress_done = Standard_Run( + os.path.join(self.tmp_dir, "141124_ST-INPROGRESSDONE1_02_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.completed = Run( + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) + self.dummy_run = Run( + os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) self.finished_runs = [self.to_start, self.in_progress, self.completed] - self.complex_run = Run(os.path.join(self.tmp_dir, '141124_ST-COMPLEX1_01_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.complex_run = Run( + os.path.join(self.tmp_dir, "141124_ST-COMPLEX1_01_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -200,13 +367,13 @@ def test_run_setup(self): """Raise RuntimeError if files are missing.""" # if rundir missing with self.assertRaises(RuntimeError): - Run('missing_dir', CONFIG['analysis']['NovaSeq']) + Run("missing_dir", CONFIG["analysis"]["NovaSeq"]) # if config incomplete with self.assertRaises(RuntimeError): - Run(self.tmp_dir, CONFIG['analysis']['DummySeq']) + Run(self.tmp_dir, CONFIG["analysis"]["DummySeq"]) # if runParameters.xml missing with self.assertRaises(RuntimeError): - Run(self.tmp_dir, CONFIG['analysis']['NovaSeq']) + Run(self.tmp_dir, CONFIG["analysis"]["NovaSeq"]) def test_is_sequencing_done(self): """Is finished should be True only if "RTAComplete.txt" file is present.""" @@ -215,22 +382,24 @@ def test_is_sequencing_done(self): def test_get_run_status(self): """Get the run status based on present files.""" - self.assertEqual('SEQUENCING', self.running.get_run_status()) - self.assertEqual('TO_START', self.to_start.get_run_status()) - self.assertEqual('IN_PROGRESS', self.in_progress.get_run_status()) - self.assertEqual('COMPLETED', self.completed.get_run_status()) + self.assertEqual("SEQUENCING", self.running.get_run_status()) + self.assertEqual("TO_START", self.to_start.get_run_status()) + self.assertEqual("IN_PROGRESS", self.in_progress.get_run_status()) + self.assertEqual("COMPLETED", self.completed.get_run_status()) def test_is_transferred(self): """is_transferred should rely on the info in transfer.tsv.""" - os.makedirs(os.path.join(self.tmp_dir, '141124_ST-DUMMY1_01_AFCIDXX', 'transferring')) + os.makedirs( + os.path.join(self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX", "transferring") + ) self.assertTrue(self.dummy_run.is_transferred(self.transfer_file)) self.assertTrue(self.completed.is_transferred(self.transfer_file)) self.assertFalse(self.running.is_transferred(self.transfer_file)) self.assertFalse(self.to_start.is_transferred(self.transfer_file)) - self.assertFalse(self.in_progress.is_transferred( self.transfer_file)) - self.assertFalse(self.completed.is_transferred('missing_file')) + self.assertFalse(self.in_progress.is_transferred(self.transfer_file)) + self.assertFalse(self.completed.is_transferred("missing_file")) - @mock.patch('taca.illumina.Standard_Runs.Standard_Run._aggregate_demux_results') + @mock.patch("taca.illumina.Standard_Runs.Standard_Run._aggregate_demux_results") def test_check_run_status_done(self, mock_aggregate_demux_results): """Recognize if a demultiplexing run is finished or not.""" self.in_progress.check_run_status() @@ -238,26 +407,26 @@ def test_check_run_status_done(self, mock_aggregate_demux_results): self.in_progress_done.check_run_status() mock_aggregate_demux_results.assert_called_once() - @mock.patch('taca.illumina.Runs.Run.get_run_status') + @mock.patch("taca.illumina.Runs.Run.get_run_status") def test_check_run_status_completed(self, mock_status): """Return None if run is finished.""" - mock_status.return_value = 'COMPLETED' + mock_status.return_value = "COMPLETED" self.assertEqual(self.in_progress.check_run_status(), None) def test_get_run_type(self): """Return runtype if set.""" - self.assertEqual('NGI-RUN', self.running.get_run_type()) + self.assertEqual("NGI-RUN", self.running.get_run_type()) self.to_start.run_type = False with self.assertRaises(RuntimeError): self.to_start.get_run_type() def test_get_demux_folder(self): """Return name of demux folder if set.""" - self.assertEqual('Demultiplexing', self.running._get_demux_folder()) + self.assertEqual("Demultiplexing", self.running._get_demux_folder()) def test_get_samplesheet(self): """Return location of sample sheet.""" - self.assertEqual('data/2014/FCIDXX.csv', self.running._get_samplesheet()) + self.assertEqual("data/2014/FCIDXX.csv", self.running._get_samplesheet()) def test_is_demultiplexing_done(self): """Return true if Stats.json exists, else false.""" @@ -274,110 +443,196 @@ def test_generate_per_lane_base_mask(self): with self.assertRaises(RuntimeError): self.dummy_run._generate_per_lane_base_mask() - shutil.copy('data/samplesheet_dummy_run.csv', os.path.join(self.tmp_dir,'141124_ST-DUMMY1_01_AFCIDXX', 'SampleSheet.csv')) - self.dummy_run._set_run_parser_obj(CONFIG['analysis']['NovaSeq']) - expected_mask = {'1': {'Y151I7N3I7N3': - {'base_mask': ['Y151', 'I7N3', 'I7N3'], - 'data': [{'index': 'CGCGCAG', - 'Lane': '1', - 'Sample_ID': 'Sample_P10000_1001', - 'Sample_Project': 'A_Test_18_01', - 'Sample_Name': 'Sample_P10000_1001', - 'index2': 'CTGCGCG'}]}, - 'Y151I7N3N10': - {'base_mask': ['Y151', 'I7N3', 'N10'], - 'data': [{'index': 'AGGTACC', - 'Lane': '1', - 'Sample_ID': 'Sample_P10000_1005', - 'Sample_Project': 'A_Test_18_01', - 'Sample_Name': 'Sample_P10000_1005', - 'index2': ''}]}}} + shutil.copy( + "data/samplesheet_dummy_run.csv", + os.path.join( + self.tmp_dir, "141124_ST-DUMMY1_01_AFCIDXX", "SampleSheet.csv" + ), + ) + self.dummy_run._set_run_parser_obj(CONFIG["analysis"]["NovaSeq"]) + expected_mask = { + "1": { + "Y151I7N3I7N3": { + "base_mask": ["Y151", "I7N3", "I7N3"], + "data": [ + { + "index": "CGCGCAG", + "Lane": "1", + "Sample_ID": "Sample_P10000_1001", + "Sample_Project": "A_Test_18_01", + "Sample_Name": "Sample_P10000_1001", + "index2": "CTGCGCG", + } + ], + }, + "Y151I7N3N10": { + "base_mask": ["Y151", "I7N3", "N10"], + "data": [ + { + "index": "AGGTACC", + "Lane": "1", + "Sample_ID": "Sample_P10000_1005", + "Sample_Project": "A_Test_18_01", + "Sample_Name": "Sample_P10000_1005", + "index2": "", + } + ], + }, + } + } got_mask = self.dummy_run._generate_per_lane_base_mask() self.assertEqual(expected_mask, got_mask) def test_compute_base_mask(self): """Compute Run base mask.""" - runSetup = [{'IsIndexedRead': 'N', 'NumCycles': '151', 'Number': '1'}, - {'IsIndexedRead': 'Y', 'NumCycles': '8', 'Number': '2'}, - {'IsIndexedRead': 'Y', 'NumCycles': '8', 'Number': '3'}, - {'IsIndexedRead': 'N', 'NumCycles': '151', 'Number': '4'}] + runSetup = [ + {"IsIndexedRead": "N", "NumCycles": "151", "Number": "1"}, + {"IsIndexedRead": "Y", "NumCycles": "8", "Number": "2"}, + {"IsIndexedRead": "Y", "NumCycles": "8", "Number": "3"}, + {"IsIndexedRead": "N", "NumCycles": "151", "Number": "4"}, + ] index_size = 7 dual_index_sample = True index2_size = 7 - got_mask = self.dummy_run._compute_base_mask(runSetup, index_size, dual_index_sample, index2_size) - expected_mask = ['Y151', 'I7N1', 'I7N1', 'Y151'] + got_mask = self.dummy_run._compute_base_mask( + runSetup, index_size, dual_index_sample, index2_size + ) + expected_mask = ["Y151", "I7N1", "I7N1", "Y151"] self.assertEqual(got_mask, expected_mask) - @mock.patch('taca.illumina.Runs.misc.call_external_command') + @mock.patch("taca.illumina.Runs.misc.call_external_command") def test_transfer_run(self, mock_call_external_command): """Call external rsync.""" self.completed.transfer_run(self.transfer_file) - command_line = ['rsync', '-LtDrv', '--chmod=g+rw', - '--exclude=Demultiplexing_*/*_*', - '--include=*/', '--include=*.file', - '--exclude=*', '--prune-empty-dirs', - os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - 'None@None:None'] - mock_call_external_command.assert_called_once_with(command_line, - log_dir=os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - prefix='', - with_log_files=True) - - @mock.patch('taca.illumina.Runs.misc.call_external_command') + command_line = [ + "rsync", + "-LtDrv", + "--chmod=g+rw", + "--exclude=Demultiplexing_*/*_*", + "--include=*/", + "--include=*.file", + "--exclude=*", + "--prune-empty-dirs", + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + "None@None:None", + ] + mock_call_external_command.assert_called_once_with( + command_line, + log_dir=os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + prefix="", + with_log_files=True, + ) + + @mock.patch("taca.illumina.Runs.misc.call_external_command") def test_transfer_run_error(self, mock_call_external_command): """Handle external rsync error.""" - mock_call_external_command.side_effect = subprocess.CalledProcessError(1, 'some error') + mock_call_external_command.side_effect = subprocess.CalledProcessError( + 1, "some error" + ) with self.assertRaises(subprocess.CalledProcessError): self.completed.transfer_run(self.transfer_file) - @mock.patch('taca.illumina.Runs.shutil.move') + @mock.patch("taca.illumina.Runs.shutil.move") def test_archive_run(self, mock_move): """Move file to archive.""" self.completed.archive_run(self.archive_dir) - mock_move.assert_called_once_with(os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX'), - os.path.join(self.archive_dir, '141124_ST-COMPLETED1_01_AFCIDXX')) + mock_move.assert_called_once_with( + os.path.join(self.tmp_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + os.path.join(self.archive_dir, "141124_ST-COMPLETED1_01_AFCIDXX"), + ) - @mock.patch('taca.illumina.Runs.misc.send_mail') + @mock.patch("taca.illumina.Runs.misc.send_mail") def test_send_mail(self, mock_send_mail): """Send mail to user.""" - self.completed.send_mail('Hello', 'user@email.com') - mock_send_mail.assert_called_once_with('141124_ST-COMPLETED1_01_AFCIDXX', 'Hello', 'user@email.com') + self.completed.send_mail("Hello", "user@email.com") + mock_send_mail.assert_called_once_with( + "141124_ST-COMPLETED1_01_AFCIDXX", "Hello", "user@email.com" + ) def test_is_unpooled_lane(self): """Check if lane is unpooled.""" - self.assertTrue(self.in_progress.is_unpooled_lane('2')) + self.assertTrue(self.in_progress.is_unpooled_lane("2")) def test_get_samples_per_lane(self): """Return samples from samplesheet.""" - expected_samples = {'1': 'P10000_1001', '2': 'P10000_1005', '3': 'P10000_1006', '4': 'P10000_1007'} - got_samples = self.in_progress.get_samples_per_lane() + expected_samples = { + "1": "P10000_1001", + "2": "P10000_1005", + "3": "P10000_1006", + "4": "P10000_1007", + } + got_samples = self.in_progress.get_samples_per_lane() self.assertEqual(expected_samples, got_samples) - @mock.patch('taca.illumina.Runs.os.rename') + @mock.patch("taca.illumina.Runs.os.rename") def test_rename_undet(self, mock_rename): """Prepend sample name to file name.""" - samples_per_lane = {'1': 'P10000_1001', '2': 'P10000_1005'} - lane = '1' + samples_per_lane = {"1": "P10000_1001", "2": "P10000_1005"} + lane = "1" self.completed._rename_undet(lane, samples_per_lane) - old_name = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX', 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz') - new_name = os.path.join(self.tmp_dir, '141124_ST-COMPLETED1_01_AFCIDXX', 'Demultiplexing', 'P10000_1001_Undetermined_L011_R1_001.fastq.gz') + old_name = os.path.join( + self.tmp_dir, + "141124_ST-COMPLETED1_01_AFCIDXX", + "Demultiplexing", + "Undetermined_S0_L001_R1_001.fastq.gz", + ) + new_name = os.path.join( + self.tmp_dir, + "141124_ST-COMPLETED1_01_AFCIDXX", + "Demultiplexing", + "P10000_1001_Undetermined_L011_R1_001.fastq.gz", + ) mock_rename.assert_called_once_with(old_name, new_name) - @mock.patch('taca.illumina.Runs.os.symlink') + @mock.patch("taca.illumina.Runs.os.symlink") def test_aggregate_demux_results_simple_complex(self, mock_symlink): """Aggregare demux results simple case.""" self.assertTrue(self.in_progress_done._aggregate_demux_results_simple_complex()) - calls = [mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/DemultiplexingStats.xml'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/DemultiplexingStats.xml')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/AdapterTrimming.txt'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/AdapterTrimming.txt')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/ConversionStats.xml'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/ConversionStats.xml')), - mock.call(os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/Stats.json'), - os.path.join(self.tmp_dir, '141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/Stats.json'))] + calls = [ + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/DemultiplexingStats.xml", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/DemultiplexingStats.xml", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/AdapterTrimming.txt", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/AdapterTrimming.txt", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/ConversionStats.xml", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/ConversionStats.xml", + ), + ), + mock.call( + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing_0/Stats/Stats.json", + ), + os.path.join( + self.tmp_dir, + "141124_ST-INPROGRESSDONE1_02_AFCIDXX/Demultiplexing/Stats/Stats.json", + ), + ), + ] mock_symlink.assert_has_calls(calls) - @mock.patch('taca.illumina.Runs.json.dump') + @mock.patch("taca.illumina.Runs.json.dump") def test_aggregate_demux_results_simple_complex_complex(self, mock_json_dump): """Aggregare demux results complex case.""" self.assertTrue(self.complex_run._aggregate_demux_results_simple_complex()) @@ -391,22 +646,23 @@ def test_aggregate_demux_results_simple_complex_fail(self): def test_create_folder_structure(self): """Make directory structure.""" root = self.tmp_dir - dirs = ['dir1', 'dir2'] + dirs = ["dir1", "dir2"] path = _create_folder_structure(root, dirs) - self.assertEqual(path, os.path.join(self.tmp_dir, 'dir1/dir2')) + self.assertEqual(path, os.path.join(self.tmp_dir, "dir1/dir2")) def test_generate_lane_html(self): """Generate lane HTML.""" - html_report = 'data/lane.html' + html_report = "data/lane.html" html_report_lane_parser = LaneBarcodeParser(html_report) - html_file = os.path.join(self.tmp_dir, 'generated_lane.html') - expected_file = 'data/lane_result.html' + html_file = os.path.join(self.tmp_dir, "generated_lane.html") + expected_file = "data/lane_result.html" _generate_lane_html(html_file, html_report_lane_parser) self.assertTrue(filecmp.cmp(html_file, expected_file)) class TestNovaSeqRuns(unittest.TestCase): """Tests for the NovaSeq_Run run class.""" + @classmethod def setUpClass(self): """Creates the following directory tree for testing purposes: @@ -415,23 +671,24 @@ def setUpClass(self): |__ 141124_ST-RUNNING1_03_AFCIDXX |__ RunInfo.xml """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") os.makedirs(self.tmp_dir) os.makedirs(running) # Create files indicating that the run is finished - open(os.path.join(running, 'RTAComplete.txt'), 'w').close() + open(os.path.join(running, "RTAComplete.txt"), "w").close() # Move sample RunInfo.xml file to run directory - shutil.copy('data/RunInfo.xml', running) - shutil.copy('data/runParameters.xml', running) + shutil.copy("data/RunInfo.xml", running) + shutil.copy("data/runParameters.xml", running) # Create run objects - self.running = NovaSeq_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = NovaSeq_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -439,12 +696,13 @@ def tearDownClass(self): def test_novaseq(self): """Set sequencer and run type NovaSeq.""" - self.assertEqual(self.running.sequencer_type, 'NovaSeq') - self.assertEqual(self.running.run_type, 'NGI-RUN') + self.assertEqual(self.running.sequencer_type, "NovaSeq") + self.assertEqual(self.running.run_type, "NGI-RUN") class TestNextSeqRuns(unittest.TestCase): """Tests for the NextSeq_Run run class.""" + @classmethod def setUpClass(self): """Creates the following directory tree for testing purposes: @@ -453,23 +711,24 @@ def setUpClass(self): |__ 141124_ST-RUNNING1_03_AFCIDXX |__ RunInfo.xml """ - self.tmp_dir = os.path.join(tempfile.mkdtemp(), 'tmp') + self.tmp_dir = os.path.join(tempfile.mkdtemp(), "tmp") - running = os.path.join(self.tmp_dir, '141124_ST-RUNNING1_03_AFCIDXX') + running = os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX") os.makedirs(self.tmp_dir) os.makedirs(running) # Create files indicating that the run is finished - open(os.path.join(running, 'RTAComplete.txt'), 'w').close() + open(os.path.join(running, "RTAComplete.txt"), "w").close() # Move sample RunInfo.xml file to run directory - shutil.copy('data/RunInfo.xml', running) - shutil.copy('data/runParameters.xml', running) + shutil.copy("data/RunInfo.xml", running) + shutil.copy("data/runParameters.xml", running) # Create run objects - self.running = NextSeq_Run(os.path.join(self.tmp_dir, - '141124_ST-RUNNING1_03_AFCIDXX'), - CONFIG['analysis']['NovaSeq']) + self.running = NextSeq_Run( + os.path.join(self.tmp_dir, "141124_ST-RUNNING1_03_AFCIDXX"), + CONFIG["analysis"]["NovaSeq"], + ) @classmethod def tearDownClass(self): @@ -477,5 +736,5 @@ def tearDownClass(self): def test_nextseq(self): """Set sequencer and run type NextSeq.""" - self.assertEqual(self.running.sequencer_type, 'NextSeq') - self.assertEqual(self.running.run_type, 'NGI-RUN') + self.assertEqual(self.running.sequencer_type, "NextSeq") + self.assertEqual(self.running.run_type, "NGI-RUN") diff --git a/tests/test_instrument_transfer.py b/tests/test_instrument_transfer.py index 60a1533b..81c3ae5d 100644 --- a/tests/test_instrument_transfer.py +++ b/tests/test_instrument_transfer.py @@ -1,10 +1,12 @@ -from taca.nanopore import instrument_transfer -from unittest.mock import patch, mock_open, call, Mock, MagicMock -import tempfile -import pytest +import json import os import re -import json +import tempfile +from unittest.mock import Mock, call, mock_open, patch + +import pytest + +from taca.nanopore import instrument_transfer DUMMY_RUN_NAME = "20240112_2342_MN19414_TEST12345_randomhash" @@ -13,7 +15,7 @@ @pytest.fixture -def setup_test_fixture() -> (Mock, tempfile.TemporaryDirectory, dict): +def setup_test_fixture(): """Set up tempdir to mimic an ONT instrument file system""" tmp = tempfile.TemporaryDirectory() @@ -78,14 +80,15 @@ def setup_test_fixture() -> (Mock, tempfile.TemporaryDirectory, dict): def test_main_ignore_CTC(setup_test_fixture): - """Check so that runs on configuration test cells are not picked up. - """ + """Check so that runs on configuration test cells are not picked up.""" # Run fixture args, tmp, file_paths = setup_test_fixture # Setup run - run_path = f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'CTC')}" + run_path = ( + f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'CTC')}" + ) os.makedirs(run_path) with patch("taca.nanopore.instrument_transfer.dump_path") as mock_dump_path: @@ -105,7 +108,9 @@ def test_main_ignore_col3(setup_test_fixture): args, tmp, file_paths = setup_test_fixture # Setup run - run_path = f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('MN19414', '3A')}" + run_path = ( + f"{args.source_dir}/experiment/sample/{DUMMY_RUN_NAME.replace('MN19414', '3A')}" + ) os.makedirs(run_path) with patch("taca.nanopore.instrument_transfer.dump_path") as mock_dump_path: @@ -113,7 +118,7 @@ def test_main_ignore_col3(setup_test_fixture): instrument_transfer.main(args) # Check dump_path was not called - mock_dump_path.assert_not_called() + mock_dump_path.assert_not_called() @pytest.mark.parametrize( @@ -156,9 +161,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc): # Check path was dumped assert os.path.exists(run_path + "/run_path.txt") - assert open(run_path + "/run_path.txt", "r").read() == "/".join( - run_path.split("/")[-3:] - ) + assert open(run_path + "/run_path.txt").read() == "/".join(run_path.split("/")[-3:]) # Check pore count history was dumped assert os.path.exists(run_path + "/pore_count_history.csv") @@ -179,7 +182,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc): ) + "\n" ) - assert open(run_path + "/pore_count_history.csv", "r").read() == template + assert open(run_path + "/pore_count_history.csv").read() == template def test_sequencing_finished(): @@ -236,7 +239,7 @@ def test_final_sync_to_storage( run_dir="run_dir", destination="destination", archive_dir="archive_dir", - log="log_path", + rsync_log="log_path", ) assert mock_run.call_args_list[0] == call( @@ -263,7 +266,7 @@ def test_final_sync_to_storage( run_dir="run_dir", destination="destination", archive_dir="archive_dir", - log="log_path", + rsync_log="log_path", ) assert mock_run.call_count == 3 @@ -326,7 +329,6 @@ def test_archive_finished_run(): def test_parse_position_logs(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture @@ -340,7 +342,6 @@ def test_parse_position_logs(setup_test_fixture): assert len(logs) == len(set(logs_as_strings)) for entry in logs: - assert re.match(r"^(MN19414)|(1A)$", entry["position"]) assert re.match(r"^2024-01-01 0\d:0\d:0\d.0\d$", entry["timestamp"]) assert re.match(r"^INFO: [a-z\._]+ \(user_messages\)$", entry["category"]) @@ -351,7 +352,6 @@ def test_parse_position_logs(setup_test_fixture): def test_get_pore_counts(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture @@ -366,7 +366,6 @@ def test_get_pore_counts(setup_test_fixture): assert len(logs) == len(set(pore_counts_as_strings)) for entry in pore_counts: - assert re.match(r"^(TEST12345)|(PAM12345)$", entry["flow_cell_id"]) assert re.match(r"^(MN19414)|(1A)$", entry["position"]) assert re.match(r"^2024-01-01 0\d:0\d:0\d.0\d$", entry["timestamp"]) @@ -377,7 +376,6 @@ def test_get_pore_counts(setup_test_fixture): def test_dump_pore_count_history(setup_test_fixture): - # Run fixture args, tmp, file_paths = setup_test_fixture @@ -389,7 +387,7 @@ def test_dump_pore_count_history(setup_test_fixture): run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST','FLG')}" os.makedirs(run_path) new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) - assert open(new_file, "r").read() == "" + assert open(new_file).read() == "" tmp.cleanup() # Nothing to add, file is present @@ -398,7 +396,7 @@ def test_dump_pore_count_history(setup_test_fixture): os.makedirs(run_path) open(run_path + "/pore_count_history.csv", "w").write("test") new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) - assert open(new_file, "r").read() == "test" + assert open(new_file).read() == "test" tmp.cleanup() # Something to add @@ -424,5 +422,5 @@ def test_dump_pore_count_history(setup_test_fixture): + "\n" ) - assert open(new_file, "r").read() == template + assert open(new_file).read() == template tmp.cleanup() diff --git a/tests/test_nanopore.py b/tests/test_nanopore.py index cb1e1a15..03ed4bc0 100644 --- a/tests/test_nanopore.py +++ b/tests/test_nanopore.py @@ -1,12 +1,12 @@ #!/usr/bin/env python -import unittest -import mock import filecmp import os import subprocess +import unittest +from unittest import mock -from taca.nanopore.ONT_run_classes import ONT_run from taca.nanopore.minion_run_class import MinIONqc +from taca.nanopore.ONT_run_classes import ONT_run from taca.utils import config CONFIG = config.load_yaml_config("data/taca_test_nanopore_cfg.yaml") @@ -14,42 +14,59 @@ class TestNanopore(unittest.TestCase): """Test Nanopore class""" + def test_is_not_transferred(self): """Check if nanopore run has been transferred.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - np_run.transfer_log = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer').get('transfer_file') + np_run.transfer_log = ( + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("transfer") + .get("transfer_file") + ) self.assertTrue(np_run.is_not_transferred()) - run_dir_transf = 'data/nanopore_data/run4/done_demuxing/20200105_1412_MN19414_AAU645_68125dc2' + run_dir_transf = "data/nanopore_data/run4/done_demuxing/20200105_1412_MN19414_AAU645_68125dc2" np_run_transf = ONT_run(run_dir_transf) - np_run_transf.transfer_log = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer').get('transfer_file') + np_run_transf.transfer_log = ( + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("transfer") + .get("transfer_file") + ) self.assertFalse(np_run_transf.is_not_transferred()) - @mock.patch('taca.nanopore.nanopore.RsyncAgent') + @mock.patch("taca.nanopore.nanopore.RsyncAgent") def test_transfer_run(self, mock_rsync): """Start rsync of finished run.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - transfer_details = CONFIG.get('nanopore_analysis').get('minion_qc_run').get('transfer') + transfer_details = ( + CONFIG.get("nanopore_analysis").get("minion_qc_run").get("transfer") + ) np_run.transfer_run(transfer_details) - rsync_opts = {'-LtDrv': None, - '--chown': ':ngi2016003', - '--chmod' : 'Dg+s,g+rw', - '-r' : None, - '--exclude' : 'work'} - mock_rsync.assert_called_with(run_dir, - dest_path='some_dir', - remote_host='some_host', - remote_user='some_user', - validate=False, - opts=rsync_opts) - - @mock.patch('taca.nanopore.nanopore.shutil.move') + rsync_opts = { + "-LtDrv": None, + "--chown": ":ngi2016003", + "--chmod": "Dg+s,g+rw", + "-r": None, + "--exclude": "work", + } + mock_rsync.assert_called_with( + run_dir, + dest_path="some_dir", + remote_host="some_host", + remote_user="some_user", + validate=False, + opts=rsync_opts, + ) + + @mock.patch("taca.nanopore.nanopore.shutil.move") def test_archive_run(self, mock_move): """Move directory to archive.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" np_run = ONT_run(run_dir) - np_run.archive_dir = '/some/dir' + np_run.archive_dir = "/some/dir" np_run.archive_run() mock_move.assert_called_once() @@ -59,89 +76,123 @@ class TestMinION(unittest.TestCase): def test_get_original_samplesheet(self): """Get location of lims sample sheet.""" - run_dir = 'data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2' + run_dir = "data/nanopore_data/run2/done_sequencing/20200102_1412_MN19414_AAU642_68125dc2" run = MinIONqc(run_dir, None, None) run._get_anglerfish_samplesheet() - expected_sample_sheet = 'data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv' + expected_sample_sheet = "data/nanopore_samplesheets/2020/QC_SQK-LSK109_AAU642_Samplesheet_22-594126.csv" self.assertEqual(run.lims_samplesheet, expected_sample_sheet) def test_parse_samplesheet(self): """Make nanoseq sample sheet from lims sample sheet.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) - run.lims_samplesheet = 'data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv' + run.lims_samplesheet = "data/nanopore_samplesheets/2020/DELIVERY_SQK-LSK109_AAU644_Samplesheet_24-594126.csv" run._parse_samplesheet() - self.assertTrue(filecmp.cmp(run.nanoseq_sample_sheet, 'data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv')) - self.assertTrue(filecmp.cmp(run.anglerfish_sample_sheet, 'data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv')) + self.assertTrue( + filecmp.cmp( + run.nanoseq_sample_sheet, + "data/nanopore_samplesheets/expected/SQK-LSK109_sample_sheet.csv", + ) + ) + self.assertTrue( + filecmp.cmp( + run.anglerfish_sample_sheet, + "data/nanopore_samplesheets/expected/anglerfish_sample_sheet.csv", + ) + ) - @mock.patch('taca.nanopore.minion.MinIONqc._get_flowcell_product_code') - @mock.patch('taca.nanopore.minion.MinIONqc._is_multiplexed') - @mock.patch('taca.nanopore.minion.subprocess.Popen') - def test_start_analysis_pipeline_multiplexed(self, mock_popen, mock_is_multiplexed, mock_get_fc_code): + @mock.patch("taca.nanopore.minion.MinIONqc._get_flowcell_product_code") + @mock.patch("taca.nanopore.minion.MinIONqc._is_multiplexed") + @mock.patch("taca.nanopore.minion.subprocess.Popen") + def test_start_analysis_pipeline_multiplexed( + self, mock_popen, mock_is_multiplexed, mock_get_fc_code + ): """Submit detached nanoseq job for multiplexed data.""" - mock_get_fc_code.return_value = 'FLO-FLG001' + mock_get_fc_code.return_value = "FLO-FLG001" mock_is_multiplexed.return_value = True - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run = MinIONqc(run_dir, sample_sheet, None) run.start_nanoseq() - expected_parameters = ('nextflow run nf-core/nanoseq' - + ' -r ' + CONFIG.get('nanopore_analysis').get('minion_qc_run').get('nanoseq_version') - + ' --input ' + sample_sheet - + ' --protocol DNA' - + ' --input_path ' + os.path.join(run_dir, 'fast5') - + ' --outdir ' + os.path.join(run_dir, 'nanoseq_output') - + ' --flowcell FLO-FLG001' - + ' --guppy_gpu' - + ' --skip_alignment' - + ' --skip_quantification' - + ' --kit SQK-LSK109' - + ' --max_cpus 6' - + ' --max_memory 20.GB' - + ' --barcode_kit EXP-NBD104' - + ' -profile singularity; echo $? > .exitcode_for_nanoseq') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) - - @mock.patch('taca.nanopore.minion.MinIONqc._get_flowcell_product_code') - @mock.patch('taca.nanopore.minion.MinIONqc._is_multiplexed') - @mock.patch('taca.nanopore.minion.subprocess.Popen') - def test_start_analysis_pipeline_not_multiplexed(self, mock_popen, mock_is_multiplexed, mock_get_fc_code): + expected_parameters = ( + "nextflow run nf-core/nanoseq" + + " -r " + + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("nanoseq_version") + + " --input " + + sample_sheet + + " --protocol DNA" + + " --input_path " + + os.path.join(run_dir, "fast5") + + " --outdir " + + os.path.join(run_dir, "nanoseq_output") + + " --flowcell FLO-FLG001" + + " --guppy_gpu" + + " --skip_alignment" + + " --skip_quantification" + + " --kit SQK-LSK109" + + " --max_cpus 6" + + " --max_memory 20.GB" + + " --barcode_kit EXP-NBD104" + + " -profile singularity; echo $? > .exitcode_for_nanoseq" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) + + @mock.patch("taca.nanopore.minion.MinIONqc._get_flowcell_product_code") + @mock.patch("taca.nanopore.minion.MinIONqc._is_multiplexed") + @mock.patch("taca.nanopore.minion.subprocess.Popen") + def test_start_analysis_pipeline_not_multiplexed( + self, mock_popen, mock_is_multiplexed, mock_get_fc_code + ): """Submit detached nanoseq job for non multiplexed data.""" - mock_get_fc_code.return_value = 'FLO-FLG001' + mock_get_fc_code.return_value = "FLO-FLG001" mock_is_multiplexed.return_value = False - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run = MinIONqc(run_dir, sample_sheet, None) run.start_nanoseq() - expected_parameters = ('nextflow run nf-core/nanoseq' - + ' -r ' + CONFIG.get('nanopore_analysis').get('minion_qc_run').get('nanoseq_version') - + ' --input ' + sample_sheet - + ' --protocol DNA' - + ' --input_path ' + os.path.join(run_dir, 'fast5') - + ' --outdir ' + os.path.join(run_dir, 'nanoseq_output') - + ' --flowcell FLO-FLG001' - + ' --guppy_gpu' - + ' --skip_alignment' - + ' --skip_quantification' - + ' --kit SQK-LSK109' - + ' --max_cpus 6' - + ' --max_memory 20.GB' - + ' -profile singularity; echo $? > .exitcode_for_nanoseq') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) + expected_parameters = ( + "nextflow run nf-core/nanoseq" + + " -r " + + CONFIG.get("nanopore_analysis") + .get("minion_qc_run") + .get("nanoseq_version") + + " --input " + + sample_sheet + + " --protocol DNA" + + " --input_path " + + os.path.join(run_dir, "fast5") + + " --outdir " + + os.path.join(run_dir, "nanoseq_output") + + " --flowcell FLO-FLG001" + + " --guppy_gpu" + + " --skip_alignment" + + " --skip_quantification" + + " --kit SQK-LSK109" + + " --max_cpus 6" + + " --max_memory 20.GB" + + " -profile singularity; echo $? > .exitcode_for_nanoseq" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) def test_get_flowcell_product_code(self): """Get flowcell product code from report.md.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) got_id = run._get_flowcell_product_code() - expected_id = 'FLO-FLG001' + expected_id = "FLO-FLG001" self.assertEqual(got_id, expected_id) def test_is_multiplexed(self): """Return True if run is multiplexed, else False.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - multiplexed_sample_sheet = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' - non_multiplexed_sample_sheet = 'data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + multiplexed_sample_sheet = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" + non_multiplexed_sample_sheet = "data/nanopore_data/run3/demultiplexing/20200103_1412_MN19414_AAU643_68125dc2/SQK-LSK109_AAU643_sample_sheet.csv" multiplexed_run = MinIONqc(run_dir, multiplexed_sample_sheet, None) non_multiplexed_run = MinIONqc(run_dir, non_multiplexed_sample_sheet, None) self.assertTrue(multiplexed_run._is_multiplexed()) @@ -149,60 +200,81 @@ def test_is_multiplexed(self): def test_get_barcode_kit(self): """Return EXP-NBD104 or EXP-NBD114 barcode kit based on sample sheet.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - sample_sheet_104 = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + sample_sheet_104 = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/SQK-LSK109_sample_sheet.csv" run_104 = MinIONqc(run_dir, sample_sheet_104, None) got_kit_104 = run_104._get_barcode_kit() - sample_sheet_114 = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv' + sample_sheet_114 = "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/SQK-LSK109_sample_sheet.csv" run_114 = MinIONqc(run_dir, sample_sheet_114, None) got_kit_114 = run_114._get_barcode_kit() - self.assertEqual(got_kit_104, 'EXP-NBD104') - self.assertEqual(got_kit_114, 'EXP-NBD114') + self.assertEqual(got_kit_104, "EXP-NBD104") + self.assertEqual(got_kit_114, "EXP-NBD114") def test_check_exit_status(self): """Check nanoseq exit status from file.""" - run_dir_success = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir_success = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" success_run = MinIONqc(run_dir_success, None, None) - self.assertTrue(success_run.check_exit_status('data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq')) - run_dir_fail = 'data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2' + self.assertTrue( + success_run.check_exit_status( + "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/.exitcode_for_nanoseq" + ) + ) + run_dir_fail = ( + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2" + ) fail_run = MinIONqc(run_dir_fail, None, None) - self.assertFalse(fail_run.check_exit_status('data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq')) + self.assertFalse( + fail_run.check_exit_status( + "data/nanopore_data/run8/demux_failed/20200108_1412_MN19414_AAU648_68125dc2/.exitcode_for_nanoseq" + ) + ) - @mock.patch('taca.nanopore.minion.os.makedirs') - @mock.patch('taca.nanopore.minion.subprocess.Popen') + @mock.patch("taca.nanopore.minion.os.makedirs") + @mock.patch("taca.nanopore.minion.subprocess.Popen") def test_start_anglerfish(self, mock_popen, mock_mkdir): """Start Anglerfish.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' - af_sample_sheet = 'anglerfish_sample_sheet.csv' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" + af_sample_sheet = "anglerfish_sample_sheet.csv" run = MinIONqc(run_dir, None, af_sample_sheet) run.start_anglerfish() - expected_parameters = ('anglerfish.py' - + ' --samplesheet anglerfish_sample_sheet.csv' - + ' --out_fastq data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output' - + ' --threads 2' - + ' --skip_demux' - + ' --skip_fastqc; echo $? > .exitcode_for_anglerfish') - mock_popen.assert_called_once_with(expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir) - - @mock.patch('taca.nanopore.minion.MinIONqc._find_anglerfish_results') - @mock.patch('taca.nanopore.minion.shutil.copyfile') + expected_parameters = ( + "anglerfish.py" + + " --samplesheet anglerfish_sample_sheet.csv" + + " --out_fastq data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output" + + " --threads 2" + + " --skip_demux" + + " --skip_fastqc; echo $? > .exitcode_for_anglerfish" + ) + mock_popen.assert_called_once_with( + expected_parameters, stdout=subprocess.PIPE, shell=True, cwd=run_dir + ) + + @mock.patch("taca.nanopore.minion.MinIONqc._find_anglerfish_results") + @mock.patch("taca.nanopore.minion.shutil.copyfile") def test_copy_results_for_lims(self, mock_copy, mock_results): """Copy Anglerfish results to lims.""" - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) - anglerfish_results_path = 'anglerfish_output' - anglerfish_results_file = os.path.join(run_dir, anglerfish_results_path, 'anglerfish_2020_09_23_141922', 'anglerfish_stats.txt') - lims_results_file = 'some/dir/2020/anglerfish_stats_AAU644.txt' + anglerfish_results_path = "anglerfish_output" + anglerfish_results_file = os.path.join( + run_dir, + anglerfish_results_path, + "anglerfish_2020_09_23_141922", + "anglerfish_stats.txt", + ) + lims_results_file = "some/dir/2020/anglerfish_stats_AAU644.txt" mock_results.return_value = anglerfish_results_file run.copy_results_for_lims() mock_copy.assert_called_once_with(anglerfish_results_file, lims_results_file) def test_find_anglerfish_results(self): """Locate Anglerfish results file.""" - anglerfish_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output' - run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2' + anglerfish_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2/anglerfish_output" + run_dir = "data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2" run = MinIONqc(run_dir, None, None) found_file = run._find_anglerfish_results() - expected_file = os.path.join(anglerfish_dir, 'anglerfish_2020_09_23_141922', 'anglerfish_stats.txt') + expected_file = os.path.join( + anglerfish_dir, "anglerfish_2020_09_23_141922", "anglerfish_stats.txt" + ) self.assertEqual(expected_file, found_file) diff --git a/tests/test_server_status.py b/tests/test_server_status.py index 2d24c83d..44690644 100644 --- a/tests/test_server_status.py +++ b/tests/test_server_status.py @@ -1,17 +1,19 @@ #!/usr/bin/env python import unittest -import mock +from unittest import mock + import crontab -from taca.server_status import server_status, cronjobs +from taca.server_status import cronjobs, server_status from taca.utils import config -CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml') +CONFIG = config.load_yaml_config("data/taca_test_cfg.yaml") INITAL_TAB = """ # First Comment 0,30 * * * * firstcommand """ + class TestServerStatus(unittest.TestCase): def test_get_nases_disk_space(self): """Get disk space for disk specified in config file.""" @@ -20,81 +22,105 @@ def test_get_nases_disk_space(self): def test_parse_output_valid_case(self): """Parse valid disk space output.""" - valid_disk_space = 'Filesystem Size Used Avail Capacity iused ifree %iused Mounted on \ - /dev/disk1s1 466Gi 59Gi 393Gi 14% 1062712 4881390168 0% /System/Volumes/Data' - expected_result = {'disk_size': '14%', - 'mounted_on': '/System/Volumes/Data', - 'available_percentage': '100%', - 'space_used': '1062712', - 'used_percentage': '0%', - 'filesystem': '393Gi', - 'space_available': '4881390168'} + valid_disk_space = "Filesystem Size Used Avail Capacity iused ifree %iused Mounted on \ + /dev/disk1s1 466Gi 59Gi 393Gi 14% 1062712 4881390168 0% /System/Volumes/Data" + expected_result = { + "disk_size": "14%", + "mounted_on": "/System/Volumes/Data", + "available_percentage": "100%", + "space_used": "1062712", + "used_percentage": "0%", + "filesystem": "393Gi", + "space_available": "4881390168", + } got_result = server_status._parse_output(valid_disk_space) self.assertEqual(expected_result, got_result) def test_parse_output_invalid_case(self): """Parse invalid disk space output.""" - invalid_disk_space = '' + invalid_disk_space = "" expected_invalid_result = { - 'disk_size': 'NaN', - 'space_used': 'NaN', - 'space_available': 'NaN', - 'used_percentage': 'NaN', - 'available_percentage': 'NaN', - 'mounted_on': 'NaN', - 'filesystem': 'NaN' + "disk_size": "NaN", + "space_used": "NaN", + "space_available": "NaN", + "used_percentage": "NaN", + "available_percentage": "NaN", + "mounted_on": "NaN", + "filesystem": "NaN", } invalid_result = server_status._parse_output(invalid_disk_space) self.assertEqual(expected_invalid_result, invalid_result) - @mock.patch('taca.server_status.server_status.statusdb') + @mock.patch("taca.server_status.server_status.statusdb") def test_update_status_db(self, mock_couchdb): """Update statusdb.""" - disk_space = {'localhost': {'disk_size': '14%', 'mounted_on': '/System/Volumes/Data', 'available_percentage': '100%', 'space_used': '1061701', 'used_percentage': '0%', 'filesystem': '393Gi', 'space_available': '4881391179'}} - server_status.update_status_db(disk_space, server_type='nas') + disk_space = { + "localhost": { + "disk_size": "14%", + "mounted_on": "/System/Volumes/Data", + "available_percentage": "100%", + "space_used": "1061701", + "used_percentage": "0%", + "filesystem": "393Gi", + "space_available": "4881391179", + } + } + server_status.update_status_db(disk_space, server_type="nas") class TestCronjobs(unittest.TestCase): - @mock.patch('taca.server_status.cronjobs.CronTab') - @mock.patch('taca.server_status.cronjobs.getpass.getuser') + @mock.patch("taca.server_status.cronjobs.CronTab") + @mock.patch("taca.server_status.cronjobs.getpass.getuser") def test_parse_crontab(self, mock_getpass, mock_crontab): """Parse crontab.""" mock_crontab.return_value = crontab.CronTab(tab=INITAL_TAB) - mock_getpass.return_value = 'test_user' - expected_crontab = {'test_user': - [{'Comment': u'First Comment', - 'Day of month': '*', - 'Command': u'firstcommand', - 'Hour': '*', - 'Day of week': '*', - 'Enabled': True, - 'Special syntax': '', - 'Minute': '0,30', - 'Month': '*'}] + mock_getpass.return_value = "test_user" + expected_crontab = { + "test_user": [ + { + "Comment": "First Comment", + "Day of month": "*", + "Command": "firstcommand", + "Hour": "*", + "Day of week": "*", + "Enabled": True, + "Special syntax": "", + "Minute": "0,30", + "Month": "*", + } + ] } got_crontab = cronjobs._parse_crontab() self.assertEqual(expected_crontab, got_crontab) - @mock.patch('taca.server_status.cronjobs.statusdb') - @mock.patch('taca.server_status.cronjobs.logging') - @mock.patch('taca.server_status.cronjobs.platform') - @mock.patch('taca.server_status.cronjobs._parse_crontab') - def test_update_cronjob_db(self, mock_parser, mock_platform, mock_logging, mock_statusdb): + @mock.patch("taca.server_status.cronjobs.statusdb") + @mock.patch("taca.server_status.cronjobs.logging") + @mock.patch("taca.server_status.cronjobs.platform") + @mock.patch("taca.server_status.cronjobs._parse_crontab") + def test_update_cronjob_db( + self, mock_parser, mock_platform, mock_logging, mock_statusdb + ): """Update couchdb with cronjobs.""" - mock_parser.return_value = {'test_user': - [{'Comment': u'First Comment', - 'Day of month': '*', - 'Command': u'firstcommand', - 'Hour': '*', - 'Day of week': '*', - 'Enabled': True, - 'Special syntax': '', - 'Minute': '0,30', - 'Month': '*'}] + mock_parser.return_value = { + "test_user": [ + { + "Comment": "First Comment", + "Day of month": "*", + "Command": "firstcommand", + "Hour": "*", + "Day of week": "*", + "Enabled": True, + "Special syntax": "", + "Minute": "0,30", + "Month": "*", + } + ] } - mock_platform.node.return_value = 'server.name' + mock_platform.node.return_value = "server.name" cronjobs.update_cronjob_db() - calls = [mock.call.info('Connecting to database: url'), - mock.call.warning('Document has not been created/updated')] + calls = [ + mock.call.info("Connecting to database: url"), + mock.call.warning("Document has not been created/updated"), + ] mock_logging.assert_has_calls(calls) diff --git a/tests/test_utils.py b/tests/test_utils.py index c4f6f2d9..79e17645 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,15 @@ """Unit tests for the utils helper functions.""" -import hashlib -import mock import os import shutil import subprocess import tempfile -import unittest import time -import couchdb +import unittest from collections import defaultdict -from taca.utils import misc, filesystem, transfer, config, bioinfo_tab, statusdb -from six.moves import map +from unittest import mock + +from taca.utils import bioinfo_tab, config, filesystem, misc, statusdb, transfer class TestMisc(unittest.TestCase): @@ -19,17 +17,15 @@ class TestMisc(unittest.TestCase): @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_misc') - self.hashfile = os.path.join(self.rootdir, 'test_hashfile') - with open(self.hashfile, 'w') as fh: - fh.write('This is some contents\n') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_misc") + self.hashfile = os.path.join(self.rootdir, "test_hashfile") + with open(self.hashfile, "w") as fh: + fh.write("This is some contents\n") self.hashfile_digests = { - 'SHA256': - '4f075ae76b480bb0200dab01cd304f4045e04cd2b73e88b89549e5ac1627f222', - 'MD5': - 'c8498fc299bc3e22690045f1b62ce4e9', - 'SHA1': - '098fb272dfdae2ea1ba57c795dd325fa70e3c3fb'} + "SHA256": "4f075ae76b480bb0200dab01cd304f4045e04cd2b73e88b89549e5ac1627f222", + "MD5": "c8498fc299bc3e22690045f1b62ce4e9", + "SHA1": "098fb272dfdae2ea1ba57c795dd325fa70e3c3fb", + } @classmethod def tearDownClass(self): @@ -46,42 +42,48 @@ def test_hashfile_dir(self): def test_multiple_hashfile_calls(self): """Ensure that the hasher object is cleared between subsequent calls.""" - assert misc.hashfile(self.hashfile, hasher='sha1') == misc.hashfile(self.hashfile, 'sha1') + assert misc.hashfile(self.hashfile, hasher="sha1") == misc.hashfile( + self.hashfile, "sha1" + ) - @mock.patch('taca.utils.misc.smtplib.SMTP') + @mock.patch("taca.utils.misc.smtplib.SMTP") def test_send_mail(self, mock_smtplib): """Test send email.""" - assert misc.send_mail('subject', 'content', 'receiver') is None - mock_smtplib.assert_called_with('localhost') - mock_smtplib().sendmail.assert_called_with('TACA', ['receiver'], mock.ANY) + assert misc.send_mail("subject", "content", "receiver") is None + mock_smtplib.assert_called_with("localhost") + mock_smtplib().sendmail.assert_called_with("TACA", ["receiver"], mock.ANY) with self.assertRaises(SystemExit): - misc.send_mail('subject', 'content', None) + misc.send_mail("subject", "content", None) def test_call_external_command_pass(self): """Call external command.""" - new_file = os.path.join(self.rootdir, 'test_call_external') - command = 'touch ' + new_file - log_dir = os.path.join(self.rootdir, 'log_tests') - misc.call_external_command(command, with_log_files=True, prefix='test', log_dir=log_dir) + new_file = os.path.join(self.rootdir, "test_call_external") + command = "touch " + new_file + log_dir = os.path.join(self.rootdir, "log_tests") + misc.call_external_command( + command, with_log_files=True, prefix="test", log_dir=log_dir + ) assert os.path.isfile(new_file) - assert os.path.isfile(os.path.join(self.rootdir, 'log_tests', 'test_touch.out')) + assert os.path.isfile(os.path.join(self.rootdir, "log_tests", "test_touch.out")) def test_call_external_command_fail(self): """Call external command should handle error.""" - command = 'ls -E' + command = "ls -E" with self.assertRaises(subprocess.CalledProcessError): misc.call_external_command(command) def test_call_external_command_detached(self): """Call external command detached.""" - new_file = os.path.join(self.rootdir, 'test_call_external_det') - command = 'touch ' + new_file - misc.call_external_command_detached(command, with_log_files=True, prefix='test_det') + new_file = os.path.join(self.rootdir, "test_call_external_det") + command = "touch " + new_file + misc.call_external_command_detached( + command, with_log_files=True, prefix="test_det" + ) time.sleep(0.1) self.assertTrue(os.path.isfile(new_file)) - self.assertTrue(os.path.isfile('test_det_touch.out')) - os.remove('test_det_touch.out') - os.remove('test_det_touch.err') + self.assertTrue(os.path.isfile("test_det_touch.out")) + os.remove("test_det_touch.out") + os.remove("test_det_touch.err") def test_to_seconds(self): """Transform days and hours to seconds.""" @@ -92,85 +94,92 @@ def test_to_seconds(self): self.assertEqual(misc.to_seconds(days=1), 86400) self.assertEqual(misc.to_seconds(hours=1), 3600) - @mock.patch('taca.utils.misc.input', return_value='yes') + @mock.patch("taca.utils.misc.input", return_value="yes") def test_query_yes_no_true(self, mock_raw_input): """Return True from answer yes.""" - response = misc.query_yes_no('Some question') + response = misc.query_yes_no("Some question") self.assertTrue(response) - @mock.patch('taca.utils.misc.input', return_value='no') + @mock.patch("taca.utils.misc.input", return_value="no") def test_query_yes_no_false(self, mock_raw_input): """Return False from answer no.""" - response = misc.query_yes_no('Some question') + response = misc.query_yes_no("Some question") self.assertFalse(response) def test_return_unique(self): """Return unique items in a list.""" - input_list = ['a', 'b', 'a', 'c'] + input_list = ["a", "b", "a", "c"] returned_list = misc.return_unique(input_list) - expected_list = ['a', 'b', 'c'] + expected_list = ["a", "b", "c"] self.assertEqual(returned_list, expected_list) - @mock.patch('taca.utils.misc.statusdb') + @mock.patch("taca.utils.misc.statusdb") def test_run_is_demuxed(self, mock_couch): """Check in StatusDB if run was demultiplexed.""" - run = '200201_A00621_0032_BHHFCFDSXX' - couch_info = {'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'db': 'db'} - is_demultiplexed = misc.run_is_demuxed(run, couch_info=couch_info) - #TODO: should add a check here but not sure how to mock this properly + run = "200201_A00621_0032_BHHFCFDSXX" + couch_info = { + "url": "url", + "username": "username", + "password": "pwd", + "db": "db", + } + misc.run_is_demuxed(run, couch_info=couch_info) + # TODO: should add a check here but not sure how to mock this properly + class TestFilesystem(unittest.TestCase): """Test class for the filesystem functions.""" def setUp(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_filesystem') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_filesystem") def tearDown(self): shutil.rmtree(self.rootdir) def test_crete_folder_non_existing(self): """Ensure that a non-existing folder is created.""" - target_folder = os.path.join(self.rootdir,'target-non-existing') + target_folder = os.path.join(self.rootdir, "target-non-existing") self.assertTrue( filesystem.create_folder(target_folder), - 'A non-existing target folder could not be created') + "A non-existing target folder could not be created", + ) self.assertTrue( os.path.exists(target_folder), - 'A non-existing target folder was not created \ - but method returned True' + "A non-existing target folder was not created \ + but method returned True", ) def test_crete_folder_existing(self): """Ensure that an existing folder is detected.""" self.assertTrue( filesystem.create_folder(self.rootdir), - 'A pre-existing target folder was not detected') + "A pre-existing target folder was not detected", + ) def test_crete_folder_parent_non_existing(self): """Ensure that a non-existing parent folder is created.""" target_folder = os.path.join( - self.rootdir, - 'parent-non-existing', - 'target-non-existing') + self.rootdir, "parent-non-existing", "target-non-existing" + ) self.assertTrue( filesystem.create_folder(target_folder), - 'A non-existing parent and target folder could not be created') + "A non-existing parent and target folder could not be created", + ) self.assertTrue( os.path.exists(target_folder), - 'A non-existing parent folder was not created \ - but method returned True' + "A non-existing parent folder was not created \ + but method returned True", ) def test_crete_folder_exception(self): """Ensure that create_folder handles thrown exceptions gracefully.""" - with mock.patch.object(filesystem.os, 'makedirs', side_effect=OSError): + with mock.patch.object(filesystem.os, "makedirs", side_effect=OSError): self.assertFalse( filesystem.create_folder( - os.path.join(self.rootdir,'target-non-existing')), - 'A raised exception was not handled properly') + os.path.join(self.rootdir, "target-non-existing") + ), + "A raised exception was not handled properly", + ) def test_chdir(self): """Ensure start dir and end dir are the same.""" @@ -181,34 +190,35 @@ def test_chdir(self): def test_touch(self): """Make empty file.""" - new_file = os.path.join(self.rootdir, 'empty') + new_file = os.path.join(self.rootdir, "empty") filesystem.touch(new_file) self.assertTrue(os.path.isfile(new_file)) def test_do_symlink(self): """Make a symlink.""" - src = os.path.join(self.rootdir, 'source_file') - open(src, 'w').close() - dst = os.path.join(self.rootdir, 'dest_file') + src = os.path.join(self.rootdir, "source_file") + open(src, "w").close() + dst = os.path.join(self.rootdir, "dest_file") filesystem.do_symlink(src, dst) self.assertTrue(os.path.islink(dst)) def test_do_copy(self): """Copy files.""" - src_dir = os.path.join(self.rootdir, 'source_dir') - src = os.path.join(src_dir, 'source_file') + src_dir = os.path.join(self.rootdir, "source_dir") + src = os.path.join(src_dir, "source_file") os.mkdir(src_dir) - open(src, 'w').close() - dst_dir = os.path.join(self.rootdir, 'dest_dir') + open(src, "w").close() + dst_dir = os.path.join(self.rootdir, "dest_dir") filesystem.do_copy(src_dir, dst_dir) - self.assertTrue(os.path.isfile(os.path.join(dst_dir, 'source_file'))) + self.assertTrue(os.path.isfile(os.path.join(dst_dir, "source_file"))) + class TestTransferAgent(unittest.TestCase): """Test class for the TransferAgent class.""" @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_transfer_src') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_transfer_src") self.testfile = tempfile.mkstemp(dir=self.rootdir) @classmethod @@ -216,10 +226,10 @@ def tearDownClass(self): shutil.rmtree(self.rootdir) def setUp(self): - self.destdir = tempfile.mkdtemp(prefix='test_taca_transfer_dest') + self.destdir = tempfile.mkdtemp(prefix="test_taca_transfer_dest") self.agent = transfer.TransferAgent( - src_path=self.rootdir, - dest_path=self.destdir) + src_path=self.rootdir, dest_path=self.destdir + ) def tearDown(self): shutil.rmtree(self.destdir) @@ -230,9 +240,7 @@ def test_transfer_validate_src_path(self): self.agent.src_path = None with self.assertRaises(transfer.TransferError): self.agent.validate_src_path() - self.agent.src_path = os.path.join( - self.rootdir, - 'this-file-does-not-exist') + self.agent.src_path = os.path.join(self.rootdir, "this-file-does-not-exist") with self.assertRaises(transfer.TransferError): self.agent.validate_src_path() @@ -259,11 +267,11 @@ class TestSymlinkAgent(unittest.TestCase): @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_symlink_src') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_symlink_src") path = self.rootdir for n in range(3): - open(os.path.join(path, 'file{}'.format(n)), 'w').close() - path = os.path.join(path, 'folder{}'.format(n)) + open(os.path.join(path, f"file{n}"), "w").close() + path = os.path.join(path, f"folder{n}") os.mkdir(path) @classmethod @@ -271,51 +279,49 @@ def tearDownClass(self): shutil.rmtree(self.rootdir) def setUp(self): - self.targetdir = tempfile.mkdtemp( - prefix='test_taca_filesystem_symlink_dest') + self.targetdir = tempfile.mkdtemp(prefix="test_taca_filesystem_symlink_dest") def tearDown(self): shutil.rmtree(self.targetdir) def test_symlink_validate_transfer(self): """Verify that the dest_path was created.""" - src = os.path.join(self.rootdir, 'file0') - dst = os.path.join(self.targetdir, 'file0') + src = os.path.join(self.rootdir, "file0") + dst = os.path.join(self.targetdir, "file0") os.symlink(src, dst) self.assertTrue(transfer.SymlinkAgent(src, dst).validate_transfer()) def test_symlink_file_top_folder(self): """Symlink a single file in the top folder.""" - src = os.path.join(self.rootdir, 'file0') - target = os.path.join(self.targetdir,os.path.basename(src)) + src = os.path.join(self.rootdir, "file0") + target = os.path.join(self.targetdir, os.path.basename(src)) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_make_dir(self): """Symlink a single file into a non-existing folder.""" - src = os.path.join(self.rootdir, 'folder0', 'folder1', 'file2') + src = os.path.join(self.rootdir, "folder0", "folder1", "file2") target = os.path.join( - self.targetdir, - 'these', 'folders', 'should', 'be', 'created') + self.targetdir, "these", "folders", "should", "be", "created" + ) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_overwrite(self): """Replace an existing file with overwrite.""" - src = os.path.join(self.rootdir, 'file0') + src = os.path.join(self.rootdir, "file0") target = os.path.join(self.targetdir, os.path.basename(src)) - open(target, 'w').close() + open(target, "w").close() self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_file_not_overwrite(self): """Don't replace an existing file without overwrite.""" - src = os.path.join(self.rootdir, 'file0') + src = os.path.join(self.rootdir, "file0") target = os.path.join(self.targetdir, os.path.basename(src)) - open(target, 'w').close() - self.assertFalse( - transfer.SymlinkAgent(src, target, overwrite=False).transfer()) + open(target, "w").close() + self.assertFalse(transfer.SymlinkAgent(src, target, overwrite=False).transfer()) def test_symlink_file_broken(self): """Don't create a broken symlink.""" - src = os.path.join(self.rootdir, 'non-existing-file') + src = os.path.join(self.rootdir, "non-existing-file") target = os.path.join(self.targetdir, os.path.basename(src)) with self.assertRaises(transfer.TransferError): transfer.SymlinkAgent(src, target).transfer() @@ -323,24 +329,23 @@ def test_symlink_file_broken(self): def test_symlink_file_unlink_fail(self): """Failing to remove existing file should raise SymlinkError.""" src = self.rootdir - target = os.path.join(self.targetdir, 'target-file') - open(target, 'w').close() + target = os.path.join(self.targetdir, "target-file") + open(target, "w").close() with mock.patch.object( - transfer.os, - 'unlink', - side_effect=OSError('Mocked error')): + transfer.os, "unlink", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() def test_symlink_folder_top_folder(self): """Symlinking a top-level folder.""" - src = os.path.join(self.rootdir, 'folder0') + src = os.path.join(self.rootdir, "folder0") target = os.path.join(self.targetdir, os.path.basename(src)) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) def test_symlink_folder_overwrite(self): """Replace an existing folder with overwrite.""" - src = os.path.join(self.rootdir, 'folder0') + src = os.path.join(self.rootdir, "folder0") target = os.path.join(self.targetdir, os.path.basename(src)) shutil.copytree(src, target) self.assertTrue(transfer.SymlinkAgent(src, target).transfer()) @@ -349,7 +354,7 @@ def test_symlink_folder_mount_point(self): """Don't overwrite a mount point.""" src = os.path.join(self.rootdir) target = os.path.join(self.targetdir) - with mock.patch.object(transfer.os.path, 'ismount', return_value=True): + with mock.patch.object(transfer.os.path, "ismount", return_value=True): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -357,19 +362,19 @@ def test_symlink_folder_not_overwrite(self): """Don't overwrite other existing paths.""" src = os.path.join(self.rootdir) target = os.path.join(self.targetdir) - with mock.patch('taca.utils.transfer.os.path') as mockobj: + with mock.patch("taca.utils.transfer.os.path") as mockobj: mockobj.ismount.return_value = False mockobj.isfile.return_value = False mockobj.islink.return_value = False mockobj.isdir.return_value = False with self.assertRaises(transfer.SymlinkError): - transfer.SymlinkAgent(src,target).transfer() + transfer.SymlinkAgent(src, target).transfer() def test_symlink_folder_parent_error(self): """Failing to create parent folder structure should raise SymlinkError.""" src = self.rootdir - target = os.path.join(self.targetdir, 'non-existing-folder', 'target-file') - with mock.patch.object(transfer, 'create_folder', return_value=False): + target = os.path.join(self.targetdir, "non-existing-folder", "target-file") + with mock.patch.object(transfer, "create_folder", return_value=False): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -378,9 +383,8 @@ def test_symlink_folder_rmtree_fail(self): src = self.rootdir target = self.targetdir with mock.patch.object( - transfer.shutil, - 'rmtree', - side_effect=OSError('Mocked error')): + transfer.shutil, "rmtree", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -389,9 +393,8 @@ def test_symlink_folder_symlink_error(self): src = self.rootdir target = os.path.join(self.targetdir, os.path.basename(src)) with mock.patch.object( - transfer.os, - 'symlink', - side_effect=OSError('Mocked error')): + transfer.os, "symlink", side_effect=OSError("Mocked error") + ): with self.assertRaises(transfer.SymlinkError): transfer.SymlinkAgent(src, target).transfer() @@ -400,11 +403,10 @@ def test_symlink_folder_unexpected(self): src = self.rootdir target = self.targetdir with mock.patch.object( - transfer.os.path, - 'exists', - side_effect=Exception('Mocked error')): + transfer.os.path, "exists", side_effect=Exception("Mocked error") + ): with self.assertRaises(Exception): - transfer.SymlinkAgent(src,target).transfer() + transfer.SymlinkAgent(src, target).transfer() class TestRsyncAgent(unittest.TestCase): @@ -412,25 +414,32 @@ class TestRsyncAgent(unittest.TestCase): @classmethod def setUpClass(cls): - cls.rootdir = tempfile.mkdtemp(prefix='test_taca_transfer_src') - (fh, cls.testfile) = tempfile.mkstemp( - prefix='test_taca_transfer_file') - os.write(fh, b'this is some content') + cls.rootdir = tempfile.mkdtemp(prefix="test_taca_transfer_src") + (fh, cls.testfile) = tempfile.mkstemp(prefix="test_taca_transfer_file") + os.write(fh, b"this is some content") os.close(fh) - open(os.path.join(cls.rootdir, 'file0'), 'w').close() - f = os.path.join(cls.rootdir, 'folder0') + open(os.path.join(cls.rootdir, "file0"), "w").close() + f = os.path.join(cls.rootdir, "folder0") os.mkdir(f) - open(os.path.join(f, 'file1'), 'w').close() + open(os.path.join(f, "file1"), "w").close() # create a digest file def _write_digest(rootdir, fhandle, fpath): - fhandle.write('{} {}\n'.format(misc.hashfile(fpath), os.path.relpath(fpath, rootdir))) - - cls.digestfile = os.path.join(cls.rootdir, 'digestfile.sha1') - with open(cls.digestfile, 'w') as digesth: - map(lambda x: - [_write_digest(cls.rootdir, digesth, os.path.join(x[0], y)) for y in [z for z in x[2] if os.path.join(x[0], z) != cls.digestfile]], - os.walk(cls.rootdir)) + fhandle.write( + f"{misc.hashfile(fpath)} {os.path.relpath(fpath, rootdir)}\n" + ) + + cls.digestfile = os.path.join(cls.rootdir, "digestfile.sha1") + with open(cls.digestfile, "w") as digesth: + map( + lambda x: [ + _write_digest(cls.rootdir, digesth, os.path.join(x[0], y)) + for y in [ + z for z in x[2] if os.path.join(x[0], z) != cls.digestfile + ] + ], + os.walk(cls.rootdir), + ) @classmethod def tearDownClass(cls): @@ -438,25 +447,26 @@ def tearDownClass(cls): os.unlink(cls.testfile) def setUp(self): - self.destdir = tempfile.mkdtemp(prefix='test_taca_transfer_dest') + self.destdir = tempfile.mkdtemp(prefix="test_taca_transfer_dest") self.agent = transfer.RsyncAgent( - self.rootdir, - dest_path=self.destdir, - validate=False) + self.rootdir, dest_path=self.destdir, validate=False + ) def tearDown(self): shutil.rmtree(self.destdir) def test_init(self): """Test initiation of agent instance.""" - args = ['arg1'] - kwargs = {'dest_path': 'arg2', - 'remote_host': 'arg3', - 'remote_user': 'arg4', - 'validate': True, - 'digestfile': 'arg5'} + args = ["arg1"] + kwargs = { + "dest_path": "arg2", + "remote_host": "arg3", + "remote_user": "arg4", + "validate": True, + "digestfile": "arg5", + } agent = transfer.RsyncAgent(*args, **kwargs) - self.assertEqual(getattr(agent, 'src_path'), args[0]) + self.assertEqual(getattr(agent, "src_path"), args[0]) for attribute, value in kwargs.items(): self.assertEqual(getattr(agent, attribute), value) self.assertEqual(agent.cmdopts, agent.DEFAULT_OPTS) @@ -464,7 +474,7 @@ def test_init(self): def test_rsync_validate_transfer(self): """Validate_transfer.""" # validation on remote hosts are not supported - self.agent.remote_host = 'not None' + self.agent.remote_host = "not None" with self.assertRaises(NotImplementedError): self.agent.validate_transfer() # validation without a digestfile throws an exception @@ -473,11 +483,17 @@ def test_rsync_validate_transfer(self): self.agent.validate_transfer() # validation with a valid digestfile should return true self.agent.digestfile = self.digestfile - self.assertTrue(self.agent.validate_transfer(), 'validation with a valid digestfile should return true') + self.assertTrue( + self.agent.validate_transfer(), + "validation with a valid digestfile should return true", + ) # modifying the contents of the digestfile should make validation fail - with open(self.digestfile, 'a') as fh: - fh.write('randomdigeststring this-file-does-not-exist') - self.assertFalse(self.agent.validate_transfer(), 'validation with an invalid digestfile should return false') + with open(self.digestfile, "a") as fh: + fh.write("randomdigeststring this-file-does-not-exist") + self.assertFalse( + self.agent.validate_transfer(), + "validation with an invalid digestfile should return false", + ) def test_rsync_validate_dest_path(self): """Destination path should be properly checked.""" @@ -486,7 +502,7 @@ def test_rsync_validate_dest_path(self): self.agent.dest_path = None with self.assertRaises(transfer.TransferError): self.agent.validate_dest_path() - self.agent.remote_user = 'user' + self.agent.remote_user = "user" self.agent.dest_path = self.destdir with self.assertRaises(transfer.TransferError): self.agent.validate_dest_path() @@ -496,76 +512,79 @@ def test_rsync_agent_dest_paths_constructed(self): self.assertEqual( self.destdir, self.agent.remote_path(), - 'Destination path was not correct for empty remote user ' \ - 'and empty destination host') - self.agent.remote_host = 'localhost' + "Destination path was not correct for empty remote user " + "and empty destination host", + ) + self.agent.remote_host = "localhost" self.assertEqual( - 'localhost:{}'.format(self.destdir), + f"localhost:{self.destdir}", self.agent.remote_path(), - 'Destination path was not correct for empty remote user') - self.agent.remote_user = 'user' + "Destination path was not correct for empty remote user", + ) + self.agent.remote_user = "user" self.assertEqual( - 'user@localhost:{}'.format(self.destdir), + f"user@localhost:{self.destdir}", self.agent.remote_path(), - 'Destination path was not correct for non-empty remote user') + "Destination path was not correct for non-empty remote user", + ) self.agent.dest_path = None self.assertEqual( - 'user@localhost:', + "user@localhost:", self.agent.remote_path(), - 'Destination path was not correct for empty destination path') + "Destination path was not correct for empty destination path", + ) def test_rsync_agent_propagate_error(self): """Wrap and propagate error thrown by the rsync subprocess.""" with mock.patch.object( - transfer.subprocess, 'check_call', - side_effect=subprocess.CalledProcessError( - cmd='mocked subprocess', - returncode=-1)): + transfer.subprocess, + "check_call", + side_effect=subprocess.CalledProcessError( + cmd="mocked subprocess", returncode=-1 + ), + ): with self.assertRaises(transfer.RsyncError): self.agent.transfer() def test_rsync_agent_file(self): """Rsync transfer of a single file.""" - self.agent.src_path = os.path.join(self.rootdir, 'file0') - self.assertTrue( - self.agent.transfer(), - 'transfer a single file failed') + self.agent.src_path = os.path.join(self.rootdir, "file0") + self.assertTrue(self.agent.transfer(), "transfer a single file failed") self.assertTrue( self.validate_files( self.agent.src_path, - os.path.join( - self.destdir, - os.path.basename(self.agent.src_path))), - 'test file was not properly transferred') + os.path.join(self.destdir, os.path.basename(self.agent.src_path)), + ), + "test file was not properly transferred", + ) def test_rsync_agent_dir(self): """Rsync transfer of a folder.""" - self.agent.src_path = os.path.join(self.rootdir, 'folder0') - self.assertTrue( - self.agent.transfer(), - 'transfer a folder failed') + self.agent.src_path = os.path.join(self.rootdir, "folder0") + self.assertTrue(self.agent.transfer(), "transfer a folder failed") self.assertTrue( self.validate_folders( self.agent.src_path, - os.path.join( - self.destdir, - os.path.basename(self.agent.src_path))), - 'folder was not properly transferred') + os.path.join(self.destdir, os.path.basename(self.agent.src_path)), + ), + "folder was not properly transferred", + ) def test_rsync_agent_symlink(self): """Rsync should be able to resolve symlinks.""" - self.agent.src_path = os.path.join(self.rootdir, 'folder0') - os.symlink(self.testfile,os.path.join(self.agent.src_path, 'link1')) - self.agent.cmdopts = {'-a': None, '--copy-links': None} + self.agent.src_path = os.path.join(self.rootdir, "folder0") + os.symlink(self.testfile, os.path.join(self.agent.src_path, "link1")) + self.agent.cmdopts = {"-a": None, "--copy-links": None} self.assertTrue( - self.agent.transfer(), - 'transfer a folder containing a symlink failed') + self.agent.transfer(), "transfer a folder containing a symlink failed" + ) self.assertEqual( - misc.hashfile(self.testfile, hasher='sha1'), + misc.hashfile(self.testfile, hasher="sha1"), misc.hashfile( - os.path.join(self.destdir, 'folder0', 'link1'), - hasher='sha1'), - 'symlink was not properly transferred') + os.path.join(self.destdir, "folder0", "link1"), hasher="sha1" + ), + "symlink was not properly transferred", + ) def validate_folders(self, src, dst): for root, dirs, files in os.walk(src): @@ -577,215 +596,256 @@ def validate_folders(self, src, dst): return True def validate_files(self, src, dst): - return os.path.exists(src) and \ - os.path.isfile(src) and \ - os.path.exists(dst) and \ - os.path.isfile(dst) and \ - misc.hashfile(src) == misc.hashfile(dst) + return ( + os.path.exists(src) + and os.path.isfile(src) + and os.path.exists(dst) + and os.path.isfile(dst) + and misc.hashfile(src) == misc.hashfile(dst) + ) -class TestConfig(unittest.TestCase): +class TestConfig(unittest.TestCase): def test_load_yaml_config(self): """Load a yaml config file""" - got_config_data = config.load_yaml_config('data/taca_test_cfg_minimal.yaml') - expexted_config_data = {'statusdb': - {'url': 'url', - 'username': 'username', - 'password': 'pwd'}, - 'log': - {'file': 'data/taca.log'}} + got_config_data = config.load_yaml_config("data/taca_test_cfg_minimal.yaml") + expexted_config_data = { + "statusdb": {"url": "url", "username": "username", "password": "pwd"}, + "log": {"file": "data/taca.log"}, + } self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - missing_config_data = config.load_yaml_config('data/missing_file.yaml)') + config.load_yaml_config("data/missing_file.yaml)") def test_load_config(self): """Load a config file.""" - got_config_data = config.load_config('data/taca_test_cfg_minimal.yaml') - expexted_config_data = {'statusdb': - {'url': 'url', - 'username': 'username', - 'password': 'pwd'}, - 'log': - {'file': 'data/taca.log'}} + got_config_data = config.load_config("data/taca_test_cfg_minimal.yaml") + expexted_config_data = { + "statusdb": {"url": "url", "username": "username", "password": "pwd"}, + "log": {"file": "data/taca.log"}, + } self.assertEqual(expexted_config_data, got_config_data) with self.assertRaises(IOError): - missing_config_data = config.load_config('data/missing_file.yaml)') + config.load_config("data/missing_file.yaml)") + class TestBioinfoTab(unittest.TestCase): """Test class for bioinfo_tab.""" @classmethod def setUpClass(self): - self.rootdir = tempfile.mkdtemp(prefix='test_taca_bt') - self.new_run = os.path.join(self.rootdir,'nosync', '190821_M01545_0252_000000001') + self.rootdir = tempfile.mkdtemp(prefix="test_taca_bt") + self.new_run = os.path.join( + self.rootdir, "nosync", "190821_M01545_0252_000000001" + ) os.makedirs(self.new_run) - self.demux_run = os.path.join(self.rootdir, '190821_M01545_0252_000000002') - os.makedirs(os.path.join(self.demux_run, 'Unaligned_1')) - self.seq_run = os.path.join(self.rootdir, '190821_M01545_0252_000000003') + self.demux_run = os.path.join(self.rootdir, "190821_M01545_0252_000000002") + os.makedirs(os.path.join(self.demux_run, "Unaligned_1")) + self.seq_run = os.path.join(self.rootdir, "190821_M01545_0252_000000003") os.makedirs(self.seq_run) - self.error_run = os.path.join(self.rootdir, '190821_M01545_0252_000000004') + self.error_run = os.path.join(self.rootdir, "190821_M01545_0252_000000004") os.makedirs(self.error_run) - with open(os.path.join(self.error_run, 'RTAComplete.txt'), 'w') as fh: - fh.write('This is some contents\n') + with open(os.path.join(self.error_run, "RTAComplete.txt"), "w") as fh: + fh.write("This is some contents\n") @classmethod def tearDownClass(self): shutil.rmtree(self.rootdir) - @mock.patch('taca.utils.bioinfo_tab.update_statusdb', return_value=None) + @mock.patch("taca.utils.bioinfo_tab.update_statusdb", return_value=None) def test_collect_runs(self, mock_update_statusdb): """Find runs in specified directory.""" bioinfo_tab.collect_runs() - calls = [mock.call('data/test_data/190201_A00621_0032_BHHFCFDSXX'), mock.call('data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY')] + calls = [ + mock.call("data/test_data/190201_A00621_0032_BHHFCFDSXX"), + mock.call("data/test_data/nosync/190201_A00621_0032_BHHFCFDSXY"), + ] mock_update_statusdb.assert_has_calls(calls) def test_get_ss_projects(self): """Get project info.""" - run_dir = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' + run_dir = "data/test_data/190201_A00621_0032_BHHFCFDSXX" got_info = bioinfo_tab.get_ss_projects(run_dir) - expected_info = defaultdict(bioinfo_tab.Tree, - {'HHFCFDSXX': defaultdict(bioinfo_tab.Tree, - {'1': defaultdict(bioinfo_tab.Tree, - {'P10000_1001': defaultdict(bioinfo_tab.Tree, - {'P10000': defaultdict(bioinfo_tab.Tree, {})})}), - '2': defaultdict(bioinfo_tab.Tree, - {'P10000_1005': defaultdict(bioinfo_tab.Tree, - {'P10000': defaultdict(bioinfo_tab.Tree, {})})})})}) + expected_info = defaultdict( + bioinfo_tab.Tree, + { + "HHFCFDSXX": defaultdict( + bioinfo_tab.Tree, + { + "1": defaultdict( + bioinfo_tab.Tree, + { + "P10000_1001": defaultdict( + bioinfo_tab.Tree, + {"P10000": defaultdict(bioinfo_tab.Tree, {})}, + ) + }, + ), + "2": defaultdict( + bioinfo_tab.Tree, + { + "P10000_1005": defaultdict( + bioinfo_tab.Tree, + {"P10000": defaultdict(bioinfo_tab.Tree, {})}, + ) + }, + ), + }, + ) + }, + ) self.assertEqual(expected_info, got_info) - @mock.patch('taca.utils.bioinfo_tab.statusdb') + @mock.patch("taca.utils.bioinfo_tab.statusdb") def test_update_statusdb(self, mock_couch): """Update statusdb.""" - run_dir = 'data/test_data/190201_A00621_0032_BHHFCFDSXX' + run_dir = "data/test_data/190201_A00621_0032_BHHFCFDSXX" bioinfo_tab.update_statusdb(run_dir) - mock_couch.StatusdbSession.assert_called_with({'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'xten_db': 'x_flowcells'}) + mock_couch.StatusdbSession.assert_called_with( + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + } + ) def test_get_status_new(self): """Return status New.""" got_status = bioinfo_tab.get_status(self.new_run) - self.assertEqual(got_status, 'New') + self.assertEqual(got_status, "New") def test_get_status_demultiplexing(self): """Return status Demultiplexing.""" got_status = bioinfo_tab.get_status(self.demux_run) - self.assertEqual(got_status, 'Demultiplexing') + self.assertEqual(got_status, "Demultiplexing") def test_get_status_sequencing(self): """Return status Sequencing.""" got_status = bioinfo_tab.get_status(self.seq_run) - self.assertEqual(got_status, 'Sequencing') + self.assertEqual(got_status, "Sequencing") def test_get_status_error(self): """Return status ERROR.""" got_status = bioinfo_tab.get_status(self.error_run) - self.assertEqual(got_status, 'ERROR') + self.assertEqual(got_status, "ERROR") def test_parse_sample_sheet(self): """Parse samplesheet.""" - sample_sheet = 'data/samplesheet.csv' - expected_data = [{'SampleWell': '1:1', - 'index': 'GAATTCGT', - 'Lane': '1', - 'SamplePlate': 'FCB_150423', - 'SampleName': 'P1775_147', - 'SampleID': 'Sample_P1775_147', - 'Project': 'J_Lundeberg_14_24'}] - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir') + sample_sheet = "data/samplesheet.csv" + expected_data = [ + { + "SampleWell": "1:1", + "index": "GAATTCGT", + "Lane": "1", + "SamplePlate": "FCB_150423", + "SampleName": "P1775_147", + "SampleID": "Sample_P1775_147", + "Project": "J_Lundeberg_14_24", + } + ] + parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, "run_dir") self.assertEqual(expected_data, parsed_data) def test_parse_sample_sheet_is_miseq(self): """Parse MiSeq samplesheet.""" - sample_sheet = 'data/miseq_samplesheet.csv' - expected_data = [{'SampleWell': '1:1', - 'index': 'GAATTCGT', - 'Lane': '1', - 'SamplePlate': 'FCB_150423', - 'SampleName': 'P1775_147', - 'SampleID': 'Sample_P1775_147', - 'Project': 'J_Lundeberg_14_24'}] - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir', is_miseq=True) + sample_sheet = "data/miseq_samplesheet.csv" + expected_data = [ + { + "SampleWell": "1:1", + "index": "GAATTCGT", + "Lane": "1", + "SamplePlate": "FCB_150423", + "SampleName": "P1775_147", + "SampleID": "Sample_P1775_147", + "Project": "J_Lundeberg_14_24", + } + ] + parsed_data = bioinfo_tab.parse_samplesheet( + sample_sheet, "run_dir", is_miseq=True + ) self.assertEqual(expected_data, parsed_data) def test_parse_sample_sheet_is_miseq_error(self): """Return empty list if not production or application in MiSeq samplesheet.""" - sample_sheet = 'data/samplesheet.csv' - parsed_data = bioinfo_tab.parse_samplesheet(sample_sheet, 'run_dir', is_miseq=True) + sample_sheet = "data/samplesheet.csv" + parsed_data = bioinfo_tab.parse_samplesheet( + sample_sheet, "run_dir", is_miseq=True + ) self.assertEqual(parsed_data, []) - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_no_samplesheet(self, mock_datetime, mock_send_mail): """Send email if no_samplesheet error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'run_missing_samplesheet' - body+='\n\nSincerely, TACA' - subject='ERROR, Samplesheet error' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "run_missing_samplesheet" + body += "\n\nSincerely, TACA" + subject = "ERROR, Samplesheet error" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('no_samplesheet', 'run_missing_samplesheet') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("no_samplesheet", "run_missing_samplesheet") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_failed_run(self, mock_datetime, mock_send_mail): """Send email if failed_run error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'failed_run' - body+='\n\nSincerely, TACA' - subject='WARNING, Reinitialization of partially failed FC' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "failed_run" + body += "\n\nSincerely, TACA" + subject = "WARNING, Reinitialization of partially failed FC" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('failed_run', 'failed_run') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("failed_run", "failed_run") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.send_mail') - @mock.patch('taca.utils.bioinfo_tab.datetime.datetime') + @mock.patch("taca.utils.bioinfo_tab.send_mail") + @mock.patch("taca.utils.bioinfo_tab.datetime.datetime") def test_error_mailer_weird_samplesheet(self, mock_datetime, mock_send_mail): """Send email if weird_samplesheet error.""" - body='TACA has encountered an issue that might be worth investigating\n' - body+='The offending entry is: ' - body+= 'weird_samplesheet_run' - body+='\n\nSincerely, TACA' - subject='ERROR, Incorrectly formatted samplesheet' + body = "TACA has encountered an issue that might be worth investigating\n" + body += "The offending entry is: " + body += "weird_samplesheet_run" + body += "\n\nSincerely, TACA" + subject = "ERROR, Incorrectly formatted samplesheet" mock_datetime.now() mock_datetime.now().hour = 7 - bioinfo_tab.error_emailer('weird_samplesheet', 'weird_samplesheet_run') - mock_send_mail.assert_called_with(subject, body, 'some_user@some_email.com') + bioinfo_tab.error_emailer("weird_samplesheet", "weird_samplesheet_run") + mock_send_mail.assert_called_with(subject, body, "some_user@some_email.com") - @mock.patch('taca.utils.bioinfo_tab.statusdb') + @mock.patch("taca.utils.bioinfo_tab.statusdb") def test_fail_run(self, mock_couch): """Fail run in statusdb.""" - run_id = '190201_A00621_0032_BHHFCFDSXX' - project = 'P0001' + run_id = "190201_A00621_0032_BHHFCFDSXX" + project = "P0001" bioinfo_tab.fail_run(run_id, project) - mock_couch.StatusdbSession.assert_called_with({'url': 'url', - 'username': 'username', - 'password': 'pwd', - 'xten_db': - 'x_flowcells'}) + mock_couch.StatusdbSession.assert_called_with( + { + "url": "url", + "username": "username", + "password": "pwd", + "xten_db": "x_flowcells", + } + ) class TestStatusdb(unittest.TestCase): """Tests for statusdb utils.""" - @mock.patch('taca.utils.statusdb.couchdb') + @mock.patch("taca.utils.statusdb.couchdb") def test_get_entry(self, mock_couch): """Get an entry from statusdb.""" - couch_config = {'user': 'username', - 'url': 'some_url', - 'password': 'some_pwd'} - entry = statusdb.ProjectSummaryConnection(couch_config).get_entry('name') + couch_config = {"user": "username", "url": "some_url", "password": "some_pwd"} + entry = statusdb.ProjectSummaryConnection(couch_config).get_entry("name") self.assertEqual(entry, None) def test_merge_dicts(self): """Merge two dicts.""" - d1 = {'a': '1', 'b': '2'} - d2 = {'a': '3', 'c': '4'} + d1 = {"a": "1", "b": "2"} + d2 = {"a": "3", "c": "4"} merged_dict = statusdb.merge_dicts(d1, d2) - expected_dict = {'a': '1', 'b': '2', 'c': '4'} + expected_dict = {"a": "1", "b": "2", "c": "4"} self.assertEqual(merged_dict, expected_dict)