diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 9c0c3268..f4f1fd01 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,31 +1,29 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
- "name": "TACA",
- "build": {
- // Sets the run context to one level up instead of the .devcontainer folder.
- "context": "..",
- // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
- "dockerfile": "../Dockerfile"
+ "name": "TACA",
+ "build": {
+ // Sets the run context to one level up instead of the .devcontainer folder.
+ "context": "..",
+ // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+ "dockerfile": "../Dockerfile",
+ },
+ "features": {},
+ "customizations": {
+ "vscode": {
+ "extensions": ["ms-python.python"],
},
- "features": {},
- "customizations": {
- "vscode": {
- "extensions": [
- "ms-python.python",
- ]
- }
- },
- // Features to add to the dev container. More info: https://containers.dev/features.
- // "features": {},
- // Use 'forwardPorts' to make a list of ports inside the container available locally.
- // "forwardPorts": [],
- "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .",
- // Configure tool-specific properties.
- // "customizations": {},
- // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
- // "remoteUser": "devcontainer"
- "mounts": [
- "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached"
- ]
-}
\ No newline at end of file
+ },
+ // Features to add to the dev container. More info: https://containers.dev/features.
+ // "features": {},
+ // Use 'forwardPorts' to make a list of ports inside the container available locally.
+ // "forwardPorts": [],
+ "postCreateCommand": "cd ../flowcell_parser/ && pip3 install -e . && cd ../TACA && pip3 install -e .",
+ // Configure tool-specific properties.
+ // "customizations": {},
+ // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+ // "remoteUser": "devcontainer"
+ "mounts": [
+ "source=${localEnv:HOME}/repos/flowcell_parser,target=/workspaces/flowcell_parser,type=bind,consistency=cached",
+ ],
+}
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..ffb399a1
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,9 @@
+# Start adding here
+
+# 2024-01-31, Non-invasive fixes after merge with master //AKe
+a676908bb32d60bcbc5e42ce710f889980845af4
+2ba0179015e380b3b7e0ce8b9b99666533a7443f
+8ea4523b1c9789d03410178d75aea93b6b2ffa77
+d5330f615b237beadcec22d5422dff3c02aa54ff
+b9ee704ad4da26790e539b8fe1d39aa71f831ef1
+6a3edf3710b6fdd8c233662ebf900e9bd24e6bd5
diff --git a/.github/pr_labels.yml b/.github/pr_labels.yml
index d04f24f3..8db6d109 100644
--- a/.github/pr_labels.yml
+++ b/.github/pr_labels.yml
@@ -1,4 +1,4 @@
-version: '1'
+version: "1"
invalidStatus: "pending"
labelRule:
values:
diff --git a/.github/workflows/check-log.yml b/.github/workflows/check-log.yml
new file mode 100644
index 00000000..1447daba
--- /dev/null
+++ b/.github/workflows/check-log.yml
@@ -0,0 +1,26 @@
+name: Check VERSIONLOG.MD has been updated
+on: [pull_request]
+
+jobs:
+ check-versionlog:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout PR
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # Fetch all history for all branches and tags
+
+ - name: Check for VERSIONLOG.MD changes
+ id: versionlog_check
+ # 1) Find the common ancestor between the current HEAD and the base branch
+ # 2) Then see if the versionlog has been updated in the PR since it diverged
+ # from the common ancestor
+ run: |
+ PR_BASE_SHA=$(git merge-base HEAD ${{ github.event.pull_request.base.sha }})
+ FILE_CHANGED=$(git diff --name-only $PR_BASE_SHA HEAD | grep 'VERSIONLOG.md' || true)
+ if [ -n "$FILE_CHANGED" ]; then
+ echo "VERSIONLOG.MD has been changed."
+ else
+ echo "VERSIONLOG.MD has NOT been changed."
+ exit 1 # Fail the workflow if no changes in VERSIONLOG.MD
+ fi
diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml
new file mode 100644
index 00000000..1ab877de
--- /dev/null
+++ b/.github/workflows/lint-code.yml
@@ -0,0 +1,124 @@
+name: Lint code
+on: [push, pull_request]
+
+jobs:
+ # Use ruff to check for code style violations
+ ruff-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff
+ - name: ruff --> Check for style violations
+ # Configured in pyproject.toml
+ run: ruff check .
+
+ # Use ruff to check code formatting
+ ruff-format:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff
+ - name: ruff --> Check code formatting
+ run: ruff format --check .
+
+ # Use mypy for static type checking
+ mypy-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mypy
+ # Start by installing type stubs
+ - name: mypy --> Install stubs
+ run: echo -e "y" | mypy --install-types **/*.py || exit 0
+ - name: mypy --> Static type checking
+ # Configured in pyprojet.toml
+ run: mypy **/*.py
+
+ # Use pipreqs to check for missing dependencies
+ pipreqs-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+
+ - name: Install pipreqs
+ run: pip install pipreqs
+
+ - name: Install requirements
+ run: pip install -r requirements.txt
+
+ - name: Run pipreqs
+ run: pipreqs --savepath pipreqs.txt
+
+ - name: Compare requirements
+ run: |
+ # Extract and sort package names
+ awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' requirements.txt | sort -u > requirements.compare
+ awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' pipreqs.txt | sort -u > pipreqs.compare
+
+ # Compare package lists
+ if cmp -s requirements.compare pipreqs.compare
+ then
+ echo "Requirements are the same"
+
+ exit 0
+ else
+ echo "Requirements are different"
+ echo ""
+
+ echo "=== current requirements.txt ==="
+ echo ""
+ cat requirements.compare
+ echo ""
+
+ echo "=== pipreqs requirements ==="
+ echo ""
+ cat pipreqs.compare
+
+ exit 1
+ fi
+
+ # Use Prettier to check various file formats
+ prettier:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - name: Setup node
+ uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - name: Install Prettier
+ run: npm install -g prettier
+
+ - name: Run Prettier --check
+ run: prettier --check .
diff --git a/.gitignore b/.gitignore
index eb7ce2ba..f60e5c99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,7 @@ _build
.benchmarks
.coverage
__pycache__
+.pytest_cache
+.vscode
+.ruff_cache
+.mypy_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..c30ff77b
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+# .pre-commit-config.yaml
+repos:
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.6
+ hooks:
+ - id: ruff
+ - id: ruff-format
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: "v1.7.1"
+ hooks:
+ - id: mypy
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: "v4.0.0-alpha.8"
+ hooks:
+ - id: prettier
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index b1ae1922..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-language: python
-
-python:
- - "2.7"
- - "3.8"
-
-install:
- - python setup.py install
- - mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml
- - pip install codecov
-
-script:
- - cd tests && nosetests --with-coverage -v -s
-
-after_success:
- - codecov
-
-notifications:
- email: false
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c2345165..57b41e7d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,13 +2,14 @@
When contribution to this package please have the following things in mind:
-__NOTE__: _Please make sure that there are no exisiting [issues]((https://github.com/SciLifeLab/TACA/issues)) relating to whatever you want to report._
+**NOTE**: _Please make sure that there are no exisiting [issues](<(https://github.com/SciLifeLab/TACA/issues)>) relating to whatever you want to report._
####To contribute:
+
1. Create an issue describing the bug / suggestion / improvement / ... [here](https://github.com/SciLifeLab/TACA/issues).
2. Fork this repository to your GitHub account
3. Make the necessary changes / additions to your forked TACA repository
-4. Please *make sure* that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/)
+4. Please _make sure_ that you've documented your code and changes using [sphinx](http://sphinx.readthedocs.org/en/latest/tutorial.html) syntax, as the documentation will be automatically generated using this engine, and published to [ReadTheDocs](http://project-management.readthedocs.org/)
5. Update the version number in `TACA/__init__.py`
6. Pull Request and wait for the responsible reviewer to review and merge the code
diff --git a/Dockerfile b/Dockerfile
index 3c63be1a..db00553a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,4 +24,4 @@ COPY requirements-dev.txt requirements-dev.txt
RUN python -m pip install -r requirements-dev.txt
RUN mkdir /root/.taca/
-COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml
\ No newline at end of file
+COPY tests/data/taca_test_cfg.yaml /root/.taca/taca.yaml
diff --git a/README.md b/README.md
index 50ce07c3..e38d2fee 100644
--- a/README.md
+++ b/README.md
@@ -4,34 +4,103 @@
-## Tool for the Automation of Cleanup and Analyses
+# Tool for the Automation of Cleanup and Analyses
[](http://badge.fury.io/py/taca)
-[](https://travis-ci.org/SciLifeLab/TACA)
[](https://readthedocs.org/projects/taca/?badge=latest)
[](https://codecov.io/gh/scilifelab/taca)
-This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) in Stockholm, Sweden.
+This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://ngisweden.scilifelab.se/) in Stockholm, Sweden.
-### Install for development
-You can install your own fork of taca in for instance a local conda environment for development. Provided you have conda installed:
+## Installation
+
+Inside the repo, run `pip install .`
+
+## Development
+
+Run `pip install requirements-dev.txt` to install packages used for development and `pip install -e .` to make the installation editable.
+
+### Automated linting
+
+This repo is configured for automated linting. Linter parameters are defined in `pyproject.toml`.
+
+As of now, we use:
+
+- [ruff](https://docs.astral.sh/ruff/) to perform automated formatting and a variety of lint checks.
+ - Run with `ruff check .` and `ruff format .`
+- [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking and to prevent contradictory type annotation.
+ - Run with `mypy **/*.py`
+- [pipreqs](https://github.com/bndr/pipreqs) to check that the requirement files are up-to-date with the code.
+
+ - This is run with a custom Bash script in GitHub Actions which will only compare the list of package names.
+
+ ```
+ # Extract and sort package names
+ awk '{print $1}' $1 | sort -u > "$1".compare
+ awk -F'==' '{print $1}' $2 | sort -u > "$2".compare
+
+ # Compare package lists
+ if cmp -s "$1".compare "$2".compare
+ then
+ echo "Requirements are the same"
+ exit 0
+ else
+ echo "Requirements are different"
+ exit 1
+ fi
+ ```
+
+- [prettier](https://prettier.io/) to format common languages.
+ - Run with `prettier .`
+- [editorconfig-checker](https://github.com/editorconfig-checker/editorconfig-checker) to enforce `.editorconfig` rules for all files not covered by the tools above.
+ - Run with
+ ```
+ editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html')
+ ```
+
+#### [GitHub Actions](https://docs.github.com/en/actions)
+
+Configured in `.github/workflows/lint-code.yml`. Will test all commits in pushes or pull requests, but not change code or prevent merges.
+
+#### [Pre-commit](https://pre-commit.com/)
+
+Will prevent local commits that fail linting checks. Configured in `.pre-commit-config.yml`.
+
+To set up pre-commit checking:
+
+1. Run `pip install pre-commit`
+2. Navigate to the repo root
+3. Run `pre-commit install`
+
+This can be disabled with `pre-commit uninstall`
+
+#### VS Code automation
+
+To enable automated linting in VS Code, go the the user `settings.json` and include the following lines:
```
-# clone the repo
-git clone https://github.com//TACA.git
+"[python]": {
+ "editor.defaultFormatter": "charliermarsh.ruff",
+}
+```
-# create an environment
-conda create -n taca_dev python=2.7
-conda activate taca_dev
+This will run the `ruff`-mediated linting with the same parameters as the `GitHub Actions` and `pre-commit` every time VS Code is used to format the code in the repository.
-# install TACA and dependencies for developoment
-cd TACA
-python setup.py develop
-pip install -r ./requirements-dev.txt
+To run formatting on save, include the lines:
-# Check that tests pass:
-cd tests && nosetests -v -s
```
+"[python]": {
+ "editor.formatOnSave": true,
+}
+```
+
+### Git blame suppression
+
+When a non-invasive tool is used to tidy up a lot of code, it is useful to supress the Git blame for that particular commit, so the original author can still be traced.
+
+To do this, add the hash of the commit containing the changes to `.git-blame-ignore-revs`, headed by an explanatory comment.
+
+### Deliver command
There is also a [plugin for the deliver command](https://github.com/SciLifeLab/taca-ngi-pipeline). To install this in the same development environment:
@@ -43,7 +112,8 @@ python setup.py develop
pip install -r ./requirements-dev.txt
# add required config files and env for taca delivery plugin
-echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml
+echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml
+echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml
mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml
export CHARON_BASE_URL="http://tracking.database.org"
export CHARON_API_TOKEN="charonapitokengoeshere"
diff --git a/VERSIONLOG.md b/VERSIONLOG.md
index 0216daf7..5b805399 100644
--- a/VERSIONLOG.md
+++ b/VERSIONLOG.md
@@ -1,5 +1,13 @@
# TACA Version Log
+## 20240123.1
+
+Exclude pod5 dir and files from being copied to metadata dir.
+
+## 20240122.1
+
+Adapt ONT analysis to new ONT JSON format (also backwards compatible).
+
## 20231204.1
Update ONT instrument transfer script to ignore runs started in the 3rd PromethION column, which will be used by Clinical Genomics.
@@ -21,9 +29,11 @@ Version 1.0.0
Fix bug with rsync permission issue cont.
## 20231031.1
+
Improve run_folder transfer
## 20231026.1
+
Fix bug with rsync permission issue
## 20231024.1
@@ -46,7 +56,6 @@ Fix bug that NovaSeqXPlus date format cause error in writing pdc_archived timest
Remove the temp change of creating links
-
## 20230920.1
Supplement last PR, primary purpose is to differentiate user runs from QC runs in the instrument transfer script rather than the installed TACA.
diff --git a/doc/conf.py b/doc/conf.py
index cb58a377..6a064945 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
#
# TACA documentation build configuration file, created by
# sphinx-quickstart on Wed Sep 17 12:39:41 2014.
@@ -12,168 +11,168 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.todo',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.viewcode',
+ "sphinx.ext.autodoc",
+ "sphinx.ext.todo",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
-#templates_path = ['_templates']
+# templates_path = ['_templates']
# The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = u'TACA'
-copyright = u'2014, Guillermo Carrasco'
+project = "TACA"
+copyright = "2014, Guillermo Carrasco"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = '1.0'
+version = "1.0"
# The full version, including alpha/beta/rc tags.
-release = '1.0'
+release = "1.0"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'default'
-on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+html_theme = "default"
+on_rtd = os.environ.get("READTHEDOCS", None) == "True"
if not on_rtd:
import sphinx_rtd_theme
- html_theme = 'sphinx_rtd_theme'
+
+ html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
html_show_copyright = False
@@ -181,68 +180,62 @@
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Output file base name for HTML help builder.
-htmlhelp_basename = 'TACAdoc'
+htmlhelp_basename = "TACAdoc"
# -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+latex_elements: dict = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ('index', 'TACA.tex', u'TACA Documentation',
- u'Guillermo Carrasco', 'manual'),
+ ("index", "TACA.tex", "TACA Documentation", "Guillermo Carrasco", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [
- ('index', 'taca', u'TACA Documentation',
- [u'Guillermo Carrasco'], 1)
-]
+man_pages = [("index", "taca", "TACA Documentation", ["Guillermo Carrasco"], 1)]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@@ -251,89 +244,95 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'TACA', u'TACA Documentation',
- u'Guillermo Carrasco', 'TACA', 'One line description of project.',
- 'Miscellaneous'),
+ (
+ "index",
+ "TACA",
+ "TACA Documentation",
+ "Guillermo Carrasco",
+ "TACA",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
-epub_title = u'TACA'
-epub_author = u'Guillermo Carrasco'
-epub_publisher = u'Guillermo Carrasco'
-epub_copyright = u'2014, Guillermo Carrasco'
+epub_title = "TACA"
+epub_author = "Guillermo Carrasco"
+epub_publisher = "Guillermo Carrasco"
+epub_copyright = "2014, Guillermo Carrasco"
# The basename for the epub file. It defaults to the project name.
-#epub_basename = u'TACA'
+# epub_basename = u'TACA'
# The HTML theme for the epub output. Since the default themes are not optimized
# for small screen space, using the same theme for HTML and epub output is
# usually not wise. This defaults to 'epub', a theme designed to save visual
# space.
-#epub_theme = 'epub'
+# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or en if the language is not set.
-#epub_language = ''
+# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
-#epub_scheme = ''
+# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
-#epub_identifier = ''
+# epub_identifier = ''
# A unique identification for the text.
-#epub_uid = ''
+# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
-#epub_cover = ()
+# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
-#epub_guide = ()
+# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_pre_files = []
+# epub_pre_files = []
# HTML files shat should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_post_files = []
+# epub_post_files = []
# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
# The depth of the table of contents in toc.ncx.
-#epub_tocdepth = 3
+# epub_tocdepth = 3
# Allow duplicate toc entries.
-#epub_tocdup = True
+# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
-#epub_tocscope = 'default'
+# epub_tocscope = 'default'
# Fix unsupported image types using the PIL.
-#epub_fix_images = False
+# epub_fix_images = False
# Scale large images.
-#epub_max_image_width = 0
+# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#epub_show_urls = 'inline'
+# epub_show_urls = 'inline'
# If false, no index is generated.
-#epub_use_index = True
+# epub_use_index = True
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..17ba1fbc
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,27 @@
+title = "taca"
+
+
+[tool.ruff.lint]
+select =[
+ # Ruff default rules
+ # ------------------------------
+ "E4", # pycodestyle Imports
+ "E7", # pycodestyle Statements
+ "E9", # pycodestyle Runtime
+ "F", # Pyflakes
+
+ # Additional Comment
+ # ------------------------------------------------------
+ "I", # isort Best-practice sorting of imports
+ "UP", # pyupgrade Make sure syntax is up-to-date
+]
+ignore = [
+ "E402", # Module level import not at top of file
+ "E722", # Do not use bare 'except'
+ "E741", # Ambiguous variable name
+]
+
+
+[tool.mypy]
+ignore_missing_imports = true
+follow_imports = 'skip'
diff --git a/requirements-dev.txt b/requirements-dev.txt
index af58407f..9118bd64 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,8 @@ mock
sphinx
sphinx-rtd-theme
pytest
+ipython
+ipdb
+ruff
+mypy
+pipreqs
diff --git a/requirements.txt b/requirements.txt
index b2bc63b1..baf1d47c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,10 @@
+CouchDB
+PyYAML
click
-requests
-pyyaml
flowcell_parser @ git+https://github.com/SciLifeLab/flowcell_parser
-soupsieve<2.0
-beautifulsoup4
-python-crontab
pandas
+pytest
+python_crontab
+python_dateutil
+setuptools
+sphinx_rtd_theme
diff --git a/setup.py b/setup.py
index cc05b49c..e278a522 100644
--- a/setup.py
+++ b/setup.py
@@ -1,50 +1,49 @@
-from setuptools import setup, find_packages
import glob
-import os
-import sys
+
+from setuptools import find_packages, setup
from taca import __version__
-from io import open
try:
- with open("requirements.txt", "r") as f:
+ with open("requirements.txt") as f:
install_requires = [x.strip() for x in f.readlines()]
-except IOError:
+except OSError:
install_requires = []
try:
- with open("dependency_links.txt", "r") as f:
+ with open("dependency_links.txt") as f:
dependency_links = [x.strip() for x in f.readlines()]
-except IOError:
+except OSError:
dependency_links = []
-setup(name='taca',
+setup(
+ name="taca",
version=__version__,
description="Tool for the Automation of Cleanup and Analyses",
- long_description='This package contains a set of functionalities that are '
- 'useful in the day-to-day tasks of bioinformatitians in '
- 'National Genomics Infrastructure in Stockholm, Sweden.',
- keywords='bioinformatics',
- author='NGI-stockholm',
- author_email='ngi_pipeline_operators@scilifelab.se',
- url='http://taca.readthedocs.org/en/latest/',
- license='MIT',
- packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
- scripts=glob.glob('scripts/*.py'),
+ long_description="This package contains a set of functionalities that are "
+ "useful in the day-to-day tasks of bioinformatitians in "
+ "National Genomics Infrastructure in Stockholm, Sweden.",
+ keywords="bioinformatics",
+ author="NGI-stockholm",
+ author_email="ngi_pipeline_operators@scilifelab.se",
+ url="http://taca.readthedocs.org/en/latest/",
+ license="MIT",
+ packages=find_packages(exclude=["ez_setup", "examples", "tests"]),
+ scripts=glob.glob("scripts/*.py"),
include_package_data=True,
zip_safe=False,
entry_points={
- 'console_scripts': ['taca = taca.cli:cli'],
- 'taca.subcommands': [
- 'cleanup = taca.cleanup.cli:cleanup',
- 'analysis = taca.analysis.cli:analysis',
- 'bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries',
- 'server_status = taca.server_status.cli:server_status',
- 'backup = taca.backup.cli:backup',
- 'create_env = taca.testing.cli:uppmax_env'
- ]
+ "console_scripts": ["taca = taca.cli:cli"],
+ "taca.subcommands": [
+ "cleanup = taca.cleanup.cli:cleanup",
+ "analysis = taca.analysis.cli:analysis",
+ "bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries",
+ "server_status = taca.server_status.cli:server_status",
+ "backup = taca.backup.cli:backup",
+ "create_env = taca.testing.cli:uppmax_env",
+ ],
},
install_requires=install_requires,
- dependency_links=dependency_links
+ dependency_links=dependency_links,
)
diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py
index c817b064..c19f0582 100755
--- a/taca/analysis/analysis.py
+++ b/taca/analysis/analysis.py
@@ -2,22 +2,19 @@
import glob
import logging
import os
-import sys
import subprocess
+import sys
+from shutil import copyfile, copytree
+
+from flowcell_parser.classes import RunParametersParser
-from shutil import copyfile
-from shutil import copytree
-from taca.illumina.Standard_Runs import Standard_Run
from taca.illumina.MiSeq_Runs import MiSeq_Run
from taca.illumina.NextSeq_Runs import NextSeq_Run
from taca.illumina.NovaSeq_Runs import NovaSeq_Run
from taca.illumina.NovaSeqXPlus_Runs import NovaSeqXPlus_Run
+from taca.utils import statusdb
from taca.utils.config import CONFIG
from taca.utils.transfer import RsyncAgent
-from taca.utils import statusdb
-
-from flowcell_parser.classes import RunParametersParser
-from io import open
logger = logging.getLogger(__name__)
@@ -32,54 +29,68 @@ def get_runObj(run, software):
None if the sequencer type is unknown of there was an error
"""
- if os.path.exists(os.path.join(run, 'runParameters.xml')):
- run_parameters_file = 'runParameters.xml'
- elif os.path.exists(os.path.join(run, 'RunParameters.xml')):
- run_parameters_file = 'RunParameters.xml'
+ if os.path.exists(os.path.join(run, "runParameters.xml")):
+ run_parameters_file = "runParameters.xml"
+ elif os.path.exists(os.path.join(run, "RunParameters.xml")):
+ run_parameters_file = "RunParameters.xml"
else:
- logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run))
+ logger.error(
+ f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}"
+ )
return
run_parameters_path = os.path.join(run, run_parameters_file)
try:
run_parameters = RunParametersParser(run_parameters_path)
except OSError:
- logger.warn('Problems parsing the runParameters.xml file at {}. '
- 'This is quite unexpected. please archive the run {} manually'.format(run_parameters_path, run))
+ logger.warn(
+ f"Problems parsing the runParameters.xml file at {run_parameters_path}. "
+ f"This is quite unexpected. please archive the run {run} manually"
+ )
else:
# Do a case by case test because there are so many version of RunParameters that there is no real other way
- runtype = run_parameters.data['RunParameters'].get('InstrumentType',
- run_parameters.data['RunParameters'].get('ApplicationName',
- run_parameters.data['RunParameters'].get('Application',
- '')))
- if 'Setup' in run_parameters.data['RunParameters']:
+ runtype = run_parameters.data["RunParameters"].get(
+ "InstrumentType",
+ run_parameters.data["RunParameters"].get(
+ "ApplicationName",
+ run_parameters.data["RunParameters"].get("Application", ""),
+ ),
+ )
+ if "Setup" in run_parameters.data["RunParameters"]:
# This is the HiSeq2500, MiSeq, and HiSeqX case
try:
# Works for recent control software
- runtype = run_parameters.data['RunParameters']['Setup']['Flowcell']
+ runtype = run_parameters.data["RunParameters"]["Setup"]["Flowcell"]
except KeyError:
# Use this as second resource but print a warning in the logs
- logger.warn('Parsing runParameters to fecth instrument type, '
- 'not found Flowcell information in it. Using ApplicationName')
+ logger.warn(
+ "Parsing runParameters to fecth instrument type, "
+ "not found Flowcell information in it. Using ApplicationName"
+ )
# Here makes sense to use get with default value '' ->
# so that it doesn't raise an exception in the next lines
# (in case ApplicationName is not found, get returns None)
- runtype = run_parameters.data['RunParameters']['Setup'].get('ApplicationName', '')
-
- if 'MiSeq' in runtype:
- return MiSeq_Run(run, software, CONFIG['analysis']['MiSeq'])
- elif 'NextSeq' in runtype:
- return NextSeq_Run(run, software, CONFIG['analysis']['NextSeq'])
- elif 'NovaSeqXPlus' in runtype:
- return NovaSeqXPlus_Run(run, software, CONFIG['analysis']['NovaSeqXPlus'])
- elif 'NovaSeq' in runtype:
- return NovaSeq_Run(run, software, CONFIG['analysis']['NovaSeq'])
+ runtype = run_parameters.data["RunParameters"]["Setup"].get(
+ "ApplicationName", ""
+ )
+
+ if "MiSeq" in runtype:
+ return MiSeq_Run(run, software, CONFIG["analysis"]["MiSeq"])
+ elif "NextSeq" in runtype:
+ return NextSeq_Run(run, software, CONFIG["analysis"]["NextSeq"])
+ elif "NovaSeqXPlus" in runtype:
+ return NovaSeqXPlus_Run(run, software, CONFIG["analysis"]["NovaSeqXPlus"])
+ elif "NovaSeq" in runtype:
+ return NovaSeq_Run(run, software, CONFIG["analysis"]["NovaSeq"])
else:
- logger.warn('Unrecognized run type {}, cannot archive the run {}. '
- 'Someone as likely bought a new sequencer without telling '
- 'it to the bioinfo team'.format(runtype, run))
+ logger.warn(
+ f"Unrecognized run type {runtype}, cannot archive the run {run}. "
+ "Someone as likely bought a new sequencer without telling "
+ "it to the bioinfo team"
+ )
return None
+
def upload_to_statusdb(run_dir, software):
"""Function to upload run_dir informations to statusDB directly from click interface.
@@ -93,60 +104,80 @@ def upload_to_statusdb(run_dir, software):
# Make the actual upload
_upload_to_statusdb(runObj)
+
def _upload_to_statusdb(run):
"""Triggers the upload to statusdb using the dependency flowcell_parser.
:param Run run: the object run
"""
- couch_conf = CONFIG['statusdb']
+ couch_conf = CONFIG["statusdb"]
couch_connection = statusdb.StatusdbSession(couch_conf).connection
- db = couch_connection[couch_conf['xten_db']]
+ db = couch_connection[couch_conf["xten_db"]]
parser = run.runParserObj
# Check if I have NoIndex lanes
- for element in parser.obj['samplesheet_csv']:
- if 'NoIndex' in element['index'] or not element['index']: # NoIndex in the case of HiSeq, empty in the case of HiSeqX
- lane = element['Lane'] # This is a lane with NoIndex
+ for element in parser.obj["samplesheet_csv"]:
+ if (
+ "NoIndex" in element["index"] or not element["index"]
+ ): # NoIndex in the case of HiSeq, empty in the case of HiSeqX
+ lane = element["Lane"] # This is a lane with NoIndex
# In this case PF Cluster is the number of undetermined reads
try:
- PFclusters = parser.obj['Undetermined'][lane]['unknown']
+ PFclusters = parser.obj["Undetermined"][lane]["unknown"]
except KeyError:
- logger.error('While taking extra care of lane {} of NoIndex type ' \
- 'I found out that not all values were available'.format(lane))
+ logger.error(
+ f"While taking extra care of lane {lane} of NoIndex type "
+ "I found out that not all values were available"
+ )
continue
# In Lanes_stats fix the lane yield
- parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters)
+ parser.obj["illumina"]["Demultiplex_Stats"]["Lanes_stats"][int(lane) - 1][
+ "PF Clusters"
+ ] = str(PFclusters)
# Now fix Barcode lane stats
- updated = 0 # Check that only one update is made
- for sample in parser.obj['illumina']['Demultiplex_Stats']['Barcode_lane_statistics']:
- if lane in sample['Lane']:
+ updated = 0 # Check that only one update is made
+ for sample in parser.obj["illumina"]["Demultiplex_Stats"][
+ "Barcode_lane_statistics"
+ ]:
+ if lane in sample["Lane"]:
updated += 1
- sample['PF Clusters'] = str(PFclusters)
+ sample["PF Clusters"] = str(PFclusters)
if updated != 1:
- logger.error('While taking extra care of lane {} of NoIndex type '
- 'I updated more than once the barcode_lane. '
- 'This is too much to continue so I will fail.'.format(lane))
+ logger.error(
+ f"While taking extra care of lane {lane} of NoIndex type "
+ "I updated more than once the barcode_lane. "
+ "This is too much to continue so I will fail."
+ )
os.sys.exit()
# If I am here it means I changed the HTML representation to something
# else to accomodate the wired things we do
# someone told me that in such cases it is better to put a place holder for this
- parser.obj['illumina']['Demultiplex_Stats']['NotOriginal'] = 'True'
+ parser.obj["illumina"]["Demultiplex_Stats"]["NotOriginal"] = "True"
# Update info about bcl2fastq tool
- if not parser.obj.get('DemultiplexConfig'):
- parser.obj['DemultiplexConfig'] = {'Setup': {'Software': run.CONFIG.get('bcl2fastq', {})}}
+ if not parser.obj.get("DemultiplexConfig"):
+ parser.obj["DemultiplexConfig"] = {
+ "Setup": {"Software": run.CONFIG.get("bcl2fastq", {})}
+ }
statusdb.update_doc(db, parser.obj, over_write_db_entry=True)
-def transfer_run(run_dir):
+
+def transfer_run(run_dir, software):
"""Interface for click to force a transfer a run to uppmax.
:param: string run_dir: the run to tranfer
"""
runObj = get_runObj(run_dir, software)
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
if runObj is None:
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
- logger.error('Trying to force a transfer of run {} but the sequencer was not recognized.'.format(run_dir))
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
+ logger.error(
+ f"Trying to force a transfer of run {run_dir} but the sequencer was not recognized."
+ )
else:
- runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients)
+ runObj.transfer_run(
+ os.path.join("nosync", CONFIG["analysis"]["status_dir"], "transfer.tsv"),
+ mail_recipients,
+ )
+
def transfer_runfolder(run_dir, pid, exclude_lane):
"""Transfer the entire run folder for a specified project and run to uppmax.
@@ -159,82 +190,105 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
# Validate whether run_dir exists or is valid
run_dir = os.path.abspath(run_dir)
if not os.path.exists(run_dir) or not os.path.isdir(run_dir):
- logger.error('Unable to locate the specified run directory for transfer.')
+ logger.error("Unable to locate the specified run directory for transfer.")
sys.exit()
- original_sample_sheet = os.path.join(run_dir, 'SampleSheet.csv')
- pid_list = list(set([x.strip() for x in pid.split(',')]))
- new_sample_sheet = os.path.join(run_dir, '_'.join(pid_list) + '_SampleSheet.txt')
+ original_sample_sheet = os.path.join(run_dir, "SampleSheet.csv")
+ pid_list = list(set([x.strip() for x in pid.split(",")]))
+ new_sample_sheet = os.path.join(run_dir, "_".join(pid_list) + "_SampleSheet.txt")
# Write new sample sheet including only rows for the specified project
try:
- with open(new_sample_sheet, 'w') as nss:
+ with open(new_sample_sheet, "w") as nss:
nss.write(extract_project_samplesheet(original_sample_sheet, pid_list))
- except IOError as e:
- logger.error('An error occured while parsing the samplesheet. '
- 'Please check the sample sheet and try again.')
+ except OSError as e:
+ logger.error(
+ "An error occured while parsing the samplesheet. "
+ "Please check the sample sheet and try again."
+ )
raise e
# Create a tar archive of the runfolder
dir_name = os.path.basename(run_dir)
- archive = run_dir + '.tar.gz'
+ archive = run_dir + ".tar.gz"
run_dir_path = os.path.dirname(run_dir)
# Prepare the options for excluding lanes
- if exclude_lane != '':
+ if exclude_lane != "":
dir_for_excluding_lane = []
- lane_to_exclude = exclude_lane.split(',')
+ lane_to_exclude = exclude_lane.split(",")
for lane in lane_to_exclude:
- if os.path.isdir('{}/{}/Thumbnail_Images/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Thumbnail_Images/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Images/Focus/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Images/Focus/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Data/Intensities/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Data/Intensities/BaseCalls/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/BaseCalls/L00{}'.format(lane)])
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}"):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Thumbnail_Images/L00{lane}"]
+ )
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Images/Focus/L00{lane}"):
+ dir_for_excluding_lane.extend(["--exclude", f"Images/Focus/L00{lane}"])
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}"):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Data/Intensities/L00{lane}"]
+ )
+ if os.path.isdir(
+ f"{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}"
+ ):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Data/Intensities/BaseCalls/L00{lane}"]
+ )
try:
- exclude_options_for_tar = ['--exclude', 'Demultiplexing*',
- '--exclude', 'demux_*',
- '--exclude', 'rsync*',
- '--exclude', '*.csv']
- if exclude_lane != '':
+ exclude_options_for_tar = [
+ "--exclude",
+ "Demultiplexing*",
+ "--exclude",
+ "demux_*",
+ "--exclude",
+ "rsync*",
+ "--exclude",
+ "*.csv",
+ ]
+ if exclude_lane != "":
exclude_options_for_tar += dir_for_excluding_lane
- subprocess.call(['tar'] + exclude_options_for_tar + ['-cvzf', archive, '-C', run_dir_path, dir_name])
+ subprocess.call(
+ ["tar"]
+ + exclude_options_for_tar
+ + ["-cvzf", archive, "-C", run_dir_path, dir_name]
+ )
except subprocess.CalledProcessError as e:
- logger.error('Error creating tar archive')
+ logger.error("Error creating tar archive")
raise e
# Generate the md5sum under the same folder as run_dir
- md5file = archive + '.md5'
+ md5file = archive + ".md5"
try:
- f = open(md5file, 'w')
+ f = open(md5file, "w")
os.chdir(run_dir_path)
- subprocess.call(['md5sum', os.path.basename(archive)], stdout=f)
+ subprocess.call(["md5sum", os.path.basename(archive)], stdout=f)
f.close()
except subprocess.CalledProcessError as e:
- logger.error('Error creating md5 file')
+ logger.error("Error creating md5 file")
raise e
# Rsync the files to the analysis cluster
- destination = CONFIG['analysis']['deliver_runfolder'].get('destination')
- rsync_opts = {'-LtDrv': None,
- '--chmod': 'g+rw'}
- connection_details = CONFIG['analysis']['deliver_runfolder'].get('analysis_server')
- archive_transfer = RsyncAgent(archive,
- dest_path=destination,
- remote_host=connection_details['host'],
- remote_user=connection_details['user'],
- validate=False,
- opts=rsync_opts)
- md5_transfer = RsyncAgent(md5file,
- dest_path=destination,
- remote_host=connection_details['host'],
- remote_user=connection_details['user'],
- validate=False,
- opts=rsync_opts)
+ destination = CONFIG["analysis"]["deliver_runfolder"].get("destination")
+ rsync_opts = {"-LtDrv": None, "--chmod": "g+rw"}
+ connection_details = CONFIG["analysis"]["deliver_runfolder"].get("analysis_server")
+ archive_transfer = RsyncAgent(
+ archive,
+ dest_path=destination,
+ remote_host=connection_details["host"],
+ remote_user=connection_details["user"],
+ validate=False,
+ opts=rsync_opts,
+ )
+ md5_transfer = RsyncAgent(
+ md5file,
+ dest_path=destination,
+ remote_host=connection_details["host"],
+ remote_user=connection_details["user"],
+ validate=False,
+ opts=rsync_opts,
+ )
archive_transfer.transfer()
md5_transfer.transfer()
@@ -244,83 +298,108 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
os.remove(new_sample_sheet)
os.remove(archive)
os.remove(md5file)
- except IOError as e:
- logger.error('Was not able to delete all temporary files')
+ except OSError as e:
+ logger.error("Was not able to delete all temporary files")
raise e
return
+
def extract_project_samplesheet(sample_sheet, pid_list):
- header_line = ''
- project_entries = ''
+ header_line = ""
+ project_entries = ""
with open(sample_sheet) as f:
for line in f:
- if line.split(',')[0] in ('Lane', 'FCID'): # include the header
+ if line.split(",")[0] in ("Lane", "FCID"): # include the header
header_line += line
elif any(pid in line for pid in pid_list):
- project_entries += line # include only lines related to the specified project
+ project_entries += (
+ line # include only lines related to the specified project
+ )
new_samplesheet_content = header_line + project_entries
return new_samplesheet_content
+
def run_preprocessing(run, software):
"""Run demultiplexing in all data directories.
:param str run: Process a particular run instead of looking for runs
"""
+
def _process(run):
"""Process a run/flowcell and transfer to analysis server.
:param taca.illumina.Run run: Run to be processed and transferred
"""
- logger.info('Checking run {}'.format(run.id))
- transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
- if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot
- logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id))
+ logger.info(f"Checking run {run.id}")
+ transfer_file = os.path.join(CONFIG["analysis"]["status_dir"], "transfer.tsv")
+ if run.is_transferred(
+ transfer_file
+ ): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot
+ logger.info(
+ f"Run {run.id} already transferred to analysis server, skipping it"
+ )
return
- if run.get_run_status() == 'SEQUENCING':
- logger.info('Run {} is not finished yet'.format(run.id))
- if 'statusdb' in CONFIG:
+ if run.get_run_status() == "SEQUENCING":
+ logger.info(f"Run {run.id} is not finished yet")
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
- elif run.get_run_status() == 'TO_START':
- if run.get_run_type() == 'NON-NGI-RUN':
+ elif run.get_run_status() == "TO_START":
+ if run.get_run_type() == "NON-NGI-RUN":
# For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs)
- logger.warn('Run {} marked as {}, '
- 'TACA will skip this and move the run to '
- 'no-sync directory'.format(run.id, run.get_run_type()))
- if 'storage' in CONFIG:
- run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
+ logger.warn(
+ f"Run {run.id} marked as {run.get_run_type()}, "
+ "TACA will skip this and move the run to "
+ "no-sync directory"
+ )
+ if "storage" in CONFIG:
+ run.archive_run(
+ CONFIG["storage"]["archive_dirs"][run.sequencer_type]
+ )
return
- logger.info(('Starting BCL to FASTQ conversion and demultiplexing for run {}'.format(run.id)))
- if 'statusdb' in CONFIG:
+ logger.info(
+ f"Starting BCL to FASTQ conversion and demultiplexing for run {run.id}"
+ )
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
run.demultiplex_run()
- elif run.get_run_status() == 'IN_PROGRESS':
- logger.info(('BCL conversion and demultiplexing process in '
- 'progress for run {}, skipping it'.format(run.id)))
+ elif run.get_run_status() == "IN_PROGRESS":
+ logger.info(
+ "BCL conversion and demultiplexing process in "
+ f"progress for run {run.id}, skipping it"
+ )
# Upload to statusDB if applies
- if 'statusdb' in CONFIG:
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
# This function checks if demux is done
run.check_run_status()
# Previous elif might change the status to COMPLETED, therefore to avoid skipping
# a cycle take the last if out of the elif
- if run.get_run_status() == 'COMPLETED':
+ if run.get_run_status() == "COMPLETED":
run.check_run_status()
- logger.info(('Preprocessing of run {} is finished, transferring it'.format(run.id)))
+ logger.info(f"Preprocessing of run {run.id} is finished, transferring it")
# Upload to statusDB if applies
- if 'statusdb' in CONFIG:
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
demux_summary_message = []
for demux_id, demux_log in run.demux_summary.items():
- if demux_log['errors'] or demux_log['warnings']:
- demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings']))
- demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5]))
- if len(demux_log['error_and_warning_messages'])>5:
- demux_summary_message.append("...... Only the first 5 errors or warnings are displayed for Demultiplexing_{}.".format(demux_id))
+ if demux_log["errors"] or demux_log["warnings"]:
+ demux_summary_message.append(
+ "Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(
+ demux_id, demux_log["errors"], demux_log["warnings"]
+ )
+ )
+ demux_summary_message.append(
+ "\n".join(demux_log["error_and_warning_messages"][:5])
+ )
+ if len(demux_log["error_and_warning_messages"]) > 5:
+ demux_summary_message.append(
+ f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}."
+ )
# Notify with a mail run completion and stats uploaded
if demux_summary_message:
- sbt = ("{} Demultiplexing Completed with ERRORs or WARNINGS!".format(run.id))
+ sbt = f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!"
msg = """The run {run} has been demultiplexed with errors or warnings!
{errors_warnings}
@@ -329,9 +408,11 @@ def _process(run):
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
- """.format(errors_warnings='\n'.join(demux_summary_message), run=run.id)
+ """.format(
+ errors_warnings="\n".join(demux_summary_message), run=run.id
+ )
else:
- sbt = ("{} Demultiplexing Completed!".format(run.id))
+ sbt = f"{run.id} Demultiplexing Completed!"
msg = """The run {run} has been demultiplexed without any error or warning.
The Run will be transferred to the analysis cluster for further analysis.
@@ -339,67 +420,103 @@ def _process(run):
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
""".format(run=run.id)
- run.send_mail(sbt, msg, rcp=CONFIG['mail']['recipients'])
+ run.send_mail(sbt, msg, rcp=CONFIG["mail"]["recipients"])
# Copy demultiplex stats file, InterOp meta data and run xml files to shared file system for LIMS purpose
- if 'mfs_path' in CONFIG['analysis']:
+ if "mfs_path" in CONFIG["analysis"]:
try:
- mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id)
- logger.info('Copying demultiplex stats, InterOp metadata and XML files for run {} to {}'.format(run.id, mfs_dest))
+ mfs_dest = os.path.join(
+ CONFIG["analysis"]["mfs_path"][run.sequencer_type.lower()],
+ run.id,
+ )
+ logger.info(
+ f"Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}"
+ )
if not os.path.exists(mfs_dest):
os.mkdir(mfs_dest)
- demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports',
- 'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html')
- copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html'))
+ demulti_stat_src = os.path.join(
+ run.run_dir,
+ run.demux_dir,
+ "Reports",
+ "html",
+ run.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
+ copyfile(
+ demulti_stat_src, os.path.join(mfs_dest, "laneBarcode.html")
+ )
# Copy RunInfo.xml
- run_info_xml_src = os.path.join(run.run_dir, 'RunInfo.xml')
+ run_info_xml_src = os.path.join(run.run_dir, "RunInfo.xml")
if os.path.isfile(run_info_xml_src):
- copyfile(run_info_xml_src, os.path.join(mfs_dest, 'RunInfo.xml'))
+ copyfile(
+ run_info_xml_src, os.path.join(mfs_dest, "RunInfo.xml")
+ )
# Copy RunParameters.xml
- run_parameters_xml_src = os.path.join(run.run_dir, 'RunParameters.xml')
+ run_parameters_xml_src = os.path.join(
+ run.run_dir, "RunParameters.xml"
+ )
if os.path.isfile(run_info_xml_src):
- copyfile(run_parameters_xml_src, os.path.join(mfs_dest, 'RunParameters.xml'))
+ copyfile(
+ run_parameters_xml_src,
+ os.path.join(mfs_dest, "RunParameters.xml"),
+ )
# Copy InterOp
- interop_src = os.path.join(run.run_dir, 'InterOp')
+ interop_src = os.path.join(run.run_dir, "InterOp")
if os.path.exists(interop_src):
- copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True)
+ copytree(
+ interop_src,
+ os.path.join(mfs_dest, "InterOp"),
+ dirs_exist_ok=True,
+ )
except:
- logger.warn('Could not copy demultiplex stats, InterOp metadata or XML files for run {}'.format(run.id))
+ logger.warn(
+ f"Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}"
+ )
# Transfer to analysis server if flag is True
if run.transfer_to_analysis_server:
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
- logger.info('Transferring run {} to {} into {}'
- .format(run.id,
- run.CONFIG['analysis_server']['host'],
- run.CONFIG['analysis_server']['sync']['data_archive']))
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
+ logger.info(
+ "Transferring run {} to {} into {}".format(
+ run.id,
+ run.CONFIG["analysis_server"]["host"],
+ run.CONFIG["analysis_server"]["sync"]["data_archive"],
+ )
+ )
run.transfer_run(transfer_file, mail_recipients)
# Archive the run if indicated in the config file
- if 'storage' in CONFIG: #TODO: make sure archiving to PDC is not ongoing
- run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
+ if "storage" in CONFIG: # TODO: make sure archiving to PDC is not ongoing
+ run.archive_run(CONFIG["storage"]["archive_dirs"][run.sequencer_type])
if run:
# Determine the run type
runObj = get_runObj(run, software)
if not runObj:
- raise RuntimeError("Unrecognized instrument type or incorrect run folder {}".format(run))
+ raise RuntimeError(
+ f"Unrecognized instrument type or incorrect run folder {run}"
+ )
else:
_process(runObj)
else:
- data_dirs = CONFIG.get('analysis').get('data_dirs')
+ data_dirs = CONFIG.get("analysis").get("data_dirs")
for data_dir in data_dirs:
# Run folder looks like DATE_*_*_*, the last section is the FC name.
- runs = glob.glob(os.path.join(data_dir, '[1-9]*_*_*_*'))
+ runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*"))
for _run in runs:
runObj = get_runObj(_run, software)
if not runObj:
- logger.warning('Unrecognized instrument type or incorrect run folder {}'.format(run))
+ logger.warning(
+ f"Unrecognized instrument type or incorrect run folder {run}"
+ )
else:
try:
_process(runObj)
except:
# This function might throw and exception,
# it is better to continue processing other runs
- logger.warning('There was an error processing the run {}'.format(run))
+ logger.warning(f"There was an error processing the run {run}")
pass
diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py
index 74e4c3ef..4f8f9345 100644
--- a/taca/analysis/analysis_nanopore.py
+++ b/taca/analysis/analysis_nanopore.py
@@ -1,17 +1,17 @@
"""Nanopore analysis methods for TACA."""
-import os
import logging
+import os
import re
import traceback
-from taca.utils.config import CONFIG
-from taca.utils.misc import send_mail
from taca.nanopore.ONT_run_classes import (
+ ONT_RUN_PATTERN,
+ ONT_qc_run,
ONT_run,
ONT_user_run,
- ONT_qc_run,
- ONT_RUN_PATTERN,
)
+from taca.utils.config import CONFIG
+from taca.utils.misc import send_mail
logger = logging.getLogger(__name__)
@@ -36,7 +36,6 @@ def find_run_dirs(dir_to_search: str, skip_dirs: list):
def send_error_mail(run_name, error: BaseException):
-
email_subject = f"Run processed with errors: {run_name}"
email_message = f"{str(error)}\n\n{traceback.format_exc()}"
email_recipients = CONFIG["mail"]["recipients"]
@@ -75,7 +74,6 @@ def process_user_run(ont_user_run: ONT_user_run):
if not ont_user_run.is_synced():
logger.info(f"{ont_user_run.run_name}: Run is not fully synced, skipping.")
else:
-
if ont_user_run.is_transferred():
logger.warning(
f"{ont_user_run.run_name}: Run is already logged as transferred, sending mail."
@@ -157,7 +155,6 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
if not ont_qc_run.is_synced():
logger.info(f"{ont_qc_run.run_name}: Run is not fully synced, skipping.")
else:
-
# Assert all files are in place
logger.info(f"{ont_qc_run.run_name}: Asserting run contents...")
ont_qc_run.assert_contents()
@@ -209,6 +206,10 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
logger.info(
f"{ont_qc_run.run_name}: Could not find Anglerfish sample sheet, skipping."
)
+ elif not ont_qc_run.has_fastq_output():
+ logger.info(
+ f"{ont_qc_run.run_name}: Run has no fastq output, skipping."
+ )
else:
logger.info(f"{ont_qc_run.run_name}: Starting Anglerfish...")
ont_qc_run.run_anglerfish()
@@ -243,7 +244,7 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
ont_qc_run.archive_run()
-def ont_transfer(run_abspath: str or None, qc: bool = False):
+def ont_transfer(run_abspath: str | None, qc: bool = False):
"""CLI entry function.
Find finished ONT runs in ngi-nas and transfer to HPC cluster.
@@ -257,7 +258,6 @@ def ont_transfer(run_abspath: str or None, qc: bool = False):
# If no run is specified, locate all runs
else:
-
for run_type in ["user_run", "qc_run"]:
logger.info(f"Looking for runs of type '{run_type}'...")
diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py
index ba101d66..52b6423b 100644
--- a/taca/analysis/cli.py
+++ b/taca/analysis/cli.py
@@ -13,21 +13,42 @@ def analysis():
# Illumina analysis subcommands
+
@analysis.command()
-@click.option('-r', '--run', type=click.Path(exists=True), default=None,
- help='Demultiplex only a particular run')
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ default=None,
+ help="Demultiplex only a particular run",
+)
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
def demultiplex(run, software):
- """Demultiplex and transfer all runs present in the data directories."""
- an.run_preprocessing(run, software)
+ """Demultiplex and transfer all runs present in the data directories."""
+ an.run_preprocessing(run, software)
+
@analysis.command()
-@click.option('--runfolder-project', is_flag=False, help='Project IDs for runfolder transfer separated by comma')
-@click.option('--exclude-lane', default='', help='Lanes to exclude separated by comma')
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
-@click.argument('rundir')
+@click.option(
+ "--runfolder-project",
+ is_flag=False,
+ help="Project IDs for runfolder transfer separated by comma",
+)
+@click.option("--exclude-lane", default="", help="Lanes to exclude separated by comma")
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
+@click.argument("rundir")
def transfer(rundir, runfolder_project, exclude_lane, software):
"""Transfers the run without qc."""
if not runfolder_project:
@@ -35,10 +56,16 @@ def transfer(rundir, runfolder_project, exclude_lane, software):
else:
an.transfer_runfolder(rundir, pid=runfolder_project, exclude_lane=exclude_lane)
+
@analysis.command()
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
-@click.argument('rundir')
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
+@click.argument("rundir")
def updatedb(rundir, software):
"""Save the run to statusdb."""
an.upload_to_statusdb(rundir, software)
@@ -46,6 +73,7 @@ def updatedb(rundir, software):
# Nanopore analysis subcommands
+
@analysis.command()
@click.option(
"-r",
@@ -65,6 +93,7 @@ def ont_transfer(run, qc):
"""Find and process all runs"""
analysis_nanopore.ont_transfer(run, qc)
+
@analysis.command()
@click.argument("run")
def ont_updatedb(run):
diff --git a/taca/backup/backup.py b/taca/backup/backup.py
index 037b1ea6..8d43a558 100644
--- a/taca/backup/backup.py
+++ b/taca/backup/backup.py
@@ -1,55 +1,63 @@
"""Backup methods and utilities."""
+import csv
import logging
import os
import re
import shutil
import subprocess as sp
import time
-import csv
-
from datetime import datetime
+
+from taca.utils import filesystem, misc, statusdb
from taca.utils.config import CONFIG
-from taca.utils import statusdb, filesystem, misc
-from io import open
logger = logging.getLogger(__name__)
-class run_vars(object):
+
+class run_vars:
"""A simple variable storage class."""
+
def __init__(self, run, archive_path):
self.abs_path = os.path.abspath(run)
self.path, self.name = os.path.split(self.abs_path)
- self.name = self.name.split('.', 1)[0]
- self.zip = os.path.join(archive_path, f'{self.name}.tar.gz')
- self.key = '{}.key'.format(self.name)
- self.key_encrypted = '{}.key.gpg'.format(self.name)
- self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg')
+ self.name = self.name.split(".", 1)[0]
+ self.zip = os.path.join(archive_path, f"{self.name}.tar.gz")
+ self.key = f"{self.name}.key"
+ self.key_encrypted = f"{self.name}.key.gpg"
+ self.zip_encrypted = os.path.join(archive_path, f"{self.name}.tar.gz.gpg")
+
-class backup_utils(object):
+class backup_utils:
"""A class object with main utility methods related to backing up."""
def __init__(self, run=None):
self.run = run
self.fetch_config_info()
- self.host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
+ self.host_name = os.getenv("HOSTNAME", os.uname()[1]).split(".", 1)[0]
def fetch_config_info(self):
"""Try to fecth required info from the config file. Log and exit if any neccesary info is missing."""
try:
- self.data_dirs = CONFIG['backup']['data_dirs']
- self.archive_dirs = CONFIG['backup']['archive_dirs']
- self.archived_dirs = CONFIG['backup']['archived_dirs']
- self.exclude_list = CONFIG['backup']['exclude_list']
- self.keys_path = CONFIG['backup']['keys_path']
- self.gpg_receiver = CONFIG['backup']['gpg_receiver']
- self.mail_recipients = CONFIG['mail']['recipients']
- self.check_demux = CONFIG.get('backup', {}).get('check_demux', False)
- self.couch_info = CONFIG.get('statusdb')
- self.finished_run_indicator = CONFIG.get('storage', {}).get('finished_run_indicator', 'RTAComplete.txt')
- self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt')
- self.archive_log_location = CONFIG['backup']['archive_log']
+ self.data_dirs = CONFIG["backup"]["data_dirs"]
+ self.archive_dirs = CONFIG["backup"]["archive_dirs"]
+ self.archived_dirs = CONFIG["backup"]["archived_dirs"]
+ self.exclude_list = CONFIG["backup"]["exclude_list"]
+ self.keys_path = CONFIG["backup"]["keys_path"]
+ self.gpg_receiver = CONFIG["backup"]["gpg_receiver"]
+ self.mail_recipients = CONFIG["mail"]["recipients"]
+ self.check_demux = CONFIG.get("backup", {}).get("check_demux", False)
+ self.couch_info = CONFIG.get("statusdb")
+ self.finished_run_indicator = CONFIG.get("storage", {}).get(
+ "finished_run_indicator", "RTAComplete.txt"
+ )
+ self.copy_complete_indicator = CONFIG.get("storage", {}).get(
+ "copy_complete_indicator", "CopyComplete.txt"
+ )
+ self.archive_log_location = CONFIG["backup"]["archive_log"]
except KeyError as e:
- logger.error('Config file is missing the key {}, make sure it have all required information'.format(str(e)))
+ logger.error(
+ f"Config file is missing the key {str(e)}, make sure it have all required information"
+ )
raise SystemExit
def collect_runs(self, ext=None, filter_by_ext=False):
@@ -59,24 +67,30 @@ def collect_runs(self, ext=None, filter_by_ext=False):
run_type = self._get_run_type(self.run)
archive_path = self.archive_dirs[run_type]
run = run_vars(self.run, archive_path)
- if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)):
- logger.error('Given run {} did not match a FC pattern'.format(self.run))
+ if not (
+ re.match(filesystem.RUN_RE, run.name)
+ or re.match(filesystem.RUN_RE_ONT, run.name)
+ ):
+ logger.error(f"Given run {self.run} did not match a FC pattern")
raise SystemExit
if self._is_ready_to_archive(run, ext):
self.runs.append(run)
else:
for adir in self.archive_dirs.values():
if not os.path.isdir(adir):
- logger.warn('Path {} does not exist or it is not a directory'.format(adir))
+ logger.warn(f"Path {adir} does not exist or it is not a directory")
continue
for item in os.listdir(adir):
if filter_by_ext and not item.endswith(ext):
continue
elif item.endswith(ext):
- item = item.replace(ext, '')
+ item = item.replace(ext, "")
elif not os.path.isdir(os.path.join(adir, item)):
continue
- if (re.match(filesystem.RUN_RE, item) or re.match(filesystem.RUN_RE_ONT, item)) and item not in self.runs:
+ if (
+ re.match(filesystem.RUN_RE, item)
+ or re.match(filesystem.RUN_RE_ONT, item)
+ ) and item not in self.runs:
run_type = self._get_run_type(item)
archive_path = self.archive_dirs[run_type]
run = run_vars(os.path.join(adir, item), archive_path)
@@ -86,7 +100,14 @@ def collect_runs(self, ext=None, filter_by_ext=False):
def avail_disk_space(self, path, run):
"""Check the space on file system based on parent directory of the run."""
# not able to fetch runtype use the max size as precaution, size units in GB
- illumina_run_sizes = {'novaseq': 1800, 'miseq': 20, 'nextseq': 250, 'NovaSeqXPlus': 3600, 'promethion': 3000, 'minion': 1000}
+ illumina_run_sizes = {
+ "novaseq": 1800,
+ "miseq": 20,
+ "nextseq": 250,
+ "NovaSeqXPlus": 3600,
+ "promethion": 3000,
+ "minion": 1000,
+ }
required_size = illumina_run_sizes.get(self._get_run_type(run), 900) * 2
# check for any ongoing runs and add up the required size accrdingly
for ddir in self.data_dirs.values():
@@ -95,19 +116,25 @@ def avail_disk_space(self, path, run):
for item in os.listdir(ddir):
if not re.match(filesystem.RUN_RE, item):
continue
- if not os.path.exists(os.path.join(ddir, item, 'RTAComplete.txt')):
- required_size += illumina_run_sizes.get(self._get_run_type(run), 900)
+ if not os.path.exists(os.path.join(ddir, item, "RTAComplete.txt")):
+ required_size += illumina_run_sizes.get(
+ self._get_run_type(run), 900
+ )
# get available free space from the file system
try:
- df_proc = sp.Popen(['df', path], stdout=sp.PIPE, stderr=sp.PIPE)
+ df_proc = sp.Popen(["df", path], stdout=sp.PIPE, stderr=sp.PIPE)
df_out, df_err = df_proc.communicate()
- available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024
+ available_size = (
+ int(df_out.strip().decode("utf-8").split("\n")[-1].strip().split()[3])
+ / 1024
+ / 1024
+ )
except Exception as e:
- logger.error('Evaluation of disk space failed with error {}'.format(e))
+ logger.error(f"Evaluation of disk space failed with error {e}")
raise SystemExit
if available_size < required_size:
- e_msg = 'Required space for encryption is {}GB, but only {}GB available'.format(required_size, available_size)
- subjt = 'Low space for encryption - {}'.format(self.host_name)
+ e_msg = f"Required space for encryption is {required_size}GB, but only {available_size}GB available"
+ subjt = f"Low space for encryption - {self.host_name}"
logger.error(e_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
raise SystemExit
@@ -118,47 +145,63 @@ def file_in_pdc(self, src_file, silent=True):
# non-zero/False though cmd is execudted but file not found
src_file_abs = os.path.abspath(src_file)
try:
- sp.check_call(['dsmc', 'query', 'archive', src_file_abs], stdout=sp.PIPE, stderr=sp.PIPE)
+ sp.check_call(
+ ["dsmc", "query", "archive", src_file_abs],
+ stdout=sp.PIPE,
+ stderr=sp.PIPE,
+ )
value = True
except sp.CalledProcessError:
value = False
if not silent:
- msg = 'File {} {} in PDC'.format(src_file_abs, 'exist' if value else 'do not exist')
+ msg = "File {} {} in PDC".format(
+ src_file_abs, "exist" if value else "do not exist"
+ )
logger.info(msg)
return value
def _get_run_type(self, run):
"""Returns run type based on the flowcell name."""
- run_type = ''
+ run_type = ""
try:
- if '_A0' in run:
- run_type = 'novaseq'
- elif '-' in run.split('_')[-1]:
- run_type = 'miseq'
- elif '_NS' in run or '_VH' in run:
- run_type = 'nextseq'
- elif '_LH' in run:
- run_type = 'NovaSeqXPlus'
- elif '_MN' in run:
- run_type = 'minion'
- elif re.match("^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$",run):
- run_type = 'promethion'
+ if "_A0" in run:
+ run_type = "novaseq"
+ elif "-" in run.split("_")[-1]:
+ run_type = "miseq"
+ elif "_NS" in run or "_VH" in run:
+ run_type = "nextseq"
+ elif "_LH" in run:
+ run_type = "NovaSeqXPlus"
+ elif "_MN" in run:
+ run_type = "minion"
+ elif re.match(
+ "^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$", run
+ ):
+ run_type = "promethion"
else:
- run_type = ''
+ run_type = ""
except:
- logger.warn('Could not fetch run type for run {}'.format(run))
+ logger.warn(f"Could not fetch run type for run {run}")
return run_type
- def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]):
+ def _call_commands(
+ self,
+ cmd1,
+ cmd2=None,
+ out_file=None,
+ return_out=False,
+ mail_failed=False,
+ tmp_files=[],
+ ):
"""Call an external command(s) with atmost two commands per function call.
Given 'out_file' is always used for the later cmd and also stdout can be return
for the later cmd. In case of failure, the 'tmp_files' are removed"""
if out_file:
if not cmd2:
- stdout1 = open(out_file, 'w')
+ stdout1 = open(out_file, "w")
else:
stdout1 = sp.PIPE
- stdout2 = open(out_file, 'w')
+ stdout2 = open(out_file, "w")
else:
stdout1 = sp.PIPE
stdout2 = sp.PIPE
@@ -171,7 +214,9 @@ def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_
p2 = sp.Popen(cmd2, stdin=p1.stdout, stdout=stdout2, stderr=sp.PIPE)
p2_stat = p2.wait()
p2_out, p2_err = p2.communicate()
- if not self._check_status(cmd2, p2_stat, p2_err, mail_failed, tmp_files):
+ if not self._check_status(
+ cmd2, p2_stat, p2_err, mail_failed, tmp_files
+ ):
return (False, p2_err) if return_out else False
p1_stat = p1.wait()
p1_out, p1_err = p1.communicate()
@@ -194,10 +239,12 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]):
if status != 0:
self._clean_tmp_files(files_to_remove)
if mail_failed:
- subjt = 'Command call failed - {}'.format(self.host_name)
- e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg)
+ subjt = f"Command call failed - {self.host_name}"
+ e_msg = "Called cmd: {}\n\nError msg: {}".format(" ".join(cmd), err_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
- logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg))
+ logger.error(
+ 'Command "{}" failed with the error "{}"'.format(" ".join(cmd), err_msg)
+ )
return False
return True
@@ -210,22 +257,24 @@ def _clean_tmp_files(self, files):
def _log_pdc_statusdb(self, run):
"""Log the time stamp in statusDB if a file is succussfully sent to PDC."""
try:
- run_vals = run.split('_')
+ run_vals = run.split("_")
if len(run_vals[0]) == 8:
run_date = run_vals[0][2:]
else:
run_date = run_vals[0]
- run_fc = '{}_{}'.format(run_date, run_vals[-1])
+ run_fc = f"{run_date}_{run_vals[-1]}"
couch_connection = statusdb.StatusdbSession(self.couch_info).connection
- db = couch_connection[self.couch_info['db']]
- fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)}
+ db = couch_connection[self.couch_info["db"]]
+ fc_names = {e.key: e.id for e in db.view("names/name", reduce=False)}
d_id = fc_names[run_fc]
doc = db.get(d_id)
- doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ doc["pdc_archived"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
db.save(doc)
- logger.info('Logged "pdc_archived" timestamp for fc {} in statusdb doc "{}"'.format(run, d_id))
+ logger.info(
+ f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"'
+ )
except:
- logger.warn('Not able to log "pdc_archived" timestamp for run {}'.format(run))
+ logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}')
def _is_ready_to_archive(self, run, ext):
"""Check if the run to be encrypted has finished sequencing and has been copied completely to nas"""
@@ -233,24 +282,35 @@ def _is_ready_to_archive(self, run, ext):
run_path = run.abs_path
rta_file = os.path.join(run_path, self.finished_run_indicator)
cp_file = os.path.join(run_path, self.copy_complete_indicator)
- if (os.path.exists(rta_file) and os.path.exists(cp_file) and (not self.file_in_pdc(run.zip_encrypted))) or (self._get_run_type(run.name) in ['promethion', 'minion'] and os.path.exists(os.path.join(run_path, ".sync_finished"))):
+ if (
+ os.path.exists(rta_file)
+ and os.path.exists(cp_file)
+ and (not self.file_in_pdc(run.zip_encrypted))
+ ) or (
+ self._get_run_type(run.name) in ["promethion", "minion"]
+ and os.path.exists(os.path.join(run_path, ".sync_finished"))
+ ):
# Case for encrypting
# Run has NOT been encrypted (run.tar.gz.gpg not exists)
- if ext == '.tar.gz' and (not os.path.exists(run.zip_encrypted)):
- logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving')
+ if ext == ".tar.gz" and (not os.path.exists(run.zip_encrypted)):
+ logger.info(
+ f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving"
+ )
archive_ready = True
# Case for putting data to PDC
# Run has already been encrypted (run.tar.gz.gpg exists)
- elif ext == '.tar.gz.gpg' and os.path.exists(run.zip_encrypted):
- logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC')
+ elif ext == ".tar.gz.gpg" and os.path.exists(run.zip_encrypted):
+ logger.info(
+ f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC"
+ )
archive_ready = True
return archive_ready
def log_archived_run(self, file_name):
"""Write files archived to PDC to log file"""
- with open(self.archive_log_location, 'a') as archive_file:
- tsv_writer = csv.writer(archive_file, delimiter='\t')
+ with open(self.archive_log_location, "a") as archive_file:
+ tsv_writer = csv.writer(archive_file, delimiter="\t")
tsv_writer.writerow([file_name, str(datetime.now())])
def _move_run_to_archived(self, run):
@@ -258,7 +318,7 @@ def _move_run_to_archived(self, run):
run_type = self._get_run_type(run.name)
archived_path = self.archived_dirs[run_type]
if os.path.isdir(archived_path):
- logger.info('Moving run {} to the archived folder'.format(run.name))
+ logger.info(f"Moving run {run.name} to the archived folder")
shutil.move(run.name, archived_path)
else:
logger.warning("Cannot move run to archived, destination does not exist")
@@ -267,130 +327,207 @@ def _move_run_to_archived(self, run):
def encrypt_runs(cls, run, force):
"""Encrypt the runs that have been collected."""
bk = cls(run)
- bk.collect_runs(ext='.tar.gz')
- logger.info(f'In total, found {len(bk.runs)} run(s) to be encrypted')
+ bk.collect_runs(ext=".tar.gz")
+ logger.info(f"In total, found {len(bk.runs)} run(s) to be encrypted")
for run in bk.runs:
- run.flag = f'{run.name}.encrypting'
+ run.flag = f"{run.name}.encrypting"
run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag]
- logger.info(f'Encryption of run {run.name} is now started')
+ logger.info(f"Encryption of run {run.name} is now started")
# Check if there is enough space and exit if not
bk.avail_disk_space(run.path, run.name)
# Check if the run in demultiplexed
if not force and bk.check_demux:
- if not misc.run_is_demuxed(run, bk.couch_info, bk._get_run_type(run.name)):
- logger.warn(f'Run {run.name} is not demultiplexed yet, so skipping it')
+ if not misc.run_is_demuxed(
+ run, bk.couch_info, bk._get_run_type(run.name)
+ ):
+ logger.warn(
+ f"Run {run.name} is not demultiplexed yet, so skipping it"
+ )
continue
- logger.info(f'Run {run.name} is demultiplexed and proceeding with encryption')
+ logger.info(
+ f"Run {run.name} is demultiplexed and proceeding with encryption"
+ )
with filesystem.chdir(run.path):
# skip run if already ongoing
if os.path.exists(run.flag):
- logger.warn(f'Run {run.name} is already being encrypted, so skipping now')
+ logger.warn(
+ f"Run {run.name} is already being encrypted, so skipping now"
+ )
continue
- flag = open(run.flag, 'w').close()
+ open(run.flag, "w").close()
# zip the run directory
if os.path.exists(run.zip):
if os.path.isdir(run.name):
- logger.warn(f'Both run source and zipped archive exist for run {run.name}, skipping run as precaution')
+ logger.warn(
+ f"Both run source and zipped archive exist for run {run.name}, skipping run as precaution"
+ )
bk._clean_tmp_files([run.flag])
continue
- logger.info(f'Zipped archive already exist for run {run.name}, so using it for encryption')
+ logger.info(
+ f"Zipped archive already exist for run {run.name}, so using it for encryption"
+ )
else:
- exclude_files = " ".join([f'--exclude {x}' for x in bk.exclude_list])
- logger.info(f'Creating zipped archive for run {run.name}')
- if bk._call_commands(cmd1=f'tar {exclude_files} -cf - {run.name}', cmd2='pigz --fast -c -',
- out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]):
- logger.info(f'Run {run.name} was successfully compressed and transferred to {run.zip}')
+ exclude_files = " ".join(
+ [f"--exclude {x}" for x in bk.exclude_list]
+ )
+ logger.info(f"Creating zipped archive for run {run.name}")
+ if bk._call_commands(
+ cmd1=f"tar {exclude_files} -cf - {run.name}",
+ cmd2="pigz --fast -c -",
+ out_file=run.zip,
+ mail_failed=True,
+ tmp_files=[run.zip, run.flag],
+ ):
+ logger.info(
+ f"Run {run.name} was successfully compressed and transferred to {run.zip}"
+ )
else:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
# Remove encrypted file if already exists
if os.path.exists(run.zip_encrypted):
- logger.warn((f'Removing already existing encrypted file for run {run.name}, this is a precaution '
- 'to make sure the file was encrypted with correct key file'))
- bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted])
+ logger.warn(
+ f"Removing already existing encrypted file for run {run.name}, this is a precaution "
+ "to make sure the file was encrypted with correct key file"
+ )
+ bk._clean_tmp_files(
+ [
+ run.zip_encrypted,
+ run.key,
+ run.key_encrypted,
+ run.dst_key_encrypted,
+ ]
+ )
# Generate random key to use as pasphrase
- if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files):
- logger.warn(f'Skipping run {run.name} and moving on')
+ if not bk._call_commands(
+ cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files
+ ):
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
- logger.info(f'Generated random phrase key for run {run.name}')
+ logger.info(f"Generated random phrase key for run {run.name}")
# Calculate md5 sum pre encryption
if not force:
- logger.info('Calculating md5sum before encryption')
- md5_call, md5_out = bk._call_commands(cmd1=f'md5sum {run.zip}', return_out=True, tmp_files=tmp_files)
+ logger.info("Calculating md5sum before encryption")
+ md5_call, md5_out = bk._call_commands(
+ cmd1=f"md5sum {run.zip}", return_out=True, tmp_files=tmp_files
+ )
if not md5_call:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
md5_pre_encrypt = md5_out.split()[0]
# Encrypt the zipped run file
- logger.info('Encrypting the zipped run file')
- if not bk._call_commands(cmd1=(f'gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo '
- f'none -o {run.zip_encrypted} {run.zip}'), tmp_files=tmp_files):
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.info("Encrypting the zipped run file")
+ if not bk._call_commands(
+ cmd1=(
+ f"gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo "
+ f"none -o {run.zip_encrypted} {run.zip}"
+ ),
+ tmp_files=tmp_files,
+ ):
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
# Decrypt and check for md5
if not force:
- logger.info('Calculating md5sum after encryption')
- md5_call, md5_out = bk._call_commands(cmd1=f'gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}',
- cmd2='md5sum', return_out=True, tmp_files=tmp_files)
+ logger.info("Calculating md5sum after encryption")
+ md5_call, md5_out = bk._call_commands(
+ cmd1=f"gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}",
+ cmd2="md5sum",
+ return_out=True,
+ tmp_files=tmp_files,
+ )
if not md5_call:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
md5_post_encrypt = md5_out.split()[0]
if md5_pre_encrypt != md5_post_encrypt:
- logger.error(f'md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on')
+ logger.error(
+ f"md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on"
+ )
bk._clean_tmp_files(tmp_files)
continue
- logger.info('Md5sum matches before and after encryption')
+ logger.info("Md5sum matches before and after encryption")
# Encrypt and move the key file
- if bk._call_commands(cmd1=f'gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}', tmp_files=tmp_files):
+ if bk._call_commands(
+ cmd1=f"gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}",
+ tmp_files=tmp_files,
+ ):
shutil.move(run.key_encrypted, run.dst_key_encrypted)
else:
- logger.error('Encryption of key file failed, skipping run')
+ logger.error("Encryption of key file failed, skipping run")
continue
bk._clean_tmp_files([run.zip, run.key, run.flag])
- logger.info(f'Encryption of run {run.name} is successfully done, removing zipped run file')
+ logger.info(
+ f"Encryption of run {run.name} is successfully done, removing zipped run file"
+ )
@classmethod
def pdc_put(cls, run):
"""Archive the collected runs to PDC."""
bk = cls(run)
- bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True)
- logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs)))
+ bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True)
+ logger.info(f"In total, found {len(bk.runs)} run(s) to send PDC")
for run in bk.runs:
- run.flag = '{}.archiving'.format(run.name)
+ run.flag = f"{run.name}.archiving"
run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
if run.path not in bk.archive_dirs.values():
- logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate '
- 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name)))
+ logger.error(
+ "Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate "
+ "archive dir before sending it to PDC".format(
+ ",".join(list(bk.archive_dirs.values())), run.name
+ )
+ )
continue
if not os.path.exists(run.dst_key_encrypted):
- logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted))
+ logger.error(
+ f"Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it"
+ )
continue
with filesystem.chdir(run.path):
- #skip run if being encrypted
- if os.path.exists('{}.encrypting'.format(run.name)):
- logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name))
+ # skip run if being encrypted
+ if os.path.exists(f"{run.name}.encrypting"):
+ logger.warn(
+ f"Run {run.name} is currently being encrypted, so skipping now"
+ )
continue
# skip run if already ongoing
if os.path.exists(run.flag):
- logger.warn('Run {} is already being archived, so skipping now'.format(run.name))
+ logger.warn(
+ f"Run {run.name} is already being archived, so skipping now"
+ )
continue
- if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
- logger.warn('Seems like files related to run {} already exist in PDC, check and cleanup'.format(run.name))
+ if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(
+ run.dst_key_encrypted, silent=False
+ ):
+ logger.warn(
+ f"Seems like files related to run {run.name} already exist in PDC, check and cleanup"
+ )
continue
- flag = open(run.flag, 'w').close()
- logger.info('Sending file {} to PDC'.format(run.zip_encrypted))
- if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]):
- time.sleep(15) # give some time just in case 'dsmc' needs to settle
- if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]):
- time.sleep(5) # give some time just in case 'dsmc' needs to settle
- if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted):
- logger.info('Successfully sent file {} to PDC, moving file locally from {} to archived folder'.format(run.zip_encrypted, run.path))
+ open(run.flag, "w").close()
+ logger.info(f"Sending file {run.zip_encrypted} to PDC")
+ if bk._call_commands(
+ cmd1=f"dsmc archive {run.zip_encrypted}", tmp_files=[run.flag]
+ ):
+ time.sleep(15) # give some time just in case 'dsmc' needs to settle
+ if bk._call_commands(
+ cmd1=f"dsmc archive {run.dst_key_encrypted}",
+ tmp_files=[run.flag],
+ ):
+ time.sleep(
+ 5
+ ) # give some time just in case 'dsmc' needs to settle
+ if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(
+ run.dst_key_encrypted
+ ):
+ logger.info(
+ f"Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder"
+ )
bk.log_archived_run(run.zip_encrypted)
if bk.couch_info:
bk._log_pdc_statusdb(run.name)
- bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag])
+ bk._clean_tmp_files(
+ [run.zip_encrypted, run.dst_key_encrypted, run.flag]
+ )
bk._move_run_to_archived(run)
continue
- logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted))
+ logger.warn(f"Sending file {run.zip_encrypted} to PDC failed")
diff --git a/taca/backup/cli.py b/taca/backup/cli.py
index 07cce810..60d8c442 100644
--- a/taca/backup/cli.py
+++ b/taca/backup/cli.py
@@ -1,39 +1,75 @@
"""CLI for the backup subcommand."""
import click
+
from taca.backup.backup import backup_utils as bkut
+
@click.group()
@click.pass_context
def backup(ctx):
- """ Backup management methods and utilities """
+ """Backup management methods and utilities"""
pass
+
@backup.command()
-@click.option('-r', '--run', type=click.Path(exists=True), help="A run (directory or a zipped archive) to be encrypted")
-@click.option('-f', '--force', is_flag=True, help="Ignore the checks and just try encryption. USE IT WITH CAUTION.")
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ help="A run (directory or a zipped archive) to be encrypted",
+)
+@click.option(
+ "-f",
+ "--force",
+ is_flag=True,
+ help="Ignore the checks and just try encryption. USE IT WITH CAUTION.",
+)
@click.pass_context
def encrypt(ctx, run, force):
bkut.encrypt_runs(run, force)
-@backup.command(name='put_data')
-@click.option('-r', '--run', type=click.Path(exists=True), help="A run name (without extension) to be sent to PDC")
+
+@backup.command(name="put_data")
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ help="A run name (without extension) to be sent to PDC",
+)
@click.pass_context
def put_data(ctx, run):
bkut.pdc_put(run)
-@backup.command(name='get_data')
-@click.option('-r', '--run', required=True, help="A run name (without extension) to download from PDC")
-@click.option('-o', '--outdir', type=click.Path(exists=True, file_okay=False, writable=True),
- help="Optional directory name to save the downloaded file. Directory should exist")
+
+@backup.command(name="get_data")
+@click.option(
+ "-r",
+ "--run",
+ required=True,
+ help="A run name (without extension) to download from PDC",
+)
+@click.option(
+ "-o",
+ "--outdir",
+ type=click.Path(exists=True, file_okay=False, writable=True),
+ help="Optional directory name to save the downloaded file. Directory should exist",
+)
@click.pass_context
def get_data(ctx, run, outdir):
## W I P ##
raise NotImplementedError
+
@backup.command()
-@click.option('-r', '--run', required=True, type=click.Path(exists=True, dir_okay=False), help="A encripted run file")
-@click.option('-k', '--key', required=True, help="Key file to be used for decryption")
-@click.option('-p', '--password', help="To pass decryption passphrase via command line")
+@click.option(
+ "-r",
+ "--run",
+ required=True,
+ type=click.Path(exists=True, dir_okay=False),
+ help="A encripted run file",
+)
+@click.option("-k", "--key", required=True, help="Key file to be used for decryption")
+@click.option("-p", "--password", help="To pass decryption passphrase via command line")
@click.pass_context
def decrypt(ctx, run, key, password):
## W I P ##
diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py
index 07600870..df1e80ab 100644
--- a/taca/cleanup/cleanup.py
+++ b/taca/cleanup/cleanup.py
@@ -2,23 +2,28 @@
import logging
import os
import re
-
from collections import defaultdict
from datetime import datetime
from glob import glob
-from taca.utils.config import CONFIG, load_config
from taca.utils import filesystem, misc, statusdb
-from io import open
-from six.moves import map
+from taca.utils.config import CONFIG, load_config
logger = logging.getLogger(__name__)
-def cleanup_miarka(days_fastq, days_analysis,
- only_fastq, only_analysis,
- clean_undetermined, status_db_config,
- exclude_projects, list_only,
- date, dry_run=False):
+
+def cleanup_miarka(
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ status_db_config,
+ exclude_projects,
+ list_only,
+ date,
+ dry_run=False,
+):
"""Remove fastq/analysis data for projects that have been closed more than given
days (as days_fastq/days_analysis) from the given 'miarka' cluster.
@@ -49,217 +54,334 @@ def cleanup_miarka(days_fastq, days_analysis,
- "*.bam"
"""
try:
- config = CONFIG['cleanup']['miarka']
- flowcell_dir_root = config['flowcell']['root']
- flowcell_project_source = config['flowcell']['relative_project_source']
- flowcell_undet_files = config['flowcell']['undet_file_pattern']
- data_dir = config['data_dir']
- analysis_dir = config['analysis']['root']
- analysis_data_to_remove = config['analysis']['files_to_remove']
+ config = CONFIG["cleanup"]["miarka"]
+ flowcell_dir_root = config["flowcell"]["root"]
+ flowcell_project_source = config["flowcell"]["relative_project_source"]
+ flowcell_undet_files = config["flowcell"]["undet_file_pattern"]
+ data_dir = config["data_dir"]
+ analysis_dir = config["analysis"]["root"]
+ analysis_data_to_remove = config["analysis"]["files_to_remove"]
if date:
- date = datetime.strptime(date, '%Y-%m-%d')
+ date = datetime.strptime(date, "%Y-%m-%d")
except KeyError as e:
- logger.error('Config file is missing the key {}, make sure it has all required information'.format(str(e)))
+ logger.error(
+ f"Config file is missing the key {str(e)}, make sure it has all required information"
+ )
raise SystemExit
- except ValueError as e:
- logger.error('Date given with "--date" option is not in required format, see help for more info')
+ except ValueError:
+ logger.error(
+ 'Date given with "--date" option is not in required format, see help for more info'
+ )
raise SystemExit
# make a connection for project db
db_config = load_config(status_db_config)
- pcon = statusdb.ProjectSummaryConnection(db_config.get('statusdb'))
- assert pcon, 'Could not connect to project database in StatusDB'
+ pcon = statusdb.ProjectSummaryConnection(db_config.get("statusdb"))
+ assert pcon, "Could not connect to project database in StatusDB"
# make exclude project list if provided
exclude_list = []
if exclude_projects:
if os.path.isfile(exclude_projects):
- with open(exclude_projects, 'r') as in_file:
+ with open(exclude_projects) as in_file:
exclude_list.extend([p.strip() for p in in_file.readlines()])
else:
- exclude_list.extend(exclude_projects.split(','))
+ exclude_list.extend(exclude_projects.split(","))
# sanity check for mentioned project to exculde or valid
- invalid_projects = [p for p in exclude_list if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()]
+ invalid_projects = [
+ p
+ for p in exclude_list
+ if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()
+ ]
if invalid_projects:
- logger.error('"--exclude_projects" was called with some invalid projects "{}", '
- 'provide valid project name/id'.format(','.join(invalid_projects)))
+ logger.error(
+ '"--exclude_projects" was called with some invalid projects "{}", '
+ "provide valid project name/id".format(",".join(invalid_projects))
+ )
raise SystemExit
- #compile list for project to delete
+ # compile list for project to delete
project_clean_list, project_processed_list = ({}, [])
if not list_only and not clean_undetermined:
- logger.info('Building initial project list for removing data...')
+ logger.info("Building initial project list for removing data...")
if only_fastq:
- logger.info('Option "--only_fastq" is given, so will not look for analysis data')
+ logger.info(
+ 'Option "--only_fastq" is given, so will not look for analysis data'
+ )
elif only_analysis:
- logger.info('Option "--only_analysis" is given, so will not look for fastq data')
+ logger.info(
+ 'Option "--only_analysis" is given, so will not look for fastq data'
+ )
if clean_undetermined:
all_undet_files = []
for flowcell_dir in flowcell_dir_root:
- for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)]:
+ for fc in [
+ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)
+ ]:
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn('Flowcell {} does not contain a "{}" directory'.format(fc, flowcell_project_source))
+ logger.warn(
+ f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory'
+ )
continue
- projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
- if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
- not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))]
+ projects_in_fc = [
+ d
+ for d in os.listdir(flowcell_project_source)
+ if re.match(r"^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$", d)
+ and not os.path.exists(
+ os.path.join(flowcell_project_source, d, "cleaned")
+ )
+ ]
# the above check looked for project directories and also that are not cleaned
# so if it could not find any project, means there is no project diretory at all
# or all the project directory is already cleaned. Then we can remove the undet
if len(projects_in_fc) > 0:
continue
- fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files))
+ fc_undet_files = glob(
+ os.path.join(flowcell_project_source, flowcell_undet_files)
+ )
if fc_undet_files:
- logger.info('All projects was cleaned for FC {}, found {} undeterminded files'.format(fc, len(fc_undet_files)))
- all_undet_files.extend(list(map(os.path.abspath, fc_undet_files)))
+ logger.info(
+ f"All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files"
+ )
+ all_undet_files.extend(
+ list(map(os.path.abspath, fc_undet_files))
+ )
if all_undet_files:
undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
- if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files),
- undet_size), default='no'):
- removed = _remove_files(all_undet_files)
+ if misc.query_yes_no(
+ "In total found {} undetermined files which are {} in size, delete now ?".format(
+ len(all_undet_files), undet_size
+ ),
+ default="no",
+ ):
+ _remove_files(all_undet_files)
return
elif only_analysis:
- for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
- not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]:
- proj_abs_path = os.path.join(analysis_dir, pid)
- proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date)
- if proj_info and proj_info['closed_days'] >= days_analysis:
+ for pid in [
+ d
+ for d in os.listdir(analysis_dir)
+ if re.match(r"^P\d+$", d)
+ and not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))
+ ]:
+ os.path.join(analysis_dir, pid)
+ proj_info = get_closed_proj_info(
+ pid, pcon.get_entry(pid, use_id_view=True), date
+ )
+ if proj_info and proj_info["closed_days"] >= days_analysis:
# move on if this project has to be excluded
- if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
+ if (
+ proj_info["name"] in exclude_list
+ or proj_info["pid"] in exclude_list
+ ):
continue
- analysis_data, analysis_size = collect_analysis_data_miarka(pid, analysis_dir, analysis_data_to_remove)
- proj_info['analysis_to_remove'] = analysis_data
- proj_info['analysis_size'] = analysis_size
- proj_info['fastq_to_remove'] = 'not_selected'
- proj_info['fastq_size'] = 0
- project_clean_list[proj_info['name']] = proj_info
+ analysis_data, analysis_size = collect_analysis_data_miarka(
+ pid, analysis_dir, analysis_data_to_remove
+ )
+ proj_info["analysis_to_remove"] = analysis_data
+ proj_info["analysis_size"] = analysis_size
+ proj_info["fastq_to_remove"] = "not_selected"
+ proj_info["fastq_size"] = 0
+ project_clean_list[proj_info["name"]] = proj_info
else:
for flowcell_dir in flowcell_dir_root:
- for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
+ for fc in [
+ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)
+ ]:
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn('Flowcell {} do not contain a "{}" direcotry'.format(fc, flowcell_project_source))
+ logger.warn(
+ f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry'
+ )
continue
- projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
- if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
- not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))]
+ projects_in_fc = [
+ d
+ for d in os.listdir(flowcell_project_source)
+ if re.match(r"^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$", d)
+ and not os.path.exists(
+ os.path.join(flowcell_project_source, d, "cleaned")
+ )
+ ]
for _proj in projects_in_fc:
- proj = re.sub(r'_+', '.', _proj, 1)
+ proj = re.sub(r"_+", ".", _proj, 1)
# if a project is already processed no need of fetching it again from status db
if proj in project_processed_list:
# if the project is closed more than threshold days collect the fastq files from FC
# no need of looking for analysis data as they would have been collected in the first time
- if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
- fc_fq_files, fq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj))
- project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
- project_clean_list[proj]['fastq_size'] += fq_size
+ if (
+ proj in project_clean_list
+ and project_clean_list[proj]["closed_days"]
+ >= days_fastq
+ ):
+ fc_fq_files, fq_size = collect_fastq_data_miarka(
+ fc_abs_path,
+ os.path.join(flowcell_project_source, _proj),
+ )
+ project_clean_list[proj]["fastq_to_remove"][
+ "flowcells"
+ ][fc] = fc_fq_files["flowcells"][fc]
+ project_clean_list[proj]["fastq_size"] += fq_size
continue
project_processed_list.append(proj)
- #by default assume all projects are not old enough for delete
- fastq_data, analysis_data = ('young', 'young')
+ # by default assume all projects are not old enough for delete
+ fastq_data, analysis_data = ("young", "young")
fastq_size, analysis_size = (0, 0)
- proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date)
+ proj_info = get_closed_proj_info(
+ proj, pcon.get_entry(proj), date
+ )
if proj_info:
# move on if this project has to be excluded
- if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
+ if (
+ proj_info["name"] in exclude_list
+ or proj_info["pid"] in exclude_list
+ ):
continue
# if project not old enough for fastq files and only fastq files selected move on to next project
- if proj_info['closed_days'] >= days_fastq:
- fastq_data, fastq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj),
- data_dir, proj_info['pid'])
+ if proj_info["closed_days"] >= days_fastq:
+ fastq_data, fastq_size = collect_fastq_data_miarka(
+ fc_abs_path,
+ os.path.join(flowcell_project_source, _proj),
+ data_dir,
+ proj_info["pid"],
+ )
if not only_fastq:
# if project is old enough for fastq files and not 'only_fastq' try collect analysis files
- if proj_info['closed_days'] >= days_analysis:
- analysis_data, analysis_size = collect_analysis_data_miarka(proj_info['pid'], analysis_dir, analysis_data_to_remove)
+ if proj_info["closed_days"] >= days_analysis:
+ (
+ analysis_data,
+ analysis_size,
+ ) = collect_analysis_data_miarka(
+ proj_info["pid"],
+ analysis_dir,
+ analysis_data_to_remove,
+ )
# if both fastq and analysis files are not old enough move on
- if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == 'cleaned') and fastq_data == 'young'):
+ if (analysis_data == fastq_data) or (
+ (not analysis_data or analysis_data == "cleaned")
+ and fastq_data == "young"
+ ):
continue
- elif fastq_data == 'young':
+ elif fastq_data == "young":
continue
else:
- analysis_data = 'not_selected'
- proj_info['fastq_to_remove'] = fastq_data
- proj_info['fastq_size'] = fastq_size
- proj_info['analysis_to_remove'] = analysis_data
- proj_info['analysis_size'] = analysis_size
+ analysis_data = "not_selected"
+ proj_info["fastq_to_remove"] = fastq_data
+ proj_info["fastq_size"] = fastq_size
+ proj_info["analysis_to_remove"] = analysis_data
+ proj_info["analysis_size"] = analysis_size
project_clean_list[proj] = proj_info
if not project_clean_list:
- logger.info('There are no projects to clean')
+ logger.info("There are no projects to clean")
return
# list only the project and exit if 'list_only' option is selected
if list_only:
- print('Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size')
- for p_info in sorted(list(project_clean_list.values()), key=lambda d: d['closed_days'], reverse=True):
- print('\t'.join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
- str(p_info['closed_days']), p_info['closed_date'],
- _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])]))
+ print(
+ "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
+ )
+ for p_info in sorted(
+ list(project_clean_list.values()),
+ key=lambda d: d["closed_days"],
+ reverse=True,
+ ):
+ print(
+ "\t".join(
+ [
+ p_info["name"],
+ p_info["pid"],
+ p_info["bioinfo_responsible"],
+ str(p_info["closed_days"]),
+ p_info["closed_date"],
+ _def_get_size_unit(p_info["fastq_size"]),
+ _def_get_size_unit(p_info["analysis_size"]),
+ ]
+ )
+ )
raise SystemExit
- logger.info('Initial list is built with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list)))
- if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'):
+ logger.info(
+ f"Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}"
+ )
+ if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
filtered_project, proj_count = ([], 0)
- #go through complied project list and remove files
+ # go through complied project list and remove files
for proj, info in project_clean_list.items():
proj_count += 1
- if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq),
- proj_count, len(project_clean_list)), default='no'):
- logger.info('Will not remove files for project {}'.format(proj))
+ if not misc.query_yes_no(
+ "{}Delete files for this project ({}/{})".format(
+ get_proj_meta_info(info, days_fastq),
+ proj_count,
+ len(project_clean_list),
+ ),
+ default="no",
+ ):
+ logger.info(f"Will not remove files for project {proj}")
filtered_project.append(proj)
# remove projects that were decided not to delete
map(project_clean_list.pop, filtered_project)
- logger.info('Removed {}/{} projects from initial list'.format(len(filtered_project), proj_count))
+ logger.info(
+ f"Removed {len(filtered_project)}/{proj_count} projects from initial list"
+ )
if not project_clean_list:
- logger.info('There are no projects to clean after filtering')
+ logger.info("There are no projects to clean after filtering")
return
- logger.info('Final list is created with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list)))
- if not misc.query_yes_no('Proceed with cleanup ?', default='no'):
- logger.info('Aborting cleanup')
+ logger.info(
+ f"Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}"
+ )
+ if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
+ logger.info("Aborting cleanup")
return
- logger.info('Will start cleaning up project now')
+ logger.info("Will start cleaning up project now")
for proj, info in project_clean_list.items():
- fastq_info = info.get('fastq_to_remove')
+ fastq_info = info.get("fastq_to_remove")
if fastq_info and isinstance(fastq_info, dict):
- logger.info('Cleaning fastq files for project {}'.format(proj))
- fastq_fc = fastq_info.get('flowcells', {})
+ logger.info(f"Cleaning fastq files for project {proj}")
+ fastq_fc = fastq_info.get("flowcells", {})
removed_fc = []
for fc, fc_info in fastq_fc.items():
- proj_fc_root = fc_info['proj_root']
- logger.info('Removing fastq files from {}'.format(proj_fc_root))
+ proj_fc_root = fc_info["proj_root"]
+ logger.info(f"Removing fastq files from {proj_fc_root}")
if not dry_run:
- if _remove_files(fc_info['fq_files']):
- logger.info('Removed fastq files from FC {} for project {}, marking it as cleaned'.format(fc, proj))
+ if _remove_files(fc_info["fq_files"]):
+ logger.info(
+ f"Removed fastq files from FC {fc} for project {proj}, marking it as cleaned"
+ )
_touch_cleaned(proj_fc_root)
removed_fc.append(fc)
if len(fastq_fc) == len(removed_fc):
try:
- proj_data_root = fastq_info['proj_data']['proj_data_root']
- logger.info('All flowcells cleaned for this project, marking it as cleaned in {}'.format(proj_data_root))
+ proj_data_root = fastq_info["proj_data"]["proj_data_root"]
+ logger.info(
+ f"All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}"
+ )
_touch_cleaned(proj_data_root)
except:
pass
- analysis_info = info.get('analysis_to_remove')
+ analysis_info = info.get("analysis_to_remove")
if analysis_info and isinstance(analysis_info, dict):
- proj_analysis_root = analysis_info['proj_analysis_root']
- logger.info('cleaning analysis data for project {}'.format(proj))
+ proj_analysis_root = analysis_info["proj_analysis_root"]
+ logger.info(f"cleaning analysis data for project {proj}")
removed_qc = []
- for qc, files in analysis_info['analysis_files'].items():
- logger.info('Removing files of "{}" from {}'.format(qc, proj_analysis_root))
+ for qc, files in analysis_info["analysis_files"].items():
+ logger.info(f'Removing files of "{qc}" from {proj_analysis_root}')
if not dry_run:
if _remove_files(files):
removed_qc.append(qc)
else:
- logger.warn('Could not remove some files in qc directory "{}"'.format(qc))
- map(analysis_info['analysis_files'].pop, removed_qc)
- if len(analysis_info['analysis_files']) == 0:
- logger.info('Removed analysis data for project {}, marking it cleaned'.format(proj))
+ logger.warn(
+ f'Could not remove some files in qc directory "{qc}"'
+ )
+ map(analysis_info["analysis_files"].pop, removed_qc)
+ if len(analysis_info["analysis_files"]) == 0:
+ logger.info(
+ f"Removed analysis data for project {proj}, marking it cleaned"
+ )
_touch_cleaned(proj_analysis_root)
@@ -267,27 +389,38 @@ def cleanup_miarka(days_fastq, days_analysis,
# Class helper methods, not exposed as commands/subcommands #
#############################################################
+
def get_closed_proj_info(prj, pdoc, tdate=None):
"""Check and return a dict if project is closed."""
pdict = None
if not tdate:
tdate = datetime.today()
if not pdoc:
- logger.warn('Seems like project {} does not have a proper statusdb document, skipping it'.format(prj))
- elif 'close_date' in pdoc:
- closed_date = pdoc['close_date']
+ logger.warn(
+ f"Seems like project {prj} does not have a proper statusdb document, skipping it"
+ )
+ elif "close_date" in pdoc:
+ closed_date = pdoc["close_date"]
try:
- closed_days = tdate - datetime.strptime(closed_date, '%Y-%m-%d')
- pdict = {'name' : pdoc.get('project_name'),
- 'pid' : pdoc.get('project_id'),
- 'closed_date' : closed_date,
- 'closed_days' : closed_days.days,
- 'bioinfo_responsible' : pdoc.get('project_summary',{}).get('bioinfo_responsible','')}
+ closed_days = tdate - datetime.strptime(closed_date, "%Y-%m-%d")
+ pdict = {
+ "name": pdoc.get("project_name"),
+ "pid": pdoc.get("project_id"),
+ "closed_date": closed_date,
+ "closed_days": closed_days.days,
+ "bioinfo_responsible": pdoc.get("project_summary", {}).get(
+ "bioinfo_responsible", ""
+ ),
+ }
except:
- logger.warn('Problem calculating closed days for project {} with close date {}. Skipping it'.format(
- pdoc.get('project_name'), closed_date))
+ logger.warn(
+ "Problem calculating closed days for project {} with close date {}. Skipping it".format(
+ pdoc.get("project_name"), closed_date
+ )
+ )
return pdict
+
def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}):
"""Collect the analysis files that have to be removed from Miarka
return a tuple with files and total size of collected files."""
@@ -295,42 +428,57 @@ def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}):
proj_abs_path = os.path.join(analysis_root, pid)
if not os.path.exists(proj_abs_path):
file_list = None
- elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')):
- file_list = 'cleaned'
+ elif os.path.exists(os.path.join(proj_abs_path, "cleaned")):
+ file_list = "cleaned"
else:
- file_list = {'proj_analysis_root':proj_abs_path,
- 'analysis_files': defaultdict(list)}
- for qc_type,ext in files_ext_to_remove.items():
+ file_list = {
+ "proj_analysis_root": proj_abs_path,
+ "analysis_files": defaultdict(list),
+ }
+ for qc_type, ext in files_ext_to_remove.items():
qc_path = os.path.join(proj_abs_path, qc_type)
if os.path.exists(qc_path):
- file_list['analysis_files'][qc_type].extend(collect_files_by_ext(qc_path, ext))
+ file_list["analysis_files"][qc_type].extend(
+ collect_files_by_ext(qc_path, ext)
+ )
try:
- size += sum([sum(map(os.path.getsize, fls)) for fls in file_list['analysis_files'].values()])
+ size += sum(
+ [
+ sum(map(os.path.getsize, fls))
+ for fls in file_list["analysis_files"].values()
+ ]
+ )
except:
pass
return (file_list, size)
+
def collect_fastq_data_miarka(fc_root, fc_proj_src, proj_root=None, pid=None):
"""Collect the fastq files that have to be removed from Miarka.
Return a tuple with files and total size of collected files."""
size = 0
- file_list = {'flowcells': defaultdict(dict)}
+ file_list = {"flowcells": defaultdict(dict)}
fc_proj_path = os.path.join(fc_root, fc_proj_src)
fc_id = os.path.basename(fc_root)
- file_list['flowcells'][fc_id] = {'proj_root': fc_proj_path,
- 'fq_files': collect_files_by_ext(fc_proj_path, '*.fastq.gz')}
+ file_list["flowcells"][fc_id] = {
+ "proj_root": fc_proj_path,
+ "fq_files": collect_files_by_ext(fc_proj_path, "*.fastq.gz"),
+ }
if proj_root and pid:
proj_abs_path = os.path.join(proj_root, pid)
if not os.path.exists(proj_abs_path):
- file_list['proj_data'] = None
- elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')):
- file_list['proj_data'] = 'cleaned'
+ file_list["proj_data"] = None
+ elif os.path.exists(os.path.join(proj_abs_path, "cleaned")):
+ file_list["proj_data"] = "cleaned"
else:
- file_list['proj_data'] = {'proj_data_root': proj_abs_path,
- 'fastq_files' : collect_files_by_ext(proj_abs_path, '*.fastq.gz')}
- size += sum(map(os.path.getsize, file_list['flowcells'][fc_id]['fq_files']))
+ file_list["proj_data"] = {
+ "proj_data_root": proj_abs_path,
+ "fastq_files": collect_files_by_ext(proj_abs_path, "*.fastq.gz"),
+ }
+ size += sum(map(os.path.getsize, file_list["flowcells"][fc_id]["fq_files"]))
return (file_list, size)
+
def collect_files_by_ext(path, ext=[]):
"""Collect files with a given extension from a given path."""
if isinstance(ext, str):
@@ -343,60 +491,79 @@ def collect_files_by_ext(path, ext=[]):
collected_files.extend(collect_files_by_ext(d, ext))
return collected_files
+
def get_proj_meta_info(info, days_fastq):
"""From given info collect meta info for a project."""
- template = '\n'
+ template = "\n"
+
def _get_template_string(h, v):
try:
- v = '{}: {}\n'.format(h, v)
+ v = f"{h}: {v}\n"
except:
- v = '{}: Problem getting this'.format(h)
+ v = f"{h}: Problem getting this"
return v
- template += _get_template_string('Project overview', info.get('name'))
- template += _get_template_string('Project ID', info.get('pid'))
- template += _get_template_string('Bioinfo Responsible', info.get('bioinfo_responsible',''))
- template += _get_template_string('Closed for (days)', info.get('closed_days'))
- template += _get_template_string('Closed from (date)', info.get('closed_date'))
+
+ template += _get_template_string("Project overview", info.get("name"))
+ template += _get_template_string("Project ID", info.get("pid"))
+ template += _get_template_string(
+ "Bioinfo Responsible", info.get("bioinfo_responsible", "")
+ )
+ template += _get_template_string("Closed for (days)", info.get("closed_days"))
+ template += _get_template_string("Closed from (date)", info.get("closed_date"))
# set analysis info based upon what we have
- analysis_info = info.get('analysis_to_remove')
+ analysis_info = info.get("analysis_to_remove")
if not analysis_info:
- template += 'Project analysis: No analysis directory\n'
- elif isinstance(analysis_info, str) and analysis_info == 'cleaned':
- template += 'Project analysis: Analysis directory already cleaned\n'
+ template += "Project analysis: No analysis directory\n"
+ elif isinstance(analysis_info, str) and analysis_info == "cleaned":
+ template += "Project analysis: Analysis directory already cleaned\n"
elif isinstance(analysis_info, dict):
f_stat = []
- for qc_type, files in analysis_info['analysis_files'].items():
- f_stat.append('{} ({} files)'.format(qc_type, len(files)))
- template += 'Project analyzed: {}\n'.format(', '.join(f_stat))
+ for qc_type, files in analysis_info["analysis_files"].items():
+ f_stat.append(f"{qc_type} ({len(files)} files)")
+ template += "Project analyzed: {}\n".format(", ".join(f_stat))
# set fastq info based upon what we have
- fq_info = info.get('fastq_to_remove')
+ fq_info = info.get("fastq_to_remove")
if isinstance(fq_info, str) and fq_info == "young":
- template += 'Project been closed less than {} days, so will not remove any fastq files\n'.format(days_fastq)
+ template += f"Project been closed less than {days_fastq} days, so will not remove any fastq files\n"
elif isinstance(fq_info, dict):
- proj_fq_info = fq_info.get('proj_data')
+ proj_fq_info = fq_info.get("proj_data")
if not proj_fq_info:
- template += 'Project organized: No organized directory for project\n'
+ template += "Project organized: No organized directory for project\n"
elif isinstance(proj_fq_info, str) and proj_fq_info == "cleaned":
- template += 'Project organized: Project directory is already cleaned\n'
+ template += "Project organized: Project directory is already cleaned\n"
elif isinstance(proj_fq_info, dict):
- template += 'Project organized: Project is organized with {} fastq files\n'.format(len(proj_fq_info['fastq_files']))
- fc_fq_info = fq_info.get('flowcells', {})
+ template += (
+ "Project organized: Project is organized with {} fastq files\n".format(
+ len(proj_fq_info["fastq_files"])
+ )
+ )
+ fc_fq_info = fq_info.get("flowcells", {})
fc_num = len(fc_fq_info.keys())
- fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()]))
- template += 'Flowcells: There are {} FC with total {} fastq files\n'.format(fc_num, fc_files)
- template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0)))
+ fc_files = sum(
+ map(len, [fc_info.get("fq_files", []) for fc_info in fc_fq_info.values()])
+ )
+ template += (
+ f"Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n"
+ )
+ template += "Estimated data size: {}\n".format(
+ _def_get_size_unit(info.get("fastq_size", 0) + info.get("fastq_size", 0))
+ )
return template
+
def get_files_size_text(plist):
"""Get project list dict and give back string with overll sizes."""
- fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()]))
- asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()]))
- return '({f}{s}{a}) '.format(f = '~{} fastq data'.format(fsize) if fsize else '',
- a = '~{} analysis data'.format(asize) if asize else '',
- s = ' and ' if fsize and asize else '')
+ fsize = _def_get_size_unit(sum([i.get("fastq_size", 0) for i in plist.values()]))
+ asize = _def_get_size_unit(sum([i.get("analysis_size", 0) for i in plist.values()]))
+ return "({f}{s}{a}) ".format(
+ f=f"~{fsize} fastq data" if fsize else "",
+ a=f"~{asize} analysis data" if asize else "",
+ s=" and " if fsize and asize else "",
+ )
+
def _def_get_size_unit(s):
"""Change the given size to appropriate unit measurement for better readability."""
@@ -405,17 +572,18 @@ def _def_get_size_unit(s):
gb = mb * 1000
tb = gb * 1000
if s > tb:
- s = '~{}tb'.format(int(s/tb))
+ s = f"~{int(s/tb)}tb"
elif s > gb:
- s = '~{}gb'.format(int(s/gb))
+ s = f"~{int(s/gb)}gb"
elif s > mb:
- s = '~{}mb'.format(int(s/mb))
+ s = f"~{int(s/mb)}mb"
elif s > kb:
- s = '~{}kb'.format(int(s/kb))
+ s = f"~{int(s/kb)}kb"
elif s > 0:
- s = '~{}b'.format(int(s/b))
+ s = f"~{int(s)}b"
return str(s)
+
def _remove_files(files):
"""Remove files from given list."""
status = True
@@ -423,13 +591,16 @@ def _remove_files(files):
try:
os.remove(fl)
except Exception as e:
- logger.warn('Could not remove file {} due to "{}"'.format(fl, e.message))
+ logger.warn(f'Could not remove file {fl} due to "{e.message}"')
status = False
return status
+
def _touch_cleaned(path):
"""Touch a 'cleaned' file in a given path."""
try:
- open(os.path.join(path, 'cleaned'), 'w').close()
+ open(os.path.join(path, "cleaned"), "w").close()
except Exception as e:
- logger.warn('Could not create "cleaned" file in path {} due to "{}"'.format(path, e.message))
+ logger.warn(
+ f'Could not create "cleaned" file in path {path} due to "{e.message}"'
+ )
diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py
index 65abaf50..fe7e11ba 100644
--- a/taca/cleanup/cli.py
+++ b/taca/cleanup/cli.py
@@ -1,67 +1,125 @@
"""CLI for the storage subcommand."""
import click
+
from taca.cleanup import cleanup as cln
from taca.utils import misc
+
@click.group()
@click.pass_context
-@click.option('--status_db_config',
- type=click.Path(exists=True, dir_okay=False),
- envvar='STATUS_DB_CONFIG',
- help='Path to statusdb-configuration.')
+@click.option(
+ "--status_db_config",
+ type=click.Path(exists=True, dir_okay=False),
+ envvar="STATUS_DB_CONFIG",
+ help="Path to statusdb-configuration.",
+)
def cleanup(ctx, status_db_config):
"""Cleaning up servers - management methods and utilities."""
pass
+
# cleanup subcommands
@cleanup.command()
-@click.option('-d', '--days', type=click.IntRange(min=1),
- help='Days to consider as thershold, should not be combined with option "--hours"')
-@click.option('-h', '--hours', type=click.IntRange(min=1),
- help='Hours to consider as thershold, should not be combined with option "--days"')
+@click.option(
+ "-d",
+ "--days",
+ type=click.IntRange(min=1),
+ help='Days to consider as thershold, should not be combined with option "--hours"',
+)
+@click.option(
+ "-h",
+ "--hours",
+ type=click.IntRange(min=1),
+ help='Hours to consider as thershold, should not be combined with option "--days"',
+)
@click.pass_context
def preproc(ctx, days, hours):
"""Do appropriate cleanup on preproc."""
seconds = misc.to_seconds(days, hours)
cln.cleanup_processing(seconds)
+
@cleanup.command()
-@click.option('--days_fastq', type=click.IntRange(min=1),
- help='Days to consider as thershold for removing "fastq" files')
-@click.option('--days_analysis', type=click.IntRange(min=1),
- help='Days to consider as thershold for removing analysis data')
-@click.option('--only_fastq', is_flag=True,
- help='Clean only fastq data in "miarka"')
-@click.option('--only_analysis', is_flag=True,
- help='Clean only analysis data in "miarka"')
-@click.option('--date', type=click.STRING,
- help='Consider the given date instead of today while collecting closed projects. '
- 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"')
-@click.option('--exclude_projects', type=click.STRING,
- help='A project or a file with a list of projects to exclude from deleting. '
- 'Either name or id can be given. Examples: --exclude_projects P1234 or '
- '--exclude_projects P1234,P5678 or '
- '--exclude_projects file_with_projects_id.txt')
-@click.option('--clean_undetermined', is_flag=True,
- help='Remove only the undetermined reads for a flowcell that have '
- 'all project cleaned. All other parameters are ignored if this '
- 'flag is called.')
-@click.option('-l', '--list_only', is_flag=True,
- help='Only build the project list that will be cleaned')
-@click.option('-n', '--dry_run', is_flag=True,
- help='Perform dry run i.e. execute nothing but log')
+@click.option(
+ "--days_fastq",
+ type=click.IntRange(min=1),
+ help='Days to consider as thershold for removing "fastq" files',
+)
+@click.option(
+ "--days_analysis",
+ type=click.IntRange(min=1),
+ help="Days to consider as thershold for removing analysis data",
+)
+@click.option("--only_fastq", is_flag=True, help='Clean only fastq data in "miarka"')
+@click.option(
+ "--only_analysis", is_flag=True, help='Clean only analysis data in "miarka"'
+)
+@click.option(
+ "--date",
+ type=click.STRING,
+ help="Consider the given date instead of today while collecting closed projects. "
+ 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"',
+)
+@click.option(
+ "--exclude_projects",
+ type=click.STRING,
+ help="A project or a file with a list of projects to exclude from deleting. "
+ "Either name or id can be given. Examples: --exclude_projects P1234 or "
+ "--exclude_projects P1234,P5678 or "
+ "--exclude_projects file_with_projects_id.txt",
+)
+@click.option(
+ "--clean_undetermined",
+ is_flag=True,
+ help="Remove only the undetermined reads for a flowcell that have "
+ "all project cleaned. All other parameters are ignored if this "
+ "flag is called.",
+)
+@click.option(
+ "-l",
+ "--list_only",
+ is_flag=True,
+ help="Only build the project list that will be cleaned",
+)
+@click.option(
+ "-n", "--dry_run", is_flag=True, help="Perform dry run i.e. execute nothing but log"
+)
@click.pass_context
-def miarka(ctx, days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, date, exclude_projects, list_only, dry_run):
+def miarka(
+ ctx,
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ date,
+ exclude_projects,
+ list_only,
+ dry_run,
+):
"""Do appropriate cleanup on Miarka."""
- status_db_config = ctx.parent.params['status_db_config']
+ status_db_config = ctx.parent.params["status_db_config"]
if only_fastq and only_analysis:
- raise SystemExit('ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one')
+ raise SystemExit(
+ 'ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one'
+ )
if not days_fastq and not only_analysis and not clean_undetermined:
- raise SystemExit('ERROR: "days_fastq" is not given while not selecting "only_analysis" option')
+ raise SystemExit(
+ 'ERROR: "days_fastq" is not given while not selecting "only_analysis" option'
+ )
if not days_analysis and not only_fastq and not clean_undetermined:
- raise SystemExit('ERROR: "days_analysis" is not given while not selecting "only_fastq" option')
- cln.cleanup_miarka(days_fastq, days_analysis,
- only_fastq, only_analysis,
- clean_undetermined, status_db_config,
- exclude_projects, list_only,
- date, dry_run)
+ raise SystemExit(
+ 'ERROR: "days_analysis" is not given while not selecting "only_fastq" option'
+ )
+ cln.cleanup_miarka(
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ status_db_config,
+ exclude_projects,
+ list_only,
+ date,
+ dry_run,
+ )
diff --git a/taca/cli.py b/taca/cli.py
index 1c78dabc..d777884a 100644
--- a/taca/cli.py
+++ b/taca/cli.py
@@ -1,35 +1,39 @@
-# -*- coding: utf-8 -*-
import logging
import os
-from pkg_resources import iter_entry_points
+
import click
-import taca.log
+from pkg_resources import iter_entry_points
+import taca.log
from taca import __version__
from taca.utils import config as conf
logger = logging.getLogger(__name__)
+
@click.group()
@click.version_option(__version__)
# Priority for the configuration file is: environment variable > -c option > default
-@click.option('-c', '--config-file',
- default=os.path.join(os.environ['HOME'], '.taca/taca.yaml'),
- envvar='TACA_CONFIG',
- type=click.File('r'),
- help='Path to TACA configuration file')
-
+@click.option(
+ "-c",
+ "--config-file",
+ default=os.path.join(os.environ["HOME"], ".taca/taca.yaml"),
+ envvar="TACA_CONFIG",
+ type=click.File("r"),
+ help="Path to TACA configuration file",
+)
@click.pass_context
def cli(ctx, config_file):
- """ Tool for the Automation of Storage and Analyses """
+ """Tool for the Automation of Storage and Analyses"""
ctx.obj = {}
config = conf.load_yaml_config(config_file.name)
- log_file = config.get('log', {}).get('file', None)
+ log_file = config.get("log", {}).get("file", None)
if log_file:
- level = config.get('log').get('log_level', 'INFO')
+ level = config.get("log").get("log_level", "INFO")
taca.log.init_logger_file(log_file, level)
- logger.debug('starting up CLI')
+ logger.debug("starting up CLI")
+
-#Add subcommands dynamically to the CLI
-for entry_point in iter_entry_points('taca.subcommands'):
+# Add subcommands dynamically to the CLI
+for entry_point in iter_entry_points("taca.subcommands"):
cli.add_command(entry_point.load())
diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py
index fd3d3b16..d6483823 100644
--- a/taca/illumina/MiSeq_Runs.py
+++ b/taca/illumina/MiSeq_Runs.py
@@ -1,21 +1,24 @@
+import logging
import os
import re
import shutil
-import logging
+
from flowcell_parser.classes import SampleSheetParser
+
from taca.illumina.Standard_Runs import Standard_Run
logger = logging.getLogger(__name__)
-TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?')
-TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?')
-SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]')
-IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)')
-RECIPE_PAT = re.compile('[0-9]+-[0-9]+')
+TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?")
+TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?")
+SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]")
+IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")
+RECIPE_PAT = re.compile("[0-9]+-[0-9]+")
+
class MiSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(MiSeq_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
self._copy_samplesheet()
@@ -30,8 +33,7 @@ def _get_samplesheet(self):
"""Locate and parse the samplesheet for a run.
In MiSeq case this is located in FC_DIR/SampleSheet.csv
"""
- ssname = os.path.join(self.run_dir,
- 'SampleSheet.csv')
+ ssname = os.path.join(self.run_dir, "SampleSheet.csv")
if os.path.exists(ssname):
# If exists parse the SampleSheet
return ssname
@@ -46,14 +48,14 @@ def _copy_samplesheet(self):
# Load index files
indexfile = dict()
try:
- indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path']
+ indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"]
except KeyError:
- logger.error('Path to index file (10X) not found in the config file')
+ logger.error("Path to index file (10X) not found in the config file")
raise RuntimeError
try:
- indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path']
+ indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"]
except KeyError:
- logger.error('Path to index file (Smart-seq) not found in the config file')
+ logger.error("Path to index file (Smart-seq) not found in the config file")
raise RuntimeError
if ssname is None:
return None
@@ -62,97 +64,144 @@ def _copy_samplesheet(self):
# Copy the original samplesheet locally.
# Copy again if already done as there might have been changes to the samplesheet
try:
- shutil.copy(ssname, os.path.join(self.run_dir, '{}.csv'.format(self.flowcell_id)))
+ shutil.copy(ssname, os.path.join(self.run_dir, f"{self.flowcell_id}.csv"))
ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
except:
- raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir))
+ raise RuntimeError(
+ f"unable to copy file {ssname} to destination {self.run_dir}"
+ )
# This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
# to be used it needs some editing.
# This will contain the samplesheet with all the renaiming to be used with bcl2fastq
- samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet_copy.csv')
+ samplesheet_dest = os.path.join(self.run_dir, "SampleSheet_copy.csv")
# Check that the samplesheet is not already present. In this case go the next step
if os.path.exists(samplesheet_dest):
- logger.info('SampleSheet_copy.csv found ... overwriting it')
+ logger.info("SampleSheet_copy.csv found ... overwriting it")
try:
- with open(samplesheet_dest, 'w') as fcd:
- fcd.write(self._generate_clean_samplesheet(ssparser,
- indexfile,
- fields_to_remove=None,
- rename_samples=True,
- rename_qPCR_suffix = True,
- fields_qPCR=[ssparser.dfield_snm]))
+ with open(samplesheet_dest, "w") as fcd:
+ fcd.write(
+ self._generate_clean_samplesheet(
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=True,
+ fields_qPCR=[ssparser.dfield_snm],
+ )
+ )
except Exception as e:
logger.error(e)
return False
- logger.info(('Created SampleSheet_copy.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest)))
+ logger.info(
+ f"Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} "
+ )
# SampleSheet.csv generated
# When demultiplexing SampleSheet.csv is the one I need to use
- self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv'))
- if not self.runParserObj.obj.get('samplesheet_csv'):
- self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data
+ self.runParserObj.samplesheet = SampleSheetParser(
+ os.path.join(self.run_dir, "SampleSheet_copy.csv")
+ )
+ if not self.runParserObj.obj.get("samplesheet_csv"):
+ self.runParserObj.obj[
+ "samplesheet_csv"
+ ] = self.runParserObj.samplesheet.data
- def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None):
+ def _generate_clean_samplesheet(
+ self,
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=False,
+ fields_qPCR=None,
+ ):
"""Generate a 'clean' samplesheet, the given fields will be removed.
If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name
Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG)
Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC
"""
- output = u''
- compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+ output = ""
+ compl = {"A": "T", "C": "G", "G": "C", "T": "A"}
# Expand the ssparser if there are lanes with 10X or Smart-seq samples
- index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
- index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq'])
+ index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"])
+ index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"])
# Replace 10X or Smart-seq indices
for sample in ssparser.data:
- if sample['index'] in index_dict_tenX.keys():
- tenX_index = sample['index']
+ if sample["index"] in index_dict_tenX.keys():
+ tenX_index = sample["index"]
# In the case of 10X dual indexes, replace index and index2
if TENX_DUAL_PAT.findall(tenX_index):
- sample['index'] = index_dict_tenX[tenX_index][0]
- sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_tenX[tenX_index][1].replace(',','').upper() ] ) )
+ sample["index"] = index_dict_tenX[tenX_index][0]
+ sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_tenX[tenX_index][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
# In the case of 10X single indexes, replace the index name with the 4 actual indicies
else:
x = 0
indices_number = len(index_dict_tenX[tenX_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_tenX[tenX_index][x]
+ new_sample["index"] = index_dict_tenX[tenX_index][x]
ssparser.data.append(new_sample)
x += 1
# Set the original 10X index to the 4th correct index
- sample['index'] = index_dict_tenX[tenX_index][x]
- elif SMARTSEQ_PAT.findall(sample['index']):
+ sample["index"] = index_dict_tenX[tenX_index][x]
+ elif SMARTSEQ_PAT.findall(sample["index"]):
x = 0
- smartseq_index = sample['index'].split('-')[1]
+ smartseq_index = sample["index"].split("-")[1]
indices_number = len(index_dict_smartseq[smartseq_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- new_sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) )
+ new_sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ new_sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_smartseq[smartseq_index][x][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
ssparser.data.append(new_sample)
x += 1
- sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) )
+ sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_smartseq[smartseq_index][x][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
# Sort to get the added indicies from 10x in the right place
# Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID
- ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID')))
+ ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID")))
if not fields_to_remove:
fields_to_remove = []
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
if field not in fields_to_remove:
datafields.append(field)
- output += ','.join(datafields)
+ output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
line_ar = []
@@ -162,16 +211,18 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
try:
if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR:
# Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix
- value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm]))
+ value = re.sub(
+ "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}"
+ )
else:
# Substitute SampleID with SampleName, add Sample_ as prefix
- value ='Sample_{}'.format(line[ssparser.dfield_snm])
+ value = f"Sample_{line[ssparser.dfield_snm]}"
except:
- # Otherwise add Sample_ as prefix
- value = 'Sample_{}'.format(line[ssparser.dfield_sid])
+ # Otherwise add Sample_ as prefix
+ value = f"Sample_{line[ssparser.dfield_sid]}"
elif rename_qPCR_suffix and field in fields_qPCR:
- value = re.sub('__qPCR_$', '', line[field])
+ value = re.sub("__qPCR_$", "", line[field])
line_ar.append(value)
- output += ','.join(line_ar)
+ output += ",".join(line_ar)
output += os.linesep
return output
diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py
index bcdf34ff..5785542c 100755
--- a/taca/illumina/NextSeq_Runs.py
+++ b/taca/illumina/NextSeq_Runs.py
@@ -3,7 +3,7 @@
class NextSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(NextSeq_Run, self).__init__( run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
# NextSeq2000 has a different FC ID pattern that ID contains the first letter for position
diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py
index 58b384af..116d7c1a 100644
--- a/taca/illumina/NovaSeqXPlus_Runs.py
+++ b/taca/illumina/NovaSeqXPlus_Runs.py
@@ -3,7 +3,7 @@
class NovaSeqXPlus_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(NovaSeqXPlus_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
self._copy_samplesheet()
diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py
index 670b1fa9..52a7e162 100644
--- a/taca/illumina/NovaSeq_Runs.py
+++ b/taca/illumina/NovaSeq_Runs.py
@@ -3,7 +3,7 @@
class NovaSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(NovaSeq_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
self._copy_samplesheet()
diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py
index f5ab583b..c562fc2f 100644
--- a/taca/illumina/Runs.py
+++ b/taca/illumina/Runs.py
@@ -1,46 +1,53 @@
-import os
-import re
import csv
-import logging
-import subprocess
-import shutil
import glob
import json
-
+import logging
+import os
+import re
+import shutil
+import subprocess
from datetime import datetime
+from flowcell_parser.classes import LaneBarcodeParser, RunParser, SampleSheetParser
+
from taca.utils import misc
from taca.utils.misc import send_mail
-from flowcell_parser.classes import RunParser, LaneBarcodeParser, SampleSheetParser
logger = logging.getLogger(__name__)
-class Run(object):
- """ Defines an Illumina run
- """
+
+class Run:
+ """Defines an Illumina run"""
def __init__(self, run_dir, software, configuration):
if not os.path.exists(run_dir):
- raise RuntimeError("Could not locate run directory {}".format(run_dir))
-
- if 'analysis_server' not in configuration or \
- 'bcl2fastq' not in configuration or \
- 'bclconvert' not in configuration or \
- 'samplesheets_dir' not in configuration:
- raise RuntimeError("configuration missing required entries "
- "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)")
- if not os.path.exists(os.path.join(run_dir, 'runParameters.xml')) \
- and os.path.exists(os.path.join(run_dir, 'RunParameters.xml')):
+ raise RuntimeError(f"Could not locate run directory {run_dir}")
+
+ if (
+ "analysis_server" not in configuration
+ or "bcl2fastq" not in configuration
+ or "bclconvert" not in configuration
+ or "samplesheets_dir" not in configuration
+ ):
+ raise RuntimeError(
+ "configuration missing required entries "
+ "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)"
+ )
+ if not os.path.exists(
+ os.path.join(run_dir, "runParameters.xml")
+ ) and os.path.exists(os.path.join(run_dir, "RunParameters.xml")):
# In NextSeq runParameters is named RunParameters
logger.warning("Creating link from runParameters.xml to RunParameters.xml")
- os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml'))
- elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')):
- raise RuntimeError("Could not locate runParameters.xml in run directory {}".format(run_dir))
+ os.symlink("RunParameters.xml", os.path.join(run_dir, "runParameters.xml"))
+ elif not os.path.exists(os.path.join(run_dir, "runParameters.xml")):
+ raise RuntimeError(
+ f"Could not locate runParameters.xml in run directory {run_dir}"
+ )
self.run_dir = os.path.abspath(run_dir)
self.software = software
self.id = os.path.basename(os.path.normpath(run_dir))
- pattern = r'(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)'
+ pattern = r"(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)"
m = re.match(pattern, self.id)
self.date = m.group(1)
self.instrument = m.group(2)
@@ -63,51 +70,78 @@ def check_run_status(self):
This function checks the status of a run while in progress.
In the case of HiSeq check that all demux have been done and in that case perform aggregation
"""
- dex_status = self.get_run_status()
- if self.software == 'bcl2fastq':
- legacy_path = ''
- elif self.software == 'bclconvert':
- legacy_path = "Reports/{}".format(self.legacy_dir)
+ dex_status = self.get_run_status()
+ if self.software == "bcl2fastq":
+ legacy_path = ""
+ elif self.software == "bclconvert":
+ legacy_path = f"Reports/{self.legacy_dir}"
# Check the status of running demux
# Collect all samplesheets generated before
- samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while
+ samplesheets = glob.glob(
+ os.path.join(self.run_dir, "*_[0-9].csv")
+ ) # A single digit, this hypothesis should hold for a while
all_demux_done = True
for samplesheet in samplesheets:
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
- demux_folder = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))
+ demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
# Check if this job is done
- if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')):
+ if os.path.exists(
+ os.path.join(
+ self.run_dir,
+ demux_folder,
+ legacy_path,
+ "Stats",
+ "DemultiplexingStats.xml",
+ )
+ ):
all_demux_done = all_demux_done and True
- if self.software == 'bcl2fastq':
- demux_log = os.path.join(self.run_dir, "demux_{}_bcl2fastq.err".format(demux_id))
- elif self.software == 'bclconvert':
- demux_log = os.path.join(self.run_dir, "demux_{}_bcl-convert.err".format(demux_id))
+ if self.software == "bcl2fastq":
+ demux_log = os.path.join(
+ self.run_dir, f"demux_{demux_id}_bcl2fastq.err"
+ )
+ elif self.software == "bclconvert":
+ demux_log = os.path.join(
+ self.run_dir, f"demux_{demux_id}_bcl-convert.err"
+ )
else:
raise RuntimeError("Unrecognized software!")
if os.path.isfile(demux_log):
- errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log)
+ (
+ errors,
+ warnings,
+ error_and_warning_messages,
+ ) = self._check_demux_log(demux_id, demux_log)
else:
- raise RuntimeError("No demux log file found for sub-demultiplexing {}!".format(demux_id))
- self.demux_summary[demux_id] = {'errors' : errors,
- 'warnings' : warnings,
- 'error_and_warning_messages' : error_and_warning_messages
- }
+ raise RuntimeError(
+ f"No demux log file found for sub-demultiplexing {demux_id}!"
+ )
+ self.demux_summary[demux_id] = {
+ "errors": errors,
+ "warnings": warnings,
+ "error_and_warning_messages": error_and_warning_messages,
+ }
if errors or warnings:
- logger.info("Sub-Demultiplexing in {} completed with {} errors and {} warnings!".format(demux_folder, errors, warnings))
+ logger.info(
+ f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!"
+ )
else:
- logger.info("Sub-Demultiplexing in {} completed without any error or warning.".format(demux_folder))
+ logger.info(
+ f"Sub-Demultiplexing in {demux_folder} completed without any error or warning."
+ )
else:
all_demux_done = all_demux_done and False
- logger.info("Sub-Demultiplexing in {} not completed yet.".format(demux_folder))
+ logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.")
# All demux jobs finished and all stats aggregated under Demultiplexing
# Aggreate all the results in the Demultiplexing folder
- if all_demux_done and dex_status!='COMPLETED':
- dex_status = 'COMPLETED'
+ if all_demux_done and dex_status != "COMPLETED":
+ dex_status = "COMPLETED"
self._aggregate_demux_results()
self.runParserObj = RunParser(self.run_dir)
# Rename undetermined if needed
- lanes = misc.return_unique([lanes['Lane'] for lanes in self.runParserObj.samplesheet.data])
+ lanes = misc.return_unique(
+ [lanes["Lane"] for lanes in self.runParserObj.samplesheet.data]
+ )
samples_per_lane = self.get_samples_per_lane()
for lane in lanes:
if self.is_unpooled_lane(lane):
@@ -119,10 +153,10 @@ def _check_demux_log(self, demux_id, demux_log):
This function checks the log files of bcl2fastq/bclconvert
Errors or warnings will be captured and email notifications will be sent
"""
- with open(demux_log, 'r') as demux_log_file:
+ with open(demux_log) as demux_log_file:
demux_log_content = demux_log_file.readlines()
- if self.software == 'bcl2fastq':
- pattern = r'Processing completed with (\d+) errors and (\d+) warnings'
+ if self.software == "bcl2fastq":
+ pattern = r"Processing completed with (\d+) errors and (\d+) warnings"
match = re.search(pattern, demux_log_content[-1])
if match:
errors = int(match.group(1))
@@ -130,21 +164,23 @@ def _check_demux_log(self, demux_id, demux_log):
error_and_warning_messages = []
if errors or warnings:
for line in demux_log_content:
- if 'ERROR' in line or 'WARN' in line:
+ if "ERROR" in line or "WARN" in line:
error_and_warning_messages.append(line)
return errors, warnings, error_and_warning_messages
else:
- raise RuntimeError("Bad format with log file demux_{}_bcl2fastq.err".format(demux_id))
- elif self.software == 'bclconvert':
+ raise RuntimeError(
+ f"Bad format with log file demux_{demux_id}_bcl2fastq.err"
+ )
+ elif self.software == "bclconvert":
errors = 0
warnings = 0
error_and_warning_messages = []
for line in demux_log_content:
- if 'ERROR' in line:
+ if "ERROR" in line:
errors += 1
error_and_warning_messages.append(line)
- elif 'WARNING' in line:
- warnnings += 1
+ elif "WARNING" in line:
+ warnings += 1
error_and_warning_messages.append(line)
return errors, warnings, error_and_warning_messages
else:
@@ -170,50 +206,53 @@ def _get_demux_folder(self):
def _get_samplesheet(self):
"""
- Locate and parse the samplesheet for a run. The idea is that there is a folder in
- samplesheet_folders that contains a samplesheet named flowecell_id.csv.
+ Locate and parse the samplesheet for a run. The idea is that there is a folder in
+ samplesheet_folders that contains a samplesheet named flowecell_id.csv.
"""
try:
# Only implemented for some, (e.g. NovaSeqXPlus)
# Will raise AttributeError if not implemented.
current_year = self._current_year()
except AttributeError:
- current_year = '20' + self.id[0:2]
+ current_year = "20" + self.id[0:2]
- samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'],
- current_year)
- ssname = os.path.join(samplesheets_dir, '{}.csv'.format(self.flowcell_id))
+ samplesheets_dir = os.path.join(self.CONFIG["samplesheets_dir"], current_year)
+ ssname = os.path.join(samplesheets_dir, f"{self.flowcell_id}.csv")
if os.path.exists(ssname):
return ssname
else:
- raise RuntimeError("not able to find samplesheet {}.csv in {}".format(self.flowcell_id, self.CONFIG['samplesheets_dir']))
+ raise RuntimeError(
+ "not able to find samplesheet {}.csv in {}".format(
+ self.flowcell_id, self.CONFIG["samplesheets_dir"]
+ )
+ )
def _is_demultiplexing_done(self):
- return os.path.exists(os.path.join(self.run_dir,
- self._get_demux_folder(),
- 'Stats',
- 'Stats.json'))
+ return os.path.exists(
+ os.path.join(self.run_dir, self._get_demux_folder(), "Stats", "Stats.json")
+ )
def _is_demultiplexing_started(self):
return os.path.exists(os.path.join(self.run_dir, self._get_demux_folder()))
def _is_sequencing_done(self):
- return os.path.exists(os.path.join(self.run_dir, 'RTAComplete.txt')) and os.path.exists(os.path.join(self.run_dir, 'CopyComplete.txt'))
+ return os.path.exists(
+ os.path.join(self.run_dir, "RTAComplete.txt")
+ ) and os.path.exists(os.path.join(self.run_dir, "CopyComplete.txt"))
def get_run_status(self):
- """ Return the current status of the run.
- """
+ """Return the current status of the run."""
demux_started = self._is_demultiplexing_started()
demux_done = self._is_demultiplexing_done()
sequencing_done = self._is_sequencing_done()
if sequencing_done and demux_done:
- return 'COMPLETED' # run is done, transfer might be ongoing.
+ return "COMPLETED" # run is done, transfer might be ongoing.
elif sequencing_done and demux_started and not demux_done:
- return 'IN_PROGRESS'
+ return "IN_PROGRESS"
elif sequencing_done and not demux_started:
- return 'TO_START'
+ return "TO_START"
elif not sequencing_done:
- return 'SEQUENCING'
+ return "SEQUENCING"
else:
raise RuntimeError("Unexpected status in get_run_status")
@@ -249,66 +288,69 @@ def _compute_base_mask(self):
raise NotImplementedError("Please Implement this method")
def transfer_run(self, t_file, mail_recipients=None):
- """ Transfer a run to the analysis server. Will add group R/W permissions to
- the run directory in the destination server so that the run can be processed
- by any user/account in that group (i.e a functional account...).
- :param str t_file: File where to put the transfer information
+ """Transfer a run to the analysis server. Will add group R/W permissions to
+ the run directory in the destination server so that the run can be processed
+ by any user/account in that group (i.e a functional account...).
+ :param str t_file: File where to put the transfer information
"""
# The option -a implies -o and -g which is not the desired behaviour
- command_line = ['rsync', '-LtDrv']
+ command_line = ["rsync", "-LtDrv"]
# Add R/W permissions to the group
- command_line.append('--chmod=g+rw')
+ command_line.append("--chmod=g+rw")
# This horrible thing here avoids data dup when we use multiple indexes in a lane/FC
command_line.append("--exclude=Demultiplexing_*/*_*")
command_line.append("--include=*/")
- for to_include in self.CONFIG['analysis_server']['sync']['include']:
- command_line.append("--include={}".format(to_include))
+ for to_include in self.CONFIG["analysis_server"]["sync"]["include"]:
+ command_line.append(f"--include={to_include}")
command_line.extend(["--exclude=*", "--prune-empty-dirs"])
- r_user = self.CONFIG['analysis_server']['user']
- r_host = self.CONFIG['analysis_server']['host']
- r_dir = self.CONFIG['analysis_server']['sync']['data_archive']
- remote = "{}@{}:{}".format(r_user, r_host, r_dir)
+ r_user = self.CONFIG["analysis_server"]["user"]
+ r_host = self.CONFIG["analysis_server"]["host"]
+ r_dir = self.CONFIG["analysis_server"]["sync"]["data_archive"]
+ remote = f"{r_user}@{r_host}:{r_dir}"
command_line.extend([self.run_dir, remote])
# Create temp file indicating that the run is being transferred
try:
- open(os.path.join(self.run_dir, 'transferring'), 'w').close()
- except IOError as e:
- logger.error("Cannot create a file in {}. "
- "Check the run name, and the permissions.".format(self.id))
+ open(os.path.join(self.run_dir, "transferring"), "w").close()
+ except OSError as e:
+ logger.error(
+ f"Cannot create a file in {self.id}. "
+ "Check the run name, and the permissions."
+ )
raise e
- started = ("Started transfer of run {} on {}".format(self.id, datetime.now()))
+ started = f"Started transfer of run {self.id} on {datetime.now()}"
logger.info(started)
# In this particular case we want to capture the exception because we want
# to delete the transfer file
try:
- msge_text="I am about to transfer with this command \n{}".format(command_line)
- logger.info(msge_text)
- misc.call_external_command(command_line, with_log_files=True,
- prefix="", log_dir=self.run_dir)
+ msge_text = f"I am about to transfer with this command \n{command_line}"
+ logger.info(msge_text)
+ misc.call_external_command(
+ command_line, with_log_files=True, prefix="", log_dir=self.run_dir
+ )
except subprocess.CalledProcessError as exception:
- os.remove(os.path.join(self.run_dir, 'transferring'))
- #Send an email notifying that the transfer failed
+ os.remove(os.path.join(self.run_dir, "transferring"))
+ # Send an email notifying that the transfer failed
runname = self.id
- sbt = ("Rsync of run {} failed".format(runname))
- msg= """ Rsync of data for run {run} has failed!
- Raised the following exception: {e}
- """.format(run=runname, e=exception)
+ sbt = f"Rsync of run {runname} failed"
+ msg = f""" Rsync of data for run {runname} has failed!
+ Raised the following exception: {exception}
+ """
if mail_recipients:
send_mail(sbt, msg, mail_recipients)
raise exception
- logger.info('Adding run {} to {}'.format(self.id, t_file))
- with open(t_file, 'a') as tranfer_file:
- tsv_writer = csv.writer(tranfer_file, delimiter='\t')
+ logger.info(f"Adding run {self.id} to {t_file}")
+ with open(t_file, "a") as tranfer_file:
+ tsv_writer = csv.writer(tranfer_file, delimiter="\t")
tsv_writer.writerow([self.id, str(datetime.now())])
- os.remove(os.path.join(self.run_dir, 'transferring'))
+ os.remove(os.path.join(self.run_dir, "transferring"))
- #Send an email notifying that the transfer was successful
+ # Send an email notifying that the transfer was successful
runname = self.id
- sbt = ("Rsync of data for run {} to the analysis cluster has finished".format(runname))
- msg= """ Rsync of data for run {run} to the analysis cluster has finished!
+ sbt = f"Rsync of data for run {runname} to the analysis cluster has finished"
+ msg = """ Rsync of data for run {run} to the analysis cluster has finished!
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
""".format(run=runname)
@@ -316,52 +358,50 @@ def transfer_run(self, t_file, mail_recipients=None):
send_mail(sbt, msg, mail_recipients)
def archive_run(self, destination):
- """ Move run to the archive folder
- :param str destination: the destination folder
+ """Move run to the archive folder
+ :param str destination: the destination folder
"""
if destination and os.path.isdir(destination):
- logger.info('archiving run {}'.format(self.id))
+ logger.info(f"archiving run {self.id}")
shutil.move(self.run_dir, os.path.join(destination, self.id))
else:
logger.warning("Cannot move run to archive, destination does not exist")
def send_mail(self, sbt, msg, rcp):
- """ Sends mail about run completion
- """
- already_seen = False
+ """Sends mail about run completion"""
runname = self.id
if not sbt:
- sbt = "{}".format(runname)
+ sbt = f"{runname}"
misc.send_mail(sbt, msg, rcp)
def is_transferred(self, transfer_file):
- """ Checks wether a run has been transferred to the analysis server or not.
- Returns true in the case in which the tranfer is finished or ongoing.
- :param str transfer_file: Path to file with information about transferred runs
+ """Checks wether a run has been transferred to the analysis server or not.
+ Returns true in the case in which the tranfer is finished or ongoing.
+ :param str transfer_file: Path to file with information about transferred runs
"""
try:
- with open(transfer_file, 'r') as file_handle:
- transfer_file_contents = csv.reader(file_handle, delimiter='\t')
+ with open(transfer_file) as file_handle:
+ transfer_file_contents = csv.reader(file_handle, delimiter="\t")
for row in transfer_file_contents:
# Rows have two columns: run and transfer date
if row[0] == os.path.basename(self.id):
return True
- if os.path.exists(os.path.join(self.run_dir, 'transferring')):
+ if os.path.exists(os.path.join(self.run_dir, "transferring")):
return True
return False
- except IOError:
+ except OSError:
return False
def is_unpooled_lane(self, lane):
"""
- :param lane: lane identifier
- :type lane: string
- :rtype: boolean
- :returns: True if the samplesheet has one entry for that lane, False otherwise
+ :param lane: lane identifier
+ :type lane: string
+ :rtype: boolean
+ :returns: True if the samplesheet has one entry for that lane, False otherwise
"""
count = 0
for l in self.runParserObj.samplesheet.data:
- if l['Lane'] == lane:
+ if l["Lane"] == lane:
count += 1
return count == 1
@@ -375,7 +415,7 @@ def get_samples_per_lane(self):
ss = self.runParserObj.samplesheet
d = {}
for l in ss.data:
- d[l['Lane']] = l[ss.dfield_snm]
+ d[l["Lane"]] = l[ss.dfield_snm]
return d
def _rename_undet(self, lane, samples_per_lane):
@@ -388,25 +428,35 @@ def _rename_undet(self, lane, samples_per_lane):
:param samples_per_lane: lane:sample dict
:type status: dict
"""
- for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, "Undetermined*L0?{}*".format(lane))):
- old_name=os.path.basename(file)
- old_name_comps=old_name.split("_")
- old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined
- old_name_comps[0]=samples_per_lane[lane]#replace Undetermined with samplename
+ for file in glob.glob(
+ os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")
+ ):
+ old_name = os.path.basename(file)
+ old_name_comps = old_name.split("_")
+ old_name_comps[1] = old_name_comps[0] # replace S0 with Undetermined
+ old_name_comps[0] = samples_per_lane[
+ lane
+ ] # replace Undetermined with samplename
for index, comp in enumerate(old_name_comps):
- if comp.startswith('L00'):
- old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper
-
- new_name="_".join(old_name_comps)
- logger.info("Renaming {} to {}".format(file, os.path.join(os.path.dirname(file), new_name)))
+ if comp.startswith("L00"):
+ old_name_comps[index] = comp.replace(
+ "L00", "L01"
+ ) # adds a 1 as the second lane number in order to differentiate undetermined from normal in piper
+
+ new_name = "_".join(old_name_comps)
+ logger.info(
+ f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}"
+ )
os.rename(file, os.path.join(os.path.dirname(file), new_name))
def _classify_lanes(self, samplesheets):
# Prepare a list for lanes with NoIndex samples
noindex_lanes = []
for entry in self.runParserObj.samplesheet.data:
- if entry['index'].upper() == 'NOINDEX' or (entry['index'] == '' and entry['index2'] == ''):
- noindex_lanes.append(entry['Lane'])
+ if entry["index"].upper() == "NOINDEX" or (
+ entry["index"] == "" and entry["index2"] == ""
+ ):
+ noindex_lanes.append(entry["Lane"])
# Prepare a dict with the lane, demux_id and index_length info based on the sub-samplesheets
# This is for the purpose of deciding simple_lanes and complex_lanes, plus we should start with the Stats.json file from which demux_id for each lane
lane_demuxid_indexlength = dict()
@@ -414,10 +464,18 @@ def _classify_lanes(self, samplesheets):
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
ssparser = SampleSheetParser(samplesheet)
for row in ssparser.data:
- if row['Lane'] not in lane_demuxid_indexlength.keys():
- lane_demuxid_indexlength[row['Lane']] = {demux_id: [len(row.get('index','')), len(row.get('index2',''))]}
- elif demux_id not in lane_demuxid_indexlength[row['Lane']].keys():
- lane_demuxid_indexlength[row['Lane']][demux_id] = [len(row.get('index','')), len(row.get('index2',''))]
+ if row["Lane"] not in lane_demuxid_indexlength.keys():
+ lane_demuxid_indexlength[row["Lane"]] = {
+ demux_id: [
+ len(row.get("index", "")),
+ len(row.get("index2", "")),
+ ]
+ }
+ elif demux_id not in lane_demuxid_indexlength[row["Lane"]].keys():
+ lane_demuxid_indexlength[row["Lane"]][demux_id] = [
+ len(row.get("index", "")),
+ len(row.get("index2", "")),
+ ]
else:
pass
@@ -434,7 +492,12 @@ def _classify_lanes(self, samplesheets):
# Dual and longer indexes have higher priority
if 0 in list(complex_lanes[key].values())[0] and 0 not in vv:
complex_lanes[key] = {vk: vv}
- elif (0 in list(complex_lanes[key].values())[0] and 0 in vv) or (0 not in list(complex_lanes[key].values())[0] and 0 not in vv):
+ elif (
+ 0 in list(complex_lanes[key].values())[0] and 0 in vv
+ ) or (
+ 0 not in list(complex_lanes[key].values())[0]
+ and 0 not in vv
+ ):
if sum(vv) > sum(list(complex_lanes[key].values())[0]):
complex_lanes[key] = {vk: vv}
else:
@@ -442,333 +505,622 @@ def _classify_lanes(self, samplesheets):
return noindex_lanes, simple_lanes, complex_lanes
- def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, legacy_path):
+ def _process_noindex_sample_with_fake_index_with_single_demux(
+ self, demux_id, legacy_path
+ ):
demux_folder = os.path.join(self.run_dir, self.demux_dir)
sample_counter = 1
- for entry in sorted(self.runParserObj.samplesheet.data, key=lambda k: k['Lane']):
- lane = entry['Lane']
- project = entry['Sample_Project']
- sample = entry['Sample_ID']
+ for entry in sorted(
+ self.runParserObj.samplesheet.data, key=lambda k: k["Lane"]
+ ):
+ lane = entry["Lane"]
+ project = entry["Sample_Project"]
+ sample = entry["Sample_ID"]
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
os.makedirs(project_dest)
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))):
+ for file in glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined*L0?{lane}*",
+ )
+ ):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_',''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:]
+ new_name_comps = [
+ sample.replace("Sample_", ""),
+ f"S{str(sample_counter)}",
+ ] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
- logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name))
+ logger.info(
+ "For undet sample {}, renaming {} to {}".format(
+ sample.replace("Sample_", ""), old_name, new_name
+ )
+ )
sample_counter += 1
# Make a softlink of lane.html
- html_report_lane_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
- html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
+ html_report_lane_source = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
+ html_report_lane_dest = os.path.join(
+ demux_folder,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
if not os.path.isdir(os.path.dirname(html_report_lane_dest)):
os.makedirs(os.path.dirname(html_report_lane_dest))
os.symlink(html_report_lane_source, html_report_lane_dest)
# Modify the laneBarcode.html file
- html_report_laneBarcode = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ html_report_laneBarcode = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
html_report_laneBarcode_parser = LaneBarcodeParser(html_report_laneBarcode)
lane_project_sample = dict()
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Sample'] != 'Undetermined':
- lane_project_sample[entry['Lane']] = {'Project': entry['Project'],
- 'Sample': entry['Sample']
- }
+ if entry["Sample"] != "Undetermined":
+ lane_project_sample[entry["Lane"]] = {
+ "Project": entry["Project"],
+ "Sample": entry["Sample"],
+ }
for entry in html_report_laneBarcode_parser.sample_data[:]:
- if entry['Sample'] == 'Undetermined':
- entry['Project'] = lane_project_sample[entry['Lane']]['Project']
- entry['Sample'] = lane_project_sample[entry['Lane']]['Sample']
+ if entry["Sample"] == "Undetermined":
+ entry["Project"] = lane_project_sample[entry["Lane"]]["Project"]
+ entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"]
else:
html_report_laneBarcode_parser.sample_data.remove(entry)
- html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data,
- key=lambda k: (k['Lane'].lower(), k['Sample']))
- new_html_report_laneBarcode = os.path.join(demux_folder,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ html_report_laneBarcode_parser.sample_data = sorted(
+ html_report_laneBarcode_parser.sample_data,
+ key=lambda k: (k["Lane"].lower(), k["Sample"]),
+ )
+ new_html_report_laneBarcode = os.path.join(
+ demux_folder,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
_generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser)
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
# Modify the Stats.json file
- stat_json_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json")
+ stat_json_source = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ "Stats.json",
+ )
stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json")
with open(stat_json_source) as json_data:
data = json.load(json_data)
# Fix the sample stats per lane
- for entry in data['ConversionResults'][:]:
- del entry['DemuxResults'][0]['IndexMetrics']
- entry['DemuxResults'][0].update(entry['Undetermined'])
- del entry['Undetermined']
+ for entry in data["ConversionResults"][:]:
+ del entry["DemuxResults"][0]["IndexMetrics"]
+ entry["DemuxResults"][0].update(entry["Undetermined"])
+ del entry["Undetermined"]
# Reset unknown barcodes list
- for entry in data['UnknownBarcodes'][:]:
- entry['Barcodes'] = {'unknown': 1}
+ for entry in data["UnknownBarcodes"][:]:
+ entry["Barcodes"] = {"unknown": 1}
# Write to a new Stats.json file
- with open(stat_json_new, 'w') as stat_json_new_file:
+ with open(stat_json_new, "w") as stat_json_new_file:
json.dump(data, stat_json_new_file)
- def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes):
- elements = [element for element in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) ]
+ def _process_simple_lane_with_single_demux(
+ self, demux_id, legacy_path, noindex_lanes
+ ):
+ elements = [
+ element
+ for element in os.listdir(
+ os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
+ )
+ ]
for element in elements:
- if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), element)
+ if (
+ "Stats" not in element and "Reports" not in element
+ ): # skip this folder and treat it differently to take into account the NoIndex case
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", element
+ )
dest = os.path.join(self.run_dir, self.demux_dir, element)
os.symlink(source, dest)
os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats"))
# Fetch the lanes that have NoIndex
- statsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "*" ))
+ statsFiles = glob.glob(
+ os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*"
+ )
+ )
for source in statsFiles:
source_name = os.path.split(source)[1]
- if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
- lane = os.path.splitext(os.path.split(source)[1])[0][-1] #lane
+ if source_name not in [
+ "DemultiplexingStats.xml",
+ "AdapterTrimming.txt",
+ "ConversionStats.xml",
+ "Stats.json",
+ ]:
+ lane = os.path.splitext(os.path.split(source)[1])[0][-1] # lane
if lane not in noindex_lanes:
- dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name)
+ dest = os.path.join(
+ self.run_dir, self.demux_dir, "Stats", source_name
+ )
os.symlink(source, dest)
- for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", file)
+ for file in [
+ "DemultiplexingStats.xml",
+ "AdapterTrimming.txt",
+ "ConversionStats.xml",
+ "Stats.json",
+ ]:
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file
+ )
dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file)
os.symlink(source, dest)
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports")
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports"
+ )
dest = os.path.join(self.run_dir, self.demux_dir, "Reports")
if os.path.exists(dest):
try:
os.rmdir(dest)
- except NotADirectoryError as e:
+ except NotADirectoryError:
os.unlink(dest)
os.symlink(source, dest)
- def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode):
+ def _fix_html_reports_for_complex_lanes(
+ self,
+ demux_folder,
+ index_cycles,
+ complex_lanes,
+ noindex_lanes,
+ html_reports_lane,
+ html_reports_laneBarcode,
+ ):
# Start with the lane
html_report_lane_parser = None
for next_html_report_lane in html_reports_lane:
if html_report_lane_parser is None:
html_report_lane_parser = LaneBarcodeParser(next_html_report_lane)
else:
- lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data]
+ lanesInReport = [
+ Lane["Lane"] for Lane in html_report_lane_parser.sample_data
+ ]
next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane)
for entry in next_html_report_lane_parser.sample_data:
- if not entry['Lane'] in lanesInReport:
+ if entry["Lane"] not in lanesInReport:
# If this is a new lane not included before
html_report_lane_parser.sample_data.append(entry)
# Now all lanes have been inserted
# NumberReads for total lane cluster/yields and total sample cluster/yields
- NumberReads_Summary = dict()
+ self.NumberReads_Summary = dict()
# The numbers in Flowcell Summary also need to be aggregated if multiple demultiplexing is done
Clusters_Raw = 0
Clusters_PF = 0
Yield_Mbases = 0
for entry in html_report_lane_parser.sample_data:
# Update NumberReads for total lane clusters
- NumberReads_Summary[entry['Lane']] = {'total_lane_cluster': int(entry['PF Clusters'].replace(',', '')),
- 'total_lane_yield': int(entry['Yield (Mbases)'].replace(',', ''))}
- Clusters_Raw += int(int(entry['PF Clusters'].replace(',', '')) / float(entry['% PFClusters']) * 100)
- Clusters_PF += int(entry['PF Clusters'].replace(',', ''))
- Yield_Mbases += int(entry['Yield (Mbases)'].replace(',', ''))
- if entry['Lane'] in complex_lanes.keys():
- entry['% Perfectbarcode'] = None
- entry['% One mismatchbarcode'] = None
+ self.NumberReads_Summary[entry["Lane"]] = {
+ "total_lane_cluster": int(entry["PF Clusters"].replace(",", "")),
+ "total_lane_yield": int(entry["Yield (Mbases)"].replace(",", "")),
+ }
+ Clusters_Raw += int(
+ int(entry["PF Clusters"].replace(",", ""))
+ / float(entry["% PFClusters"])
+ * 100
+ )
+ Clusters_PF += int(entry["PF Clusters"].replace(",", ""))
+ Yield_Mbases += int(entry["Yield (Mbases)"].replace(",", ""))
+ if entry["Lane"] in complex_lanes.keys():
+ entry["% Perfectbarcode"] = None
+ entry["% One mismatchbarcode"] = None
# Update the values in Flowcell Summary
- html_report_lane_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw)
- html_report_lane_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF)
- html_report_lane_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases)
+ html_report_lane_parser.flowcell_data["Clusters (Raw)"] = f"{Clusters_Raw:,}"
+ html_report_lane_parser.flowcell_data["Clusters(PF)"] = f"{Clusters_PF:,}"
+ html_report_lane_parser.flowcell_data["Yield (MBases)"] = f"{Yield_Mbases:,}"
# Add lanes not present in this demux
# Create the new lane.html
- new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all'])
- new_html_report_lane = os.path.join(new_html_report_lane_dir, 'lane.html')
+ new_html_report_lane_dir = _create_folder_structure(
+ demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
+ )
+ new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html")
_generate_lane_html(new_html_report_lane, html_report_lane_parser)
# Generate the laneBarcode
html_report_laneBarcode_parser = None
for next_html_report_laneBarcode in html_reports_laneBarcode:
if html_report_laneBarcode_parser is None:
- html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode)
+ html_report_laneBarcode_parser = LaneBarcodeParser(
+ next_html_report_laneBarcode
+ )
else:
# No need to check samples occuring in more than one file as it would be spotted while softlinking
- next_html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode)
+ next_html_report_laneBarcode_parser = LaneBarcodeParser(
+ next_html_report_laneBarcode
+ )
for entry in next_html_report_laneBarcode_parser.sample_data:
html_report_laneBarcode_parser.sample_data.append(entry)
# For complex lanes, set all numbers of undetermined to 0. And only keep one such entry
- constant_keys = ['Lane', 'Barcode sequence', 'Project', 'Sample']
+ constant_keys = ["Lane", "Barcode sequence", "Project", "Sample"]
modified_complex_lanes = []
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Lane'] in list(complex_lanes.keys()) and entry['Project'] in 'default':
- if entry['Lane'] not in modified_complex_lanes:
+ if (
+ entry["Lane"] in list(complex_lanes.keys())
+ and entry["Project"] in "default"
+ ):
+ if entry["Lane"] not in modified_complex_lanes:
for key in entry.keys():
if key not in constant_keys:
- entry[key] = '0'
- modified_complex_lanes.append(entry['Lane'])
+ entry[key] = "0"
+ modified_complex_lanes.append(entry["Lane"])
else:
html_report_laneBarcode_parser.sample_data.remove(entry)
# Update NumberReads for total sample yields
for entry in html_report_laneBarcode_parser.sample_data:
- if 'total_sample_cluster' not in NumberReads_Summary[entry['Lane']].keys():
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] = 0
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] = 0
- if entry['Project'] != 'default':
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', ''))
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', ''))
+ if (
+ "total_sample_cluster"
+ not in self.NumberReads_Summary[entry["Lane"]].keys()
+ ):
+ self.NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0
+ self.NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0
+ if entry["Project"] != "default":
+ self.NumberReads_Summary[entry["Lane"]][
+ "total_sample_cluster"
+ ] += int(entry["PF Clusters"].replace(",", ""))
+ self.NumberReads_Summary[entry["Lane"]][
+ "total_sample_yield"
+ ] += int(entry["Yield (Mbases)"].replace(",", ""))
else:
- if entry['Project'] != 'default':
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', ''))
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', ''))
+ if entry["Project"] != "default":
+ self.NumberReads_Summary[entry["Lane"]][
+ "total_sample_cluster"
+ ] += int(entry["PF Clusters"].replace(",", ""))
+ self.NumberReads_Summary[entry["Lane"]][
+ "total_sample_yield"
+ ] += int(entry["Yield (Mbases)"].replace(",", ""))
# Calculate the numbers clusters/yields of undet reads
- for key, value in NumberReads_Summary.items():
- value['undet_cluster'] = value['total_lane_cluster'] - value['total_sample_cluster']
- value['undet_yield'] = value['total_lane_yield'] - value['total_sample_yield']
+ for key, value in self.NumberReads_Summary.items():
+ value["undet_cluster"] = (
+ value["total_lane_cluster"] - value["total_sample_cluster"]
+ )
+ value["undet_yield"] = (
+ value["total_lane_yield"] - value["total_sample_yield"]
+ )
# Update the cluster/yield info of undet for complex lanes
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Project'] == 'default' and entry['Lane'] in complex_lanes.keys():
- entry['PF Clusters'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_cluster'])
- entry['Yield (Mbases)'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_yield'])
+ if entry["Project"] == "default" and entry["Lane"] in complex_lanes.keys():
+ entry["PF Clusters"] = "{:,}".format(
+ self.NumberReads_Summary[entry["Lane"]]["undet_cluster"]
+ )
+ entry["Yield (Mbases)"] = "{:,}".format(
+ self.NumberReads_Summary[entry["Lane"]]["undet_yield"]
+ )
# Fix special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
lane_project_sample = dict()
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined':
- lane_project_sample[entry['Lane']] = {'Project': entry['Project'],
- 'Sample': entry['Sample']}
+ if entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined":
+ lane_project_sample[entry["Lane"]] = {
+ "Project": entry["Project"],
+ "Sample": entry["Sample"],
+ }
for entry in html_report_laneBarcode_parser.sample_data[:]:
- if entry['Lane'] in noindex_lanes and entry['Sample'] == 'Undetermined':
- entry['Project'] = lane_project_sample[entry['Lane']]['Project']
- entry['Sample'] = lane_project_sample[entry['Lane']]['Sample']
- elif entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined':
+ if entry["Lane"] in noindex_lanes and entry["Sample"] == "Undetermined":
+ entry["Project"] = lane_project_sample[entry["Lane"]]["Project"]
+ entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"]
+ elif (
+ entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined"
+ ):
html_report_laneBarcode_parser.sample_data.remove(entry)
# Sort sample_data: first by lane then by sample ID
- html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data,
- key=lambda k: (k['Lane'].lower(), k['Sample']))
+ html_report_laneBarcode_parser.sample_data = sorted(
+ html_report_laneBarcode_parser.sample_data,
+ key=lambda k: (k["Lane"].lower(), k["Sample"]),
+ )
# Update the values in Flowcell Summary
- html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw)
- html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF)
- html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases)
+ html_report_laneBarcode_parser.flowcell_data[
+ "Clusters (Raw)"
+ ] = f"{Clusters_Raw:,}"
+ html_report_laneBarcode_parser.flowcell_data[
+ "Clusters(PF)"
+ ] = f"{Clusters_PF:,}"
+ html_report_laneBarcode_parser.flowcell_data[
+ "Yield (MBases)"
+ ] = f"{Yield_Mbases:,}"
# Generate the new report for laneBarcode.html
- new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html')
+ new_html_report_laneBarcode = os.path.join(
+ new_html_report_lane_dir, "laneBarcode.html"
+ )
_generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser)
- return NumberReads_Summary
-
- def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary):
+ def _fix_demultiplexingstats_xml_dir(
+ self,
+ demux_folder,
+ stats_json,
+ samplesheets,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ ):
# Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done)
- DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ['Stats'])
+ DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"])
# For creating DemuxSummary.txt files for complex lanes
DemuxSummaryFiles_complex_lanes = dict()
# Generate the Stats.json
- with open(os.path.join(DemultiplexingStats_xml_dir, 'Stats.json'), 'w') as json_data_cumulative:
+ with open(
+ os.path.join(DemultiplexingStats_xml_dir, "Stats.json"), "w"
+ ) as json_data_cumulative:
stats_list = {}
for stat_json in stats_json:
- demux_id = re.findall('Demultiplexing_([0-9])', stat_json)[0]
+ demux_id = re.findall("Demultiplexing_([0-9])", stat_json)[0]
with open(stat_json) as json_data_partial:
data = json.load(json_data_partial)
if len(stats_list) == 0:
# First time I do this
- stats_list['RunNumber'] = data['RunNumber']
- stats_list['Flowcell'] = data['Flowcell']
- stats_list['RunId'] = data['RunId']
- stats_list['ConversionResults'] = data['ConversionResults']
- stats_list['ReadInfosForLanes'] = data['ReadInfosForLanes']
- stats_list['UnknownBarcodes'] = []
+ stats_list["RunNumber"] = data["RunNumber"]
+ stats_list["Flowcell"] = data["Flowcell"]
+ stats_list["RunId"] = data["RunId"]
+ stats_list["ConversionResults"] = data["ConversionResults"]
+ stats_list["ReadInfosForLanes"] = data["ReadInfosForLanes"]
+ stats_list["UnknownBarcodes"] = []
else:
# Update only the importat fields
- lanes_present_in_stats_json = [entry['LaneNumber'] for entry in stats_list['ConversionResults']]
- for ReadInfosForLanes_lane in data['ReadInfosForLanes']:
- if ReadInfosForLanes_lane['LaneNumber'] not in lanes_present_in_stats_json:
- stats_list['ReadInfosForLanes'].extend([ReadInfosForLanes_lane])
- for ConversionResults_lane in data['ConversionResults']:
- if ConversionResults_lane['LaneNumber'] in lanes_present_in_stats_json and str(ConversionResults_lane['LaneNumber']) in complex_lanes.keys():
+ lanes_present_in_stats_json = [
+ entry["LaneNumber"]
+ for entry in stats_list["ConversionResults"]
+ ]
+ for ReadInfosForLanes_lane in data["ReadInfosForLanes"]:
+ if (
+ ReadInfosForLanes_lane["LaneNumber"]
+ not in lanes_present_in_stats_json
+ ):
+ stats_list["ReadInfosForLanes"].extend(
+ [ReadInfosForLanes_lane]
+ )
+ for ConversionResults_lane in data["ConversionResults"]:
+ if (
+ ConversionResults_lane["LaneNumber"]
+ in lanes_present_in_stats_json
+ and str(ConversionResults_lane["LaneNumber"])
+ in complex_lanes.keys()
+ ):
# For complex lanes, we set all stats to 0, except for read number and yield which will use values from NumberReads_Summary
- ConversionResults_lane['Undetermined']['NumberReads'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_cluster']
- ConversionResults_lane['Undetermined']['Yield'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_yield']*1000000
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['QualityScoreSum'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['TrimmedBases'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['Yield'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['YieldQ30'] = 0
- if len([r for r in self.runParserObj.runinfo.data['Reads'] if r['IsIndexedRead'] == 'N']) == 2:
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['QualityScoreSum'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['TrimmedBases'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['Yield'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['YieldQ30'] = 0
+ ConversionResults_lane["Undetermined"][
+ "NumberReads"
+ ] = self.NumberReads_Summary[
+ str(ConversionResults_lane["LaneNumber"])
+ ]["undet_cluster"]
+ ConversionResults_lane["Undetermined"]["Yield"] = (
+ self.NumberReads_Summary[
+ str(ConversionResults_lane["LaneNumber"])
+ ]["undet_yield"]
+ * 1000000
+ )
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["QualityScoreSum"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["TrimmedBases"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["Yield"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["YieldQ30"] = 0
+ if (
+ len(
+ [
+ r
+ for r in self.runParserObj.runinfo.data[
+ "Reads"
+ ]
+ if r["IsIndexedRead"] == "N"
+ ]
+ )
+ == 2
+ ):
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["QualityScoreSum"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["TrimmedBases"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["Yield"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["YieldQ30"] = 0
# Find the list containing info for this lane #TODO: can lane_to_update be removed?
- lane_to_update = [entry for entry in stats_list['ConversionResults'] if entry['LaneNumber'] == ConversionResults_lane['LaneNumber']][0]
- lane_to_update['DemuxResults'].extend(ConversionResults_lane['DemuxResults'])
- lane_to_update['Undetermined'] = ConversionResults_lane['Undetermined']
+ lane_to_update = [
+ entry
+ for entry in stats_list["ConversionResults"]
+ if entry["LaneNumber"]
+ == ConversionResults_lane["LaneNumber"]
+ ][0]
+ lane_to_update["DemuxResults"].extend(
+ ConversionResults_lane["DemuxResults"]
+ )
+ lane_to_update["Undetermined"] = ConversionResults_lane[
+ "Undetermined"
+ ]
else:
- stats_list['ConversionResults'].extend([ConversionResults_lane])
-
- for unknown_barcode_lane in data['UnknownBarcodes']:
- if str(unknown_barcode_lane['Lane']) in simple_lanes.keys():
- stats_list['UnknownBarcodes'].extend([unknown_barcode_lane])
- elif str(unknown_barcode_lane['Lane']) in complex_lanes.keys():
- if list(complex_lanes[str(unknown_barcode_lane['Lane'])].keys())[0] == demux_id:
+ stats_list["ConversionResults"].extend(
+ [ConversionResults_lane]
+ )
+
+ for unknown_barcode_lane in data["UnknownBarcodes"]:
+ if str(unknown_barcode_lane["Lane"]) in simple_lanes.keys():
+ stats_list["UnknownBarcodes"].extend([unknown_barcode_lane])
+ elif str(unknown_barcode_lane["Lane"]) in complex_lanes.keys():
+ if (
+ list(
+ complex_lanes[
+ str(unknown_barcode_lane["Lane"])
+ ].keys()
+ )[0]
+ == demux_id
+ ):
# First have the list of unknown indexes from the top priority demux run
full_list_unknownbarcodes = unknown_barcode_lane
# Remove the samples involved in the other samplesheets
for samplesheet in samplesheets:
- demux_id_ss = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
+ demux_id_ss = os.path.splitext(
+ os.path.split(samplesheet)[1]
+ )[0].split("_")[1]
if demux_id_ss != demux_id:
ssparser = SampleSheetParser(samplesheet)
- ssparser_data_lane = [row for row in ssparser.data if row['Lane'] == str(unknown_barcode_lane['Lane'])]
+ ssparser_data_lane = [
+ row
+ for row in ssparser.data
+ if row["Lane"]
+ == str(unknown_barcode_lane["Lane"])
+ ]
for row in ssparser_data_lane:
- sample_idx1 = row.get('index','')
- sample_idx2 = row.get('index2','')
- idx_copy = tuple(full_list_unknownbarcodes['Barcodes'].keys())
+ sample_idx1 = row.get("index", "")
+ sample_idx2 = row.get("index2", "")
+ idx_copy = tuple(
+ full_list_unknownbarcodes[
+ "Barcodes"
+ ].keys()
+ )
for idx in idx_copy:
- unknownbarcode_idx1 = idx.split('+')[0] if '+' in idx else idx
- unknownbarcode_idx2 = idx.split('+')[1] if '+' in idx else ''
+ unknownbarcode_idx1 = (
+ idx.split("+")[0]
+ if "+" in idx
+ else idx
+ )
+ unknownbarcode_idx2 = (
+ idx.split("+")[1]
+ if "+" in idx
+ else ""
+ )
if sample_idx1 and sample_idx2:
- comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)]
- comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx2) else sample_idx2[:len(unknownbarcode_idx2)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)] and comparepart_idx2 == unknownbarcode_idx2[:len(comparepart_idx2)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
+ comparepart_idx1 = (
+ sample_idx1
+ if len(sample_idx1)
+ <= len(unknownbarcode_idx1)
+ else sample_idx1[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ comparepart_idx2 = (
+ sample_idx2
+ if len(sample_idx2)
+ <= len(unknownbarcode_idx2)
+ else sample_idx2[
+ : len(unknownbarcode_idx2)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx1)
+ ]
+ and comparepart_idx2
+ == unknownbarcode_idx2[
+ : len(comparepart_idx2)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
elif sample_idx1 and not sample_idx2:
- comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
+ comparepart_idx1 = (
+ sample_idx1
+ if len(sample_idx1)
+ <= len(unknownbarcode_idx1)
+ else sample_idx1[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx1)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
elif not sample_idx1 and sample_idx2:
- comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx1) else sample_idx2[:len(unknownbarcode_idx1)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx2)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
- stats_list['UnknownBarcodes'].extend([full_list_unknownbarcodes])
- DemuxSummaryFiles_complex_lanes[str(unknown_barcode_lane['Lane'])] = full_list_unknownbarcodes
+ comparepart_idx2 = (
+ sample_idx2
+ if len(sample_idx2)
+ <= len(unknownbarcode_idx1)
+ else sample_idx2[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx2)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
+ stats_list["UnknownBarcodes"].extend(
+ [full_list_unknownbarcodes]
+ )
+ DemuxSummaryFiles_complex_lanes[
+ str(unknown_barcode_lane["Lane"])
+ ] = full_list_unknownbarcodes
else:
pass
# Fix special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
- for entry in stats_list['ConversionResults'][:]:
- if str(entry['LaneNumber']) in noindex_lanes:
- del entry['DemuxResults'][0]['IndexMetrics']
- entry['DemuxResults'][0].update(entry['Undetermined'])
- del entry['Undetermined']
+ for entry in stats_list["ConversionResults"][:]:
+ if str(entry["LaneNumber"]) in noindex_lanes:
+ del entry["DemuxResults"][0]["IndexMetrics"]
+ entry["DemuxResults"][0].update(entry["Undetermined"])
+ del entry["Undetermined"]
# Reset unknown barcodes list
- for entry in stats_list['UnknownBarcodes'][:]:
- if str(entry['Lane']) in noindex_lanes:
- entry['Barcodes'] = {'unknown': 1}
+ for entry in stats_list["UnknownBarcodes"][:]:
+ if str(entry["Lane"]) in noindex_lanes:
+ entry["Barcodes"] = {"unknown": 1}
# Write the final version of Stats.json file
json.dump(stats_list, json_data_cumulative)
@@ -776,100 +1128,161 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet
# Create DemuxSummary.txt files for complex lanes
if len(DemuxSummaryFiles_complex_lanes) > 0:
for key, value in DemuxSummaryFiles_complex_lanes.items():
- with open(os.path.join(DemultiplexingStats_xml_dir, 'DemuxSummaryF1L{}.txt'.format(key)), 'w') as DemuxSummaryFile:
- DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n')
- DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n')
- for idx, count in value['Barcodes'].items():
- DemuxSummaryFile.write('{}\t{}\n'.format(idx, count))
-
- open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close()
-
- def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes):
+ with open(
+ os.path.join(
+ DemultiplexingStats_xml_dir, f"DemuxSummaryF1L{key}.txt"
+ ),
+ "w",
+ ) as DemuxSummaryFile:
+ DemuxSummaryFile.write("### Most Popular Unknown Index Sequences\n")
+ DemuxSummaryFile.write("### Columns: Index_Sequence Hit_Count\n")
+ for idx, count in value["Barcodes"].items():
+ DemuxSummaryFile.write(f"{idx}\t{count}\n")
+
+ open(
+ os.path.join(DemultiplexingStats_xml_dir, "DemultiplexingStats.xml"), "a"
+ ).close()
+
+ def _process_demux_with_complex_lanes(
+ self,
+ demux_folder,
+ samplesheets,
+ legacy_path,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ ):
html_reports_lane = []
html_reports_laneBarcode = []
stats_json = []
for samplesheet in samplesheets:
ssparser = SampleSheetParser(samplesheet)
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
- html_report_lane = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "lane.html"
- )
+ html_report_lane = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
if os.path.exists(html_report_lane):
html_reports_lane.append(html_report_lane)
else:
- raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_lane))
-
- html_report_laneBarcode = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ raise RuntimeError(
+ f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing"
+ )
+
+ html_report_laneBarcode = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
if os.path.exists(html_report_laneBarcode):
html_reports_laneBarcode.append(html_report_laneBarcode)
else:
- raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_laneBarcode))
-
- stat_json = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json")
+ raise RuntimeError(
+ f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing"
+ )
+
+ stat_json = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ "Stats.json",
+ )
if os.path.exists(stat_json):
stats_json.append(stat_json)
else:
- raise RuntimeError("Not able to find Stats.json report {}: possible cause is problem in demultiplexing".format(stat_json))
+ raise RuntimeError(
+ f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing"
+ )
# Aggregate fastq
lanes_samples = dict()
for row in ssparser.data:
- if row['Lane'] not in lanes_samples.keys():
- lanes_samples[row['Lane']] = [row['Sample_Name']]
+ if row["Lane"] not in lanes_samples.keys():
+ lanes_samples[row["Lane"]] = [row["Sample_Name"]]
else:
- lanes_samples[row['Lane']].append(row['Sample_Name'])
+ lanes_samples[row["Lane"]].append(row["Sample_Name"])
# Special case that when we assign fake indexes for NoIndex samples
- if (set(list(lanes_samples.keys())) & set(noindex_lanes)) and index_cycles != [0, 0]:
+ if (
+ set(list(lanes_samples.keys())) & set(noindex_lanes)
+ ) and index_cycles != [0, 0]:
sample_counter = 1
- for entry in sorted(ssparser.data, key=lambda k: k['Lane']):
- lane = entry['Lane']
- project = entry['Sample_Project']
- sample = entry['Sample_ID']
+ for entry in sorted(ssparser.data, key=lambda k: k["Lane"]):
+ lane = entry["Lane"]
+ project = entry["Sample_Project"]
+ sample = entry["Sample_ID"]
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
os.makedirs(project_dest)
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))):
+ for file in glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined*L0?{lane}*",
+ )
+ ):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_', ''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:]
+ new_name_comps = [
+ sample.replace("Sample_", ""),
+ f"S{str(sample_counter)}",
+ ] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
- logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name))
+ logger.info(
+ "For undet sample {}, renaming {} to {}".format(
+ sample.replace("Sample_", ""), old_name, new_name
+ )
+ )
sample_counter += 1
# Ordinary cases
else:
- projects = [project for project in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) if os.path.isdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project))]
+ projects = [
+ project
+ for project in os.listdir(
+ os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
+ )
+ if os.path.isdir(
+ os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", project
+ )
+ )
+ ]
for project in projects:
if project in "Reports" or project in "Stats":
continue
- project_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project)
+ project_source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", project
+ )
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
# There might be project seqeunced with multiple index lengths
os.makedirs(project_dest)
- samples = [sample for sample in os.listdir(project_source) if os.path.isdir(os.path.join(project_source, sample))]
+ samples = [
+ sample
+ for sample in os.listdir(project_source)
+ if os.path.isdir(os.path.join(project_source, sample))
+ ]
for sample in samples:
sample_source = os.path.join(project_source, sample)
sample_dest = os.path.join(project_dest, sample)
@@ -877,13 +1290,31 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
# There should never be the same sample sequenced with different index length,
# however a sample might be pooled in several lanes and therefore sequenced using different samplesheets
os.makedirs(sample_dest)
- fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*"))
+ fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*"))
for fastqfile in fastqfiles:
- os.symlink(fastqfile, os.path.join(sample_dest, os.path.split(fastqfile)[1]))
+ os.symlink(
+ fastqfile,
+ os.path.join(sample_dest, os.path.split(fastqfile)[1]),
+ )
# Copy fastq files for undetermined and the undetermined stats for simple lanes only
lanes_in_sub_samplesheet = []
- header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project']
- with open(samplesheet, mode='r') as sub_samplesheet_file:
+ header = [
+ "[Header]",
+ "[Data]",
+ "FCID",
+ "Lane",
+ "Sample_ID",
+ "Sample_Name",
+ "Sample_Ref",
+ "index",
+ "index2",
+ "Description",
+ "Control",
+ "Recipe",
+ "Operator",
+ "Sample_Project",
+ ]
+ with open(samplesheet) as sub_samplesheet_file:
sub_samplesheet_reader = csv.reader(sub_samplesheet_file)
for row in sub_samplesheet_reader:
if row[0] not in header:
@@ -891,68 +1322,120 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
lanes_in_sub_samplesheet = list(set(lanes_in_sub_samplesheet))
for lane in lanes_in_sub_samplesheet:
if lane in simple_lanes.keys():
- undetermined_fastq_files = glob.glob(os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- "Undetermined_S0_L00{}*.fastq*".format(lane))) # Contains only simple lanes undetermined
+ undetermined_fastq_files = glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined_S0_L00{lane}*.fastq*",
+ )
+ ) # Contains only simple lanes undetermined
for fastqfile in undetermined_fastq_files:
- os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1]))
- DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- legacy_path,
- "Stats",
- "*L{}*txt".format(lane)))
+ os.symlink(
+ fastqfile,
+ os.path.join(demux_folder, os.path.split(fastqfile)[1]),
+ )
+ DemuxSummaryFiles = glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ f"*L{lane}*txt",
+ )
+ )
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
for DemuxSummaryFile in DemuxSummaryFiles:
- os.symlink(DemuxSummaryFile, os.path.join(demux_folder, "Stats", os.path.split(DemuxSummaryFile)[1]))
+ os.symlink(
+ DemuxSummaryFile,
+ os.path.join(
+ demux_folder,
+ "Stats",
+ os.path.split(DemuxSummaryFile)[1],
+ ),
+ )
return html_reports_lane, html_reports_laneBarcode, stats_json
def _aggregate_demux_results_simple_complex(self):
runSetup = self.runParserObj.runinfo.get_read_configuration()
- demux_folder = os.path.join(self.run_dir , self.demux_dir)
+ demux_folder = os.path.join(self.run_dir, self.demux_dir)
samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv"))
- if self.software == 'bcl2fastq':
- legacy_path = ''
- elif self.software == 'bclconvert':
- legacy_path = "Reports/{}".format(self.legacy_dir)
+ if self.software == "bcl2fastq":
+ legacy_path = ""
+ elif self.software == "bclconvert":
+ legacy_path = f"Reports/{self.legacy_dir}"
else:
raise RuntimeError("Unrecognized software!")
index_cycles = [0, 0]
for read in runSetup:
- if read['IsIndexedRead'] == 'Y':
- if int(read['Number']) == 2:
- index_cycles[0] = int(read['NumCycles'])
+ if read["IsIndexedRead"] == "Y":
+ if int(read["Number"]) == 2:
+ index_cycles[0] = int(read["NumCycles"])
else:
- index_cycles[1] = int(read['NumCycles'])
+ index_cycles[1] = int(read["NumCycles"])
# Classify lanes in samplesheets
- (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(samplesheets)
+ (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(
+ samplesheets
+ )
# Case with only one sub-demultiplexing
if len(complex_lanes) == 0 and len(samplesheets) == 1:
- demux_id = "0" # in this case this is the only demux dir
+ demux_id = "0" # in this case this is the only demux dir
# Special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
# We first softlink the FastQ files of undet as the FastQ files of samples
- self._process_noindex_sample_with_fake_index_with_single_demux(demux_id, legacy_path)
+ self._process_noindex_sample_with_fake_index_with_single_demux(
+ demux_id, legacy_path
+ )
# This is the simple case, Demultiplexing dir is simply a symlink to the only sub-demultiplexing dir
else:
- self._process_simple_lane_with_single_demux(demux_id, legacy_path, noindex_lanes)
+ self._process_simple_lane_with_single_demux(
+ demux_id, legacy_path, noindex_lanes
+ )
return True
# Case with multiple sub-demultiplexings
- (html_reports_lane, html_reports_laneBarcode, stats_json) = self._process_demux_with_complex_lanes(demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes)
+ (
+ html_reports_lane,
+ html_reports_laneBarcode,
+ stats_json,
+ ) = self._process_demux_with_complex_lanes(
+ demux_folder,
+ samplesheets,
+ legacy_path,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ )
# Create the html reports
- NumberReads_Summary = self._fix_html_reports_for_complex_lanes(demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode)
+ self._fix_html_reports_for_complex_lanes(
+ demux_folder,
+ index_cycles,
+ complex_lanes,
+ noindex_lanes,
+ html_reports_lane,
+ html_reports_laneBarcode,
+ )
# Fix contents under the DemultiplexingStats folder
- self._fix_demultiplexingstats_xml_dir(demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes, NumberReads_Summary)
+ self._fix_demultiplexingstats_xml_dir(
+ demux_folder,
+ stats_json,
+ samplesheets,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ )
return True
+
def _create_folder_structure(root, dirs):
"""Creates a fodler stucture rooted in root usinf all dirs listed in dirs (a list)
returns the path to the deepest directory
@@ -964,49 +1447,56 @@ def _create_folder_structure(root, dirs):
os.makedirs(path)
return path
+
def _generate_lane_html(html_file, html_report_lane_parser):
- with open(html_file, 'w') as html:
+ with open(html_file, "w") as html:
# HEADER
- html.write('\n')
- html.write('\n')
- html.write('\n')
- html.write('\n')
+ html.write(
+ '\n'
+ )
+ html.write("\n")
+ html.write(
+ '\n'
+ )
+ html.write("\n")
html.write('
\n")
lane_keys = sorted(list(html_report_lane_parser.sample_data[0].keys()))
for key in lane_keys:
- html.write('
{}
\n'.format(key))
- html.write('
\n')
+ html.write(f"
{key}
\n")
+ html.write("\n")
for sample in html_report_lane_parser.sample_data:
- html.write('
\n')
+ html.write("
\n")
for key in lane_keys:
- html.write('
{}
\n'.format(sample[key]))
- html.write('
\n')
- html.write('
\n')
+ html.write(f"
{sample[key]}
\n")
+ html.write("\n")
+ html.write("\n")
# FOOTER
- html.write('\n')
- html.write('\n')
- html.write('\n')
+ html.write("\n")
+ html.write("\n")
+ html.write("\n")
diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py
index bb718787..7f051d66 100755
--- a/taca/illumina/Standard_Runs.py
+++ b/taca/illumina/Standard_Runs.py
@@ -1,76 +1,88 @@
+import logging
import os
import re
-import logging
from datetime import datetime
-from taca.utils.filesystem import chdir
+from flowcell_parser.classes import SampleSheetParser
+
from taca.illumina.Runs import Run
from taca.utils import misc
-from flowcell_parser.classes import SampleSheetParser
-from io import open
+from taca.utils.filesystem import chdir
logger = logging.getLogger(__name__)
-TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?')
-TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?')
-SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]')
-IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)')
-RECIPE_PAT = re.compile('[0-9]+-[0-9]+')
+TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?")
+TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?")
+SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]")
+IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")
+RECIPE_PAT = re.compile("[0-9]+-[0-9]+")
class Standard_Run(Run):
-
def __init__(self, run_dir, software, configuration):
- super(Standard_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
+ self._set_sequencer_type()
+ self._set_run_type()
+ self._copy_samplesheet()
def _set_sequencer_type(self):
- self.sequencer_type = ''
+ self.sequencer_type = ""
def _set_run_type(self):
- self.run_type = 'NGI-RUN'
+ self.run_type = "NGI-RUN"
def _copy_samplesheet(self):
- ssname = self._get_samplesheet()
+ ssname = self._get_samplesheet()
ssparser = SampleSheetParser(ssname)
indexfile = dict()
runSetup = self.runParserObj.runinfo.get_read_configuration()
# Loading index files
try:
- indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path']
+ indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"]
except KeyError:
- logger.error('Path to index file (10X) not found in the config file')
+ logger.error("Path to index file (10X) not found in the config file")
raise RuntimeError
try:
- indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path']
+ indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"]
except KeyError:
- logger.error('Path to index file (Smart-seq) not found in the config file')
+ logger.error("Path to index file (Smart-seq) not found in the config file")
raise RuntimeError
# Samplesheet need to be positioned in the FC directory with name SampleSheet.csv (Illumina default)
# If this is not the case then create it and take special care of modification to be done on the SampleSheet
- samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet.csv')
+ samplesheet_dest = os.path.join(self.run_dir, "SampleSheet.csv")
# Function that goes through the original sample sheet and check for sample types
self.sample_table = self._classify_samples(indexfile, ssparser, runSetup)
# Check that the samplesheet is not already present. In this case go the next step
if not os.path.exists(samplesheet_dest):
try:
- with open(samplesheet_dest, 'w') as fcd:
- fcd.write(self._generate_clean_samplesheet(ssparser,
- indexfile,
- fields_to_remove=None,
- rename_samples=True,
- rename_qPCR_suffix = True,
- fields_qPCR=[ssparser.dfield_snm]))
+ with open(samplesheet_dest, "w") as fcd:
+ fcd.write(
+ self._generate_clean_samplesheet(
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=True,
+ fields_qPCR=[ssparser.dfield_snm],
+ )
+ )
except Exception as e:
- logger.error('Encountered the following exception {}'.format(e))
+ logger.error(f"Encountered the following exception {e}")
return False
- logger.info(('Created SampleSheet.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest)))
+ logger.info(
+ f"Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} "
+ )
# SampleSheet.csv generated
# When demultiplexing SampleSheet.csv is the one I need to use
# Need to rewrite so that SampleSheet_0.csv is always used.
- self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet.csv'))
- if not self.runParserObj.obj.get('samplesheet_csv'):
- self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data
+ self.runParserObj.samplesheet = SampleSheetParser(
+ os.path.join(self.run_dir, "SampleSheet.csv")
+ )
+ if not self.runParserObj.obj.get("samplesheet_csv"):
+ self.runParserObj.obj[
+ "samplesheet_csv"
+ ] = self.runParserObj.samplesheet.data
def _parse_10X_indexes(self, indexfile):
"""
@@ -78,9 +90,9 @@ def _parse_10X_indexes(self, indexfile):
Todo: Set it up to take the file from config instead
"""
index_dict = {}
- with open(indexfile, 'r') as f:
+ with open(indexfile) as f:
for line in f:
- line_ = line.rstrip().split(',')
+ line_ = line.rstrip().split(",")
index_dict[line_[0]] = line_[1:5]
return index_dict
@@ -90,107 +102,140 @@ def _parse_smartseq_indexes(self, indexfile):
Todo: Set it up to take the file from config instead
"""
index_dict = {}
- with open(indexfile, 'r') as f:
+ with open(indexfile) as f:
for line in f:
- line_ = line.rstrip().split(',')
+ line_ = line.rstrip().split(",")
if index_dict.get(line_[0]):
- index_dict[line_[0]].append((line_[1],line_[2]))
+ index_dict[line_[0]].append((line_[1], line_[2]))
else:
- index_dict.update({line_[0]:[(line_[1],line_[2])]})
+ index_dict.update({line_[0]: [(line_[1], line_[2])]})
return index_dict
def _classify_samples(self, indexfile, ssparser, runSetup):
"""Given an ssparser object, go through all samples and decide sample types."""
sample_table = dict()
- index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
- index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq'])
+ index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"])
+ index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"])
index_cycles = [0, 0]
read_cycles = [0, 0]
for read in runSetup:
- if read['IsIndexedRead'] == 'Y':
- if int(read['Number']) == 2:
- index_cycles[0] = int(read['NumCycles'])
+ if read["IsIndexedRead"] == "Y":
+ if int(read["Number"]) == 2:
+ index_cycles[0] = int(read["NumCycles"])
else:
- index_cycles[1] = int(read['NumCycles'])
- elif read['IsIndexedRead'] == 'N':
- if int(read['Number']) == 1:
- read_cycles[0] = int(read['NumCycles'])
+ index_cycles[1] = int(read["NumCycles"])
+ elif read["IsIndexedRead"] == "N":
+ if int(read["Number"]) == 1:
+ read_cycles[0] = int(read["NumCycles"])
else:
- read_cycles[1] = int(read['NumCycles'])
+ read_cycles[1] = int(read["NumCycles"])
for sample in ssparser.data:
- lane = sample['Lane']
- sample_name = sample.get('Sample_Name') or sample.get('SampleName')
+ lane = sample["Lane"]
+ sample_name = sample.get("Sample_Name") or sample.get("SampleName")
umi_length = [0, 0]
read_length = read_cycles
# Read the length of read 1 and read 2 from the field Recipe
- if sample.get('Recipe') and RECIPE_PAT.findall(sample.get('Recipe')):
- ss_read_length = [int(sample.get('Recipe').split('-')[0]), int(sample.get('Recipe').split('-')[1])]
+ if sample.get("Recipe") and RECIPE_PAT.findall(sample.get("Recipe")):
+ ss_read_length = [
+ int(sample.get("Recipe").split("-")[0]),
+ int(sample.get("Recipe").split("-")[1]),
+ ]
else:
ss_read_length = [0, 0]
# By default use the read cycles from the sequncing setup. Otherwise use the shorter read length
if ss_read_length != [0, 0]:
read_length = [min(rd) for rd in zip(ss_read_length, read_length)]
# 10X single index
- if TENX_SINGLE_PAT.findall(sample['index']):
- index_length = [len(index_dict_tenX[sample['index']][0]),0]
- sample_type = '10X_SINGLE'
+ if TENX_SINGLE_PAT.findall(sample["index"]):
+ index_length = [len(index_dict_tenX[sample["index"]][0]), 0]
+ sample_type = "10X_SINGLE"
# 10X dual index
- elif TENX_DUAL_PAT.findall(sample['index']):
- index_length = [len(index_dict_tenX[sample['index']][0]),len(index_dict_tenX[sample['index']][1])]
- sample_type = '10X_DUAL'
+ elif TENX_DUAL_PAT.findall(sample["index"]):
+ index_length = [
+ len(index_dict_tenX[sample["index"]][0]),
+ len(index_dict_tenX[sample["index"]][1]),
+ ]
+ sample_type = "10X_DUAL"
# IDT UMI samples
- elif IDT_UMI_PAT.findall(sample['index']) or IDT_UMI_PAT.findall(sample['index2']):
+ elif IDT_UMI_PAT.findall(sample["index"]) or IDT_UMI_PAT.findall(
+ sample["index2"]
+ ):
# Index length after removing "N" part
- index_length = [len(sample['index'].replace('N', '')),
- len(sample['index2'].replace('N', ''))]
- sample_type = 'IDT_UMI'
- umi_length = [sample['index'].upper().count('N'), sample['index2'].upper().count('N')]
+ index_length = [
+ len(sample["index"].replace("N", "")),
+ len(sample["index2"].replace("N", "")),
+ ]
+ sample_type = "IDT_UMI"
+ umi_length = [
+ sample["index"].upper().count("N"),
+ sample["index2"].upper().count("N"),
+ ]
# Smart-seq
- elif SMARTSEQ_PAT.findall(sample['index']):
- smartseq_index = sample['index'].split('-')[1]
- index_length = [len(index_dict_smartseq[smartseq_index][0][0]),len(index_dict_smartseq[smartseq_index][0][1])]
- sample_type = 'SMARTSEQ'
+ elif SMARTSEQ_PAT.findall(sample["index"]):
+ smartseq_index = sample["index"].split("-")[1]
+ index_length = [
+ len(index_dict_smartseq[smartseq_index][0][0]),
+ len(index_dict_smartseq[smartseq_index][0][1]),
+ ]
+ sample_type = "SMARTSEQ"
# No Index case 1. We will write indexes to separate FastQ files
- elif sample['index'].upper() == 'NOINDEX' and index_cycles != [0, 0]:
+ elif sample["index"].upper() == "NOINDEX" and index_cycles != [0, 0]:
index_length = index_cycles
- sample_type = 'NOINDEX'
+ sample_type = "NOINDEX"
# No Index case 2. Both index 1 and 2 are empty, it will be the same index type but will be handled in the next case
- elif sample['index'].upper() == 'NOINDEX' and index_cycles == [0, 0]:
+ elif sample["index"].upper() == "NOINDEX" and index_cycles == [0, 0]:
index_length = [0, 0]
- sample_type = 'ordinary'
+ sample_type = "ordinary"
# Ordinary samples
else:
- index_length = [len(sample['index']),len(sample['index2'])]
+ index_length = [len(sample["index"]), len(sample["index2"])]
# Short single index (<=6nt)
- if (index_length[0] <= 8 and index_length[1] == 0) or (index_length[0] == 0 and index_length[1] <= 8):
- sample_type = 'short_single_index'
+ if (index_length[0] <= 8 and index_length[1] == 0) or (
+ index_length[0] == 0 and index_length[1] <= 8
+ ):
+ sample_type = "short_single_index"
else:
- sample_type = 'ordinary'
+ sample_type = "ordinary"
# Write in sample table
# {'1': [('101', {'sample_type': 'ordinary', 'index_length': [8, 8]}), ('102', {'sample_type': 'ordinary', 'index_length': [8, 8]})]}
if sample_table.get(lane):
- sample_table[lane].append((sample_name,
- {'sample_type': sample_type,
- 'index_length': index_length,
- 'umi_length': umi_length,
- 'read_length': read_length}))
+ sample_table[lane].append(
+ (
+ sample_name,
+ {
+ "sample_type": sample_type,
+ "index_length": index_length,
+ "umi_length": umi_length,
+ "read_length": read_length,
+ },
+ )
+ )
else:
- sample_table.update({lane:[(sample_name,
- {'sample_type': sample_type,
- 'index_length': index_length,
- 'umi_length': umi_length,
- 'read_length': read_length})]})
+ sample_table.update(
+ {
+ lane: [
+ (
+ sample_name,
+ {
+ "sample_type": sample_type,
+ "index_length": index_length,
+ "umi_length": umi_length,
+ "read_length": read_length,
+ },
+ )
+ ]
+ }
+ )
return sample_table
-
def demultiplex_run(self):
"""
- Demultiplex a run:
- - Make sub-samplesheet based on sample classes
- - Decide correct bcl2fastq/bclconvert command parameters based on sample classes
- - run bcl2fastq/bclconvert conversion
+ Demultiplex a run:
+ - Make sub-samplesheet based on sample classes
+ - Decide correct bcl2fastq/bclconvert command parameters based on sample classes
+ - run bcl2fastq/bclconvert conversion
"""
runSetup = self.runParserObj.runinfo.get_read_configuration()
# Check sample types
@@ -198,7 +243,7 @@ def demultiplex_run(self):
for lane, lane_contents in self.sample_table.items():
for sample in lane_contents:
sample_detail = sample[1]
- sample_type = sample_detail['sample_type']
+ sample_type = sample_detail["sample_type"]
if sample_type not in sample_type_list:
sample_type_list.append(sample_type)
@@ -210,21 +255,43 @@ def demultiplex_run(self):
for lane, lane_contents in self.sample_table.items():
for sample in lane_contents:
sample_detail = sample[1]
- sample_type_t = sample_detail['sample_type']
- sample_index_length = sample_detail['index_length']
- sample_umi_length = sample_detail['umi_length']
- sample_read_length = sample_detail['read_length']
+ sample_type_t = sample_detail["sample_type"]
+ sample_index_length = sample_detail["index_length"]
+ sample_umi_length = sample_detail["umi_length"]
+ sample_read_length = sample_detail["read_length"]
if sample_type_t == sample_type:
if lane_table.get(lane):
- if (sample_index_length, sample_umi_length, sample_read_length) not in lane_table[lane]:
- lane_table[lane].append((sample_index_length, sample_umi_length, sample_read_length))
+ if (
+ sample_index_length,
+ sample_umi_length,
+ sample_read_length,
+ ) not in lane_table[lane]:
+ lane_table[lane].append(
+ (
+ sample_index_length,
+ sample_umi_length,
+ sample_read_length,
+ )
+ )
else:
- lane_table.update({lane:[(sample_index_length, sample_umi_length, sample_read_length)]})
+ lane_table.update(
+ {
+ lane: [
+ (
+ sample_index_length,
+ sample_umi_length,
+ sample_read_length,
+ )
+ ]
+ }
+ )
# Determine the number of demux needed for the same sample type
- if self.software == 'bcl2fastq':
- demux_number_with_the_same_sample_type = len(max([v for k, v in lane_table.items()],key=len))
- elif self.software == 'bclconvert':
+ if self.software == "bcl2fastq":
+ demux_number_with_the_same_sample_type = len(
+ max([v for k, v in lane_table.items()], key=len)
+ )
+ elif self.software == "bclconvert":
unique_masks = []
for masks in lane_table.values():
for mask in masks:
@@ -232,33 +299,44 @@ def demultiplex_run(self):
unique_masks.append(mask)
demux_number_with_the_same_sample_type = len(unique_masks)
# Prepare sub-samplesheets, masks and commands
- for i in range(0,demux_number_with_the_same_sample_type):
+ for i in range(0, demux_number_with_the_same_sample_type):
# Prepare sub-samplesheet
# A dictionary with lane and sample IDs to include
samples_to_include = dict()
# A dictionary with lane and index length for generating masks
mask_table = dict()
- if self.software == 'bcl2fastq':
+ if self.software == "bcl2fastq":
for lane, lane_contents in self.sample_table.items():
try:
- (index_length, umi_length, read_length) = lane_table[lane][i]
- mask_table.update({lane: (index_length, umi_length, read_length)})
+ (index_length, umi_length, read_length) = lane_table[lane][
+ i
+ ]
+ mask_table.update(
+ {lane: (index_length, umi_length, read_length)}
+ )
for sample in lane_contents:
sample_name = sample[0]
sample_detail = sample[1]
- sample_type_t = sample_detail['sample_type']
- sample_index_length = sample_detail['index_length']
- sample_umi_length = sample_detail['umi_length']
- sample_read_length = sample_detail['read_length']
- if sample_type_t == sample_type and sample_index_length == index_length and sample_umi_length == umi_length and sample_read_length == read_length:
+ sample_type_t = sample_detail["sample_type"]
+ sample_index_length = sample_detail["index_length"]
+ sample_umi_length = sample_detail["umi_length"]
+ sample_read_length = sample_detail["read_length"]
+ if (
+ sample_type_t == sample_type
+ and sample_index_length == index_length
+ and sample_umi_length == umi_length
+ and sample_read_length == read_length
+ ):
if samples_to_include.get(lane):
samples_to_include[lane].append(sample_name)
else:
- samples_to_include.update({lane:[sample_name]})
- except (KeyError, IndexError) as err:
- logger.info(('No corresponding mask in lane {}. Skip it.'.format(lane)))
+ samples_to_include.update({lane: [sample_name]})
+ except (KeyError, IndexError):
+ logger.info(
+ f"No corresponding mask in lane {lane}. Skip it."
+ )
continue
- elif self.software == 'bclconvert':
+ elif self.software == "bclconvert":
mask = unique_masks[i]
for lane, lane_contents in self.sample_table.items():
if lane_table.get(lane):
@@ -267,17 +345,24 @@ def demultiplex_run(self):
for sample in lane_contents:
sample_name = sample[0]
sample_detail = sample[1]
- sample_type_t = sample_detail['sample_type']
- sample_index_length = sample_detail['index_length']
- sample_umi_length = sample_detail['umi_length']
- sample_read_length = sample_detail['read_length']
- if sample_type_t == sample_type and sample_index_length == mask[0] and sample_umi_length == mask[1] and sample_read_length == mask[2]:
+ sample_type_t = sample_detail["sample_type"]
+ sample_index_length = sample_detail["index_length"]
+ sample_umi_length = sample_detail["umi_length"]
+ sample_read_length = sample_detail["read_length"]
+ if (
+ sample_type_t == sample_type
+ and sample_index_length == mask[0]
+ and sample_umi_length == mask[1]
+ and sample_read_length == mask[2]
+ ):
if samples_to_include.get(lane):
samples_to_include[lane].append(sample_name)
else:
- samples_to_include.update({lane:[sample_name]})
+ samples_to_include.update(
+ {lane: [sample_name]}
+ )
- if self.software == 'bclconvert':
+ if self.software == "bclconvert":
runSetup = self.runParserObj.runinfo.get_read_configuration()
(index_length, umi_length, read_length) = mask
index1_size = int(index_length[0])
@@ -287,37 +372,61 @@ def demultiplex_run(self):
read1_size = int(read_length[0])
read2_size = int(read_length[1])
is_dual_index = False
- if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0):
+ if (index1_size != 0 and index2_size != 0) or (
+ index1_size == 0 and index2_size != 0
+ ):
is_dual_index = True
- base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size)
+ base_mask = self._compute_base_mask(
+ runSetup,
+ sample_type,
+ index1_size,
+ is_dual_index,
+ index2_size,
+ umi1_size,
+ umi2_size,
+ read1_size,
+ read2_size,
+ )
else:
index1_size = 0
index2_size = 0
base_mask = []
# Make sub-samplesheet
with chdir(self.run_dir):
- samplesheet_dest='SampleSheet_{}.csv'.format(bcl_cmd_counter)
- with open(samplesheet_dest, 'w') as fcd:
- fcd.write(self._generate_samplesheet_subset(self.runParserObj.samplesheet,
- samples_to_include, runSetup, self.software, sample_type, index1_size, index2_size, base_mask, self.CONFIG))
+ samplesheet_dest = f"SampleSheet_{bcl_cmd_counter}.csv"
+ with open(samplesheet_dest, "w") as fcd:
+ fcd.write(
+ self._generate_samplesheet_subset(
+ self.runParserObj.samplesheet,
+ samples_to_include,
+ runSetup,
+ self.software,
+ sample_type,
+ index1_size,
+ index2_size,
+ base_mask,
+ self.CONFIG,
+ )
+ )
# Prepare demultiplexing dir
with chdir(self.run_dir):
# Create Demultiplexing dir, this changes the status to IN_PROGRESS
- if not os.path.exists('Demultiplexing'):
- os.makedirs('Demultiplexing')
+ if not os.path.exists("Demultiplexing"):
+ os.makedirs("Demultiplexing")
# Prepare demultiplexing command
with chdir(self.run_dir):
- cmd = self.generate_bcl_command(sample_type,
- mask_table,
- bcl_cmd_counter)
- misc.call_external_command_detached(cmd,
- with_log_files = True,
- prefix='demux_{}'.format(bcl_cmd_counter))
- logger.info(('BCL to FASTQ conversion and demultiplexing ' \
- 'started for run {} on {}'.format(os.path.basename(self.id),
- datetime.now())))
+ cmd = self.generate_bcl_command(
+ sample_type, mask_table, bcl_cmd_counter
+ )
+ misc.call_external_command_detached(
+ cmd, with_log_files=True, prefix=f"demux_{bcl_cmd_counter}"
+ )
+ logger.info(
+ "BCL to FASTQ conversion and demultiplexing "
+ f"started for run {os.path.basename(self.id)} on {datetime.now()}"
+ )
# Demutiplexing done for one mask type and scripts will continue
# Working with the next type. Command counter should increase by 1
@@ -333,47 +442,59 @@ def _aggregate_demux_results(self):
def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter):
with chdir(self.run_dir):
# Software
- cl = [self.CONFIG.get(self.software)['bin']]
+ cl = [self.CONFIG.get(self.software)["bin"]]
# Case with bcl2fastq
- if self.software == 'bcl2fastq':
- logger.info('Building a bcl2fastq command')
- per_lane_base_masks = self._generate_per_lane_base_mask(sample_type, mask_table)
+ if self.software == "bcl2fastq":
+ logger.info("Building a bcl2fastq command")
+ per_lane_base_masks = self._generate_per_lane_base_mask(
+ sample_type, mask_table
+ )
# Add the base_mask for each lane
lanes = list(mask_table.keys())
for lane in sorted(lanes):
# Iterate thorugh each lane and add the correct --use-bases-mask for that lane
- base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # Get the base_mask
- base_mask_expr = '{}:'.format(lane) + ','.join(base_mask)
- cl.extend(['--use-bases-mask', base_mask_expr])
+ base_mask = [
+ per_lane_base_masks[lane][bm]["base_mask"]
+ for bm in per_lane_base_masks[lane]
+ ][0] # Get the base_mask
+ base_mask_expr = f"{lane}:" + ",".join(base_mask)
+ cl.extend(["--use-bases-mask", base_mask_expr])
# Case with bclconvert
- elif self.software == 'bclconvert':
- logger.info('Building a bclconvert command')
- cl.extend(['--bcl-input-directory', self.run_dir])
+ elif self.software == "bclconvert":
+ logger.info("Building a bclconvert command")
+ cl.extend(["--bcl-input-directory", self.run_dir])
else:
raise RuntimeError("Unrecognized software!")
# Output dir
- output_dir = os.path.join(self.run_dir, 'Demultiplexing_{}'.format(bcl_cmd_counter))
+ output_dir = os.path.join(self.run_dir, f"Demultiplexing_{bcl_cmd_counter}")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
- cl.extend(['--output-dir', output_dir])
+ cl.extend(["--output-dir", output_dir])
# Samplesheet
- cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, 'SampleSheet_{}.csv'.format(bcl_cmd_counter)))])
+ cl.extend(
+ [
+ "--sample-sheet",
+ os.path.join(
+ os.path.join(self.run_dir, f"SampleSheet_{bcl_cmd_counter}.csv")
+ ),
+ ]
+ )
# Demux options
cl_options = []
- if 'options' in self.CONFIG.get(self.software):
- if self.CONFIG[self.software]['options'].get('common'):
- for option in self.CONFIG[self.software]['options']['common']:
+ if "options" in self.CONFIG.get(self.software):
+ if self.CONFIG[self.software]["options"].get("common"):
+ for option in self.CONFIG[self.software]["options"]["common"]:
cl_options.extend([option])
- if self.CONFIG[self.software]['options'].get(sample_type):
- for option in self.CONFIG[self.software]['options'][sample_type]:
+ if self.CONFIG[self.software]["options"].get(sample_type):
+ for option in self.CONFIG[self.software]["options"][sample_type]:
cl_options.extend([option])
for option in cl_options:
if isinstance(option, dict):
opt, val = list(option.items())[0]
- if 'output-dir' not in opt:
- cl.extend(['--{}'.format(opt), str(val).lower()])
+ if "output-dir" not in opt:
+ cl.extend([f"--{opt}", str(val).lower()])
else:
- cl.append('--{}'.format(option))
+ cl.append(f"--{option}")
return cl
def _generate_per_lane_base_mask(self, sample_type, mask_table):
@@ -405,200 +526,302 @@ def _generate_per_lane_base_mask(self, sample_type, mask_table):
read1_size = lane_contents[2][0]
read2_size = lane_contents[2][1]
is_dual_index = False
- if (index1_size != 0 and index2_size != 0) or (index1_size == 0 and index2_size != 0):
+ if (index1_size != 0 and index2_size != 0) or (
+ index1_size == 0 and index2_size != 0
+ ):
is_dual_index = True
# Compute the basemask
- base_mask = self._compute_base_mask(runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size)
- base_mask_string = ''.join(base_mask)
-
- base_masks[lane][base_mask_string] = {'base_mask':base_mask}
+ base_mask = self._compute_base_mask(
+ runSetup,
+ sample_type,
+ index1_size,
+ is_dual_index,
+ index2_size,
+ umi1_size,
+ umi2_size,
+ read1_size,
+ read2_size,
+ )
+ base_mask_string = "".join(base_mask)
+
+ base_masks[lane][base_mask_string] = {"base_mask": base_mask}
return base_masks
- def _compute_base_mask(self, runSetup, sample_type, index1_size, is_dual_index, index2_size, umi1_size, umi2_size, read1_size, read2_size):
+ def _compute_base_mask(
+ self,
+ runSetup,
+ sample_type,
+ index1_size,
+ is_dual_index,
+ index2_size,
+ umi1_size,
+ umi2_size,
+ read1_size,
+ read2_size,
+ ):
"""
- Assumptions:
- - if runSetup is of size 3, then single index run
- - if runSetup is of size 4, then dual index run
+ Assumptions:
+ - if runSetup is of size 3, then single index run
+ - if runSetup is of size 4, then dual index run
"""
bm = []
- dual_index_run = False
if len(runSetup) > 4:
- raise RuntimeError("when generating base_masks looks like there are" \
- " more than 4 reads in the RunSetup.xml")
+ raise RuntimeError(
+ "when generating base_masks looks like there are"
+ " more than 4 reads in the RunSetup.xml"
+ )
for read in runSetup:
- cycles = int(read['NumCycles'])
- if read['IsIndexedRead'] == 'N':
+ cycles = int(read["NumCycles"])
+ if read["IsIndexedRead"] == "N":
# Prepare the base mask for the 1st read
- is_first_read = int(read['Number']) == 1
+ is_first_read = int(read["Number"]) == 1
if is_first_read:
if cycles > read1_size:
r_remainder = cycles - read1_size
if read1_size != 0:
- bm.append('Y' + str(read1_size) + 'N' + str(r_remainder))
+ bm.append("Y" + str(read1_size) + "N" + str(r_remainder))
else:
- bm.append('N' + str(cycles))
+ bm.append("N" + str(cycles))
else:
- bm.append('Y' + str(cycles))
+ bm.append("Y" + str(cycles))
else:
if cycles > read2_size:
r_remainder = cycles - read2_size
if read2_size != 0:
- bm.append('Y' + str(read2_size) + 'N' + str(r_remainder))
+ bm.append("Y" + str(read2_size) + "N" + str(r_remainder))
else:
- bm.append('N' + str(cycles))
+ bm.append("N" + str(cycles))
else:
- bm.append('Y' + str(cycles))
+ bm.append("Y" + str(cycles))
else:
- is_first_index_read = int(read['Number']) == 2
+ is_first_index_read = int(read["Number"]) == 2
# Prepare the base mask for the 1st index read
if is_first_index_read:
# The size of the index of the sample sheet is larger than the
# one specified by RunInfo.xml, somethig must be wrong
if index1_size > cycles:
- raise RuntimeError("when generating base_masks found index 1 in" \
- " samplesheet larger than the index specifed in RunInfo.xml")
+ raise RuntimeError(
+ "when generating base_masks found index 1 in"
+ " samplesheet larger than the index specifed in RunInfo.xml"
+ )
i_remainder = cycles - index1_size
if i_remainder > 0:
- if sample_type == 'IDT_UMI': # Case of IDT UMI
+ if sample_type == "IDT_UMI": # Case of IDT UMI
if umi1_size != 0:
if i_remainder - umi1_size > 0:
- if self.software == 'bcl2fastq':
- bm.append('I' + str(index1_size) + 'Y' + str(umi1_size) + 'N' + str(i_remainder - umi1_size))
- elif self.software == 'bclconvert':
- bm.append('I' + str(index1_size) + 'U' + str(umi1_size) + 'N' + str(i_remainder - umi1_size))
+ if self.software == "bcl2fastq":
+ bm.append(
+ "I"
+ + str(index1_size)
+ + "Y"
+ + str(umi1_size)
+ + "N"
+ + str(i_remainder - umi1_size)
+ )
+ elif self.software == "bclconvert":
+ bm.append(
+ "I"
+ + str(index1_size)
+ + "U"
+ + str(umi1_size)
+ + "N"
+ + str(i_remainder - umi1_size)
+ )
else:
raise RuntimeError("Unrecognized software!")
elif i_remainder - umi1_size == 0:
- if self.software == 'bcl2fastq':
- bm.append('I' + str(index1_size) + 'Y' + str(umi1_size))
- elif self.software == 'bclconvert':
- bm.append('I' + str(index1_size) + 'U' + str(umi1_size))
+ if self.software == "bcl2fastq":
+ bm.append(
+ "I"
+ + str(index1_size)
+ + "Y"
+ + str(umi1_size)
+ )
+ elif self.software == "bclconvert":
+ bm.append(
+ "I"
+ + str(index1_size)
+ + "U"
+ + str(umi1_size)
+ )
else:
raise RuntimeError("Unrecognized software!")
else:
- raise RuntimeError("when generating base_masks for UMI samples" \
- " some UMI1 length is longer than specified in RunInfo.xml")
+ raise RuntimeError(
+ "when generating base_masks for UMI samples"
+ " some UMI1 length is longer than specified in RunInfo.xml"
+ )
else:
- bm.append('I' + str(index1_size) + 'N' + str(i_remainder))
+ bm.append(
+ "I" + str(index1_size) + "N" + str(i_remainder)
+ )
elif index1_size == 0:
- bm.append('N' + str(cycles)) # Case of NoIndex
+ bm.append("N" + str(cycles)) # Case of NoIndex
else:
- bm.append('I' + str(index1_size) + 'N' + str(i_remainder))
+ bm.append("I" + str(index1_size) + "N" + str(i_remainder))
else:
- bm.append('I' + str(cycles))
+ bm.append("I" + str(cycles))
else:
# The size of the index of the sample sheet is larger than the
# one specified by RunInfo.xml, somethig must be wrong
if index2_size > cycles:
- raise RuntimeError("when generating base_masks found index 2 in" \
- " samplesheet larger than the index specifed in RunInfo.xml")
+ raise RuntimeError(
+ "when generating base_masks found index 2 in"
+ " samplesheet larger than the index specifed in RunInfo.xml"
+ )
# When working on the second read index I need to know if the sample is dual index or not
- if is_dual_index or sample_type == '10X_SINGLE':
- if sample_type == '10X_SINGLE': # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert
- if self.software == 'bcl2fastq':
- bm.append('Y' + str(cycles))
- elif self.software == 'bclconvert':
- bm.append('N' + str(cycles))
+ if is_dual_index or sample_type == "10X_SINGLE":
+ if (
+ sample_type == "10X_SINGLE"
+ ): # Case of 10X single indexes, demultiplex the whole index 2 cycles as FastQ for bcl2fastq. But this has to be ignored for bclconvert
+ if self.software == "bcl2fastq":
+ bm.append("Y" + str(cycles))
+ elif self.software == "bclconvert":
+ bm.append("N" + str(cycles))
else:
raise RuntimeError("Unrecognized software!")
else:
i_remainder = cycles - index2_size
if i_remainder > 0:
- if sample_type == 'IDT_UMI': # Case of IDT UMI
+ if sample_type == "IDT_UMI": # Case of IDT UMI
if umi2_size != 0:
if i_remainder - umi2_size > 0:
- if self.software == 'bcl2fastq':
- bm.append('I' + str(index2_size) + 'Y' + str(umi2_size) + 'N' + str(i_remainder - umi2_size))
- elif self.software == 'bclconvert':
- bm.append('I' + str(index2_size) + 'U' + str(umi2_size) + 'N' + str(i_remainder - umi2_size))
+ if self.software == "bcl2fastq":
+ bm.append(
+ "I"
+ + str(index2_size)
+ + "Y"
+ + str(umi2_size)
+ + "N"
+ + str(i_remainder - umi2_size)
+ )
+ elif self.software == "bclconvert":
+ bm.append(
+ "I"
+ + str(index2_size)
+ + "U"
+ + str(umi2_size)
+ + "N"
+ + str(i_remainder - umi2_size)
+ )
else:
- raise RuntimeError("Unrecognized software!")
+ raise RuntimeError(
+ "Unrecognized software!"
+ )
elif i_remainder - umi2_size == 0:
- if self.software == 'bcl2fastq':
- bm.append('I' + str(index2_size) + 'Y' + str(umi2_size))
- elif self.software == 'bclconvert':
- bm.append('I' + str(index2_size) + 'U' + str(umi2_size))
+ if self.software == "bcl2fastq":
+ bm.append(
+ "I"
+ + str(index2_size)
+ + "Y"
+ + str(umi2_size)
+ )
+ elif self.software == "bclconvert":
+ bm.append(
+ "I"
+ + str(index2_size)
+ + "U"
+ + str(umi2_size)
+ )
else:
- raise RuntimeError("Unrecognized software!")
+ raise RuntimeError(
+ "Unrecognized software!"
+ )
else:
- raise RuntimeError("when generating base_masks for UMI samples" \
- " some UMI2 length is longer than specified in RunInfo.xml")
+ raise RuntimeError(
+ "when generating base_masks for UMI samples"
+ " some UMI2 length is longer than specified in RunInfo.xml"
+ )
else:
- bm.append('I' + str(index2_size) + 'N' + str(i_remainder))
+ bm.append(
+ "I"
+ + str(index2_size)
+ + "N"
+ + str(i_remainder)
+ )
elif index2_size == 0:
- bm.append('N' + str(cycles))
+ bm.append("N" + str(cycles))
else:
- bm.append('I' + str(index2_size) + 'N' + str(i_remainder))
+ bm.append(
+ "I" + str(index2_size) + "N" + str(i_remainder)
+ )
else:
- bm.append('I' + str(cycles))
+ bm.append("I" + str(cycles))
else:
- # If this sample is not dual index but the run is,
- # then I need to ignore the second index completely
- bm.append('N' + str(cycles))
+ # If this sample is not dual index but the run is,
+ # then I need to ignore the second index completely
+ bm.append("N" + str(cycles))
return bm
-
- def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None):
+ def _generate_clean_samplesheet(
+ self,
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=False,
+ fields_qPCR=None,
+ ):
"""Generate a 'clean' samplesheet, the given fields will be removed.
If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name
Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG)
"""
- output = u''
+ output = ""
# Expand the ssparser if there are lanes with 10X or Smart-seq samples
- index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
- index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq'])
+ index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"])
+ index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"])
# Replace 10X or Smart-seq indices
for sample in ssparser.data:
- if sample['index'] in index_dict_tenX.keys():
- tenX_index = sample['index']
+ if sample["index"] in index_dict_tenX.keys():
+ tenX_index = sample["index"]
# In the case of 10X dual indexes, replace index and index2
if TENX_DUAL_PAT.findall(tenX_index):
- sample['index'] = index_dict_tenX[tenX_index][0]
- sample['index2'] = index_dict_tenX[tenX_index][1]
+ sample["index"] = index_dict_tenX[tenX_index][0]
+ sample["index2"] = index_dict_tenX[tenX_index][1]
# In the case of 10X single indexes, replace the index name with the 4 actual indicies
else:
x = 0
indices_number = len(index_dict_tenX[tenX_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_tenX[tenX_index][x]
+ new_sample["index"] = index_dict_tenX[tenX_index][x]
ssparser.data.append(new_sample)
x += 1
# Set the original 10X index to the 4th correct index
- sample['index'] = index_dict_tenX[tenX_index][x]
- elif SMARTSEQ_PAT.findall(sample['index']):
+ sample["index"] = index_dict_tenX[tenX_index][x]
+ elif SMARTSEQ_PAT.findall(sample["index"]):
x = 0
- smartseq_index = sample['index'].split('-')[1]
+ smartseq_index = sample["index"].split("-")[1]
indices_number = len(index_dict_smartseq[smartseq_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- new_sample['index2'] = index_dict_smartseq[smartseq_index][x][1]
+ new_sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ new_sample["index2"] = index_dict_smartseq[smartseq_index][x][1]
ssparser.data.append(new_sample)
x += 1
- sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- sample['index2'] = index_dict_smartseq[smartseq_index][x][1]
+ sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ sample["index2"] = index_dict_smartseq[smartseq_index][x][1]
# Sort to get the added indicies from 10x in the right place
# Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID
- ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID')))
+ ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID")))
if not fields_to_remove:
fields_to_remove = []
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
if field not in fields_to_remove:
datafields.append(field)
- output += ','.join(datafields)
+ output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
line_ar = []
@@ -608,79 +831,108 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
try:
if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR:
# Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix
- value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm]))
+ value = re.sub(
+ "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}"
+ )
else:
# Substitute SampleID with SampleName, add Sample_ as prefix
- value ='Sample_{}'.format(line[ssparser.dfield_snm])
+ value = f"Sample_{line[ssparser.dfield_snm]}"
except:
- # Otherwise add Sample_ as prefix
- value = 'Sample_{}'.format(line[ssparser.dfield_sid])
+ # Otherwise add Sample_ as prefix
+ value = f"Sample_{line[ssparser.dfield_sid]}"
elif rename_qPCR_suffix and field in fields_qPCR:
- value = re.sub('__qPCR_$', '', line[field])
+ value = re.sub("__qPCR_$", "", line[field])
line_ar.append(value)
- output += ','.join(line_ar)
+ output += ",".join(line_ar)
output += os.linesep
return output
- def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, software, sample_type, index1_size, index2_size, base_mask, CONFIG):
- output = u''
+ def _generate_samplesheet_subset(
+ self,
+ ssparser,
+ samples_to_include,
+ runSetup,
+ software,
+ sample_type,
+ index1_size,
+ index2_size,
+ base_mask,
+ CONFIG,
+ ):
+ output = ""
# Prepare index cycles
index_cycles = [0, 0]
for read in runSetup:
- if read['IsIndexedRead'] == 'Y':
- if int(read['Number']) == 2:
- index_cycles[0] = int(read['NumCycles'])
+ if read["IsIndexedRead"] == "Y":
+ if int(read["Number"]) == 2:
+ index_cycles[0] = int(read["NumCycles"])
else:
- index_cycles[1] = int(read['NumCycles'])
+ index_cycles[1] = int(read["NumCycles"])
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Settings for BCL Convert
- if software == 'bclconvert':
- output += '[Settings]{}'.format(os.linesep)
- output += 'OverrideCycles,{}{}'.format(';'.join(base_mask), os.linesep)
+ if software == "bclconvert":
+ output += f"[Settings]{os.linesep}"
+ output += "OverrideCycles,{}{}".format(";".join(base_mask), os.linesep)
- if CONFIG.get('bclconvert'):
- if CONFIG['bclconvert'].get('settings'):
+ if CONFIG.get("bclconvert"):
+ if CONFIG["bclconvert"].get("settings"):
# Put common settings
- if CONFIG['bclconvert']['settings'].get('common'):
- for setting in CONFIG['bclconvert']['settings']['common']:
+ if CONFIG["bclconvert"]["settings"].get("common"):
+ for setting in CONFIG["bclconvert"]["settings"]["common"]:
for k, v in setting.items():
- output += '{},{}{}'.format(k, v, os.linesep)
+ output += f"{k},{v}{os.linesep}"
# Put special settings:
- if sample_type in CONFIG['bclconvert']['settings'].keys():
- for setting in CONFIG['bclconvert']['settings'][sample_type]:
+ if sample_type in CONFIG["bclconvert"]["settings"].keys():
+ for setting in CONFIG["bclconvert"]["settings"][sample_type]:
for k, v in setting.items():
- if (k == 'BarcodeMismatchesIndex1' and index1_size != 0) or (k == 'BarcodeMismatchesIndex2' and index2_size != 0) or 'BarcodeMismatchesIndex' not in k:
- output += '{},{}{}'.format(k, v, os.linesep)
+ if (
+ (
+ k == "BarcodeMismatchesIndex1"
+ and index1_size != 0
+ )
+ or (
+ k == "BarcodeMismatchesIndex2"
+ and index2_size != 0
+ )
+ or "BarcodeMismatchesIndex" not in k
+ ):
+ output += f"{k},{v}{os.linesep}"
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
datafields.append(field)
- output += ','.join(datafields)
+ output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
- sample_name = line.get('Sample_Name') or line.get('SampleName')
- lane = line['Lane']
+ sample_name = line.get("Sample_Name") or line.get("SampleName")
+ lane = line["Lane"]
noindex_flag = False
if lane in samples_to_include.keys():
if sample_name in samples_to_include.get(lane):
line_ar = []
for field in datafields:
# Case with NoIndex
- if field == 'index' and 'NOINDEX' in line['index'].upper():
- line[field] = 'T'*index_cycles[0] if index_cycles[0] !=0 else ''
+ if field == "index" and "NOINDEX" in line["index"].upper():
+ line[field] = (
+ "T" * index_cycles[0] if index_cycles[0] != 0 else ""
+ )
noindex_flag = True
- if field == 'index2' and noindex_flag:
- line[field] = 'A'*index_cycles[1] if index_cycles[1] !=0 else ''
+ if field == "index2" and noindex_flag:
+ line[field] = (
+ "A" * index_cycles[1] if index_cycles[1] != 0 else ""
+ )
noindex_flag = False
# Case of IDT UMI
- if (field == 'index' or field == 'index2') and IDT_UMI_PAT.findall(line[field]):
- line[field] = line[field].replace('N', '')
+ if (
+ field == "index" or field == "index2"
+ ) and IDT_UMI_PAT.findall(line[field]):
+ line[field] = line[field].replace("N", "")
line_ar.append(line[field])
- output += ','.join(line_ar)
+ output += ",".join(line_ar)
output += os.linesep
return output
diff --git a/taca/illumina/__init__.py b/taca/illumina/__init__.py
index 14e36756..50a56a43 100644
--- a/taca/illumina/__init__.py
+++ b/taca/illumina/__init__.py
@@ -1,3 +1,3 @@
"""
Runs class to parse and work with illumina flowcells
-"""
\ No newline at end of file
+"""
diff --git a/taca/log/__init__.py b/taca/log/__init__.py
index 0946603e..0ce995d1 100644
--- a/taca/log/__init__.py
+++ b/taca/log/__init__.py
@@ -8,25 +8,28 @@
# Console logger
stream_handler = logging.StreamHandler()
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
stream_handler.setFormatter(formatter)
ROOT_LOG.addHandler(stream_handler)
LOG_LEVELS = {
- 'ERROR': logging.ERROR,
- 'WARN': logging.WARN,
- 'INFO': logging.INFO,
- 'DEBUG': logging.DEBUG
+ "ERROR": logging.ERROR,
+ "WARN": logging.WARN,
+ "INFO": logging.INFO,
+ "DEBUG": logging.DEBUG,
}
-def init_logger_file(log_file, log_level='INFO'):
- """ Append a FileHandler to the root logger.
+
+def init_logger_file(log_file, log_level="INFO"):
+ """Append a FileHandler to the root logger.
:param str log_file: Path to the log file
:param str log_level: Logging level
"""
- ROOT_LOG.handlers=[]
- log_level = LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO
+ ROOT_LOG.handlers = []
+ log_level = (
+ LOG_LEVELS[log_level] if log_level in LOG_LEVELS.keys() else logging.INFO
+ )
ROOT_LOG.setLevel(log_level)
diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py
index 5f839058..675edcd2 100644
--- a/taca/nanopore/ONT_run_classes.py
+++ b/taca/nanopore/ONT_run_classes.py
@@ -1,18 +1,18 @@
-import os
-import logging
import csv
-import shutil
import glob
-import re
import json
-import pandas as pd
-import subprocess
+import logging
import os
+import re
+import shutil
+import subprocess
+from datetime import datetime
from typing import Union
-from taca.utils.statusdb import NanoporeRunsConnection
-from datetime import datetime
+import pandas as pd
+
from taca.utils.config import CONFIG
+from taca.utils.statusdb import NanoporeRunsConnection
from taca.utils.transfer import RsyncAgent, RsyncError
logger = logging.getLogger(__name__)
@@ -22,24 +22,27 @@
)
-class ONT_run(object):
+class ONT_run:
"""General Nanopore run.
Expects instantiation from absolute path of run directory on preprocessing server.
"""
def __init__(self, run_abspath: str):
-
# Get paths and names of MinKNOW experiment, sample and run
self.run_name = os.path.basename(run_abspath)
self.run_abspath = run_abspath
+ self.run_type: str | None = (
+ None # This will be defined upon instantiation of a child class
+ )
+
assert re.match(
ONT_RUN_PATTERN, self.run_name
), f"Run {self.run_name} doesn't look like a run dir"
# Parse MinKNOW sample and experiment name
- with open(self.get_file("/run_path.txt"), "r") as stream:
+ with open(self.get_file("/run_path.txt")) as stream:
self.experiment_name, self.sample_name, _ = stream.read().split("/")
# Get info from run name
@@ -122,7 +125,7 @@ def assert_contents(self):
def is_transferred(self) -> bool:
"""Return True if run ID in transfer.tsv, else False."""
- with open(self.transfer_details["transfer_log"], "r") as f:
+ with open(self.transfer_details["transfer_log"]) as f:
return self.run_name in f.read()
# DB update
@@ -159,7 +162,7 @@ def update_db_entry(self, force_update=False):
self.touch_db_entry()
# If the run document is marked as "ongoing" or database is being manually updated
- if self.db.check_run_status(self) == "ongoing" or force_update == True:
+ if self.db.check_run_status(self) == "ongoing" or force_update is True:
logger.info(
f"{self.run_name}: Run exists in the database with run status: {self.db.check_run_status(self)}."
)
@@ -185,7 +188,6 @@ def update_db_entry(self, force_update=False):
)
def parse_pore_activity(self, db_update):
-
logger.info(f"{self.run_name}: Parsing pore activity...")
pore_activity = {}
@@ -230,7 +232,7 @@ def parse_minknow_json(self, db_update):
logger.info(f"{self.run_name}:Parsing report JSON...")
- dict_json_report = json.load(open(self.get_file("/report*.json"), "r"))
+ dict_json_report = json.load(open(self.get_file("/report*.json")))
# Initialize return dict
parsed_data = {}
@@ -257,7 +259,10 @@ def parse_minknow_json(self, db_update):
# -- Run output subsection
seq_metadata_trimmed["acquisition_output"] = []
for section in seq_metadata["acquisition_output"]:
- if section["type"] in ["AllData", "SplitByBarcode"]:
+ if "type" not in section.keys() or section["type"] in [
+ "AllData",
+ "SplitByBarcode",
+ ]:
seq_metadata_trimmed["acquisition_output"].append(section)
# -- Read length subseqtion
@@ -282,11 +287,13 @@ def copy_metadata(self):
"**/bam*/***",
"**/fast5*/***",
"**/fastq*/***",
+ "**/pod5*/***",
# Any files found elsewhere
"*.bam*",
"*.bai*",
"*.fast5*",
"*.fastq*",
+ "*.pod5*",
]
exclude_patterns_quoted = ["'" + pattern + "'" for pattern in exclude_patterns]
@@ -299,7 +306,6 @@ def copy_metadata(self):
)
def copy_html_report(self):
-
logger.info(f"{self.run_name}: Transferring .html report to ngi-internal...")
# Transfer the MinKNOW .html report file to ngi-internal, renaming it to the full run ID. Requires password-free SSH access.
@@ -352,10 +358,10 @@ def update_transfer_log(self):
with open(self.transfer_details["transfer_log"], "a") as f:
tsv_writer = csv.writer(f, delimiter="\t")
tsv_writer.writerow([self.run_name, str(datetime.now())])
- except IOError:
+ except OSError:
msg = f"{self.run_name}: Could not update the transfer logfile {self.transfer_details['transfer_log']}"
logger.error(msg)
- raise IOError(msg)
+ raise OSError(msg)
# Archive run
@@ -372,7 +378,7 @@ class ONT_user_run(ONT_run):
def __init__(self, run_abspath: str):
self.run_type = "user_run"
- super(ONT_user_run, self).__init__(run_abspath)
+ super().__init__(run_abspath)
class ONT_qc_run(ONT_run):
@@ -380,7 +386,7 @@ class ONT_qc_run(ONT_run):
def __init__(self, run_abspath: str):
self.run_type = "qc_run"
- super(ONT_qc_run, self).__init__(run_abspath)
+ super().__init__(run_abspath)
# Get Anglerfish attributes from run
self.anglerfish_done_abspath = f"{self.run_abspath}/.anglerfish_done"
@@ -404,7 +410,7 @@ def get_anglerfish_exit_code(self) -> Union[int, None]:
Return exit code or None.
"""
if os.path.exists(self.anglerfish_done_abspath):
- return int(open(self.anglerfish_done_abspath, "r").read())
+ return int(open(self.anglerfish_done_abspath).read())
else:
return None
@@ -413,7 +419,7 @@ def get_anglerfish_pid(self) -> Union[str, None]:
Return process ID or None."""
if os.path.exists(self.anglerfish_ongoing_abspath):
- return str(open(self.anglerfish_ongoing_abspath, "r").read())
+ return str(open(self.anglerfish_ongoing_abspath).read())
else:
return None
@@ -458,12 +464,32 @@ def fetch_anglerfish_samplesheet(self) -> bool:
f"{self.run_name}: Error occured when copying anglerfish samplesheet to run dir."
)
+ def has_fastq_output(self) -> bool:
+ """Check whether run has fastq output."""
+
+ reads_dir = os.path.join(self.run_abspath, "fastq_pass")
+
+ return os.path.exists(reads_dir)
+
+ def has_barcode_dirs(self) -> bool:
+ barcode_dir_pattern = r"barcode\d{2}"
+
+ for dir in os.listdir(os.path.join(self.run_abspath, "fastq_pass")):
+ if re.search(barcode_dir_pattern, dir):
+ return True
+
+ return False
+
def run_anglerfish(self):
"""Run Anglerfish as subprocess within it's own Conda environment.
Dump files to indicate ongoing and finished processes.
"""
+ timestamp = datetime.now().strftime("%Y_%m_%d_%H%M%S")
+
+ # "anglerfish_run*" is the dir pattern recognized by the LIMS script parsing the results
anglerfish_run_name = "anglerfish_run"
+
n_threads = 2 # This could possibly be changed
anglerfish_command = [
@@ -473,9 +499,18 @@ def run_anglerfish(self):
f"--run_name {anglerfish_run_name}",
f"--threads {n_threads}",
"--lenient",
- "--ont_barcodes",
"--skip_demux",
]
+ if self.has_barcode_dirs():
+ anglerfish_command.append("--barcoding")
+
+ # Create dir to trace TACA executing Anglerfish as a subprocess
+ taca_anglerfish_run_dir = f"taca_anglerfish_run_{timestamp}"
+ os.mkdir(taca_anglerfish_run_dir)
+ # Copy samplesheet used for traceability
+ shutil.copy(self.anglerfish_samplesheet, f"{taca_anglerfish_run_dir}/")
+ # Create files to dump subprocess std
+ stderr_relpath = f"{taca_anglerfish_run_dir}/stderr.txt"
full_command = [
# Dump subprocess PID into 'run-ongoing'-indicator file.
@@ -484,20 +519,29 @@ def run_anglerfish(self):
"conda run -n anglerfish " + " ".join(anglerfish_command),
# Dump Anglerfish exit code into file
f"echo $? > {self.anglerfish_done_abspath}",
- # Copy the Anglerfish samplesheet used to start the run into the run dir, for traceability
- # (The correct anglerfish run dir is identified by it being younger than the "run-ongoing" file)
- f"new_runs=$(find . -type d -name 'anglerfish_run*' -newer {self.anglerfish_ongoing_abspath})",
- f"if [[ $(echo '${{new_runs}}' | wc -l) -eq 1 ]] ; then cp {self.anglerfish_samplesheet} ${{new_runs}}/ ; fi",
- # Regardless of exit status: Remove 'run-ongoing' file.
+ # Move run to subdir
+ # 1) Find the latest Anglerfish run dir (younger than the 'run-ongoing' file)
+ f'find {self.run_abspath} -name "anglerfish_run*" -type d -newer {self.run_abspath}/.anglerfish_ongoing '
+ # 2) Move the Anglerfish run dir into the TACA Anglerfish run dir
+ + "-exec mv \{\} "
+ + f"{self.run_abspath}/{taca_anglerfish_run_dir}/ \; "
+ # 3) Only do this once
+ + "-quit",
+ # Remove 'run-ongoing' file.
f"rm {self.anglerfish_ongoing_abspath}",
]
+ with open(f"{taca_anglerfish_run_dir}/command.sh", "w") as stream:
+ stream.write("\n".join(full_command))
+
# Start Anglerfish subprocess
- process = subprocess.Popen(
- "; ".join(full_command),
- shell=True,
- cwd=self.run_abspath,
- )
+ with open(stderr_relpath, "w") as stderr:
+ process = subprocess.Popen(
+ f"bash {taca_anglerfish_run_dir}/command.sh",
+ shell=True,
+ cwd=self.run_abspath,
+ stderr=stderr,
+ )
logger.info(
f"{self.run_name}: Anglerfish subprocess started with process ID {process.pid}."
)
diff --git a/taca/nanopore/__init__.py b/taca/nanopore/__init__.py
index c8b7802c..5063a460 100644
--- a/taca/nanopore/__init__.py
+++ b/taca/nanopore/__init__.py
@@ -1,3 +1,3 @@
"""
Classes to parse and work with ONT data
-"""
\ No newline at end of file
+"""
diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py
index 75c2d56d..978701aa 100644
--- a/taca/nanopore/instrument_transfer.py
+++ b/taca/nanopore/instrument_transfer.py
@@ -2,14 +2,14 @@
"""
__version__ = "1.0.13"
+import argparse
import logging
import os
import re
import shutil
-import argparse
import subprocess
-from glob import glob
from datetime import datetime as dt
+from glob import glob
def main(args):
@@ -48,19 +48,18 @@ def main(args):
# Iterate over runs
for run_path in run_paths:
-
logging.info(f"Handling {run_path}...")
if run_path.split(os.sep)[-2][0:3] == "QC_":
# For QC runs, the sample name should start with "QC_"
- logging.info(f"Run categorized as QC.")
+ logging.info("Run categorized as QC.")
rsync_dest = args.dest_dir_qc
else:
rsync_dest = args.dest_dir
- logging.info(f"Dumping run path...")
+ logging.info("Dumping run path...")
dump_path(run_path)
- logging.info(f"Dumping QC and MUX history...")
+ logging.info("Dumping QC and MUX history...")
dump_pore_count_history(run_path, pore_counts)
if not sequencing_finished(run_path):
@@ -96,7 +95,7 @@ def write_finished_indicator(run_path):
open(new_file, "w").close()
-def sync_to_storage(run_dir, destination, log):
+def sync_to_storage(run_dir: str, destination: str, rsync_log: str):
"""Sync the run to storage using rsync.
Skip if rsync is already running on the run."""
@@ -104,7 +103,7 @@ def sync_to_storage(run_dir, destination, log):
"run-one",
"rsync",
"-rvu",
- "--log-file=" + log,
+ "--log-file=" + rsync_log,
run_dir,
destination,
]
@@ -115,17 +114,19 @@ def sync_to_storage(run_dir, destination, log):
)
-def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log: list[str]):
+def final_sync_to_storage(
+ run_dir: str, destination: str, archive_dir: str, rsync_log: str
+):
"""Do a final sync of the run to storage, then archive it.
Skip if rsync is already running on the run."""
- logging.info("Performing a final sync of {} to storage".format(run_dir))
+ logging.info(f"Performing a final sync of {run_dir} to storage")
command = [
"run-one",
"rsync",
"-rvu",
- "--log-file=" + log,
+ "--log-file=" + rsync_log,
run_dir,
destination,
]
@@ -140,9 +141,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log:
archive_finished_run(run_dir, archive_dir)
else:
logging.info(
- "Previous rsync might be running still. Skipping {} for now.".format(
- run_dir
- )
+ f"Previous rsync might be running still. Skipping {run_dir} for now."
)
return
@@ -155,7 +154,7 @@ def archive_finished_run(run_dir: str, archive_dir: str):
sample_dir = os.path.dirname(run_dir)
exp_dir = os.path.dirname(sample_dir)
- run_name = os.path.basename(run_dir)
+ os.path.basename(run_dir)
sample_name = os.path.basename(sample_dir)
exp_name = os.path.basename(exp_dir)
@@ -214,9 +213,9 @@ def parse_position_logs(minknow_logs_dir: str) -> list:
for row in "ABCDEFGH":
positions.append(col + row)
- entries = []
+ headers = []
+ header: dict | None = None
for position in positions:
-
log_files = glob(
os.path.join(minknow_logs_dir, position, "control_server_log-*.txt")
)
@@ -227,32 +226,35 @@ def parse_position_logs(minknow_logs_dir: str) -> list:
for log_file in log_files:
with open(log_file) as stream:
lines = stream.readlines()
- for i in range(0, len(lines)):
- line = lines[i]
- if line[0:4] != " ":
+
+ # Iterate across log lines
+ for line in lines:
+ if not line[0:4] == " ":
# Line is log header
split_header = line.split(" ")
timestamp = " ".join(split_header[0:2])
category = " ".join(split_header[2:])
- entry = {
+ header = {
"position": position,
"timestamp": timestamp.strip(),
"category": category.strip(),
}
- entries.append(entry)
- else:
+ headers.append(header)
+
+ elif header:
# Line is log body
- if "body" not in entry:
- entry["body"] = {}
+ if "body" not in header.keys():
+ body: dict = {}
+ header["body"] = body
key = line.split(": ")[0].strip()
val = ": ".join(line.split(": ")[1:]).strip()
- entry["body"][key] = val
+ header["body"][key] = val
- entries.sort(key=lambda x: x["timestamp"])
- logging.info(f"Parsed {len(entries)} log entries.")
+ headers.sort(key=lambda x: x["timestamp"])
+ logging.info(f"Parsed {len(headers)} log entries.")
- return entries
+ return headers
def get_pore_counts(position_logs: list) -> list:
@@ -260,7 +262,6 @@ def get_pore_counts(position_logs: list) -> list:
pore_counts = []
for entry in position_logs:
-
if "INFO: platform_qc.report (user_messages)" in entry["category"]:
type = "qc"
elif "INFO: mux_scan_result (user_messages)" in entry["category"]:
@@ -269,7 +270,6 @@ def get_pore_counts(position_logs: list) -> list:
type = "other"
if type in ["qc", "mux"]:
-
new_entry = {
"flow_cell_id": entry["body"]["flow_cell_id"],
"timestamp": entry["timestamp"],
@@ -329,6 +329,7 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str:
return new_file_path
+
# BEGIN_EXCLUDE
if __name__ == "__main__":
# This is clunky but should be fine since it will only ever run as a cronjob
@@ -367,4 +368,4 @@ def dump_pore_count_history(run: str, pore_counts: list) -> str:
args = parser.parse_args()
main(args)
-# END_EXCLUDE
\ No newline at end of file
+# END_EXCLUDE
diff --git a/taca/server_status/cli.py b/taca/server_status/cli.py
index 723410df..1833035f 100644
--- a/taca/server_status/cli.py
+++ b/taca/server_status/cli.py
@@ -1,41 +1,46 @@
-import click
import logging
+import click
+
+from taca.server_status import (
+ cronjobs as cj, # to avoid similar names with command, otherwise exception
+)
from taca.server_status import server_status as status
from taca.utils.config import CONFIG
-from taca.server_status import cronjobs as cj # to avoid similar names with command, otherwise exception
-@click.group(name='server_status')
+@click.group(name="server_status")
def server_status():
- """ Monitor server status """
+ """Monitor server status"""
+
# server status subcommands
@server_status.command()
-@click.option('--statusdb', is_flag=True, help="Update the statusdb")
+@click.option("--statusdb", is_flag=True, help="Update the statusdb")
def nases(statusdb):
- """ Checks the available space on all the nases
- """
- if not CONFIG.get('server_status', ''):
+ """Checks the available space on all the nases"""
+ if not CONFIG.get("server_status", ""):
logging.warning("Configuration missing required entries: server_status")
disk_space = status.get_nases_disk_space()
if statusdb:
- status.update_status_db(disk_space, server_type='nas')
+ status.update_status_db(disk_space, server_type="nas")
+
@server_status.command()
def cronjobs():
- """ Monitors cronjobs and updates statusdb
- """
+ """Monitors cronjobs and updates statusdb"""
cj.update_cronjob_db()
+
@server_status.command()
def monitor_promethion():
- """ Checks the status of PromethION and if ngi-nas is mounted
- """
- if not CONFIG.get('promethion_status', ''):
+ """Checks the status of PromethION and if ngi-nas is mounted"""
+ if not CONFIG.get("promethion_status", ""):
logging.warning("Configuration missing required entries: server_status")
promethion_status = status.check_promethion_status()
if promethion_status:
logging.info("No issues encountered with the PromethION")
else:
- logging.warning("An issue with the PromethION was encountered. Operator has been notified by email.")
\ No newline at end of file
+ logging.warning(
+ "An issue with the PromethION was encountered. Operator has been notified by email."
+ )
diff --git a/taca/server_status/cronjobs.py b/taca/server_status/cronjobs.py
index 9b808bd8..1f1605c4 100644
--- a/taca/server_status/cronjobs.py
+++ b/taca/server_status/cronjobs.py
@@ -1,77 +1,84 @@
+import datetime
+import getpass
import logging
import platform
-import getpass
-import datetime
from crontab import CronTab
+
from taca.utils import statusdb
from taca.utils.config import CONFIG
+
def _parse_crontab():
result = {}
user = getpass.getuser()
- logging.info('Getting crontab for user {}'.format(user))
+ logging.info(f"Getting crontab for user {user}")
try:
crontab = CronTab(user=user)
except Exception as e:
- logging.error('Cannot get a crontab for user: {}'.format(user))
+ logging.error(f"Cannot get a crontab for user: {user}")
logging.error(e.message)
else:
result[user] = []
for job in crontab.crons:
# this is for special syntax like @monthly or @reboot
- special_syntax = str(job).split()[0] if str(job).startswith('@') else ''
- result[user].append({'Command': job.command,
- 'Comment': job.comment,
- 'Enabled': job.enabled,
- 'Minute': str(job.minutes),
- 'Hour': str(job.hours),
- 'Day of month' : str(job.dom),
- 'Month': str(job.month),
- 'Day of week': str(job.dow),
- 'Special syntax': special_syntax})
+ special_syntax = str(job).split()[0] if str(job).startswith("@") else ""
+ result[user].append(
+ {
+ "Command": job.command,
+ "Comment": job.comment,
+ "Enabled": job.enabled,
+ "Minute": str(job.minutes),
+ "Hour": str(job.hours),
+ "Day of month": str(job.dom),
+ "Month": str(job.month),
+ "Day of week": str(job.dow),
+ "Special syntax": special_syntax,
+ }
+ )
return result
def update_cronjob_db():
- server = platform.node().split('.')[0]
+ server = platform.node().split(".")[0]
timestamp = datetime.datetime.now()
# parse results
result = _parse_crontab()
# connect to db
- statusdb_conf = CONFIG.get('statusdb')
- logging.info('Connecting to database: {}'.format(CONFIG.get('statusdb', {}).get('url')))
+ statusdb_conf = CONFIG.get("statusdb")
+ logging.info(
+ "Connecting to database: {}".format(CONFIG.get("statusdb", {}).get("url"))
+ )
try:
couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
except Exception as e:
logging.error(e.message)
else:
# update document
- crontab_db = couch_connection['cronjobs']
- view = crontab_db.view('server/alias')
+ crontab_db = couch_connection["cronjobs"]
+ view = crontab_db.view("server/alias")
# to be safe
doc = {}
# create doc if not exist
if not view[server].rows:
- logging.info('Creating a document')
+ logging.info("Creating a document")
doc = {
- 'users': {user: cronjobs for user, cronjobs in result.items()},
- 'Last updated': str(timestamp),
- 'server': server,
+ "users": {user: cronjobs for user, cronjobs in result.items()},
+ "Last updated": str(timestamp),
+ "server": server,
}
# else: get existing doc
for row in view[server]:
- logging.info('Updating the document')
+ logging.info("Updating the document")
doc = crontab_db.get(row.value)
- doc['users'].update(result)
- doc['Last updated'] = str(timestamp)
+ doc["users"].update(result)
+ doc["Last updated"] = str(timestamp)
if doc:
try:
crontab_db.save(doc)
except Exception as e:
logging.error(e.message)
else:
- logging.info('{} has been successfully updated'.format(server))
+ logging.info(f"{server} has been successfully updated")
else:
- logging.warning('Document has not been created/updated')
-
+ logging.warning("Document has not been created/updated")
diff --git a/taca/server_status/server_status.py b/taca/server_status/server_status.py
index a03a107a..3431da31 100644
--- a/taca/server_status/server_status.py
+++ b/taca/server_status/server_status.py
@@ -1,6 +1,6 @@
-import subprocess
-import logging
import datetime
+import logging
+import subprocess
from taca.utils import statusdb
from taca.utils.config import CONFIG
@@ -9,41 +9,42 @@
def get_nases_disk_space():
result = {}
- config = CONFIG['server_status']
- servers = config.get('servers', dict())
+ config = CONFIG["server_status"]
+ servers = config.get("servers", dict())
for server_url, path in servers.items():
-
# Get command
- command = '{command} {path}'.format(command=config['command'], path=path)
+ command = "{command} {path}".format(command=config["command"], path=path)
# If localhost, don't connect to ssh
- if server_url == 'localhost':
+ if server_url == "localhost":
command = command.split()
else:
- if 'promethion' in server_url:
- user = 'prom'
+ if "promethion" in server_url:
+ user = "prom"
else:
- user = config['user']
+ user = config["user"]
# Connect via ssh to server and execute the command
- command = ['ssh', '-t', '{}@{}'.format(user, server_url), command]
+ command = ["ssh", "-t", f"{user}@{server_url}", command]
result[server_url] = _run_cmd(command)
# Storage systems are mouted locally, e.g. ngi-nas
- for storage_system, path in config.get('storage_systems', {}).items():
+ for storage_system, path in config.get("storage_systems", {}).items():
# Get command
- command = '{command} {path}'.format(command=config['command'], path=path)
+ command = "{command} {path}".format(command=config["command"], path=path)
result[storage_system] = _run_cmd(command.split())
return result
+
def _run_cmd(command):
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output = proc.stdout.read().decode("utf-8")
return _parse_output(output)
-def _parse_output(output): # for nases
+
+def _parse_output(output): # for nases
# command = df -h /home
# output = Filesystem Size Used Avail Use% Mounted on
# /dev/mapper/VGStor-lv_illumina
@@ -59,39 +60,40 @@ def _parse_output(output): # for nases
disk_size = output[-5]
filesystem = output[-6]
- available_percentage = str(100 - int(used_percentage.replace('%',''))) + '%'
+ available_percentage = str(100 - int(used_percentage.replace("%", ""))) + "%"
result = {
- 'disk_size': disk_size,
- 'space_used': space_used,
- 'space_available': space_available,
- 'used_percentage': used_percentage,
- 'available_percentage': available_percentage,
- 'mounted_on': mounted_on,
- 'filesystem': filesystem
+ "disk_size": disk_size,
+ "space_used": space_used,
+ "space_available": space_available,
+ "used_percentage": used_percentage,
+ "available_percentage": available_percentage,
+ "mounted_on": mounted_on,
+ "filesystem": filesystem,
}
except:
# Sometimes it fails for whatever reason as Popen returns not what it is supposed to
result = {
- 'disk_size': 'NaN',
- 'space_used': 'NaN',
- 'space_available': 'NaN',
- 'used_percentage': 'NaN',
- 'available_percentage': 'NaN',
- 'mounted_on': 'NaN',
- 'filesystem': 'NaN'
+ "disk_size": "NaN",
+ "space_used": "NaN",
+ "space_available": "NaN",
+ "used_percentage": "NaN",
+ "available_percentage": "NaN",
+ "mounted_on": "NaN",
+ "filesystem": "NaN",
}
- logging.error('Can not parse the output: {}'.format(output))
+ logging.error(f"Can not parse the output: {output}")
return result
+
def update_status_db(data, server_type=None):
- """ Pushed the data to status db.
+ """Pushed the data to status db.
data can be from nases
server_type should be 'nas'.
"""
- db_config = CONFIG.get('statusdb')
+ db_config = CONFIG.get("statusdb")
if db_config is None:
logging.error('"statusdb" must be present in the config file!')
raise RuntimeError('"statusdb" must be present in the config file!')
@@ -101,14 +103,14 @@ def update_status_db(data, server_type=None):
logging.error(e.message)
raise
- db = couch_connection['server_status']
- logging.info('Connection established')
- for key in data.keys(): # data is dict of dicts
- server = data[key] # data[key] is dictionary (the command output)
- server['name'] = key # key is nas url
+ db = couch_connection["server_status"]
+ logging.info("Connection established")
+ for key in data.keys(): # data is dict of dicts
+ server = data[key] # data[key] is dictionary (the command output)
+ server["name"] = key # key is nas url
# datetime.datetime(2015, 11, 18, 9, 54, 33, 473189) is not JSON serializable
- server['time'] = datetime.datetime.now().isoformat()
- server['server_type'] = server_type or 'unknown'
+ server["time"] = datetime.datetime.now().isoformat()
+ server["server_type"] = server_type or "unknown"
try:
db.save(server)
@@ -116,27 +118,30 @@ def update_status_db(data, server_type=None):
logging.error(e.message)
raise
else:
- logging.info('{}: Server status has been updated'.format(key))
+ logging.info(f"{key}: Server status has been updated")
+
def check_promethion_status():
- config = CONFIG.get('promethion_status')
- server = config.get('server')
- path = config.get('path')
- command = config.get('command')
- command_to_run = f'{command} {path}'
- user = config.get('user')
+ config = CONFIG.get("promethion_status")
+ server = config.get("server")
+ path = config.get("path")
+ command = config.get("command")
+ command_to_run = f"{command} {path}"
+ user = config.get("user")
try:
- subprocess.run(['ssh', '-t', f'{user}@{server}', command_to_run],
- check=True)
+ subprocess.run(["ssh", "-t", f"{user}@{server}", command_to_run], check=True)
except subprocess.CalledProcessError:
_send_promethion_warning_email()
return False
return True
+
def _send_promethion_warning_email():
- email_recipients = CONFIG.get('mail').get('recipients')
- email_subject = ('An issue with the PromethION has been detected.')
- email_message = ('An issue with the PromethION has been detected. '
- 'Please investigate and consider pausing the transfer cronjob on preproc1')
- send_mail(email_subject, email_message, email_recipients)
\ No newline at end of file
+ email_recipients = CONFIG.get("mail").get("recipients")
+ email_subject = "An issue with the PromethION has been detected."
+ email_message = (
+ "An issue with the PromethION has been detected. "
+ "Please investigate and consider pausing the transfer cronjob on preproc1"
+ )
+ send_mail(email_subject, email_message, email_recipients)
diff --git a/taca/testing/cli.py b/taca/testing/cli.py
index 63b89a35..2abcea9e 100644
--- a/taca/testing/cli.py
+++ b/taca/testing/cli.py
@@ -1,67 +1,93 @@
-
""" CLI for the testing commands
"""
-from __future__ import print_function
import os
+
import click
+
import taca.testing.create_uppmax_like_env as createupp
-@click.group(name='uppmax_env')
+
+@click.group(name="uppmax_env")
def uppmax_env():
- """ Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log)
- The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to:
+ """Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log)
+ The condig file (in general saved in variable NGI_CONFIG needs to looks something similar to:
- \b
- environment:
- project_id: ngi1234 #CAN BE ANYTHING
- ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING
- conda_env: TACA #CAN BE ANYTHING
- flowcell_inbox:
- - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS
- analysis:
- best_practice_analysis:
- whole_genome_reseq:
- analysis_engine: ngi_pipeline.engines.piper_ngi
- IGN:
- analysis_engine: ngi_pipeline.engines.piper_ngi
+ \b
+ environment:
+ project_id: ngi1234 #CAN BE ANYTHING
+ ngi_scripts_dir: /Users/vezzi/opt/ngi_pipeline/scripts #CAN BE ANYTHING
+ conda_env: TACA #CAN BE ANYTHING
+ flowcell_inbox:
+ - /Users/vezzi/opt/uppmax_env/incoming/ #NEEDS TO EXISTS
+ analysis:
+ best_practice_analysis:
+ whole_genome_reseq:
+ analysis_engine: ngi_pipeline.engines.piper_ngi
+ IGN:
+ analysis_engine: ngi_pipeline.engines.piper_ngi
- qc:
+ qc:
- analysis_engine: ngi_pipeline.engines.qc_ngi
+ analysis_engine: ngi_pipeline.engines.qc_ngi
- base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS
- sthlm_root: uppmax_env #NEEDS TO EXISTS
- top_dir: nobackup/NGI #NEEDS TO EXISTS
- upps_root: nothing #CAN BE ANYTHING
- logging:
- log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL
+ base_root: /Users/vezzi/opt/ #NEEDS TO EXISTS
+ sthlm_root: uppmax_env #NEEDS TO EXISTS
+ top_dir: nobackup/NGI #NEEDS TO EXISTS
+ upps_root: nothing #CAN BE ANYTHING
+ logging:
+ log_file: "/Users/vezzi/opt/log/ngi_pipeline.log" #NEEDS TO BE REAL
- \b
- The requested project will be divided into the following sets:
- - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in:
- - 1/4: closed more than 3 months ago
- - 1/4: closed more than 1 month ago, less than 3 months
- - 1/4: closed less than 1 month ago
- - 1/4: open
- - 1/3 will be selected amonf the projects with application different from 'WG re-seq':
- - 1/4: closed more than 3 months ago
- - 1/4: closed more than 1 month ago, less than 3 months
- - 1/4: closed less than 1 month ago
- - 1/4: open
+ \b
+ The requested project will be divided into the following sets:
+ - 2/3 will be selected among the projects with application equeal to 'WG re-seq'. These will be divided up in:
+ - 1/4: closed more than 3 months ago
+ - 1/4: closed more than 1 month ago, less than 3 months
+ - 1/4: closed less than 1 month ago
+ - 1/4: open
+ - 1/3 will be selected amonf the projects with application different from 'WG re-seq':
+ - 1/4: closed more than 3 months ago
+ - 1/4: closed more than 1 month ago, less than 3 months
+ - 1/4: closed less than 1 month ago
+ - 1/4: open
- """
+ """
pass
-@uppmax_env.command()
-@click.option('-p', '--projects', type=int, default=30, help='number of projects to be extracted from statusdb')
-@click.option('-nc', '--ngi-config', type=str, default=os.environ.get('NGI_CONFIG') , help='path to ngi configuration file (expected in variable NGI_CONFIG)')
-@click.option('-fq1', '--fastq_1', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 1')
-@click.option('-fq2', '--fastq_2', type=click.Path(exists=True, dir_okay=False), default=None , help='Path to fastq file for read 2')
+@uppmax_env.command()
+@click.option(
+ "-p",
+ "--projects",
+ type=int,
+ default=30,
+ help="number of projects to be extracted from statusdb",
+)
+@click.option(
+ "-nc",
+ "--ngi-config",
+ type=str,
+ default=os.environ.get("NGI_CONFIG"),
+ help="path to ngi configuration file (expected in variable NGI_CONFIG)",
+)
+@click.option(
+ "-fq1",
+ "--fastq_1",
+ type=click.Path(exists=True, dir_okay=False),
+ default=None,
+ help="Path to fastq file for read 1",
+)
+@click.option(
+ "-fq2",
+ "--fastq_2",
+ type=click.Path(exists=True, dir_okay=False),
+ default=None,
+ help="Path to fastq file for read 2",
+)
def create(projects, ngi_config, fastq_1, fastq_2):
- """creates a uppmax like env
- """
- if (fastq_1 is None and fastq_2 is not None) or (fastq_1 is not None and fastq_2 is None):
+ """creates a uppmax like env"""
+ if (fastq_1 is None and fastq_2 is not None) or (
+ fastq_1 is not None and fastq_2 is None
+ ):
print("ERROR: either both fastq_1 and fastq_2 are specified or none of them")
return 1
if fastq_1 is not None:
@@ -71,11 +97,13 @@ def create(projects, ngi_config, fastq_1, fastq_2):
if which("ngi_pipeline_start.py"):
createupp.create(projects, ngi_config, fastq_1, fastq_2)
else:
- print("ERROR: ngi_pipeline_start.py needs to be available and properly installed")
+ print(
+ "ERROR: ngi_pipeline_start.py needs to be available and properly installed"
+ )
def which(file):
for path in os.environ["PATH"].split(os.pathsep):
if os.path.exists(os.path.join(path, file)):
- return True
+ return True
return False
diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py
index e4852b42..d3c4a615 100644
--- a/taca/testing/create_uppmax_like_env.py
+++ b/taca/testing/create_uppmax_like_env.py
@@ -1,140 +1,169 @@
""" Load and parse configuration file."""
-from __future__ import print_function
+import datetime
import logging
import os
-import datetime
import random
import subprocess
+import sys
from dateutil.relativedelta import relativedelta
-from taca.utils.config import CONFIG
+
from taca.utils import config as conf
from taca.utils import filesystem as fs
from taca.utils import statusdb
-from io import open
-
+from taca.utils.config import CONFIG
logger = logging.getLogger(__name__)
def create_version_report(path):
# Creates the file version_report.txt for stuff run ngi_pipeline
- with open(os.path.join(path, 'version_report.txt'), 'w') as VERSION_REPORT:
- VERSION_REPORT.write(u'******\n')
- VERSION_REPORT.write(u'README\n')
- VERSION_REPORT.write(u'******\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'The versions of programs and references used:\n')
- VERSION_REPORT.write(u'piper: unknown\n')
- VERSION_REPORT.write(u'bwa: 0.7.12\n')
- VERSION_REPORT.write(u'samtools: 0.1.19\n')
- VERSION_REPORT.write(u'qualimap: v2.2\n')
- VERSION_REPORT.write(u'snpEff: 4.1\n')
- VERSION_REPORT.write(u'snpEff reference: GRCh37.75\n')
- VERSION_REPORT.write(u'gatk: 3.3-0-geee94ec\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'reference: human_g1k_v37.fasta\n')
- VERSION_REPORT.write(u'db_snp: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'hapmap: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'omni: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'1000G_indels: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n')
- VERSION_REPORT.write(u'indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'piper\n')
- VERSION_REPORT.write(u'-----\n')
- VERSION_REPORT.write(u'Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n')
+ with open(os.path.join(path, "version_report.txt"), "w") as VERSION_REPORT:
+ VERSION_REPORT.write("******\n")
+ VERSION_REPORT.write("README\n")
+ VERSION_REPORT.write("******\n")
+ VERSION_REPORT.write("\n")
+ VERSION_REPORT.write(
+ "Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n"
+ )
+ VERSION_REPORT.write("\n")
+ VERSION_REPORT.write("The versions of programs and references used:\n")
+ VERSION_REPORT.write("piper: unknown\n")
+ VERSION_REPORT.write("bwa: 0.7.12\n")
+ VERSION_REPORT.write("samtools: 0.1.19\n")
+ VERSION_REPORT.write("qualimap: v2.2\n")
+ VERSION_REPORT.write("snpEff: 4.1\n")
+ VERSION_REPORT.write("snpEff reference: GRCh37.75\n")
+ VERSION_REPORT.write("gatk: 3.3-0-geee94ec\n")
+ VERSION_REPORT.write("\n")
+ VERSION_REPORT.write("reference: human_g1k_v37.fasta\n")
+ VERSION_REPORT.write("db_snp: gatk-bundle/2.8\n")
+ VERSION_REPORT.write("hapmap: gatk-bundle/2.8\n")
+ VERSION_REPORT.write("omni: gatk-bundle/2.8\n")
+ VERSION_REPORT.write("1000G_indels: gatk-bundle/2.8\n")
+ VERSION_REPORT.write(
+ "Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n"
+ )
+ VERSION_REPORT.write("\n")
+ VERSION_REPORT.write(
+ "indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n"
+ )
+ VERSION_REPORT.write(
+ "indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n"
+ )
+ VERSION_REPORT.write("\n")
+ VERSION_REPORT.write("piper\n")
+ VERSION_REPORT.write("-----\n")
+ VERSION_REPORT.write(
+ "Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n"
+ )
-def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None ):
+
+def create_FC(incoming_dir, run_name, samplesheet, fastq_1=None, fastq_2=None):
# Create something like 160217_ST-E00201_0063_AHJHNYCCXX
path_to_fc = os.path.join(incoming_dir, run_name)
if os.path.exists(path_to_fc):
# This FC exists, skip it
return
fs.create_folder(path_to_fc)
- fs.touch(os.path.join(path_to_fc, 'RTAComplete.txt'))
+ fs.touch(os.path.join(path_to_fc, "RTAComplete.txt"))
# Create folder Demultiplexing
- fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing'))
+ fs.create_folder(os.path.join(path_to_fc, "Demultiplexing"))
# Create folder Demultiplexing/Reports
- fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Reports'))
+ fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Reports"))
# Create folder Demultiplexing/Stats
- fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', 'Stats'))
+ fs.create_folder(os.path.join(path_to_fc, "Demultiplexing", "Stats"))
# Memorise SampleSheet stats
header = []
for key in samplesheet[0]:
header.append(key)
counter = 1
- current_lane = ''
+ current_lane = ""
for line in samplesheet:
- project_name = line.get('Sample_Project', line.get('Project', ''))
- lane = line['Lane']
- if current_lane == '':
+ project_name = line.get("Sample_Project", line.get("Project", ""))
+ lane = line["Lane"]
+ if current_lane == "":
current_lane = lane
elif current_lane != lane:
counter = 1
current_lane = lane
- sample_id = line.get('SampleID', line.get('Sample_ID', ''))
- sample_name = line.get('SampleName', line.get('Sample_Name', ''))
+ sample_id = line.get("SampleID", line.get("Sample_ID", ""))
+ sample_name = line.get("SampleName", line.get("Sample_Name", ""))
# Create dir structure
- fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', project_name, sample_id))
+ fs.create_folder(
+ os.path.join(path_to_fc, "Demultiplexing", project_name, sample_id)
+ )
# Now create the data
- fastq_1_dest = '{}_S{}_L00{}_R1_001.fastq.gz'.format(sample_name, counter, lane)
- fastq_2_dest = '{}_S{}_L00{}_R2_001.fastq.gz'.format(sample_name, counter, lane)
+ fastq_1_dest = f"{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz"
+ fastq_2_dest = f"{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz"
counter += 1
if fastq_1 is None:
- fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name,
- sample_id, fastq_1_dest))
- fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name,
- sample_id, fastq_2_dest))
+ fs.touch(
+ os.path.join(
+ path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest
+ )
+ )
+ fs.touch(
+ os.path.join(
+ path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest
+ )
+ )
else:
- fs.do_symlink(fastq_1, os.path.join(path_to_fc, 'Demultiplexing',
- project_name, sample_id, fastq_1_dest))
- fs.do_symlink(fastq_2, os.path.join(path_to_fc, 'Demultiplexing',
- project_name, sample_id, fastq_2_dest))
+ fs.do_symlink(
+ fastq_1,
+ os.path.join(
+ path_to_fc, "Demultiplexing", project_name, sample_id, fastq_1_dest
+ ),
+ )
+ fs.do_symlink(
+ fastq_2,
+ os.path.join(
+ path_to_fc, "Demultiplexing", project_name, sample_id, fastq_2_dest
+ ),
+ )
- with open(os.path.join(path_to_fc, 'SampleSheet.csv'), 'w') as Samplesheet_file:
- Samplesheet_file.write(u'[Header]\n')
- Samplesheet_file.write(u'Date,2016-03-29\n')
- Samplesheet_file.write(u'Investigator Name,Christian Natanaelsson\n')
- Samplesheet_file.write(u'[Data]\n')
+ with open(os.path.join(path_to_fc, "SampleSheet.csv"), "w") as Samplesheet_file:
+ Samplesheet_file.write("[Header]\n")
+ Samplesheet_file.write("Date,2016-03-29\n")
+ Samplesheet_file.write("Investigator Name,Christian Natanaelsson\n")
+ Samplesheet_file.write("[Data]\n")
for key in header:
- Samplesheet_file.write(u'{},'.format(key))
- Samplesheet_file.write(u'\n')
+ Samplesheet_file.write(f"{key},")
+ Samplesheet_file.write("\n")
for line in samplesheet:
for key in header:
- Samplesheet_file.write(u'{},'.format(line[key]))
- Samplesheet_file.write(u'\n')
+ Samplesheet_file.write(f"{line[key]},")
+ Samplesheet_file.write("\n")
+
def create_uppmax_env(ngi_config):
paths = {}
- if 'analysis' not in ngi_config:
- sys.exit('ERROR: analysis must be a field of NGI_CONFIG.')
+ if "analysis" not in ngi_config:
+ sys.exit("ERROR: analysis must be a field of NGI_CONFIG.")
try:
- base_root = ngi_config['analysis']['base_root']
- paths['base_root'] = base_root
- sthlm_root = ngi_config['analysis']['sthlm_root']
- paths['sthlm_root'] = sthlm_root
- top_dir = ngi_config['analysis']['top_dir']
- paths['top_dir'] = top_dir
+ base_root = ngi_config["analysis"]["base_root"]
+ paths["base_root"] = base_root
+ sthlm_root = ngi_config["analysis"]["sthlm_root"]
+ paths["sthlm_root"] = sthlm_root
+ top_dir = ngi_config["analysis"]["top_dir"]
+ paths["top_dir"] = top_dir
except KeyError as e:
- raise SystemExit('Config file is missing the key {}, make sure it have all required information'.format(str(e)))
- if 'environment' not in ngi_config:
- sys.exit('ERROR: environment must be a field of NGI_CONFIG.')
+ raise SystemExit(
+ f"Config file is missing the key {str(e)}, make sure it have all required information"
+ )
+ if "environment" not in ngi_config:
+ sys.exit("ERROR: environment must be a field of NGI_CONFIG.")
try:
# Get base root
- flowcell_inboxes = ngi_config['environment']['flowcell_inbox']
- flowcell_inbox = flowcell_inboxes[0] # I assume there is only one
- paths['flowcell_inbox'] = flowcell_inbox
+ flowcell_inboxes = ngi_config["environment"]["flowcell_inbox"]
+ flowcell_inbox = flowcell_inboxes[0] # I assume there is only one
+ paths["flowcell_inbox"] = flowcell_inbox
except ValueError as e:
- sys.exit('key error, flowcell_inbox not found in "{}": {}'.format(ngi_config, e))
+ sys.exit(f'key error, flowcell_inbox not found in "{ngi_config}": {e}')
# Now I need to create the folders for this
if not os.path.exists(base_root):
- sys.exit('base_root needs to exists: {}'.format(base_root))
+ sys.exit(f"base_root needs to exists: {base_root}")
fs.create_folder(flowcell_inbox)
if sthlm_root is None:
path_to_analysis = os.path.join(base_root, top_dir)
@@ -143,72 +172,89 @@ def create_uppmax_env(ngi_config):
fs.create_folder(path_to_analysis)
return paths
+
def produce_analysis_qc_ngi(ngi_config, project_id):
- analysis_dir = os.path.join(ngi_config['analysis']['base_root'],
- ngi_config['analysis']['sthlm_root'],
- ngi_config['analysis']['top_dir'],
- 'ANALYSIS', project_id)
- data_dir = os.path.join(ngi_config['analysis']['base_root'],
- ngi_config['analysis']['sthlm_root'],
- ngi_config['analysis']['top_dir'],
- 'DATA', project_id)
+ analysis_dir = os.path.join(
+ ngi_config["analysis"]["base_root"],
+ ngi_config["analysis"]["sthlm_root"],
+ ngi_config["analysis"]["top_dir"],
+ "ANALYSIS",
+ project_id,
+ )
+ data_dir = os.path.join(
+ ngi_config["analysis"]["base_root"],
+ ngi_config["analysis"]["sthlm_root"],
+ ngi_config["analysis"]["top_dir"],
+ "DATA",
+ project_id,
+ )
- qc_ngi_dir = os.path.join(analysis_dir, 'qc_ngi')
+ qc_ngi_dir = os.path.join(analysis_dir, "qc_ngi")
fs.create_folder(qc_ngi_dir)
for sample_id in os.listdir(data_dir):
sample_dir_qc = os.path.join(qc_ngi_dir, sample_id)
fs.create_folder(sample_dir_qc)
- fastqc_dir = os.path.join(sample_dir_qc, 'fastqc')
+ fastqc_dir = os.path.join(sample_dir_qc, "fastqc")
fs.create_folder(fastqc_dir)
- fastq_screen_dir = os.path.join(sample_dir_qc, 'fastq_screen')
+ fastq_screen_dir = os.path.join(sample_dir_qc, "fastq_screen")
fs.create_folder(fastq_screen_dir)
# Do not create more than this...
+
def produce_analysis_piper(ngi_config, project_id):
# Create piper_ngi
- analysis_dir = os.path.join(ngi_config['analysis']['base_root'],
- ngi_config['analysis']['sthlm_root'],
- ngi_config['analysis']['top_dir'],
- 'ANALYSIS', project_id)
- data_dir = os.path.join(ngi_config['analysis']['base_root'],
- ngi_config['analysis']['sthlm_root'],
- ngi_config['analysis']['top_dir'],
- 'DATA', project_id)
+ analysis_dir = os.path.join(
+ ngi_config["analysis"]["base_root"],
+ ngi_config["analysis"]["sthlm_root"],
+ ngi_config["analysis"]["top_dir"],
+ "ANALYSIS",
+ project_id,
+ )
+ data_dir = os.path.join(
+ ngi_config["analysis"]["base_root"],
+ ngi_config["analysis"]["sthlm_root"],
+ ngi_config["analysis"]["top_dir"],
+ "DATA",
+ project_id,
+ )
- piper_ngi_dir = os.path.join(analysis_dir, 'piper_ngi')
+ piper_ngi_dir = os.path.join(analysis_dir, "piper_ngi")
fs.create_folder(piper_ngi_dir)
- piper_dirs = ['01_raw_alignments',
- '02_preliminary_alignment_qc',
- '03_genotype_concordance',
- '04_merged_aligments',
- '05_processed_alignments',
- '06_final_alignment_qc',
- '07_variant_calls',
- '08_misc']
+ piper_dirs = [
+ "01_raw_alignments",
+ "02_preliminary_alignment_qc",
+ "03_genotype_concordance",
+ "04_merged_aligments",
+ "05_processed_alignments",
+ "06_final_alignment_qc",
+ "07_variant_calls",
+ "08_misc",
+ ]
for piper_dir in piper_dirs:
- current_dir = os.path.join(piper_ngi_dir, piper_dir)
+ current_dir = os.path.join(piper_ngi_dir, piper_dir)
fs.create_folder(current_dir)
- if piper_dir == '05_processed_alignments':
+ if piper_dir == "05_processed_alignments":
for sample_id in os.listdir(data_dir):
- bam_file = '{}.clean.dedup.bam'.format(sample_id)
+ bam_file = f"{sample_id}.clean.dedup.bam"
fs.touch(os.path.join(current_dir, bam_file))
- if piper_dir == '07_variant_calls':
+ if piper_dir == "07_variant_calls":
for sample_id in os.listdir(data_dir):
- vcf_file = '{}.clean.dedup.recal.bam.raw.indel.vcf.gz'.format(sample_id)
+ vcf_file = f"{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz"
fs.touch(os.path.join(current_dir, vcf_file))
- current_dir = os.path.join(piper_ngi_dir, 'sbatch')
+ current_dir = os.path.join(piper_ngi_dir, "sbatch")
fs.create_folder(current_dir)
- current_dir = os.path.join(piper_ngi_dir, 'setup_xml_files')
+ current_dir = os.path.join(piper_ngi_dir, "setup_xml_files")
fs.create_folder(current_dir)
- current_dir = os.path.join(piper_ngi_dir, 'logs')
+ current_dir = os.path.join(piper_ngi_dir, "logs")
fs.create_folder(current_dir)
create_version_report(current_dir)
+
def select_random_projects(projects_in, num_proj, application, projects_out, label):
chosen_projects = 0
- iterations = 0 # Safe guard to avoid infinite loops
- application_not_in_other = ['WG re-seq']
- while chosen_projects != num_proj and iterations < 4*len(projects_in):
+ iterations = 0 # Safe guard to avoid infinite loops
+ application_not_in_other = ["WG re-seq"]
+ while chosen_projects != num_proj and iterations < 4 * len(projects_in):
iterations += 1
selected_proj = random.choice(list(projects_in.keys()))
# Check if I have already picked up this element
@@ -217,139 +263,190 @@ def select_random_projects(projects_in, num_proj, application, projects_out, lab
if selected_proj == project_pair[0]:
already_chosen = True
if already_chosen:
- continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops
+ continue # I am reprocessing an element I already saw. I skip it. iterations will avoid infinite loops
proj_value = projects_in[selected_proj]
- if application == 'other':
+ if application == "other":
# In this case everything expcept
- if proj_value['application'] not in application_not_in_other:
+ if proj_value["application"] not in application_not_in_other:
# I select this one
projects_out.append([selected_proj, label])
chosen_projects += 1
- elif application == proj_value['application']:
+ elif application == proj_value["application"]:
# I select this one
projects_out.append([selected_proj, label])
chosen_projects += 1
+
def create(projects, ngi_config_file, fastq_1, fastq_2):
- statusdb_conf = CONFIG.get('statusdb')
+ statusdb_conf = CONFIG.get("statusdb")
if statusdb_conf is None:
- logger.error('No statusdb field in taca configuration file')
+ logger.error("No statusdb field in taca configuration file")
return 1
- if 'dev' not in statusdb_conf['url']:
- logger.error('url for status db is {}, but dev must be specified in this case'.format(statusdb_conf['url']))
+ if "dev" not in statusdb_conf["url"]:
+ logger.error(
+ "url for status db is {}, but dev must be specified in this case".format(
+ statusdb_conf["url"]
+ )
+ )
couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
- projectsDB = couch_connection['projects']
- project_summary = projectsDB.view('project/summary')
+ projectsDB = couch_connection["projects"]
+ project_summary = projectsDB.view("project/summary")
projects_closed_more_than_three_months = {}
projects_closed_more_than_one_month_less_than_three = {}
projects_closed_less_than_one_month = {}
projects_opened = {}
current_date = datetime.datetime.today()
- date_limit_one_year = current_date - relativedelta(months=6) #yes yes I know.. but in this way i am sure all data in in xflocell_db
+ date_limit_one_year = current_date - relativedelta(
+ months=6
+ ) # yes yes I know.. but in this way i am sure all data in in xflocell_db
date_limit_one_month = current_date - relativedelta(months=1)
date_limit_three_month = current_date - relativedelta(months=3)
for row in project_summary:
- project_id = row['key'][1]
- project_status = row['key'][0]
- if 'application' not in row['value']:
+ project_id = row["key"][1]
+ project_status = row["key"][0]
+ if "application" not in row["value"]:
continue
- if row['value']['no_samples'] > 50:
- continue # Skip large projects
- application = row['value']['application']
- if project_status == 'closed':
- if 'close_date' in row['value']:
- close_date = datetime.datetime.strptime(row['value']['close_date'], '%Y-%m-%d')
- if close_date > date_limit_one_year: # If the project has been closed after the date limit
+ if row["value"]["no_samples"] > 50:
+ continue # Skip large projects
+ application = row["value"]["application"]
+ if project_status == "closed":
+ if "close_date" in row["value"]:
+ close_date = datetime.datetime.strptime(
+ row["value"]["close_date"], "%Y-%m-%d"
+ )
+ if (
+ close_date > date_limit_one_year
+ ): # If the project has been closed after the date limit
if close_date >= date_limit_one_month:
- projects_closed_less_than_one_month[project_id] = {'project_name': row['value']['project_name'],
- 'application': application,
- 'no_samples': row['value']['no_samples']}
- elif close_date < date_limit_one_month and close_date >= date_limit_three_month:
- projects_closed_more_than_one_month_less_than_three[project_id] = {'project_name': row['value']['project_name'],
- 'application': application,
- 'no_samples': row['value']['no_samples']}
+ projects_closed_less_than_one_month[project_id] = {
+ "project_name": row["value"]["project_name"],
+ "application": application,
+ "no_samples": row["value"]["no_samples"],
+ }
+ elif (
+ close_date < date_limit_one_month
+ and close_date >= date_limit_three_month
+ ):
+ projects_closed_more_than_one_month_less_than_three[
+ project_id
+ ] = {
+ "project_name": row["value"]["project_name"],
+ "application": application,
+ "no_samples": row["value"]["no_samples"],
+ }
elif close_date < date_limit_three_month:
- projects_closed_more_than_three_months[project_id] = {'project_name': row['value']['project_name'],
- 'application': application,
- 'no_samples': row['value']['no_samples']}
- elif project_status == 'open':
- if 'lanes_sequenced' in row['value'] and row['value']['lanes_sequenced'] > 0:
- projects_opened[project_id] = {'project_name': row['value']['project_name'],
- 'application': application,
- 'no_samples': row['value']['no_samples']}
+ projects_closed_more_than_three_months[project_id] = {
+ "project_name": row["value"]["project_name"],
+ "application": application,
+ "no_samples": row["value"]["no_samples"],
+ }
+ elif project_status == "open":
+ if (
+ "lanes_sequenced" in row["value"]
+ and row["value"]["lanes_sequenced"] > 0
+ ):
+ projects_opened[project_id] = {
+ "project_name": row["value"]["project_name"],
+ "application": application,
+ "no_samples": row["value"]["no_samples"],
+ }
else:
- print('status {}'.format(project_status))
+ print(f"status {project_status}")
## Now I can parse the x_flowcell db to check what I can and cannot use
- whole_genome_projects = int(2*projects/3)
+ whole_genome_projects = int(2 * projects / 3)
projects_to_reproduce = []
- select_random_projects(projects_closed_more_than_three_months,
- whole_genome_projects/4+1,
- 'WG re-seq',
- projects_to_reproduce,
- 'WGreseq_tot_closed')
- select_random_projects(projects_closed_more_than_one_month_less_than_three,
- whole_genome_projects/4+1,
- 'WG re-seq',
- projects_to_reproduce,
- 'WGreseq_closed_clean_no_del')
- select_random_projects(projects_closed_less_than_one_month,
- whole_genome_projects/4+1,
- 'WG re-seq',
- projects_to_reproduce,
- 'WGreseq_closed_no_clean')
- select_random_projects(projects_opened,
- whole_genome_projects/4+1,
- 'WG re-seq',
- projects_to_reproduce,
- 'WGreseq_open')
+ select_random_projects(
+ projects_closed_more_than_three_months,
+ whole_genome_projects / 4 + 1,
+ "WG re-seq",
+ projects_to_reproduce,
+ "WGreseq_tot_closed",
+ )
+ select_random_projects(
+ projects_closed_more_than_one_month_less_than_three,
+ whole_genome_projects / 4 + 1,
+ "WG re-seq",
+ projects_to_reproduce,
+ "WGreseq_closed_clean_no_del",
+ )
+ select_random_projects(
+ projects_closed_less_than_one_month,
+ whole_genome_projects / 4 + 1,
+ "WG re-seq",
+ projects_to_reproduce,
+ "WGreseq_closed_no_clean",
+ )
+ select_random_projects(
+ projects_opened,
+ whole_genome_projects / 4 + 1,
+ "WG re-seq",
+ projects_to_reproduce,
+ "WGreseq_open",
+ )
- other_projects = int(projects/3)
- select_random_projects(projects_closed_more_than_three_months,
- other_projects/4+1,
- 'other',
- projects_to_reproduce,
- 'noWGreseq_tot_closed')
- select_random_projects(projects_closed_more_than_one_month_less_than_three,
- other_projects/4+1,
- 'other',
- projects_to_reproduce,
- 'noWGreseq_closed_clean_no_del')
- select_random_projects(projects_closed_less_than_one_month,
- other_projects/4+1,
- 'other',
- projects_to_reproduce,
- 'noWGreseq_closed_no_clean')
- select_random_projects(projects_opened,
- other_projects/4+1,
- 'other',
- projects_to_reproduce,
- 'noWGreseq_open')
+ other_projects = int(projects / 3)
+ select_random_projects(
+ projects_closed_more_than_three_months,
+ other_projects / 4 + 1,
+ "other",
+ projects_to_reproduce,
+ "noWGreseq_tot_closed",
+ )
+ select_random_projects(
+ projects_closed_more_than_one_month_less_than_three,
+ other_projects / 4 + 1,
+ "other",
+ projects_to_reproduce,
+ "noWGreseq_closed_clean_no_del",
+ )
+ select_random_projects(
+ projects_closed_less_than_one_month,
+ other_projects / 4 + 1,
+ "other",
+ projects_to_reproduce,
+ "noWGreseq_closed_no_clean",
+ )
+ select_random_projects(
+ projects_opened,
+ other_projects / 4 + 1,
+ "other",
+ projects_to_reproduce,
+ "noWGreseq_open",
+ )
# Create ngi_pipeline enviroment
- print('#NGI_CONFIG varaible is {}. This variable needs to be in the .bashrc file'.format(ngi_config_file))
- print('NGI_CONFIG={}'.format(ngi_config_file))
+ print(
+ f"#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file"
+ )
+ print(f"NGI_CONFIG={ngi_config_file}")
try:
ngi_config = conf.load_config(ngi_config_file)
- except IOError as e:
- print('ERROR: {}'.format(e.message))
+ except OSError as e:
+ print(f"ERROR: {e.message}")
# Create uppmax env
paths = create_uppmax_env(ngi_config)
- print('#Going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry'.format(len(projects_to_reproduce)))
+ print(
+ f"#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry"
+ )
# Scan over x_flowcell and reproduce FCs
- flowcellDB = couch_connection['x_flowcells']
+ flowcellDB = couch_connection["x_flowcells"]
reproduced_projects = {}
for fc_doc in flowcellDB:
try:
- samplesheet_csv = flowcellDB[fc_doc]['samplesheet_csv']
+ samplesheet_csv = flowcellDB[fc_doc]["samplesheet_csv"]
except KeyError:
- continue # Parse only FC that have a samplesheet
+ continue # Parse only FC that have a samplesheet
# Check if this FC contains one of the proejcts I need to replicate.
projects_in_FC = set()
- if 'SampleName' in samplesheet_csv[0]:
- projects_in_FC = set([line['SampleName'].split('_')[0] for line in samplesheet_csv])
+ if "SampleName" in samplesheet_csv[0]:
+ projects_in_FC = set(
+ [line["SampleName"].split("_")[0] for line in samplesheet_csv]
+ )
else:
- projects_in_FC = set([line['Sample_Name'].split('_')[0] for line in samplesheet_csv])
+ projects_in_FC = set(
+ [line["Sample_Name"].split("_")[0] for line in samplesheet_csv]
+ )
found = False
for project_pair in projects_to_reproduce:
project = project_pair[0]
@@ -357,31 +454,46 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
# This FC needs to be created
if not found:
# Create the FC only the first time I see a project belonging to it
- create_FC(paths['flowcell_inbox'] , flowcellDB[fc_doc]['RunInfo']['Id'], samplesheet_csv, fastq_1, fastq_2)
+ create_FC(
+ paths["flowcell_inbox"],
+ flowcellDB[fc_doc]["RunInfo"]["Id"],
+ samplesheet_csv,
+ fastq_1,
+ fastq_2,
+ )
found = True
# But I keep track of all projects-run I need to organise
if project not in reproduced_projects:
reproduced_projects[project] = []
- reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id'])
- print('#Reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)'.format(len(reproduced_projects)))
+ reproduced_projects[project].append(flowcellDB[fc_doc]["RunInfo"]["Id"])
+ print(
+ f"#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)"
+ )
for project in projects_to_reproduce:
if project[0] in reproduced_projects:
- print('# {}: {}'.format(project[0], project[1]))
+ print(f"# {project[0]}: {project[1]}")
# Need to output the command to organise
to_be_deleted = []
for project in reproduced_projects:
for FC in reproduced_projects[project]:
- print('Running: ngi_pipeline_start.py organize flowcell {} -p {}'.format(FC, project))
- with open('ngi_pipeline_local.logs', 'w') as NGILOGS:
- return_value = subprocess.call(['ngi_pipeline_start.py',
- 'organize',
- 'flowcell',
- '{}'.format(FC),
- '-p',
- '{}'.format(project)],
- stdout=NGILOGS, stderr=NGILOGS)
+ print(f"Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}")
+ with open("ngi_pipeline_local.logs", "w") as NGILOGS:
+ return_value = subprocess.call(
+ [
+ "ngi_pipeline_start.py",
+ "organize",
+ "flowcell",
+ f"{FC}",
+ "-p",
+ f"{project}",
+ ],
+ stdout=NGILOGS,
+ stderr=NGILOGS,
+ )
if return_value > 0:
- print('#project {} not organised: have a look to the logs, but most likely this projec is not in charon'.format(project))
+ print(
+ f"#project {project} not organised: have a look to the logs, but most likely this projec is not in charon"
+ )
if project not in to_be_deleted:
to_be_deleted.append(project)
@@ -390,13 +502,15 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
# Create ANALYSIS --
for project in projects_to_reproduce:
- if project[0] in reproduced_projects: # Only for projects that I know I have organised
+ if (
+ project[0] in reproduced_projects
+ ): # Only for projects that I know I have organised
produce_analysis_qc_ngi(ngi_config, project[0])
- if project[1].startswith('WGreseq'):
+ if project[1].startswith("WGreseq"):
produce_analysis_piper(ngi_config, project[0])
# Store in a file the results
- with open('projects.txt', 'w') as PROJECTS:
+ with open("projects.txt", "w") as PROJECTS:
for project in projects_to_reproduce:
if project[0] in reproduced_projects:
- PROJECTS.write(u'{}:{}\n'.format(project[0], project[1]))
+ PROJECTS.write(f"{project[0]}:{project[1]}\n")
diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py
index 47da90a9..33ea19b5 100644
--- a/taca/utils/bioinfo_tab.py
+++ b/taca/utils/bioinfo_tab.py
@@ -1,13 +1,14 @@
-import os
+import datetime
import glob
-import re
import logging
-import datetime
+import os
+import re
+from collections import OrderedDict, defaultdict
+
+from flowcell_parser.classes import RunParametersParser, SampleSheetParser
-from taca.utils.config import CONFIG
from taca.utils import statusdb
-from flowcell_parser.classes import SampleSheetParser, RunParametersParser
-from collections import defaultdict, OrderedDict
+from taca.utils.config import CONFIG
from taca.utils.misc import send_mail
logger = logging.getLogger(__name__)
@@ -15,8 +16,9 @@
class Tree(defaultdict):
"""Constructor for a search tree."""
+
def __init__(self, value=None):
- super(Tree, self).__init__(Tree)
+ super().__init__(Tree)
self.value = value
@@ -25,31 +27,36 @@ def collect_runs():
found_runs = []
# Pattern explained:
# 6-8Digits_(maybe ST-)AnythingLetterornumberNumber_Number_AorBLetterornumberordash
- rundir_re = re.compile('\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+')
- for data_dir in CONFIG['bioinfo_tab']['data_dirs']:
+ rundir_re = re.compile("\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+")
+ for data_dir in CONFIG["bioinfo_tab"]["data_dirs"]:
if os.path.exists(data_dir):
- potential_run_dirs = glob.glob(os.path.join(data_dir, '*'))
+ potential_run_dirs = glob.glob(os.path.join(data_dir, "*"))
for run_dir in potential_run_dirs:
- if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir):
+ if rundir_re.match(
+ os.path.basename(os.path.abspath(run_dir))
+ ) and os.path.isdir(run_dir):
found_runs.append(os.path.basename(run_dir))
- logger.info('Working on {}'.format(run_dir))
+ logger.info(f"Working on {run_dir}")
update_statusdb(run_dir)
- nosync_data_dir = os.path.join(data_dir, 'nosync')
- potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, '*'))
+ nosync_data_dir = os.path.join(data_dir, "nosync")
+ potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, "*"))
for run_dir in potential_nosync_run_dirs:
- if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir):
+ if rundir_re.match(
+ os.path.basename(os.path.abspath(run_dir))
+ ) and os.path.isdir(run_dir):
update_statusdb(run_dir)
+
def update_statusdb(run_dir):
"""Gets status for a project."""
# Fetch individual fields
project_info = get_ss_projects(run_dir)
run_id = os.path.basename(os.path.abspath(run_dir))
- statusdb_conf = CONFIG.get('statusdb')
+ statusdb_conf = CONFIG.get("statusdb")
couch_connection = statusdb.StatusdbSession(statusdb_conf).connection
valueskey = datetime.datetime.now().isoformat()
- db = couch_connection['bioinfo_analysis']
- view = db.view('latest_data/sample_id')
+ db = couch_connection["bioinfo_analysis"]
+ view = db.view("latest_data/sample_id")
# Construction and sending of individual records, if samplesheet is incorrectly formatted the loop is skipped
if project_info:
for flowcell in project_info:
@@ -58,14 +65,20 @@ def update_statusdb(run_dir):
for project in project_info[flowcell][lane][sample]:
project_info[flowcell][lane][sample].value = get_status(run_dir)
sample_status = project_info[flowcell][lane][sample].value
- obj = {'run_id': run_id,
- 'project_id': project,
- 'flowcell': flowcell,
- 'lane': lane,
- 'sample': sample,
- 'status': sample_status,
- 'values': {valueskey: {'user': 'taca',
- 'sample_status': sample_status}}}
+ obj = {
+ "run_id": run_id,
+ "project_id": project,
+ "flowcell": flowcell,
+ "lane": lane,
+ "sample": sample,
+ "status": sample_status,
+ "values": {
+ valueskey: {
+ "user": "taca",
+ "sample_status": sample_status,
+ }
+ },
+ }
# If entry exists, append to existing
# Special if case to handle lanes written as int, can be safely removed when old lanes
# is no longer stored as int
@@ -74,151 +87,193 @@ def update_statusdb(run_dir):
if len(view[[project, run_id, lane, sample]].rows) >= 1:
remote_id = view[[project, run_id, lane, sample]].rows[0].id
lane = str(lane)
- remote_doc = db[remote_id]['values']
- remote_status = db[remote_id]['status']
+ remote_doc = db[remote_id]["values"]
+ remote_status = db[remote_id]["status"]
# Only updates the listed statuses
- if remote_status in ['New', 'ERROR', 'Sequencing', 'Demultiplexing'] and sample_status != remote_status:
+ if (
+ remote_status
+ in ["New", "ERROR", "Sequencing", "Demultiplexing"]
+ and sample_status != remote_status
+ ):
# Appends old entry to new. Essentially merges the two
for k, v in remote_doc.items():
- obj['values'][k] = v
- logger.info('Updating {} {} {} {} {} as {}'.format(run_id,
- project,
- flowcell,
- lane,
- sample,
- sample_status))
+ obj["values"][k] = v
+ logger.info(
+ "Updating {} {} {} {} {} as {}".format(
+ run_id,
+ project,
+ flowcell,
+ lane,
+ sample,
+ sample_status,
+ )
+ )
# Sorts timestamps
- obj['values'] = OrderedDict(sorted(obj['values'].items(), key=lambda k_v: k_v[0], reverse=True))
+ obj["values"] = OrderedDict(
+ sorted(
+ obj["values"].items(),
+ key=lambda k_v: k_v[0],
+ reverse=True,
+ )
+ )
# Update record cluster
- obj['_rev'] = db[remote_id].rev
- obj['_id'] = remote_id
+ obj["_rev"] = db[remote_id].rev
+ obj["_id"] = remote_id
db.save(obj)
# Creates new entry
else:
- logger.info('Creating {} {} {} {} {} as {}'.format(run_id,
- project,
- flowcell,
- lane,
- sample,
- sample_status))
+ logger.info(
+ "Creating {} {} {} {} {} as {}".format(
+ run_id,
+ project,
+ flowcell,
+ lane,
+ sample,
+ sample_status,
+ )
+ )
# Creates record
db.save(obj)
# Sets FC error flag
- if not project_info[flowcell].value == None:
- if (('Failed' in project_info[flowcell].value and 'Failed' not in sample_status)
- or ('Failed' in sample_status and 'Failed' not in project_info[flowcell].value)):
- project_info[flowcell].value = 'Ambiguous'
+ if project_info[flowcell].value is not None:
+ if (
+ "Failed" in project_info[flowcell].value
+ and "Failed" not in sample_status
+ ) or (
+ "Failed" in sample_status
+ and "Failed" not in project_info[flowcell].value
+ ):
+ project_info[flowcell].value = "Ambiguous"
else:
project_info[flowcell].value = sample_status
# Checks if a flowcell needs partial re-doing
# Email error per flowcell
- if not project_info[flowcell].value == None:
- if 'Ambiguous' in project_info[flowcell].value:
- error_emailer('failed_run', run_id)
+ if project_info[flowcell].value is not None:
+ if "Ambiguous" in project_info[flowcell].value:
+ error_emailer("failed_run", run_id)
+
def get_status(run_dir):
"""Gets status of a sample run, based on flowcell info (folder structure)."""
# Default state, should never occur
- status = 'ERROR'
- xten_dmux_folder = os.path.join(run_dir, 'Demultiplexing')
- unaligned_folder = glob.glob(os.path.join(run_dir, 'Unaligned_*'))
- nosync_pattern = re.compile('nosync')
+ status = "ERROR"
+ xten_dmux_folder = os.path.join(run_dir, "Demultiplexing")
+ unaligned_folder = glob.glob(os.path.join(run_dir, "Unaligned_*"))
+ nosync_pattern = re.compile("nosync")
# If we're in a nosync folder
if nosync_pattern.search(run_dir):
- status = 'New'
+ status = "New"
# If demux folder exist (or similar)
- elif (os.path.exists(xten_dmux_folder) or unaligned_folder):
- status = 'Demultiplexing'
+ elif os.path.exists(xten_dmux_folder) or unaligned_folder:
+ status = "Demultiplexing"
# If RTAcomplete doesn't exist
- elif not (os.path.exists(os.path.join(run_dir, 'RTAComplete.txt'))):
- status = 'Sequencing'
+ elif not (os.path.exists(os.path.join(run_dir, "RTAComplete.txt"))):
+ status = "Sequencing"
return status
+
def get_ss_projects(run_dir):
"""Fetches project, FC, lane & sample (sample-run) status for a given folder."""
proj_tree = Tree()
- lane_pattern = re.compile('^([1-8]{1,2})$')
- sample_proj_pattern = re.compile('^((P[0-9]{3,5})_[0-9]{3,5})')
+ lane_pattern = re.compile("^([1-8]{1,2})$")
+ sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})")
run_name = os.path.basename(os.path.abspath(run_dir))
- run_date = run_name.split('_')[0]
+ run_date = run_name.split("_")[0]
if len(run_date) == 6:
- current_year = '20' + run_date[0:2]
- elif len(run_name.split('_')[0]) == 8: # NovaSeqXPlus case
+ current_year = "20" + run_date[0:2]
+ elif len(run_name.split("_")[0]) == 8: # NovaSeqXPlus case
current_year = run_date[0:4]
- run_name_components = run_name.split('_')
- if 'VH' in run_name_components[1]:
+ run_name_components = run_name.split("_")
+ if "VH" in run_name_components[1]:
FCID = run_name_components[3]
else:
FCID = run_name_components[3][1:]
miseq = False
# FIXME: this check breaks if the system is case insensitive
- if os.path.exists(os.path.join(run_dir, 'runParameters.xml')):
- run_parameters_file = 'runParameters.xml'
- elif os.path.exists(os.path.join(run_dir, 'RunParameters.xml')):
- run_parameters_file = 'RunParameters.xml'
+ if os.path.exists(os.path.join(run_dir, "runParameters.xml")):
+ run_parameters_file = "runParameters.xml"
+ elif os.path.exists(os.path.join(run_dir, "RunParameters.xml")):
+ run_parameters_file = "RunParameters.xml"
else:
- logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run_dir))
+ logger.error(
+ f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}"
+ )
return []
rp = RunParametersParser(os.path.join(run_dir, run_parameters_file))
- if 'Setup' in rp.data['RunParameters']:
- runtype = rp.data['RunParameters']['Setup'].get('Flowcell', '')
+ if "Setup" in rp.data["RunParameters"]:
+ runtype = rp.data["RunParameters"]["Setup"].get("Flowcell", "")
if not runtype:
- logger.warn('Parsing runParameters to fetch instrument type, '
- 'not found Flowcell information in it. Using ApplicationName')
- runtype = rp.data['RunParameters']['Setup'].get('ApplicationName', '')
- elif 'InstrumentType' in rp.data['RunParameters']:
- runtype = rp.data['RunParameters'].get('InstrumentType')
+ logger.warn(
+ "Parsing runParameters to fetch instrument type, "
+ "not found Flowcell information in it. Using ApplicationName"
+ )
+ runtype = rp.data["RunParameters"]["Setup"].get("ApplicationName", "")
+ elif "InstrumentType" in rp.data["RunParameters"]:
+ runtype = rp.data["RunParameters"].get("InstrumentType")
else:
- runtype = rp.data['RunParameters'].get('Application')
+ runtype = rp.data["RunParameters"].get("Application")
if not runtype:
- logger.warn("Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'")
- runtype = rp.data['RunParameters'].get('ApplicationName', '')
+ logger.warn(
+ "Couldn't find 'Application', could be NextSeq. Trying 'ApplicationName'"
+ )
+ runtype = rp.data["RunParameters"].get("ApplicationName", "")
# Miseq case
- if 'MiSeq' in runtype:
- if os.path.exists(os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv')):
- FCID_samplesheet_origin = os.path.join(run_dir, 'Data', 'Intensities', 'BaseCalls', 'SampleSheet.csv')
- elif os.path.exists(os.path.join(run_dir, 'SampleSheet.csv')):
- FCID_samplesheet_origin = os.path.join(run_dir, 'SampleSheet.csv')
+ if "MiSeq" in runtype:
+ if os.path.exists(
+ os.path.join(run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv")
+ ):
+ FCID_samplesheet_origin = os.path.join(
+ run_dir, "Data", "Intensities", "BaseCalls", "SampleSheet.csv"
+ )
+ elif os.path.exists(os.path.join(run_dir, "SampleSheet.csv")):
+ FCID_samplesheet_origin = os.path.join(run_dir, "SampleSheet.csv")
else:
- logger.warn('No samplesheet found for {}'.format(run_dir))
+ logger.warn(f"No samplesheet found for {run_dir}")
miseq = True
lanes = str(1)
# Pattern is a bit more rigid since we're no longer also checking for lanes
- sample_proj_pattern=re.compile('^((P[0-9]{3,5})_[0-9]{3,5})$')
+ sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})$")
data = parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=True)
# HiSeq X case
- elif 'HiSeq X' in runtype:
- FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['xten_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ elif "HiSeq X" in runtype:
+ FCID_samplesheet_origin = os.path.join(
+ CONFIG["bioinfo_tab"]["xten_samplesheets"], current_year, f"{FCID}.csv"
+ )
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# HiSeq 2500 case
- elif 'HiSeq' in runtype or 'TruSeq' in runtype:
- FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['hiseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ elif "HiSeq" in runtype or "TruSeq" in runtype:
+ FCID_samplesheet_origin = os.path.join(
+ CONFIG["bioinfo_tab"]["hiseq_samplesheets"], current_year, f"{FCID}.csv"
+ )
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
- elif 'NovaSeqXPlus' in runtype:
- FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseqxplus_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ elif "NovaSeqXPlus" in runtype:
+ FCID_samplesheet_origin = os.path.join(
+ CONFIG["bioinfo_tab"]["novaseqxplus_samplesheets"],
+ current_year,
+ f"{FCID}.csv",
+ )
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# NovaSeq 6000 case
- elif 'NovaSeq' in runtype:
- FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ elif "NovaSeq" in runtype:
+ FCID_samplesheet_origin = os.path.join(
+ CONFIG["bioinfo_tab"]["novaseq_samplesheets"], current_year, f"{FCID}.csv"
+ )
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# NextSeq Case
- elif 'NextSeq' in runtype:
- FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['nextseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ elif "NextSeq" in runtype:
+ FCID_samplesheet_origin = os.path.join(
+ CONFIG["bioinfo_tab"]["nextseq_samplesheets"], current_year, f"{FCID}.csv"
+ )
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
else:
- logger.warn('Cannot locate the samplesheet for run {}'.format(run_dir))
+ logger.warn(f"Cannot locate the samplesheet for run {run_dir}")
return []
# If samplesheet is empty, don't bother going through it
if data == []:
- return data
+ return data
proj_n_sample = False
lane = False
@@ -244,87 +299,110 @@ def get_ss_projects(run_dir):
lane = False
if list(proj_tree.keys()) == []:
- logger.info('INCORRECTLY FORMATTED SAMPLESHEET, CHECK {}'.format(run_name))
+ logger.info(f"INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}")
return proj_tree
+
def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False):
"""Parses a samplesheet with SampleSheetParser
- :param FCID_samplesheet_origin sample sheet path
+ :param FCID_samplesheet_origin sample sheet path
"""
data = []
try:
ss_reader = SampleSheetParser(FCID_samplesheet_origin)
data = ss_reader.data
except:
- logger.warn('Cannot initialize SampleSheetParser for {}. Most likely due to poor comma separation'.format(run_dir))
+ logger.warn(
+ f"Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation"
+ )
return []
if is_miseq:
- if not 'Description' in ss_reader.header or not \
- ('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']):
- logger.warn('Run {} not labelled as production or application. Disregarding it.'.format(run_dir))
+ if "Description" not in ss_reader.header or not (
+ "Production" in ss_reader.header["Description"]
+ or "Application" in ss_reader.header["Description"]
+ ):
+ logger.warn(
+ f"Run {run_dir} not labelled as production or application. Disregarding it."
+ )
# Skip this run
return []
return data
+
def error_emailer(flag, info):
"""Sends a custom error e-mail
:param flag e-mail state
:param info variable that describes the record in some way
"""
- recipients = CONFIG['mail']['recipients']
+ recipients = CONFIG["mail"]["recipients"]
# Failed_run: Samplesheet for a given project couldn't be found
- body = 'TACA has encountered an issue that might be worth investigating\n'
- body += 'The offending entry is: '
+ body = "TACA has encountered an issue that might be worth investigating\n"
+ body += "The offending entry is: "
body += info
- body += '\n\nSincerely, TACA'
+ body += "\n\nSincerely, TACA"
- if (flag == 'no_samplesheet'):
- subject='ERROR, Samplesheet error'
- elif (flag == "failed_run"):
- subject='WARNING, Reinitialization of partially failed FC'
- elif (flag == 'weird_samplesheet'):
- subject='ERROR, Incorrectly formatted samplesheet'
+ if flag == "no_samplesheet":
+ subject = "ERROR, Samplesheet error"
+ elif flag == "failed_run":
+ subject = "WARNING, Reinitialization of partially failed FC"
+ elif flag == "weird_samplesheet":
+ subject = "ERROR, Incorrectly formatted samplesheet"
hour_now = datetime.datetime.now().hour
if hour_now == 7 or hour_now == 12 or hour_now == 16:
send_mail(subject, body, recipients)
+
def fail_run(runid, project):
"""Updates status of specified run or project-run to Failed."""
- statusdb_conf = CONFIG.get('statusdb')
- logger.info('Connecting to status db: {}'.format(statusdb_conf.get('url')))
+ statusdb_conf = CONFIG.get("statusdb")
+ logger.info("Connecting to status db: {}".format(statusdb_conf.get("url")))
try:
status_db = statusdb.StatusdbSession(statusdb_conf).connection
except Exception as e:
- logger.error('Can not connect to status_db: https://{}:*****@{}'.format(
- statusdb_conf.get('username'),
- statusdb_conf.get('url')))
+ logger.error(
+ "Can not connect to status_db: https://{}:*****@{}".format(
+ statusdb_conf.get("username"), statusdb_conf.get("url")
+ )
+ )
logger.error(e)
raise e
- bioinfo_db = status_db['bioinfo_analysis']
+ bioinfo_db = status_db["bioinfo_analysis"]
if project is not None:
- view = bioinfo_db.view('full_doc/pj_run_to_doc')
+ view = bioinfo_db.view("full_doc/pj_run_to_doc")
rows = view[[project, runid]].rows
- logger.info('Updating status of {} objects with flowcell_id: {} and project_id {}'.format(len(rows), runid, project))
+ logger.info(
+ f"Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}"
+ )
else:
- view = bioinfo_db.view('full_doc/run_id_to_doc')
+ view = bioinfo_db.view("full_doc/run_id_to_doc")
rows = view[[runid]].rows
- logger.info('Updating status of {} objects with flowcell_id: {}'.format(len(rows), runid))
+ logger.info(f"Updating status of {len(rows)} objects with flowcell_id: {runid}")
new_timestamp = datetime.datetime.now().isoformat()
updated = 0
for row in rows:
- if row.value['status'] != 'Failed':
- row.value['values'][new_timestamp] = {'sample_status' : 'Failed', 'user': 'taca'}
- row.value['status'] = 'Failed'
+ if row.value["status"] != "Failed":
+ row.value["values"][new_timestamp] = {
+ "sample_status": "Failed",
+ "user": "taca",
+ }
+ row.value["status"] = "Failed"
try:
bioinfo_db.save(row.value)
updated += 1
except Exception as e:
- logger.error('Cannot update object project-sample-run-lane: {}-{}-{}-{}'.format(row.value.get('project_id'), row.value.get('sample'), row.value.get('run_id'), row.value.get('lane')))
+ logger.error(
+ "Cannot update object project-sample-run-lane: {}-{}-{}-{}".format(
+ row.value.get("project_id"),
+ row.value.get("sample"),
+ row.value.get("run_id"),
+ row.value.get("lane"),
+ )
+ )
logger.error(e)
raise e
- logger.info('Successfully updated {} objects'.format(updated))
+ logger.info(f"Successfully updated {updated} objects")
diff --git a/taca/utils/cli.py b/taca/utils/cli.py
index bbfdb819..3bef6eef 100644
--- a/taca/utils/cli.py
+++ b/taca/utils/cli.py
@@ -1,27 +1,34 @@
"""CLI for the bioinfo subcommand."""
import click
+
import taca.utils.bioinfo_tab as bt
-@click.group(name='bioinfo_deliveries')
+
+@click.group(name="bioinfo_deliveries")
def bioinfo_deliveries():
"""Update statusdb with information about FC entry point."""
pass
+
# bioinfo subcommands
@bioinfo_deliveries.command()
-@click.argument('rundir')
+@click.argument("rundir")
def updaterun(rundir):
"""Saves the bioinfo data to statusdb."""
bt.update_statusdb(rundir)
+
@bioinfo_deliveries.command()
def update():
"""Saves the bioinfo data of everything that can be found to statusdb."""
bt.collect_runs()
-@bioinfo_deliveries.command(name='fail_run')
-@click.argument('runid')
-@click.option('-p','--project', is_flag=False, help='Fail run for the specified project')
+
+@bioinfo_deliveries.command(name="fail_run")
+@click.argument("runid")
+@click.option(
+ "-p", "--project", is_flag=False, help="Fail run for the specified project"
+)
def fail_run(runid, project=None):
"""Updates the status of the specified run to 'Failed'.
Example of RUNID: 170113_ST-E00269_0163_BHCVH7ALXX"""
diff --git a/taca/utils/config.py b/taca/utils/config.py
index 74b8876f..e2710ba7 100644
--- a/taca/utils/config.py
+++ b/taca/utils/config.py
@@ -1,21 +1,23 @@
"""Load and parse configuration file."""
+
import yaml
-from io import open
CONFIG = {}
+
def load_config(config_file):
"""Loads a configuration file."""
config = {}
try:
- with open(config_file, 'r') as f:
+ with open(config_file) as f:
content = yaml.load(f, Loader=yaml.FullLoader)
config.update(content)
return content
- except IOError as e:
- e.message = 'Could not open configuration file "{}".'.format(config_file)
+ except OSError as e:
+ e.message = f'Could not open configuration file "{config_file}".'
raise e
+
def load_yaml_config(config_file):
"""Load YAML config file
@@ -26,10 +28,10 @@ def load_yaml_config(config_file):
:raises IOError: If the config file cannot be opened.
"""
try:
- with open(config_file, 'r') as f:
+ with open(config_file) as f:
content = yaml.load(f, Loader=yaml.FullLoader)
CONFIG.update(content)
return content
- except IOError as e:
- e.message = 'Could not open configuration file "{}".'.format(config_file)
+ except OSError as e:
+ e.message = f'Could not open configuration file "{config_file}".'
raise e
diff --git a/taca/utils/filesystem.py b/taca/utils/filesystem.py
index f1db6968..a001615e 100644
--- a/taca/utils/filesystem.py
+++ b/taca/utils/filesystem.py
@@ -3,8 +3,9 @@
import os
import shutil
-RUN_RE = '^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$'
-RUN_RE_ONT = '^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$'
+RUN_RE = "^\d{6,8}_[a-zA-Z\d\-]+_\d{2,}_[AB0][A-Z\d\-]+$"
+RUN_RE_ONT = "^(\d{8})_(\d{4})_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$"
+
@contextlib.contextmanager
def chdir(new_dir):
@@ -16,28 +17,32 @@ def chdir(new_dir):
finally:
os.chdir(cur_dir)
+
def create_folder(target_folder):
- """ Ensure that a folder exists and create it if it doesn't, including any
- parent folders, as necessary.
+ """Ensure that a folder exists and create it if it doesn't, including any
+ parent folders, as necessary.
- :param target_folder: the target folder
- :returns: True if the folder exists or was created, False if the folder
- does not exists and could not be created
+ :param target_folder: the target folder
+ :returns: True if the folder exists or was created, False if the folder
+ does not exists and could not be created
"""
try:
os.makedirs(target_folder)
- except OSError as e:
+ except OSError:
pass
return os.path.exists(target_folder)
+
def touch(file):
- open(file, 'w').close()
+ open(file, "w").close()
+
def do_symlink(src_file, dst_file):
link_f = os.symlink
if not os.path.isfile(dst_file):
link_f(os.path.realpath(src_file), dst_file)
+
def do_copy(src_path, dst_path):
# copies folder structure and files (recursively)
# if symlinks, will copy content, not the links
diff --git a/taca/utils/misc.py b/taca/utils/misc.py
index 3f9bec6a..a180bcfd 100755
--- a/taca/utils/misc.py
+++ b/taca/utils/misc.py
@@ -5,12 +5,11 @@
import smtplib
import subprocess
import sys
-
from datetime import datetime
from email.mime.text import MIMEText
+
from taca.utils import statusdb
-from io import open
-from six.moves import input
+
def send_mail(subject, content, receiver):
"""Sends an email.
@@ -20,17 +19,18 @@ def send_mail(subject, content, receiver):
:param str receiver: Address to send the email
"""
if not receiver:
- raise SystemExit('No receiver was given to send mail')
+ raise SystemExit("No receiver was given to send mail")
msg = MIMEText(content)
- msg['Subject'] = 'TACA - {}'.format(subject)
- msg['From'] = 'TACA@scilifelab.se'
- msg['to'] = receiver
+ msg["Subject"] = f"TACA - {subject}"
+ msg["From"] = "TACA@scilifelab.se"
+ msg["to"] = receiver
- s = smtplib.SMTP('localhost')
- s.sendmail('TACA', [receiver], msg.as_string())
+ s = smtplib.SMTP("localhost")
+ s.sendmail("TACA", [receiver], msg.as_string())
s.quit()
-def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''):
+
+def call_external_command(cl, with_log_files=False, prefix=None, log_dir=""):
"""Executes an external command.
:param string cl: Command line to be executed (command + options and parameters)
@@ -38,59 +38,60 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''):
:param string prefix: the prefix to add to log file
:param string log_dir: where to write the log file (to avoid problems with rights)
"""
- if type(cl) == str:
- cl = cl.split(' ')
+ if isinstance(type(cl), str):
+ cl = cl.split(" ")
logFile = os.path.basename(cl[0])
stdout = sys.stdout
stderr = sys.stderr
if with_log_files:
if prefix:
- logFile = '{}_{}'.format(prefix, logFile)
+ logFile = f"{prefix}_{logFile}"
# Create log dir if it didn't exist in CWD
if log_dir and not os.path.exists(log_dir):
os.mkdir(log_dir)
logFile = os.path.join(log_dir, logFile)
- stdout = open(logFile + '.out', 'a')
- stderr = open(logFile + '.err', 'a')
- started = 'Started command {} on {}'.format(' '.join(cl), datetime.now())
- stdout.write(started + u'\n')
- stdout.write(''.join(['=']*len(cl)) + u'\n')
+ stdout = open(logFile + ".out", "a")
+ stderr = open(logFile + ".err", "a")
+ started = "Started command {} on {}".format(" ".join(cl), datetime.now())
+ stdout.write(started + "\n")
+ stdout.write("".join(["="] * len(cl)) + "\n")
try:
subprocess.check_call(cl, stdout=stdout, stderr=stderr)
except subprocess.CalledProcessError as e:
- e.message = 'The command {} failed.'.format(' '.join(cl))
+ e.message = "The command {} failed.".format(" ".join(cl))
raise e
finally:
if with_log_files:
stdout.close()
stderr.close()
+
def call_external_command_detached(cl, with_log_files=False, prefix=None):
"""Executes an external command.
:param string cl: Command line to be executed (command + options and parameters)
:param bool with_log_files: Create log files for stdout and stderr
"""
- if type(cl) == str:
- cl = cl.split(' ')
+ if isinstance(type(cl), str):
+ cl = cl.split(" ")
command = os.path.basename(cl[0])
stdout = sys.stdout
stderr = sys.stderr
if with_log_files:
if prefix:
- command = '{}_{}'.format(prefix, command)
- stdout = open(command + '.out', 'a')
- stderr = open(command + '.err', 'a')
- started = 'Started command {} on {}'.format(' '.join(cl), datetime.now())
- stdout.write(started + u'\n')
- stdout.write(''.join(['=']*len(cl)) + u'\n')
+ command = f"{prefix}_{command}"
+ stdout = open(command + ".out", "a")
+ stderr = open(command + ".err", "a")
+ started = "Started command {} on {}".format(" ".join(cl), datetime.now())
+ stdout.write(started + "\n")
+ stdout.write("".join(["="] * len(cl)) + "\n")
try:
p_handle = subprocess.Popen(cl, stdout=stdout, stderr=stderr)
except subprocess.CalledProcessError as e:
- e.message = 'The command {} failed.'.format(' '.join(cl))
+ e.message = "The command {} failed.".format(" ".join(cl))
raise e
finally:
if with_log_files:
@@ -98,6 +99,7 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None):
stderr.close()
return p_handle
+
def to_seconds(days=None, hours=None):
"""Convert given day/hours to seconds and return.
@@ -116,7 +118,8 @@ def to_seconds(days=None, hours=None):
# 1 hour == 60*60 seconds --> 3600
return 3600 * hours
-def hashfile(afile, hasher='sha1', blocksize=65536):
+
+def hashfile(afile, hasher="sha1", blocksize=65536):
"""Calculate the hash digest of a file with the specified algorithm and
return it.
@@ -130,14 +133,15 @@ def hashfile(afile, hasher='sha1', blocksize=65536):
if not os.path.isfile(afile):
return None
hashobj = hashlib.new(hasher)
- with open(afile,'rb') as fh:
+ with open(afile, "rb") as fh:
buf = fh.read(blocksize)
while len(buf) > 0:
hashobj.update(buf)
buf = fh.read(blocksize)
return hashobj.hexdigest()
-def query_yes_no(question, default='yes', force=False):
+
+def query_yes_no(question, default="yes", force=False):
"""Ask a yes/no question via raw_input() and return their answer.
"question" is a string that is presented to the user. "default"
is the presumed answer if the user just hits . It must be
@@ -150,14 +154,13 @@ def query_yes_no(question, default='yes', force=False):
:param force: set answer to default
:returns: yes or no
"""
- valid = {'yes': True, 'y': True, 'ye': True,
- 'no': False, 'n': False}
- if default == None:
- prompt = ' [y/n] '
- elif default == 'yes':
- prompt = ' [Y/n] '
- elif default == 'no':
- prompt = ' [y/N] '
+ valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+ if default is None:
+ prompt = " [y/n] "
+ elif default == "yes":
+ prompt = " [Y/n] "
+ elif default == "no":
+ prompt = " [y/N] "
else:
raise ValueError('invalid default answer: "%s"' % default)
@@ -166,56 +169,60 @@ def query_yes_no(question, default='yes', force=False):
if not force:
choice = input().lower()
else:
- choice = 'yes'
- if default is not None and choice == '':
+ choice = "yes"
+ if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
- sys.stdout.write('Please respond with "yes" or "no" '\
- '(or "y" or "n").\n')
+ sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n')
+
def return_unique(seq):
seen = set()
seen_add = seen.add
- return [ x for x in seq if not (x in seen or seen_add(x))]
+ return [x for x in seq if not (x in seen or seen_add(x))]
+
def run_is_demuxed(run, couch_info=None, seq_run_type=None):
- """
+ """
For ONT runs:
check that .sync_finished exists, which is created by TACA when the sync is finalized. Since demux is done on the sequencers
in parallel to sequencing, the presence of this file also implies that demux is done.
-
+
For Illumina runs:
Check in StatusDB 'x_flowcells' database if the given run has an entry which means it was
demultiplexed (as TACA only creates a document upon successfull demultiplexing)
:param dict couch_info: a dict with 'statusDB' info
"""
- if seq_run_type in ['promethion', 'minion']:
+ if seq_run_type in ["promethion", "minion"]:
if os.path.exists(os.path.join(run.abs_path, ".sync_finished")):
return True
else:
return False
else:
if not couch_info:
- raise SystemExit('To check for demultiplexing is enabled in config file but no "statusDB" info was given')
- run_terms = run.name.split('_')
+ raise SystemExit(
+ 'To check for demultiplexing is enabled in config file but no "statusDB" info was given'
+ )
+ run_terms = run.name.split("_")
run_date = run_terms[0]
- if len(run_date)>6:
+ if len(run_date) > 6:
run_date = run_date[2:]
run_fc = run_terms[-1]
- run_name = '{}_{}'.format(run_date, run_fc)
+ run_name = f"{run_date}_{run_fc}"
try:
couch_connection = statusdb.StatusdbSession(couch_info).connection
- fc_db = couch_connection[couch_info['xten_db']]
- for fc in fc_db.view('names/name', reduce=False, descending=True):
+ fc_db = couch_connection[couch_info["xten_db"]]
+ for fc in fc_db.view("names/name", reduce=False, descending=True):
if fc.key != run_name:
continue
fc_doc = fc_db.get(fc.id)
- if not fc_doc or not fc_doc.get('illumina', {}).get('Demultiplex_Stats', {}):
+ if not fc_doc or not fc_doc.get("illumina", {}).get(
+ "Demultiplex_Stats", {}
+ ):
return False
return True
except Exception as e:
raise e
-
diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py
index 3ae4d291..939e0606 100644
--- a/taca/utils/statusdb.py
+++ b/taca/utils/statusdb.py
@@ -1,24 +1,26 @@
"""Classes for handling connection to StatusDB."""
-import couchdb
-import logging
import csv
-
+import logging
from datetime import datetime
+import couchdb
+
logger = logging.getLogger(__name__)
-class StatusdbSession(object):
+
+class StatusdbSession:
"""Wrapper class for couchdb."""
+
def __init__(self, config, db=None):
- user = config.get('username')
- password = config.get('password')
- url = config.get('url')
- url_string = 'https://{}:{}@{}'.format(user, password, url)
- display_url_string = 'https://{}:{}@{}'.format(user, '*********', url)
+ user = config.get("username")
+ password = config.get("password")
+ url = config.get("url")
+ url_string = f"https://{user}:{password}@{url}"
+ display_url_string = "https://{}:{}@{}".format(user, "*********", url)
self.connection = couchdb.Server(url=url_string)
if not self.connection:
- raise Exception('Couchdb connection failed for url {}'.format(display_url_string))
+ raise Exception(f"Couchdb connection failed for url {display_url_string}")
if db:
self.db_connection = self.connection[db]
@@ -40,9 +42,11 @@ def save_db_doc(self, doc, db=None):
db = db or self.db
db.save(doc)
except Exception as e:
- raise Exception('Failed saving document due to {}'.format(e))
+ raise Exception(f"Failed saving document due to {e}")
- def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='%Y-%m-%d'):
+ def get_project_flowcell(
+ self, project_id, open_date="2015-01-01", date_format="%Y-%m-%d"
+ ):
"""From information available in flowcell db connection,
collect the flowcell this project was sequenced.
@@ -53,68 +57,91 @@ def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='
try:
open_date = datetime.strptime(open_date, date_format)
except:
- open_date = datetime.strptime('2015-01-01', '%Y-%m-%d')
+ open_date = datetime.strptime("2015-01-01", "%Y-%m-%d")
project_flowcells = {}
- date_sorted_fcs = sorted(list(self.proj_list.keys()), key=lambda k: datetime.strptime(k.split('_')[0], '%y%m%d'), reverse=True)
+ date_sorted_fcs = sorted(
+ list(self.proj_list.keys()),
+ key=lambda k: datetime.strptime(k.split("_")[0], "%y%m%d"),
+ reverse=True,
+ )
for fc in date_sorted_fcs:
- fc_date, fc_name = fc.split('_')
- if datetime.strptime(fc_date,'%y%m%d') < open_date:
+ fc_date, fc_name = fc.split("_")
+ if datetime.strptime(fc_date, "%y%m%d") < open_date:
break
- if project_id in self.proj_list[fc] and fc_name not in project_flowcells.keys():
- project_flowcells[fc_name] = {'name':fc_name,'run_name':fc, 'date':fc_date, 'db':self.db.name}
+ if (
+ project_id in self.proj_list[fc]
+ and fc_name not in project_flowcells.keys()
+ ):
+ project_flowcells[fc_name] = {
+ "name": fc_name,
+ "run_name": fc,
+ "date": fc_date,
+ "db": self.db.name,
+ }
return project_flowcells
+
class ProjectSummaryConnection(StatusdbSession):
- def __init__(self, config, dbname='projects'):
- super(ProjectSummaryConnection, self).__init__(config)
+ def __init__(self, config, dbname="projects"):
+ super().__init__(config)
self.db = self.connection[dbname]
- self.name_view = {k.key: k.id for k in self.db.view('project/project_name', reduce=False)}
- self.id_view = {k.key: k.id for k in self.db.view('project/project_id', reduce=False)}
+ self.name_view = {
+ k.key: k.id for k in self.db.view("project/project_name", reduce=False)
+ }
+ self.id_view = {
+ k.key: k.id for k in self.db.view("project/project_id", reduce=False)
+ }
class FlowcellRunMetricsConnection(StatusdbSession):
- def __init__(self, config, dbname='flowcells'):
- super(FlowcellRunMetricsConnection, self).__init__(config)
+ def __init__(self, config, dbname="flowcells"):
+ super().__init__(config)
self.db = self.connection[dbname]
- self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)}
- self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key}
+ self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)}
+ self.proj_list = {
+ k.key: k.value
+ for k in self.db.view("names/project_ids_list", reduce=False)
+ if k.key
+ }
class X_FlowcellRunMetricsConnection(StatusdbSession):
- def __init__(self, config, dbname='x_flowcells'):
- super(X_FlowcellRunMetricsConnection, self).__init__(config)
+ def __init__(self, config, dbname="x_flowcells"):
+ super().__init__(config)
self.db = self.connection[dbname]
- self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)}
- self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key}
+ self.name_view = {k.key: k.id for k in self.db.view("names/name", reduce=False)}
+ self.proj_list = {
+ k.key: k.value
+ for k in self.db.view("names/project_ids_list", reduce=False)
+ if k.key
+ }
class NanoporeRunsConnection(StatusdbSession):
-
- def __init__(self, config, dbname='nanopore_runs'):
- super(NanoporeRunsConnection, self).__init__(config)
+ def __init__(self, config, dbname="nanopore_runs"):
+ super().__init__(config)
self.db = self.connection[dbname]
def check_run_exists(self, ont_run) -> bool:
- view_names = self.db.view('names/name')
+ view_names = self.db.view("names/name")
if len(view_names[ont_run.run_name].rows) > 0:
return True
else:
return False
-
+
def check_run_status(self, ont_run) -> str:
- view_all_stats = self.db.view('names/name')
+ view_all_stats = self.db.view("names/name")
doc_id = view_all_stats[ont_run.run_name].rows[0].id
return self.db[doc_id]["run_status"]
def create_ongoing_run(
self, ont_run, run_path_file: str, pore_count_history_file: str
):
-
- run_path = open(run_path_file, "r").read().strip()
+ run_path = open(run_path_file).read().strip()
pore_counts = []
- with open(pore_count_history_file, "r") as stream:
+ with open(pore_count_history_file) as stream:
for line in csv.DictReader(stream):
pore_counts.append(line)
@@ -130,7 +157,7 @@ def create_ongoing_run(
)
def finish_ongoing_run(self, ont_run, dict_json: dict):
- view_names = self.db.view('names/name')
+ view_names = self.db.view("names/name")
doc_id = view_names[ont_run.run_name].rows[0].id
doc = self.db[doc_id]
@@ -140,23 +167,23 @@ def finish_ongoing_run(self, ont_run, dict_json: dict):
def update_doc(db, obj, over_write_db_entry=False):
- view = db.view('info/name')
- if len(view[obj['name']].rows) == 1:
- remote_doc = view[obj['name']].rows[0].value
- doc_id = remote_doc.pop('_id')
- doc_rev = remote_doc.pop('_rev')
+ view = db.view("info/name")
+ if len(view[obj["name"]].rows) == 1:
+ remote_doc = view[obj["name"]].rows[0].value
+ doc_id = remote_doc.pop("_id")
+ doc_rev = remote_doc.pop("_rev")
if remote_doc != obj:
if not over_write_db_entry:
obj = merge_dicts(obj, remote_doc)
- obj['_id'] = doc_id
- obj['_rev'] = doc_rev
+ obj["_id"] = doc_id
+ obj["_rev"] = doc_rev
db[doc_id] = obj
- logger.info('Updating {}'.format(obj['name']))
- elif len(view[obj['name']].rows) == 0:
+ logger.info("Updating {}".format(obj["name"]))
+ elif len(view[obj["name"]].rows) == 0:
db.save(obj)
- logger.info('Saving {}'.format(obj['name']))
+ logger.info("Saving {}".format(obj["name"]))
else:
- logger.warn('More than one row with name {} found'.format(obj['name']))
+ logger.warn("More than one row with name {} found".format(obj["name"]))
def merge_dicts(d1, d2):
@@ -166,12 +193,14 @@ def merge_dicts(d1, d2):
for key in d2:
if key in d1:
if isinstance(d1[key], dict) and isinstance(d2[key], dict):
- merge(d1[key], d2[key])
+ merge_dicts(d1[key], d2[key])
elif d1[key] == d2[key]:
pass # same leaf value
else:
- logger.debug('Values for key {key} in d1 and d2 differ, '
- 'using the value of d1'.format(key=key))
+ logger.debug(
+ f"Values for key {key} in d1 and d2 differ, "
+ "using the value of d1"
+ )
else:
d1[key] = d2[key]
return d1
diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py
index 34e6b314..693f0725 100644
--- a/taca/utils/transfer.py
+++ b/taca/utils/transfer.py
@@ -6,109 +6,108 @@
import subprocess
from taca.utils.filesystem import create_folder
-from taca.utils.misc import hashfile, call_external_command
-from io import open
+from taca.utils.misc import call_external_command, hashfile
logger = logging.getLogger(__name__)
-class TransferAgent(object):
+class TransferAgent:
"""
- (Abstract) superclass representing an Agent that performs file transfers.
- Agents implementing specific methods for transferring files should extend
- this and implement the transfer() method.
+ (Abstract) superclass representing an Agent that performs file transfers.
+ Agents implementing specific methods for transferring files should extend
+ this and implement the transfer() method.
"""
- def __init__(
- self,
- src_path=None,
- dest_path=None,
- opts={},
- **kwargs):
- """ Creates an agent instance
- :param string src_path: the file or folder that should be transferred
- :param string dest_path: the destination file or folder
- :param bool validate: whether to validate the transferred files
- :param opts: options that will be passed to the transfer command
+
+ def __init__(self, src_path=None, dest_path=None, opts={}, **kwargs):
+ """Creates an agent instance
+ :param string src_path: the file or folder that should be transferred
+ :param string dest_path: the destination file or folder
+ :param bool validate: whether to validate the transferred files
+ :param opts: options that will be passed to the transfer command
"""
self.src_path = src_path
self.dest_path = dest_path
- self.validate = kwargs.get('validate', False)
+ self.validate = kwargs.get("validate", False)
self.cmdopts = opts
def __str__(self):
return type(self).__name__
def format_options(self):
- """ Format the options dictionary stored in this instance's cmdopts
- attribute and return the formatted options as a list of strings.
- A key in the dictionary represents the option name. If
- the corresponding value is None, the option will be assumed to
- represent a flag. If the value is a list, the option will be given
- multiple times.
+ """Format the options dictionary stored in this instance's cmdopts
+ attribute and return the formatted options as a list of strings.
+ A key in the dictionary represents the option name. If
+ the corresponding value is None, the option will be assumed to
+ represent a flag. If the value is a list, the option will be given
+ multiple times.
- For example:
+ For example:
- opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']}
+ opts = {'opt1': None, 'opt2': 'val1', 'opt3': ['val2','val3']}
- will be expanded to:
+ will be expanded to:
- ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3']
+ ['--opt1','--opt2=val1','--opt3=val2','--opt3=val3']
- :returns: List of formatted options as strings
+ :returns: List of formatted options as strings
"""
cmdopts = []
for param, val in self.cmdopts.items():
if val is None:
cmdopts.append(param)
else:
- if type(val) == str:
+ if isinstance(type(val), str):
val = [val]
for v in val:
- cmdopts.append('{}={}'.format(param,v))
+ cmdopts.append(f"{param}={v}")
return cmdopts
def transfer(self):
"""Abstract method, should be implemented by subclasses."""
- raise NotImplementedError('This method should be implemented by subclass')
+ raise NotImplementedError("This method should be implemented by subclass")
def validate_src_path(self):
"""Validates that the src_path attribute of the Agent instance.
- :raises transfer.TransferError: if src_path is not valid
+ :raises transfer.TransferError: if src_path is not valid
"""
if self.src_path is None:
raise TransferError(
- msg='src_path cannot be None',
+ msg="src_path cannot be None",
src_path=self.src_path,
- dest_path=self.dest_path)
+ dest_path=self.dest_path,
+ )
if not os.path.exists(self.src_path):
raise TransferError(
- msg='src_path "{}" does not exist'.format(self.src_path),
+ msg=f'src_path "{self.src_path}" does not exist',
src_path=self.src_path,
- dest_path=self.dest_path)
+ dest_path=self.dest_path,
+ )
def validate_dest_path(self):
"""Validates that the dest_path attribute of the Agent instance.
- :raises transfer.TransferError: if dest_path is not valid
+ :raises transfer.TransferError: if dest_path is not valid
"""
if self.dest_path is None:
raise TransferError(
- msg='dest_path cannot be None',
+ msg="dest_path cannot be None",
src_path=self.src_path,
- dest_path=self.dest_path)
+ dest_path=self.dest_path,
+ )
def validate_transfer(self):
"""Abstract method, should be implemented by subclasses."""
- raise NotImplementedError('This method should be implemented by subclass')
+ raise NotImplementedError("This method should be implemented by subclass")
class RsyncAgent(TransferAgent):
"""An agent that knows how to perform an rsync transfer locally or
- between hosts. If supplied with a checksum file, the transfer can
- be validated on the receiving side.
+ between hosts. If supplied with a checksum file, the transfer can
+ be validated on the receiving side.
"""
- CMD = 'rsync'
+
+ CMD = "rsync"
DEFAULT_OPTS = {
- '-a': None,
+ "-a": None,
}
def __init__(
@@ -120,68 +119,65 @@ def __init__(
validate=True,
digestfile=None,
opts=None,
- **kwargs):
+ **kwargs,
+ ):
"""Creates an RsyncAgent instance
- :param string src_path: the file or folder that should be transferred
- :param string dest_path: the destination file or folder
- :param string remote_host: the remote host to transfer to.
- If None, the transfer will be on the local filesystem
- :param string remote_user: the remote user to connect with.
- If None, the local user will be used
- :param bool validate: whether to validate the transferred files
- using a supplied file with checksums
- :param string digestfile: a file with checksums for the files to be
- transferred. Must be specified if validate is True. The checksum
- algorithm will be inferred from the extension of the digest file
- :param opts: options that will be passed to the rsync command
+ :param string src_path: the file or folder that should be transferred
+ :param string dest_path: the destination file or folder
+ :param string remote_host: the remote host to transfer to.
+ If None, the transfer will be on the local filesystem
+ :param string remote_user: the remote user to connect with.
+ If None, the local user will be used
+ :param bool validate: whether to validate the transferred files
+ using a supplied file with checksums
+ :param string digestfile: a file with checksums for the files to be
+ transferred. Must be specified if validate is True. The checksum
+ algorithm will be inferred from the extension of the digest file
+ :param opts: options that will be passed to the rsync command
"""
- super(RsyncAgent, self).__init__(
+ super().__init__(
src_path=src_path,
dest_path=dest_path,
opts=opts or self.DEFAULT_OPTS,
validate=validate,
- **kwargs)
+ **kwargs,
+ )
self.remote_host = remote_host
self.remote_user = remote_user
self.digestfile = digestfile
def transfer(self, transfer_log=None):
"""Execute the transfer as set up by this instance and, if requested,
- validate the transfer.
+ validate the transfer.
- :param string transfer_log: path prefix to log files where stderr
- and stdout streams will be directed if this option is specified
- :returns True on success, False if the validation failed
- :raises transfer.TransferError: if src_path or dest_path were not valid
- :raises transfer.RsyncError: if the rsync command did not exit successfully
+ :param string transfer_log: path prefix to log files where stderr
+ and stdout streams will be directed if this option is specified
+ :returns True on success, False if the validation failed
+ :raises transfer.TransferError: if src_path or dest_path were not valid
+ :raises transfer.RsyncError: if the rsync command did not exit successfully
"""
self.validate_src_path()
self.validate_dest_path()
- command = [self.CMD] + self.format_options() + [self.src_path, self.remote_path()]
+ command = (
+ [self.CMD] + self.format_options() + [self.src_path, self.remote_path()]
+ )
try:
call_external_command(
- command,
- with_log_files=(transfer_log is not None),
- prefix=transfer_log)
+ command, with_log_files=(transfer_log is not None), prefix=transfer_log
+ )
except subprocess.CalledProcessError as e:
raise RsyncError(e)
return (not self.validate) or self.validate_transfer()
def remote_path(self):
"""Construct the remote path according to what has been specified.
- :returns: the remote path string on the form
- [remote_user]@[remote_host]:[dest_path]
+ :returns: the remote path string on the form
+ [remote_user]@[remote_host]:[dest_path]
"""
- return '{}{}{}'.format(
- '{}@'.format(self.remote_user) \
- if self.remote_user is not None \
- else '',
- '{}:'.format(self.remote_host) \
- if self.remote_host is not None \
- else '',
- self.dest_path \
- if self.dest_path is not None \
- else ''
+ return "{}{}{}".format(
+ f"{self.remote_user}@" if self.remote_user is not None else "",
+ f"{self.remote_host}:" if self.remote_host is not None else "",
+ self.dest_path if self.dest_path is not None else "",
)
def validate_dest_path(self):
@@ -193,63 +189,60 @@ def validate_dest_path(self):
"""
if self.dest_path is None and self.remote_host is None:
raise TransferError(
- msg='dest_path and remote_host cannot both be None',
- src_path=self.src_path)
+ msg="dest_path and remote_host cannot both be None",
+ src_path=self.src_path,
+ )
if self.remote_user is not None and self.remote_host is None:
raise TransferError(
- msg='dest_path cannot be None if remote_user is not None',
- src_path=self.src_path)
+ msg="dest_path cannot be None if remote_user is not None",
+ src_path=self.src_path,
+ )
def validate_transfer(self):
"""Validate the transferred files by computing checksums and comparing
- to the pre-computed checksums, supplied in the digestfile attribute
- of this Agent instance. The hash algorithm is inferred from the file
- extension of the digestfile. The paths of the files to check are
- assumed to be relative to the location of the digestfile.
+ to the pre-computed checksums, supplied in the digestfile attribute
+ of this Agent instance. The hash algorithm is inferred from the file
+ extension of the digestfile. The paths of the files to check are
+ assumed to be relative to the location of the digestfile.
- Currently not implemented for remote transfers.
+ Currently not implemented for remote transfers.
- :returns: False if any checksum does not match, or if a file does
- not exist. True otherwise.
- :raises transfer.RsyncValidationError: if the digestfile was not
- supplied
+ :returns: False if any checksum does not match, or if a file does
+ not exist. True otherwise.
+ :raises transfer.RsyncValidationError: if the digestfile was not
+ supplied
"""
if self.remote_host is not None:
- raise NotImplementedError('Validation on remote host not implemented')
+ raise NotImplementedError("Validation on remote host not implemented")
try:
with open(self.digestfile) as fh:
- hasher = self.digestfile.split('.')[-1]
+ hasher = self.digestfile.split(".")[-1]
dpath = os.path.dirname(self.digestfile)
for line in fh:
- digest,fpath = line.split()
- tfile = os.path.join(dpath,fpath)
+ digest, fpath = line.split()
+ tfile = os.path.join(dpath, fpath)
if not os.path.exists(tfile) or digest != hashfile(
- tfile,
- hasher=hasher):
+ tfile, hasher=hasher
+ ):
return False
- except TypeError as e:
+ except TypeError:
raise RsyncValidationError(
- 'no digest file specified',
- self.src_path,
- self.dest_path)
+ "no digest file specified", self.src_path, self.dest_path
+ )
return True
class SymlinkAgent(TransferAgent):
-
def __init__(self, src_path, dest_path, overwrite=True, relative=True, **kwargs):
"""Creates an SymlinkAgent instance for creating symlinks.
- :param string src_path: the file or folder that should be symlinked
- :param string dest_path: the destination symlink
- :param bool overwrite: if true, the destination file or folder will
- be overwritten if it already exists
- :param bool relative: if true, the destination symlink will be relative
+ :param string src_path: the file or folder that should be symlinked
+ :param string dest_path: the destination symlink
+ :param bool overwrite: if true, the destination file or folder will
+ be overwritten if it already exists
+ :param bool relative: if true, the destination symlink will be relative
"""
- super(SymlinkAgent,self).__init__(
- src_path=src_path,
- dest_path=dest_path,
- **kwargs)
+ super().__init__(src_path=src_path, dest_path=dest_path, **kwargs)
self.overwrite = overwrite
self.relative = relative
@@ -268,22 +261,23 @@ def transfer(self):
# If the existing target is a symlink that points to the
# source, we're all good
if self.validate_transfer():
- logger.debug('target exists and points to the correct '
- 'source path: "{}"'.format(self.src_path))
+ logger.debug(
+ "target exists and points to the correct "
+ f'source path: "{self.src_path}"'
+ )
return True
# If we are not overwriting, return False
if not self.overwrite:
- logger.debug('target "{}" exists and will not be '
- 'overwritten'.format(self.dest_path))
+ logger.debug(
+ f'target "{self.dest_path}" exists and will not be ' "overwritten"
+ )
return False
# If the target is a mount, let's not mess with it
if os.path.ismount(self.dest_path):
- raise SymlinkError('target exists and is a mount')
+ raise SymlinkError("target exists and is a mount")
# If the target is a link or a file, we remove it
- if os.path.islink(self.dest_path) or \
- os.path.isfile(self.dest_path):
- logger.debug('removing existing target file "{}"'
- .format(self.dest_path))
+ if os.path.islink(self.dest_path) or os.path.isfile(self.dest_path):
+ logger.debug(f'removing existing target file "{self.dest_path}"')
try:
os.unlink(self.dest_path)
except OSError as e:
@@ -291,44 +285,55 @@ def transfer(self):
# If the target is a directory, we remove it and
# everything underneath
elif os.path.isdir(self.dest_path):
- logger.debug('removing existing target folder "{}"'
- .format(self.dest_path))
+ logger.debug(f'removing existing target folder "{self.dest_path}"')
try:
shutil.rmtree(self.dest_path)
except OSError as e:
raise SymlinkError(e)
# If it's something else, let's bail out
else:
- raise SymlinkError('target exists and will not be overwritten')
+ raise SymlinkError("target exists and will not be overwritten")
if not create_folder(os.path.dirname(self.dest_path)):
- raise SymlinkError('failed to create target folder hierarchy')
+ raise SymlinkError("failed to create target folder hierarchy")
try:
# If we should create a relative symlink, determine the relative path
os.symlink(
- os.path.relpath(self.src_path,os.path.dirname(self.dest_path)) \
- if self.relative else self.src_path,
- self.dest_path)
+ os.path.relpath(self.src_path, os.path.dirname(self.dest_path))
+ if self.relative
+ else self.src_path,
+ self.dest_path,
+ )
except OSError as e:
raise SymlinkError(e)
return (not self.validate) or self.validate_transfer()
def validate_transfer(self):
"""Validates the symlinked files by verifying that the dest_path was
- created, is a link and resolves to the same file as src_path.
+ created, is a link and resolves to the same file as src_path.
- :returns: True if link is valid, False otherwise
+ :returns: True if link is valid, False otherwise
"""
- return os.path.exists(self.dest_path) and \
- os.path.islink(self.dest_path) and \
- os.path.samefile(self.src_path, self.dest_path)
+ return (
+ os.path.exists(self.dest_path)
+ and os.path.islink(self.dest_path)
+ and os.path.samefile(self.src_path, self.dest_path)
+ )
class TransferError(Exception):
def __init__(self, msg, src_path=None, dest_path=None):
- super(TransferError, self).__init__(msg)
+ super().__init__(msg)
self.src_path = src_path
self.dest_path = dest_path
-class SymlinkError(TransferError): pass
-class RsyncError(TransferError): pass
-class RsyncValidationError(TransferError): pass
+
+class SymlinkError(TransferError):
+ pass
+
+
+class RsyncError(TransferError):
+ pass
+
+
+class RsyncValidationError(TransferError):
+ pass
diff --git a/tests/data/Stats.json b/tests/data/Stats.json
index 5090f4ac..1d2e2bd1 100644
--- a/tests/data/Stats.json
+++ b/tests/data/Stats.json
@@ -1,75 +1,75 @@
{
- "RunNumber":131,
- "Flowcell":"FCIDXX",
- "RunId":"141124_ST-COMPLEX1_01_AFCIDXX",
- "ConversionResults":[
- {
- "LaneNumber":1,
- "DemuxResults":[
+ "RunNumber": 131,
+ "Flowcell": "FCIDXX",
+ "RunId": "141124_ST-COMPLEX1_01_AFCIDXX",
+ "ConversionResults": [
+ {
+ "LaneNumber": 1,
+ "DemuxResults": [
+ {
+ "SampleId": "Sample_P12345_1001",
+ "SampleName": "P12345_1001",
+ "NumberReads": 494288265,
+ "Yield": 58820303535,
+ "ReadMetrics": [
{
- "SampleId":"Sample_P12345_1001",
- "SampleName":"P12345_1001",
- "NumberReads":494288265,
- "Yield":58820303535,
- "ReadMetrics":[
- {
- "ReadNumber":1,
- "Yield":13840071420,
- "YieldQ30":13329609381,
- "QualityScoreSum":503672520160,
- "TrimmedBases":0
- }
- ]
+ "ReadNumber": 1,
+ "Yield": 13840071420,
+ "YieldQ30": 13329609381,
+ "QualityScoreSum": 503672520160,
+ "TrimmedBases": 0
}
- ],
- "Undetermined":{
- "NumberReads":17709745,
- "Yield":2036620675,
- "ReadMetrics":[
- {
- "ReadNumber":1,
- "Yield":885487250,
- "YieldQ30":680049984,
- "QualityScoreSum":28815661398,
- "TrimmedBases":0
- },
- {
- "ReadNumber":2,
- "Yield":283355920,
- "YieldQ30":179655904,
- "QualityScoreSum":8324058259,
- "TrimmedBases":0
- }
- ]
- }
+ ]
+ }
+ ],
+ "Undetermined": {
+ "NumberReads": 17709745,
+ "Yield": 2036620675,
+ "ReadMetrics": [
+ {
+ "ReadNumber": 1,
+ "Yield": 885487250,
+ "YieldQ30": 680049984,
+ "QualityScoreSum": 28815661398,
+ "TrimmedBases": 0
+ },
+ {
+ "ReadNumber": 2,
+ "Yield": 283355920,
+ "YieldQ30": 179655904,
+ "QualityScoreSum": 8324058259,
+ "TrimmedBases": 0
+ }
+ ]
}
- ],
- "ReadInfosForLanes":[
- {
- "LaneNumber":1,
- "ReadInfos":[
- {
- "Number":1,
- "NumCycles":28,
- "IsIndexedRead":"false"
- }
- ]
+ }
+ ],
+ "ReadInfosForLanes": [
+ {
+ "LaneNumber": 1,
+ "ReadInfos": [
+ {
+ "Number": 1,
+ "NumCycles": 28,
+ "IsIndexedRead": "false"
+ }
+ ]
+ }
+ ],
+ "UnknownBarcodes": [
+ {
+ "Lane": 1,
+ "Barcodes": {
+ "GGGGGGGG": 3203920,
+ "CCCTAACA": 290420
}
- ],
- "UnknownBarcodes":[
- {
- "Lane":1,
- "Barcodes":{
- "GGGGGGGG":3203920,
- "CCCTAACA":290420
- }
- },
- {
- "Lane":2,
- "Barcodes":{
- "GGGGGGGG":3075440,
- "CCCTAACA":296260
- }
+ },
+ {
+ "Lane": 2,
+ "Barcodes": {
+ "GGGGGGGG": 3075440,
+ "CCCTAACA": 296260
}
- ]
+ }
+ ]
}
diff --git a/tests/data/lane.html b/tests/data/lane.html
index 0079244b..435f6b29 100644
--- a/tests/data/lane.html
+++ b/tests/data/lane.html
@@ -1,85 +1,92 @@
-
-
-