From 132fe5f32de1f458657128e1d1b6ee79c0b46503 Mon Sep 17 00:00:00 2001
From: kedhammar
Date: Wed, 17 Jan 2024 14:13:58 +0100
Subject: [PATCH 01/44] Add lots of CI files
---
.editorconfig | 12 +++
.git-blame-ignore-revs | 1 +
.github/workflows/check-log.yml | 26 +++++++
.github/workflows/lint-code.yml | 130 ++++++++++++++++++++++++++++++++
.gitignore | 2 +
.pre-commit-config.yaml | 19 +++++
pyproject.toml | 27 +++++++
requirements-dev.txt | 5 ++
8 files changed, 222 insertions(+)
create mode 100644 .editorconfig
create mode 100644 .git-blame-ignore-revs
create mode 100644 .github/workflows/check-log.yml
create mode 100644 .github/workflows/lint-code.yml
create mode 100644 .pre-commit-config.yaml
create mode 100644 pyproject.toml
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000..70c7a9a8
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,12 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+indent_size = 4
+indent_style = space
+
+[*.{md,yml,yaml,cff}]
+indent_size = 2
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..a99e5d13
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1 @@
+# Start adding here
diff --git a/.github/workflows/check-log.yml b/.github/workflows/check-log.yml
new file mode 100644
index 00000000..1447daba
--- /dev/null
+++ b/.github/workflows/check-log.yml
@@ -0,0 +1,26 @@
+name: Check VERSIONLOG.MD has been updated
+on: [pull_request]
+
+jobs:
+ check-versionlog:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout PR
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0 # Fetch all history for all branches and tags
+
+ - name: Check for VERSIONLOG.MD changes
+ id: versionlog_check
+ # 1) Find the common ancestor between the current HEAD and the base branch
+ # 2) Then see if the versionlog has been updated in the PR since it diverged
+ # from the common ancestor
+ run: |
+ PR_BASE_SHA=$(git merge-base HEAD ${{ github.event.pull_request.base.sha }})
+ FILE_CHANGED=$(git diff --name-only $PR_BASE_SHA HEAD | grep 'VERSIONLOG.md' || true)
+ if [ -n "$FILE_CHANGED" ]; then
+ echo "VERSIONLOG.MD has been changed."
+ else
+ echo "VERSIONLOG.MD has NOT been changed."
+ exit 1 # Fail the workflow if no changes in VERSIONLOG.MD
+ fi
diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml
new file mode 100644
index 00000000..73fb6f97
--- /dev/null
+++ b/.github/workflows/lint-code.yml
@@ -0,0 +1,130 @@
+name: Lint code
+on: [push, pull_request]
+
+jobs:
+ # Use ruff to check for code style violations
+ ruff-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff
+ - name: ruff --> Check for style violations
+ # Configured in pyproject.toml
+ run: ruff check .
+
+ # Use ruff to check code formatting
+ ruff-format:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff
+ - name: ruff --> Check code formatting
+ run: ruff format --check .
+
+ # Use mypy for static type checking
+ mypy-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mypy
+ # Start by installing type stubs
+ - name: mypy --> Install stubs
+ run: echo -e "y" | mypy --install-types **/*.py || exit 0
+ - name: mypy --> Static type checking
+ # Configured in pyprojet.toml
+ run: mypy **/*.py
+
+ # Use pipreqs to check for missing dependencies
+ pipreqs-check:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.10"
+
+ - name: Install pipreqs
+ run: pip install pipreqs
+
+ - name: Install requirements
+ run: pip install -r requirements.txt
+
+ - name: Run pipreqs
+ run: pipreqs --savepath pipreqs.txt
+
+ - name: Compare requirements
+ run: |
+ # Extract and sort package names
+ awk '{print $1}' $1 | sort -u > "$1".compare
+ awk -F'==' '{print $1}' $2 | sort -u > "$2".compare
+
+ # Compare package lists
+ if cmp -s "$1".compare "$2".compare
+ then
+ echo "Requirements are the same"
+ exit 0
+ else
+ echo "Requirements are different"
+ exit 1
+ fi
+
+ # Use Prettier to check various file formats
+ prettier:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+ - name: Setup node
+ uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - name: Install Prettier
+ run: npm install -g prettier
+
+ - name: Run Prettier --check
+ run: prettier --check .
+
+ # Use editorconfig to check all remaining file formats
+ editorconfig:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Setup node
+ uses: actions/setup-node@v4
+ with:
+ node-version: "20"
+
+ - name: Install editorconfig-checker
+ run: npm install -g editorconfig-checker
+
+ - name: editorconfig --> Lint files
+ run: editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html')
diff --git a/.gitignore b/.gitignore
index eb7ce2ba..91b6d2ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ _build
.benchmarks
.coverage
__pycache__
+.pytest_cache
+.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..1c09ed2d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+# .pre-commit-config.yaml
+repos:
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.1.6
+ hooks:
+ - id: ruff
+ - id: ruff-format
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: "v1.7.1"
+ hooks:
+ - id: mypy
+ - repo: https://github.com/pre-commit/mirrors-prettier
+ rev: "v4.0.0-alpha.8"
+ hooks:
+ - id: prettier
+ - repo: https://github.com/editorconfig-checker/editorconfig-checker.python
+ rev: "2.7.2"
+ hooks:
+ - id: editorconfig-checker
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..17ba1fbc
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,27 @@
+title = "taca"
+
+
+[tool.ruff.lint]
+select =[
+ # Ruff default rules
+ # ------------------------------
+ "E4", # pycodestyle Imports
+ "E7", # pycodestyle Statements
+ "E9", # pycodestyle Runtime
+ "F", # Pyflakes
+
+ # Additional Comment
+ # ------------------------------------------------------
+ "I", # isort Best-practice sorting of imports
+ "UP", # pyupgrade Make sure syntax is up-to-date
+]
+ignore = [
+ "E402", # Module level import not at top of file
+ "E722", # Do not use bare 'except'
+ "E741", # Ambiguous variable name
+]
+
+
+[tool.mypy]
+ignore_missing_imports = true
+follow_imports = 'skip'
diff --git a/requirements-dev.txt b/requirements-dev.txt
index af58407f..9118bd64 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,3 +5,8 @@ mock
sphinx
sphinx-rtd-theme
pytest
+ipython
+ipdb
+ruff
+mypy
+pipreqs
From 8f1f5d3ee27cacf53d82788420ec947cbd378f2a Mon Sep 17 00:00:00 2001
From: kedhammar
Date: Wed, 17 Jan 2024 14:23:07 +0100
Subject: [PATCH 02/44] update outdated readme
---
README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 87 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
index 50ce07c3..776051d6 100644
--- a/README.md
+++ b/README.md
@@ -4,34 +4,105 @@
-## Tool for the Automation of Cleanup and Analyses
+# Tool for the Automation of Cleanup and Analyses
[](http://badge.fury.io/py/taca)
[](https://travis-ci.org/SciLifeLab/TACA)
[](https://readthedocs.org/projects/taca/?badge=latest)
[](https://codecov.io/gh/scilifelab/taca)
-This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://portal.scilifelab.se/genomics/) in Stockholm, Sweden.
+This package contains several tools for projects and data management in the [National Genomics Infrastructure](https://ngisweden.scilifelab.se/) in Stockholm, Sweden.
-### Install for development
-You can install your own fork of taca in for instance a local conda environment for development. Provided you have conda installed:
+## Installation
+
+Inside the repo, run `pip install .`
+
+## Development
+
+Run `pip install requirements-dev.txt` to install packages used for development and `pip install -e .` to make the installation editable.
+
+### Automated linting
+
+This repo is configured for automated linting. Linter parameters are defined in `pyproject.toml`.
+
+As of now, we use:
+
+- [ruff](https://docs.astral.sh/ruff/) to perform automated formatting and a variety of lint checks.
+ - Run with `ruff check .` and `ruff format .`
+- [mypy](https://mypy.readthedocs.io/en/stable/) for static type checking and to prevent contradictory type annotation.
+ - Run with `mypy **/*.py`
+- [pipreqs](https://github.com/bndr/pipreqs) to check that the requirement files are up-to-date with the code.
+
+ - This is run with a custom Bash script in GitHub Actions which will only compare the list of package names.
+
+ ```
+ # Extract and sort package names
+ awk '{print $1}' $1 | sort -u > "$1".compare
+ awk -F'==' '{print $1}' $2 | sort -u > "$2".compare
+
+ # Compare package lists
+ if cmp -s "$1".compare "$2".compare
+ then
+ echo "Requirements are the same"
+ exit 0
+ else
+ echo "Requirements are different"
+ exit 1
+ fi
+ ```
+
+- [prettier](https://prettier.io/) to format common languages.
+ - Run with `prettier .`
+- [editorconfig-checker](https://github.com/editorconfig-checker/editorconfig-checker) to enforce `.editorconfig` rules for all files not covered by the tools above.
+ - Run with
+ ```
+ editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html')
+ ```
+
+#### [GitHub Actions](https://docs.github.com/en/actions)
+
+Configured in `.github/workflows/lint-code.yml`. Will test all commits in pushes or pull requests, but not change code or prevent merges.
+
+#### [Pre-commit](https://pre-commit.com/)
+
+Will prevent local commits that fail linting checks. Configured in `.pre-commit-config.yml`.
+
+To set up pre-commit checking:
+
+1. Run `pip install pre-commit`
+2. Navigate to the repo root
+3. Run `pre-commit install`
+
+This can be disabled with `pre-commit uninstall`
+
+#### VS Code automation
+
+To enable automated linting in VS Code, go the the user `settings.json` and include the following lines:
```
-# clone the repo
-git clone https://github.com//TACA.git
+"[python]": {
+ "editor.defaultFormatter": "charliermarsh.ruff",
+}
+```
-# create an environment
-conda create -n taca_dev python=2.7
-conda activate taca_dev
+This will run the `ruff`-mediated linting with the same parameters as the `GitHub Actions` and `pre-commit` every time VS Code is used to format the code in the repository.
-# install TACA and dependencies for developoment
-cd TACA
-python setup.py develop
-pip install -r ./requirements-dev.txt
+To run formatting on save, include the lines:
-# Check that tests pass:
-cd tests && nosetests -v -s
```
+"[python]": {
+ "editor.formatOnSave": true,
+}
+```
+
+### Git blame suppression
+
+When a non-invasive tool is used to tidy up a lot of code, it is useful to supress the Git blame for that particular commit, so the original author can still be traced.
+
+To do this, add the hash of the commit containing the changes to `.git-blame-ignore-revs`, headed by an explanatory comment.
+
+
+### Deliver command
There is also a [plugin for the deliver command](https://github.com/SciLifeLab/taca-ngi-pipeline). To install this in the same development environment:
@@ -43,7 +114,7 @@ python setup.py develop
pip install -r ./requirements-dev.txt
# add required config files and env for taca delivery plugin
-echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml
+echo "foo:bar" >> ~/.ngipipeline/ngi_config.yaml
mkdir ~/.taca && cp tests/data/taca_test_cfg.yaml ~/.taca/taca.yaml
export CHARON_BASE_URL="http://tracking.database.org"
export CHARON_API_TOKEN="charonapitokengoeshere"
From b9ee704ad4da26790e539b8fe1d39aa71f831ef1 Mon Sep 17 00:00:00 2001
From: kedhammar
Date: Wed, 17 Jan 2024 14:25:39 +0100
Subject: [PATCH 03/44] ruff check safe fixes
---
doc/conf.py | 26 ++--
setup.py | 11 +-
taca/analysis/analysis.py | 84 +++++++------
taca/analysis/analysis_nanopore.py | 10 +-
taca/backup/backup.py | 72 +++++------
taca/backup/cli.py | 2 +
taca/cleanup/cleanup.py | 72 +++++------
taca/cleanup/cli.py | 2 +
taca/cli.py | 6 +-
taca/illumina/MiSeq_Runs.py | 24 ++--
taca/illumina/Runs.py | 160 ++++++++++++-------------
taca/illumina/Standard_Runs.py | 64 +++++-----
taca/nanopore/ONT_run_classes.py | 32 ++---
taca/nanopore/instrument_transfer.py | 16 ++-
taca/server_status/cli.py | 7 +-
taca/server_status/cronjobs.py | 12 +-
taca/server_status/server_status.py | 10 +-
taca/testing/cli.py | 4 +-
taca/testing/create_uppmax_like_env.py | 121 ++++++++++---------
taca/utils/bioinfo_tab.py | 45 +++----
taca/utils/cli.py | 2 +
taca/utils/config.py | 11 +-
taca/utils/filesystem.py | 2 +-
taca/utils/misc.py | 22 ++--
taca/utils/statusdb.py | 22 ++--
taca/utils/transfer.py | 28 ++---
tests/test_analysis.py | 6 +-
tests/test_analysis_nanopore.py | 7 +-
tests/test_backup.py | 6 +-
tests/test_cleanup.py | 2 +-
tests/test_illumina.py | 24 ++--
tests/test_instrument_transfer.py | 22 ++--
tests/test_nanopore.py | 6 +-
tests/test_server_status.py | 13 +-
tests/test_utils.py | 20 ++--
35 files changed, 489 insertions(+), 484 deletions(-)
diff --git a/doc/conf.py b/doc/conf.py
index cb58a377..5c1d130e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
#
# TACA documentation build configuration file, created by
# sphinx-quickstart on Wed Sep 17 12:39:41 2014.
@@ -12,7 +11,6 @@
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys
import os
# If extensions (or modules to document with autodoc) are in another directory,
@@ -49,8 +47,8 @@
master_doc = 'index'
# General information about the project.
-project = u'TACA'
-copyright = u'2014, Guillermo Carrasco'
+project = 'TACA'
+copyright = '2014, Guillermo Carrasco'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -207,8 +205,8 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ('index', 'TACA.tex', u'TACA Documentation',
- u'Guillermo Carrasco', 'manual'),
+ ('index', 'TACA.tex', 'TACA Documentation',
+ 'Guillermo Carrasco', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
@@ -237,8 +235,8 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ('index', 'taca', u'TACA Documentation',
- [u'Guillermo Carrasco'], 1)
+ ('index', 'taca', 'TACA Documentation',
+ ['Guillermo Carrasco'], 1)
]
# If true, show URL addresses after external links.
@@ -251,8 +249,8 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'TACA', u'TACA Documentation',
- u'Guillermo Carrasco', 'TACA', 'One line description of project.',
+ ('index', 'TACA', 'TACA Documentation',
+ 'Guillermo Carrasco', 'TACA', 'One line description of project.',
'Miscellaneous'),
]
@@ -272,10 +270,10 @@
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
-epub_title = u'TACA'
-epub_author = u'Guillermo Carrasco'
-epub_publisher = u'Guillermo Carrasco'
-epub_copyright = u'2014, Guillermo Carrasco'
+epub_title = 'TACA'
+epub_author = 'Guillermo Carrasco'
+epub_publisher = 'Guillermo Carrasco'
+epub_copyright = '2014, Guillermo Carrasco'
# The basename for the epub file. It defaults to the project name.
#epub_basename = u'TACA'
diff --git a/setup.py b/setup.py
index cc05b49c..d8962c06 100644
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,20 @@
-from setuptools import setup, find_packages
import glob
-import os
-import sys
+from io import open
+
+from setuptools import find_packages, setup
from taca import __version__
-from io import open
try:
with open("requirements.txt", "r") as f:
install_requires = [x.strip() for x in f.readlines()]
-except IOError:
+except OSError:
install_requires = []
try:
with open("dependency_links.txt", "r") as f:
dependency_links = [x.strip() for x in f.readlines()]
-except IOError:
+except OSError:
dependency_links = []
diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py
index c817b064..2ef4aafd 100755
--- a/taca/analysis/analysis.py
+++ b/taca/analysis/analysis.py
@@ -2,22 +2,20 @@
import glob
import logging
import os
-import sys
import subprocess
+import sys
+from io import open
+from shutil import copyfile, copytree
+
+from flowcell_parser.classes import RunParametersParser
-from shutil import copyfile
-from shutil import copytree
-from taca.illumina.Standard_Runs import Standard_Run
from taca.illumina.MiSeq_Runs import MiSeq_Run
from taca.illumina.NextSeq_Runs import NextSeq_Run
from taca.illumina.NovaSeq_Runs import NovaSeq_Run
from taca.illumina.NovaSeqXPlus_Runs import NovaSeqXPlus_Run
+from taca.utils import statusdb
from taca.utils.config import CONFIG
from taca.utils.transfer import RsyncAgent
-from taca.utils import statusdb
-
-from flowcell_parser.classes import RunParametersParser
-from io import open
logger = logging.getLogger(__name__)
@@ -37,15 +35,15 @@ def get_runObj(run, software):
elif os.path.exists(os.path.join(run, 'RunParameters.xml')):
run_parameters_file = 'RunParameters.xml'
else:
- logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run))
+ logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}')
return
run_parameters_path = os.path.join(run, run_parameters_file)
try:
run_parameters = RunParametersParser(run_parameters_path)
except OSError:
- logger.warn('Problems parsing the runParameters.xml file at {}. '
- 'This is quite unexpected. please archive the run {} manually'.format(run_parameters_path, run))
+ logger.warn(f'Problems parsing the runParameters.xml file at {run_parameters_path}. '
+ f'This is quite unexpected. please archive the run {run} manually')
else:
# Do a case by case test because there are so many version of RunParameters that there is no real other way
runtype = run_parameters.data['RunParameters'].get('InstrumentType',
@@ -110,8 +108,8 @@ def _upload_to_statusdb(run):
try:
PFclusters = parser.obj['Undetermined'][lane]['unknown']
except KeyError:
- logger.error('While taking extra care of lane {} of NoIndex type ' \
- 'I found out that not all values were available'.format(lane))
+ logger.error(f'While taking extra care of lane {lane} of NoIndex type ' \
+ 'I found out that not all values were available')
continue
# In Lanes_stats fix the lane yield
parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters)
@@ -122,9 +120,9 @@ def _upload_to_statusdb(run):
updated += 1
sample['PF Clusters'] = str(PFclusters)
if updated != 1:
- logger.error('While taking extra care of lane {} of NoIndex type '
+ logger.error(f'While taking extra care of lane {lane} of NoIndex type '
'I updated more than once the barcode_lane. '
- 'This is too much to continue so I will fail.'.format(lane))
+ 'This is too much to continue so I will fail.')
os.sys.exit()
# If I am here it means I changed the HTML representation to something
# else to accomodate the wired things we do
@@ -144,7 +142,7 @@ def transfer_run(run_dir):
mail_recipients = CONFIG.get('mail', {}).get('recipients')
if runObj is None:
mail_recipients = CONFIG.get('mail', {}).get('recipients')
- logger.error('Trying to force a transfer of run {} but the sequencer was not recognized.'.format(run_dir))
+ logger.error(f'Trying to force a transfer of run {run_dir} but the sequencer was not recognized.')
else:
runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients)
@@ -170,7 +168,7 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
try:
with open(new_sample_sheet, 'w') as nss:
nss.write(extract_project_samplesheet(original_sample_sheet, pid_list))
- except IOError as e:
+ except OSError as e:
logger.error('An error occured while parsing the samplesheet. '
'Please check the sample sheet and try again.')
raise e
@@ -185,14 +183,14 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
dir_for_excluding_lane = []
lane_to_exclude = exclude_lane.split(',')
for lane in lane_to_exclude:
- if os.path.isdir('{}/{}/Thumbnail_Images/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Thumbnail_Images/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Images/Focus/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Images/Focus/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Data/Intensities/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/L00{}'.format(lane)])
- if os.path.isdir('{}/{}/Data/Intensities/BaseCalls/L00{}'.format(run_dir_path, dir_name, lane)):
- dir_for_excluding_lane.extend(['--exclude', 'Data/Intensities/BaseCalls/L00{}'.format(lane)])
+ if os.path.isdir(f'{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}'):
+ dir_for_excluding_lane.extend(['--exclude', f'Thumbnail_Images/L00{lane}'])
+ if os.path.isdir(f'{run_dir_path}/{dir_name}/Images/Focus/L00{lane}'):
+ dir_for_excluding_lane.extend(['--exclude', f'Images/Focus/L00{lane}'])
+ if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}'):
+ dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/L00{lane}'])
+ if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}'):
+ dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/BaseCalls/L00{lane}'])
try:
exclude_options_for_tar = ['--exclude', 'Demultiplexing*',
@@ -244,7 +242,7 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
os.remove(new_sample_sheet)
os.remove(archive)
os.remove(md5file)
- except IOError as e:
+ except OSError as e:
logger.error('Was not able to delete all temporary files')
raise e
return
@@ -271,32 +269,32 @@ def _process(run):
:param taca.illumina.Run run: Run to be processed and transferred
"""
- logger.info('Checking run {}'.format(run.id))
+ logger.info(f'Checking run {run.id}')
transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot
- logger.info('Run {} already transferred to analysis server, skipping it'.format(run.id))
+ logger.info(f'Run {run.id} already transferred to analysis server, skipping it')
return
if run.get_run_status() == 'SEQUENCING':
- logger.info('Run {} is not finished yet'.format(run.id))
+ logger.info(f'Run {run.id} is not finished yet')
if 'statusdb' in CONFIG:
_upload_to_statusdb(run)
elif run.get_run_status() == 'TO_START':
if run.get_run_type() == 'NON-NGI-RUN':
# For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs)
- logger.warn('Run {} marked as {}, '
+ logger.warn(f'Run {run.id} marked as {run.get_run_type()}, '
'TACA will skip this and move the run to '
- 'no-sync directory'.format(run.id, run.get_run_type()))
+ 'no-sync directory')
if 'storage' in CONFIG:
run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
return
- logger.info(('Starting BCL to FASTQ conversion and demultiplexing for run {}'.format(run.id)))
+ logger.info(f'Starting BCL to FASTQ conversion and demultiplexing for run {run.id}')
if 'statusdb' in CONFIG:
_upload_to_statusdb(run)
run.demultiplex_run()
elif run.get_run_status() == 'IN_PROGRESS':
- logger.info(('BCL conversion and demultiplexing process in '
- 'progress for run {}, skipping it'.format(run.id)))
+ logger.info('BCL conversion and demultiplexing process in '
+ f'progress for run {run.id}, skipping it')
# Upload to statusDB if applies
if 'statusdb' in CONFIG:
_upload_to_statusdb(run)
@@ -307,7 +305,7 @@ def _process(run):
# a cycle take the last if out of the elif
if run.get_run_status() == 'COMPLETED':
run.check_run_status()
- logger.info(('Preprocessing of run {} is finished, transferring it'.format(run.id)))
+ logger.info(f'Preprocessing of run {run.id} is finished, transferring it')
# Upload to statusDB if applies
if 'statusdb' in CONFIG:
_upload_to_statusdb(run)
@@ -317,10 +315,10 @@ def _process(run):
demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings']))
demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5]))
if len(demux_log['error_and_warning_messages'])>5:
- demux_summary_message.append("...... Only the first 5 errors or warnings are displayed for Demultiplexing_{}.".format(demux_id))
+ demux_summary_message.append(f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}.")
# Notify with a mail run completion and stats uploaded
if demux_summary_message:
- sbt = ("{} Demultiplexing Completed with ERRORs or WARNINGS!".format(run.id))
+ sbt = (f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!")
msg = """The run {run} has been demultiplexed with errors or warnings!
{errors_warnings}
@@ -331,7 +329,7 @@ def _process(run):
""".format(errors_warnings='\n'.join(demux_summary_message), run=run.id)
else:
- sbt = ("{} Demultiplexing Completed!".format(run.id))
+ sbt = (f"{run.id} Demultiplexing Completed!")
msg = """The run {run} has been demultiplexed without any error or warning.
The Run will be transferred to the analysis cluster for further analysis.
@@ -345,7 +343,7 @@ def _process(run):
if 'mfs_path' in CONFIG['analysis']:
try:
mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id)
- logger.info('Copying demultiplex stats, InterOp metadata and XML files for run {} to {}'.format(run.id, mfs_dest))
+ logger.info(f'Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}')
if not os.path.exists(mfs_dest):
os.mkdir(mfs_dest)
demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports',
@@ -364,7 +362,7 @@ def _process(run):
if os.path.exists(interop_src):
copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True)
except:
- logger.warn('Could not copy demultiplex stats, InterOp metadata or XML files for run {}'.format(run.id))
+ logger.warn(f'Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}')
# Transfer to analysis server if flag is True
if run.transfer_to_analysis_server:
@@ -383,7 +381,7 @@ def _process(run):
# Determine the run type
runObj = get_runObj(run, software)
if not runObj:
- raise RuntimeError("Unrecognized instrument type or incorrect run folder {}".format(run))
+ raise RuntimeError(f"Unrecognized instrument type or incorrect run folder {run}")
else:
_process(runObj)
else:
@@ -394,12 +392,12 @@ def _process(run):
for _run in runs:
runObj = get_runObj(_run, software)
if not runObj:
- logger.warning('Unrecognized instrument type or incorrect run folder {}'.format(run))
+ logger.warning(f'Unrecognized instrument type or incorrect run folder {run}')
else:
try:
_process(runObj)
except:
# This function might throw and exception,
# it is better to continue processing other runs
- logger.warning('There was an error processing the run {}'.format(run))
+ logger.warning(f'There was an error processing the run {run}')
pass
diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py
index 74e4c3ef..9141551a 100644
--- a/taca/analysis/analysis_nanopore.py
+++ b/taca/analysis/analysis_nanopore.py
@@ -1,17 +1,17 @@
"""Nanopore analysis methods for TACA."""
-import os
import logging
+import os
import re
import traceback
-from taca.utils.config import CONFIG
-from taca.utils.misc import send_mail
from taca.nanopore.ONT_run_classes import (
+ ONT_RUN_PATTERN,
+ ONT_qc_run,
ONT_run,
ONT_user_run,
- ONT_qc_run,
- ONT_RUN_PATTERN,
)
+from taca.utils.config import CONFIG
+from taca.utils.misc import send_mail
logger = logging.getLogger(__name__)
diff --git a/taca/backup/backup.py b/taca/backup/backup.py
index 037b1ea6..88a4188f 100644
--- a/taca/backup/backup.py
+++ b/taca/backup/backup.py
@@ -1,31 +1,31 @@
"""Backup methods and utilities."""
+import csv
import logging
import os
import re
import shutil
import subprocess as sp
import time
-import csv
-
from datetime import datetime
-from taca.utils.config import CONFIG
-from taca.utils import statusdb, filesystem, misc
from io import open
+from taca.utils import filesystem, misc, statusdb
+from taca.utils.config import CONFIG
+
logger = logging.getLogger(__name__)
-class run_vars(object):
+class run_vars:
"""A simple variable storage class."""
def __init__(self, run, archive_path):
self.abs_path = os.path.abspath(run)
self.path, self.name = os.path.split(self.abs_path)
self.name = self.name.split('.', 1)[0]
self.zip = os.path.join(archive_path, f'{self.name}.tar.gz')
- self.key = '{}.key'.format(self.name)
- self.key_encrypted = '{}.key.gpg'.format(self.name)
+ self.key = f'{self.name}.key'
+ self.key_encrypted = f'{self.name}.key.gpg'
self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg')
-class backup_utils(object):
+class backup_utils:
"""A class object with main utility methods related to backing up."""
def __init__(self, run=None):
@@ -49,7 +49,7 @@ def fetch_config_info(self):
self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt')
self.archive_log_location = CONFIG['backup']['archive_log']
except KeyError as e:
- logger.error('Config file is missing the key {}, make sure it have all required information'.format(str(e)))
+ logger.error(f'Config file is missing the key {str(e)}, make sure it have all required information')
raise SystemExit
def collect_runs(self, ext=None, filter_by_ext=False):
@@ -60,14 +60,14 @@ def collect_runs(self, ext=None, filter_by_ext=False):
archive_path = self.archive_dirs[run_type]
run = run_vars(self.run, archive_path)
if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)):
- logger.error('Given run {} did not match a FC pattern'.format(self.run))
+ logger.error(f'Given run {self.run} did not match a FC pattern')
raise SystemExit
if self._is_ready_to_archive(run, ext):
self.runs.append(run)
else:
for adir in self.archive_dirs.values():
if not os.path.isdir(adir):
- logger.warn('Path {} does not exist or it is not a directory'.format(adir))
+ logger.warn(f'Path {adir} does not exist or it is not a directory')
continue
for item in os.listdir(adir):
if filter_by_ext and not item.endswith(ext):
@@ -103,11 +103,11 @@ def avail_disk_space(self, path, run):
df_out, df_err = df_proc.communicate()
available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024
except Exception as e:
- logger.error('Evaluation of disk space failed with error {}'.format(e))
+ logger.error(f'Evaluation of disk space failed with error {e}')
raise SystemExit
if available_size < required_size:
- e_msg = 'Required space for encryption is {}GB, but only {}GB available'.format(required_size, available_size)
- subjt = 'Low space for encryption - {}'.format(self.host_name)
+ e_msg = f'Required space for encryption is {required_size}GB, but only {available_size}GB available'
+ subjt = f'Low space for encryption - {self.host_name}'
logger.error(e_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
raise SystemExit
@@ -146,7 +146,7 @@ def _get_run_type(self, run):
else:
run_type = ''
except:
- logger.warn('Could not fetch run type for run {}'.format(run))
+ logger.warn(f'Could not fetch run type for run {run}')
return run_type
def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]):
@@ -194,7 +194,7 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]):
if status != 0:
self._clean_tmp_files(files_to_remove)
if mail_failed:
- subjt = 'Command call failed - {}'.format(self.host_name)
+ subjt = f'Command call failed - {self.host_name}'
e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg))
@@ -215,7 +215,7 @@ def _log_pdc_statusdb(self, run):
run_date = run_vals[0][2:]
else:
run_date = run_vals[0]
- run_fc = '{}_{}'.format(run_date, run_vals[-1])
+ run_fc = f'{run_date}_{run_vals[-1]}'
couch_connection = statusdb.StatusdbSession(self.couch_info).connection
db = couch_connection[self.couch_info['db']]
fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)}
@@ -223,9 +223,9 @@ def _log_pdc_statusdb(self, run):
doc = db.get(d_id)
doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
db.save(doc)
- logger.info('Logged "pdc_archived" timestamp for fc {} in statusdb doc "{}"'.format(run, d_id))
+ logger.info(f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"')
except:
- logger.warn('Not able to log "pdc_archived" timestamp for run {}'.format(run))
+ logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}')
def _is_ready_to_archive(self, run, ext):
"""Check if the run to be encrypted has finished sequencing and has been copied completely to nas"""
@@ -258,7 +258,7 @@ def _move_run_to_archived(self, run):
run_type = self._get_run_type(run.name)
archived_path = self.archived_dirs[run_type]
if os.path.isdir(archived_path):
- logger.info('Moving run {} to the archived folder'.format(run.name))
+ logger.info(f'Moving run {run.name} to the archived folder')
shutil.move(run.name, archived_path)
else:
logger.warning("Cannot move run to archived, destination does not exist")
@@ -306,8 +306,8 @@ def encrypt_runs(cls, run, force):
continue
# Remove encrypted file if already exists
if os.path.exists(run.zip_encrypted):
- logger.warn((f'Removing already existing encrypted file for run {run.name}, this is a precaution '
- 'to make sure the file was encrypted with correct key file'))
+ logger.warn(f'Removing already existing encrypted file for run {run.name}, this is a precaution '
+ 'to make sure the file was encrypted with correct key file')
bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted])
# Generate random key to use as pasphrase
if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files):
@@ -356,41 +356,41 @@ def pdc_put(cls, run):
"""Archive the collected runs to PDC."""
bk = cls(run)
bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True)
- logger.info('In total, found {} run(s) to send PDC'.format(len(bk.runs)))
+ logger.info(f'In total, found {len(bk.runs)} run(s) to send PDC')
for run in bk.runs:
- run.flag = '{}.archiving'.format(run.name)
+ run.flag = f'{run.name}.archiving'
run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
if run.path not in bk.archive_dirs.values():
- logger.error(('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate '
- 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name)))
+ logger.error('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate '
+ 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name))
continue
if not os.path.exists(run.dst_key_encrypted):
- logger.error('Encrypted key file {} is not found for file {}, skipping it'.format(run.dst_key_encrypted, run.zip_encrypted))
+ logger.error(f'Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it')
continue
with filesystem.chdir(run.path):
#skip run if being encrypted
- if os.path.exists('{}.encrypting'.format(run.name)):
- logger.warn('Run {} is currently being encrypted, so skipping now'.format(run.name))
+ if os.path.exists(f'{run.name}.encrypting'):
+ logger.warn(f'Run {run.name} is currently being encrypted, so skipping now')
continue
# skip run if already ongoing
if os.path.exists(run.flag):
- logger.warn('Run {} is already being archived, so skipping now'.format(run.name))
+ logger.warn(f'Run {run.name} is already being archived, so skipping now')
continue
if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
- logger.warn('Seems like files related to run {} already exist in PDC, check and cleanup'.format(run.name))
+ logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup')
continue
flag = open(run.flag, 'w').close()
- logger.info('Sending file {} to PDC'.format(run.zip_encrypted))
- if bk._call_commands(cmd1='dsmc archive {}'.format(run.zip_encrypted), tmp_files=[run.flag]):
+ logger.info(f'Sending file {run.zip_encrypted} to PDC')
+ if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]):
time.sleep(15) # give some time just in case 'dsmc' needs to settle
- if bk._call_commands(cmd1='dsmc archive {}'.format(run.dst_key_encrypted), tmp_files=[run.flag]):
+ if bk._call_commands(cmd1=f'dsmc archive {run.dst_key_encrypted}', tmp_files=[run.flag]):
time.sleep(5) # give some time just in case 'dsmc' needs to settle
if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted):
- logger.info('Successfully sent file {} to PDC, moving file locally from {} to archived folder'.format(run.zip_encrypted, run.path))
+ logger.info(f'Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder')
bk.log_archived_run(run.zip_encrypted)
if bk.couch_info:
bk._log_pdc_statusdb(run.name)
bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag])
bk._move_run_to_archived(run)
continue
- logger.warn('Sending file {} to PDC failed'.format(run.zip_encrypted))
+ logger.warn(f'Sending file {run.zip_encrypted} to PDC failed')
diff --git a/taca/backup/cli.py b/taca/backup/cli.py
index 07cce810..89128002 100644
--- a/taca/backup/cli.py
+++ b/taca/backup/cli.py
@@ -1,7 +1,9 @@
"""CLI for the backup subcommand."""
import click
+
from taca.backup.backup import backup_utils as bkut
+
@click.group()
@click.pass_context
def backup(ctx):
diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py
index 07600870..80ef1905 100644
--- a/taca/cleanup/cleanup.py
+++ b/taca/cleanup/cleanup.py
@@ -2,16 +2,16 @@
import logging
import os
import re
-
from collections import defaultdict
from datetime import datetime
from glob import glob
-
-from taca.utils.config import CONFIG, load_config
-from taca.utils import filesystem, misc, statusdb
from io import open
+
from six.moves import map
+from taca.utils import filesystem, misc, statusdb
+from taca.utils.config import CONFIG, load_config
+
logger = logging.getLogger(__name__)
def cleanup_miarka(days_fastq, days_analysis,
@@ -59,9 +59,9 @@ def cleanup_miarka(days_fastq, days_analysis,
if date:
date = datetime.strptime(date, '%Y-%m-%d')
except KeyError as e:
- logger.error('Config file is missing the key {}, make sure it has all required information'.format(str(e)))
+ logger.error(f'Config file is missing the key {str(e)}, make sure it has all required information')
raise SystemExit
- except ValueError as e:
+ except ValueError:
logger.error('Date given with "--date" option is not in required format, see help for more info')
raise SystemExit
@@ -101,7 +101,7 @@ def cleanup_miarka(days_fastq, days_analysis,
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn('Flowcell {} does not contain a "{}" directory'.format(fc, flowcell_project_source))
+ logger.warn(f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory')
continue
projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
@@ -113,7 +113,7 @@ def cleanup_miarka(days_fastq, days_analysis,
continue
fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files))
if fc_undet_files:
- logger.info('All projects was cleaned for FC {}, found {} undeterminded files'.format(fc, len(fc_undet_files)))
+ logger.info(f'All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files')
all_undet_files.extend(list(map(os.path.abspath, fc_undet_files)))
if all_undet_files:
undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
@@ -142,7 +142,7 @@ def cleanup_miarka(days_fastq, days_analysis,
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn('Flowcell {} do not contain a "{}" direcotry'.format(fc, flowcell_project_source))
+ logger.warn(f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry')
continue
projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
@@ -201,7 +201,7 @@ def cleanup_miarka(days_fastq, days_analysis,
_def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])]))
raise SystemExit
- logger.info('Initial list is built with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list)))
+ logger.info(f'Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}')
if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'):
filtered_project, proj_count = ([], 0)
#go through complied project list and remove files
@@ -209,15 +209,15 @@ def cleanup_miarka(days_fastq, days_analysis,
proj_count += 1
if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq),
proj_count, len(project_clean_list)), default='no'):
- logger.info('Will not remove files for project {}'.format(proj))
+ logger.info(f'Will not remove files for project {proj}')
filtered_project.append(proj)
# remove projects that were decided not to delete
map(project_clean_list.pop, filtered_project)
- logger.info('Removed {}/{} projects from initial list'.format(len(filtered_project), proj_count))
+ logger.info(f'Removed {len(filtered_project)}/{proj_count} projects from initial list')
if not project_clean_list:
logger.info('There are no projects to clean after filtering')
return
- logger.info('Final list is created with {} projects {}'.format(len(project_clean_list), get_files_size_text(project_clean_list)))
+ logger.info(f'Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}')
if not misc.query_yes_no('Proceed with cleanup ?', default='no'):
logger.info('Aborting cleanup')
return
@@ -226,21 +226,21 @@ def cleanup_miarka(days_fastq, days_analysis,
for proj, info in project_clean_list.items():
fastq_info = info.get('fastq_to_remove')
if fastq_info and isinstance(fastq_info, dict):
- logger.info('Cleaning fastq files for project {}'.format(proj))
+ logger.info(f'Cleaning fastq files for project {proj}')
fastq_fc = fastq_info.get('flowcells', {})
removed_fc = []
for fc, fc_info in fastq_fc.items():
proj_fc_root = fc_info['proj_root']
- logger.info('Removing fastq files from {}'.format(proj_fc_root))
+ logger.info(f'Removing fastq files from {proj_fc_root}')
if not dry_run:
if _remove_files(fc_info['fq_files']):
- logger.info('Removed fastq files from FC {} for project {}, marking it as cleaned'.format(fc, proj))
+ logger.info(f'Removed fastq files from FC {fc} for project {proj}, marking it as cleaned')
_touch_cleaned(proj_fc_root)
removed_fc.append(fc)
if len(fastq_fc) == len(removed_fc):
try:
proj_data_root = fastq_info['proj_data']['proj_data_root']
- logger.info('All flowcells cleaned for this project, marking it as cleaned in {}'.format(proj_data_root))
+ logger.info(f'All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}')
_touch_cleaned(proj_data_root)
except:
pass
@@ -248,18 +248,18 @@ def cleanup_miarka(days_fastq, days_analysis,
analysis_info = info.get('analysis_to_remove')
if analysis_info and isinstance(analysis_info, dict):
proj_analysis_root = analysis_info['proj_analysis_root']
- logger.info('cleaning analysis data for project {}'.format(proj))
+ logger.info(f'cleaning analysis data for project {proj}')
removed_qc = []
for qc, files in analysis_info['analysis_files'].items():
- logger.info('Removing files of "{}" from {}'.format(qc, proj_analysis_root))
+ logger.info(f'Removing files of "{qc}" from {proj_analysis_root}')
if not dry_run:
if _remove_files(files):
removed_qc.append(qc)
else:
- logger.warn('Could not remove some files in qc directory "{}"'.format(qc))
+ logger.warn(f'Could not remove some files in qc directory "{qc}"')
map(analysis_info['analysis_files'].pop, removed_qc)
if len(analysis_info['analysis_files']) == 0:
- logger.info('Removed analysis data for project {}, marking it cleaned'.format(proj))
+ logger.info(f'Removed analysis data for project {proj}, marking it cleaned')
_touch_cleaned(proj_analysis_root)
@@ -273,7 +273,7 @@ def get_closed_proj_info(prj, pdoc, tdate=None):
if not tdate:
tdate = datetime.today()
if not pdoc:
- logger.warn('Seems like project {} does not have a proper statusdb document, skipping it'.format(prj))
+ logger.warn(f'Seems like project {prj} does not have a proper statusdb document, skipping it')
elif 'close_date' in pdoc:
closed_date = pdoc['close_date']
try:
@@ -348,9 +348,9 @@ def get_proj_meta_info(info, days_fastq):
template = '\n'
def _get_template_string(h, v):
try:
- v = '{}: {}\n'.format(h, v)
+ v = f'{h}: {v}\n'
except:
- v = '{}: Problem getting this'.format(h)
+ v = f'{h}: Problem getting this'
return v
template += _get_template_string('Project overview', info.get('name'))
template += _get_template_string('Project ID', info.get('pid'))
@@ -367,13 +367,13 @@ def _get_template_string(h, v):
elif isinstance(analysis_info, dict):
f_stat = []
for qc_type, files in analysis_info['analysis_files'].items():
- f_stat.append('{} ({} files)'.format(qc_type, len(files)))
+ f_stat.append(f'{qc_type} ({len(files)} files)')
template += 'Project analyzed: {}\n'.format(', '.join(f_stat))
# set fastq info based upon what we have
fq_info = info.get('fastq_to_remove')
if isinstance(fq_info, str) and fq_info == "young":
- template += 'Project been closed less than {} days, so will not remove any fastq files\n'.format(days_fastq)
+ template += f'Project been closed less than {days_fastq} days, so will not remove any fastq files\n'
elif isinstance(fq_info, dict):
proj_fq_info = fq_info.get('proj_data')
if not proj_fq_info:
@@ -385,7 +385,7 @@ def _get_template_string(h, v):
fc_fq_info = fq_info.get('flowcells', {})
fc_num = len(fc_fq_info.keys())
fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()]))
- template += 'Flowcells: There are {} FC with total {} fastq files\n'.format(fc_num, fc_files)
+ template += f'Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n'
template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0)))
return template
@@ -394,8 +394,8 @@ def get_files_size_text(plist):
"""Get project list dict and give back string with overll sizes."""
fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()]))
asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()]))
- return '({f}{s}{a}) '.format(f = '~{} fastq data'.format(fsize) if fsize else '',
- a = '~{} analysis data'.format(asize) if asize else '',
+ return '({f}{s}{a}) '.format(f = f'~{fsize} fastq data' if fsize else '',
+ a = f'~{asize} analysis data' if asize else '',
s = ' and ' if fsize and asize else '')
def _def_get_size_unit(s):
@@ -405,15 +405,15 @@ def _def_get_size_unit(s):
gb = mb * 1000
tb = gb * 1000
if s > tb:
- s = '~{}tb'.format(int(s/tb))
+ s = f'~{int(s/tb)}tb'
elif s > gb:
- s = '~{}gb'.format(int(s/gb))
+ s = f'~{int(s/gb)}gb'
elif s > mb:
- s = '~{}mb'.format(int(s/mb))
+ s = f'~{int(s/mb)}mb'
elif s > kb:
- s = '~{}kb'.format(int(s/kb))
+ s = f'~{int(s/kb)}kb'
elif s > 0:
- s = '~{}b'.format(int(s/b))
+ s = f'~{int(s/b)}b'
return str(s)
def _remove_files(files):
@@ -423,7 +423,7 @@ def _remove_files(files):
try:
os.remove(fl)
except Exception as e:
- logger.warn('Could not remove file {} due to "{}"'.format(fl, e.message))
+ logger.warn(f'Could not remove file {fl} due to "{e.message}"')
status = False
return status
@@ -432,4 +432,4 @@ def _touch_cleaned(path):
try:
open(os.path.join(path, 'cleaned'), 'w').close()
except Exception as e:
- logger.warn('Could not create "cleaned" file in path {} due to "{}"'.format(path, e.message))
+ logger.warn(f'Could not create "cleaned" file in path {path} due to "{e.message}"')
diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py
index 65abaf50..6410567b 100644
--- a/taca/cleanup/cli.py
+++ b/taca/cleanup/cli.py
@@ -1,8 +1,10 @@
"""CLI for the storage subcommand."""
import click
+
from taca.cleanup import cleanup as cln
from taca.utils import misc
+
@click.group()
@click.pass_context
@click.option('--status_db_config',
diff --git a/taca/cli.py b/taca/cli.py
index 1c78dabc..ad8d59b6 100644
--- a/taca/cli.py
+++ b/taca/cli.py
@@ -1,10 +1,10 @@
-# -*- coding: utf-8 -*-
import logging
import os
-from pkg_resources import iter_entry_points
+
import click
-import taca.log
+from pkg_resources import iter_entry_points
+import taca.log
from taca import __version__
from taca.utils import config as conf
diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py
index b90d734b..f6585801 100644
--- a/taca/illumina/MiSeq_Runs.py
+++ b/taca/illumina/MiSeq_Runs.py
@@ -1,8 +1,10 @@
+import logging
import os
import re
import shutil
-import logging
+
from flowcell_parser.classes import SampleSheetParser
+
from taca.illumina.Standard_Runs import Standard_Run
logger = logging.getLogger(__name__)
@@ -63,10 +65,10 @@ def _copy_samplesheet(self):
# Copy the original samplesheet locally.
# Copy again if already done as there might have been changes to the samplesheet
try:
- shutil.copy(ssname, os.path.join(self.run_dir, '{}.csv'.format(self.flowcell_id)))
+ shutil.copy(ssname, os.path.join(self.run_dir, f'{self.flowcell_id}.csv'))
ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
except:
- raise RuntimeError("unable to copy file {} to destination {}".format(ssname, self.run_dir))
+ raise RuntimeError(f"unable to copy file {ssname} to destination {self.run_dir}")
# This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
# to be used it needs some editing.
@@ -86,7 +88,7 @@ def _copy_samplesheet(self):
except Exception as e:
logger.error(e)
return False
- logger.info(('Created SampleSheet_copy.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest)))
+ logger.info(f'Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} ')
# SampleSheet.csv generated
# When demultiplexing SampleSheet.csv is the one I need to use
self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv'))
@@ -99,7 +101,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG)
Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC
"""
- output = u''
+ output = ''
compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
# Expand the ssparser if there are lanes with 10X or Smart-seq samples
index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
@@ -143,12 +145,12 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
if not fields_to_remove:
fields_to_remove = []
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f'[Header]{os.linesep}'
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f'{field.rstrip()},{ssparser.header[field].rstrip()}'
output += os.linesep
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f'[Data]{os.linesep}'
datafields = []
for field in ssparser.datafields:
if field not in fields_to_remove:
@@ -163,13 +165,13 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
try:
if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR:
# Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix
- value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm]))
+ value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}')
else:
# Substitute SampleID with SampleName, add Sample_ as prefix
- value ='Sample_{}'.format(line[ssparser.dfield_snm])
+ value =f'Sample_{line[ssparser.dfield_snm]}'
except:
# Otherwise add Sample_ as prefix
- value = 'Sample_{}'.format(line[ssparser.dfield_sid])
+ value = f'Sample_{line[ssparser.dfield_sid]}'
elif rename_qPCR_suffix and field in fields_qPCR:
value = re.sub('__qPCR_$', '', line[field])
line_ar.append(value)
diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py
index e479e800..fa4618c8 100644
--- a/taca/illumina/Runs.py
+++ b/taca/illumina/Runs.py
@@ -1,27 +1,27 @@
-import os
-import re
import csv
-import logging
-import subprocess
-import shutil
import glob
import json
-
+import logging
+import os
+import re
+import shutil
+import subprocess
from datetime import datetime
+from flowcell_parser.classes import LaneBarcodeParser, RunParser, SampleSheetParser
+
from taca.utils import misc
from taca.utils.misc import send_mail
-from flowcell_parser.classes import RunParser, LaneBarcodeParser, SampleSheetParser
logger = logging.getLogger(__name__)
-class Run(object):
+class Run:
""" Defines an Illumina run
"""
def __init__(self, run_dir, software, configuration):
if not os.path.exists(run_dir):
- raise RuntimeError("Could not locate run directory {}".format(run_dir))
+ raise RuntimeError(f"Could not locate run directory {run_dir}")
if 'analysis_server' not in configuration or \
'bcl2fastq' not in configuration or \
@@ -35,7 +35,7 @@ def __init__(self, run_dir, software, configuration):
logger.warning("Creating link from runParameters.xml to RunParameters.xml")
os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml'))
elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')):
- raise RuntimeError("Could not locate runParameters.xml in run directory {}".format(run_dir))
+ raise RuntimeError(f"Could not locate runParameters.xml in run directory {run_dir}")
self.run_dir = os.path.abspath(run_dir)
self.software = software
@@ -67,38 +67,38 @@ def check_run_status(self):
if self.software == 'bcl2fastq':
legacy_path = ''
elif self.software == 'bclconvert':
- legacy_path = "Reports/{}".format(self.legacy_dir)
+ legacy_path = f"Reports/{self.legacy_dir}"
# Check the status of running demux
# Collect all samplesheets generated before
samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while
all_demux_done = True
for samplesheet in samplesheets:
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
- demux_folder = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))
+ demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
# Check if this job is done
if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')):
all_demux_done = all_demux_done and True
if self.software == 'bcl2fastq':
- demux_log = os.path.join(self.run_dir, "demux_{}_bcl2fastq.err".format(demux_id))
+ demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl2fastq.err")
elif self.software == 'bclconvert':
- demux_log = os.path.join(self.run_dir, "demux_{}_bcl-convert.err".format(demux_id))
+ demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl-convert.err")
else:
raise RuntimeError("Unrecognized software!")
if os.path.isfile(demux_log):
errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log)
else:
- raise RuntimeError("No demux log file found for sub-demultiplexing {}!".format(demux_id))
+ raise RuntimeError(f"No demux log file found for sub-demultiplexing {demux_id}!")
self.demux_summary[demux_id] = {'errors' : errors,
'warnings' : warnings,
'error_and_warning_messages' : error_and_warning_messages
}
if errors or warnings:
- logger.info("Sub-Demultiplexing in {} completed with {} errors and {} warnings!".format(demux_folder, errors, warnings))
+ logger.info(f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!")
else:
- logger.info("Sub-Demultiplexing in {} completed without any error or warning.".format(demux_folder))
+ logger.info(f"Sub-Demultiplexing in {demux_folder} completed without any error or warning.")
else:
all_demux_done = all_demux_done and False
- logger.info("Sub-Demultiplexing in {} not completed yet.".format(demux_folder))
+ logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.")
# All demux jobs finished and all stats aggregated under Demultiplexing
# Aggreate all the results in the Demultiplexing folder
@@ -119,7 +119,7 @@ def _check_demux_log(self, demux_id, demux_log):
This function checks the log files of bcl2fastq/bclconvert
Errors or warnings will be captured and email notifications will be sent
"""
- with open(demux_log, 'r') as demux_log_file:
+ with open(demux_log) as demux_log_file:
demux_log_content = demux_log_file.readlines()
if self.software == 'bcl2fastq':
pattern = r'Processing completed with (\d+) errors and (\d+) warnings'
@@ -134,7 +134,7 @@ def _check_demux_log(self, demux_id, demux_log):
error_and_warning_messages.append(line)
return errors, warnings, error_and_warning_messages
else:
- raise RuntimeError("Bad format with log file demux_{}_bcl2fastq.err".format(demux_id))
+ raise RuntimeError(f"Bad format with log file demux_{demux_id}_bcl2fastq.err")
elif self.software == 'bclconvert':
errors = 0
warnings = 0
@@ -182,7 +182,7 @@ def _get_samplesheet(self):
samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'],
current_year)
- ssname = os.path.join(samplesheets_dir, '{}.csv'.format(self.flowcell_id))
+ ssname = os.path.join(samplesheets_dir, f'{self.flowcell_id}.csv')
if os.path.exists(ssname):
return ssname
else:
@@ -262,27 +262,27 @@ def transfer_run(self, t_file, mail_recipients=None):
command_line.append("--exclude=Demultiplexing_*/*_*")
command_line.append("--include=*/")
for to_include in self.CONFIG['analysis_server']['sync']['include']:
- command_line.append("--include={}".format(to_include))
+ command_line.append(f"--include={to_include}")
command_line.extend(["--exclude=*", "--prune-empty-dirs"])
r_user = self.CONFIG['analysis_server']['user']
r_host = self.CONFIG['analysis_server']['host']
r_dir = self.CONFIG['analysis_server']['sync']['data_archive']
- remote = "{}@{}:{}".format(r_user, r_host, r_dir)
+ remote = f"{r_user}@{r_host}:{r_dir}"
command_line.extend([self.run_dir, remote])
# Create temp file indicating that the run is being transferred
try:
open(os.path.join(self.run_dir, 'transferring'), 'w').close()
- except IOError as e:
- logger.error("Cannot create a file in {}. "
- "Check the run name, and the permissions.".format(self.id))
+ except OSError as e:
+ logger.error(f"Cannot create a file in {self.id}. "
+ "Check the run name, and the permissions.")
raise e
- started = ("Started transfer of run {} on {}".format(self.id, datetime.now()))
+ started = (f"Started transfer of run {self.id} on {datetime.now()}")
logger.info(started)
# In this particular case we want to capture the exception because we want
# to delete the transfer file
try:
- msge_text="I am about to transfer with this command \n{}".format(command_line)
+ msge_text=f"I am about to transfer with this command \n{command_line}"
logger.info(msge_text)
misc.call_external_command(command_line, with_log_files=True,
prefix="", log_dir=self.run_dir)
@@ -290,16 +290,16 @@ def transfer_run(self, t_file, mail_recipients=None):
os.remove(os.path.join(self.run_dir, 'transferring'))
#Send an email notifying that the transfer failed
runname = self.id
- sbt = ("Rsync of run {} failed".format(runname))
- msg= """ Rsync of data for run {run} has failed!
- Raised the following exception: {e}
- """.format(run=runname, e=exception)
+ sbt = (f"Rsync of run {runname} failed")
+ msg= f""" Rsync of data for run {runname} has failed!
+ Raised the following exception: {exception}
+ """
if mail_recipients:
send_mail(sbt, msg, mail_recipients)
raise exception
- logger.info('Adding run {} to {}'.format(self.id, t_file))
+ logger.info(f'Adding run {self.id} to {t_file}')
with open(t_file, 'a') as tranfer_file:
tsv_writer = csv.writer(tranfer_file, delimiter='\t')
tsv_writer.writerow([self.id, str(datetime.now())])
@@ -307,7 +307,7 @@ def transfer_run(self, t_file, mail_recipients=None):
#Send an email notifying that the transfer was successful
runname = self.id
- sbt = ("Rsync of data for run {} to the analysis cluster has finished".format(runname))
+ sbt = (f"Rsync of data for run {runname} to the analysis cluster has finished")
msg= """ Rsync of data for run {run} to the analysis cluster has finished!
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
@@ -320,7 +320,7 @@ def archive_run(self, destination):
:param str destination: the destination folder
"""
if destination and os.path.isdir(destination):
- logger.info('archiving run {}'.format(self.id))
+ logger.info(f'archiving run {self.id}')
shutil.move(self.run_dir, os.path.join(destination, self.id))
else:
logger.warning("Cannot move run to archive, destination does not exist")
@@ -331,7 +331,7 @@ def send_mail(self, sbt, msg, rcp):
already_seen = False
runname = self.id
if not sbt:
- sbt = "{}".format(runname)
+ sbt = f"{runname}"
misc.send_mail(sbt, msg, rcp)
def is_transferred(self, transfer_file):
@@ -340,7 +340,7 @@ def is_transferred(self, transfer_file):
:param str transfer_file: Path to file with information about transferred runs
"""
try:
- with open(transfer_file, 'r') as file_handle:
+ with open(transfer_file) as file_handle:
transfer_file_contents = csv.reader(file_handle, delimiter='\t')
for row in transfer_file_contents:
# Rows have two columns: run and transfer date
@@ -349,7 +349,7 @@ def is_transferred(self, transfer_file):
if os.path.exists(os.path.join(self.run_dir, 'transferring')):
return True
return False
- except IOError:
+ except OSError:
return False
def is_unpooled_lane(self, lane):
@@ -388,7 +388,7 @@ def _rename_undet(self, lane, samples_per_lane):
:param samples_per_lane: lane:sample dict
:type status: dict
"""
- for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, "Undetermined*L0?{}*".format(lane))):
+ for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")):
old_name=os.path.basename(file)
old_name_comps=old_name.split("_")
old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined
@@ -398,7 +398,7 @@ def _rename_undet(self, lane, samples_per_lane):
old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper
new_name="_".join(old_name_comps)
- logger.info("Renaming {} to {}".format(file, os.path.join(os.path.dirname(file), new_name)))
+ logger.info(f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}")
os.rename(file, os.path.join(os.path.dirname(file), new_name))
def _classify_lanes(self, samplesheets):
@@ -455,16 +455,16 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))):
+ for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_',''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:]
+ new_name_comps = [sample.replace('Sample_',''), f'S{str(sample_counter)}'] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name))
sample_counter += 1
# Make a softlink of lane.html
- html_report_lane_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
+ html_report_lane_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
if not os.path.isdir(os.path.dirname(html_report_lane_dest)):
os.makedirs(os.path.dirname(html_report_lane_dest))
@@ -472,7 +472,7 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le
# Modify the laneBarcode.html file
html_report_laneBarcode = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
+ f"Demultiplexing_{demux_id}",
legacy_path,
"Reports",
"html",
@@ -511,7 +511,7 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
# Modify the Stats.json file
- stat_json_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json")
+ stat_json_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json")
stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json")
with open(stat_json_source) as json_data:
data = json.load(json_data)
@@ -528,15 +528,15 @@ def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, le
json.dump(data, stat_json_new_file)
def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes):
- elements = [element for element in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) ]
+ elements = [element for element in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) ]
for element in elements:
if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), element)
+ source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", element)
dest = os.path.join(self.run_dir, self.demux_dir, element)
os.symlink(source, dest)
os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats"))
# Fetch the lanes that have NoIndex
- statsFiles = glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "*" ))
+ statsFiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" ))
for source in statsFiles:
source_name = os.path.split(source)[1]
if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
@@ -545,15 +545,15 @@ def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_
dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name)
os.symlink(source, dest)
for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", file)
+ source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file)
dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file)
os.symlink(source, dest)
- source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Reports")
+ source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports")
dest = os.path.join(self.run_dir, self.demux_dir, "Reports")
if os.path.exists(dest):
try:
os.rmdir(dest)
- except NotADirectoryError as e:
+ except NotADirectoryError:
os.unlink(dest)
os.symlink(source, dest)
@@ -567,7 +567,7 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple
lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data]
next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane)
for entry in next_html_report_lane_parser.sample_data:
- if not entry['Lane'] in lanesInReport:
+ if entry['Lane'] not in lanesInReport:
# If this is a new lane not included before
html_report_lane_parser.sample_data.append(entry)
# Now all lanes have been inserted
@@ -589,9 +589,9 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple
entry['% Perfectbarcode'] = None
entry['% One mismatchbarcode'] = None
# Update the values in Flowcell Summary
- html_report_lane_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw)
- html_report_lane_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF)
- html_report_lane_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases)
+ html_report_lane_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}'
+ html_report_lane_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}'
+ html_report_lane_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}'
# Add lanes not present in this demux
# Create the new lane.html
new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all'])
@@ -664,9 +664,9 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple
key=lambda k: (k['Lane'].lower(), k['Sample']))
# Update the values in Flowcell Summary
- html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = '{:,}'.format(Clusters_Raw)
- html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = '{:,}'.format(Clusters_PF)
- html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = '{:,}'.format(Yield_Mbases)
+ html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}'
+ html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}'
+ html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}'
# Generate the new report for laneBarcode.html
new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html')
_generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser)
@@ -774,11 +774,11 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet
# Create DemuxSummary.txt files for complex lanes
if len(DemuxSummaryFiles_complex_lanes) > 0:
for key, value in DemuxSummaryFiles_complex_lanes.items():
- with open(os.path.join(DemultiplexingStats_xml_dir, 'DemuxSummaryF1L{}.txt'.format(key)), 'w') as DemuxSummaryFile:
+ with open(os.path.join(DemultiplexingStats_xml_dir, f'DemuxSummaryF1L{key}.txt'), 'w') as DemuxSummaryFile:
DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n')
DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n')
for idx, count in value['Barcodes'].items():
- DemuxSummaryFile.write('{}\t{}\n'.format(idx, count))
+ DemuxSummaryFile.write(f'{idx}\t{count}\n')
open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close()
@@ -790,7 +790,7 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
ssparser = SampleSheetParser(samplesheet)
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
html_report_lane = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
+ f"Demultiplexing_{demux_id}",
legacy_path,
"Reports",
"html",
@@ -803,10 +803,10 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
if os.path.exists(html_report_lane):
html_reports_lane.append(html_report_lane)
else:
- raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_lane))
+ raise RuntimeError(f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing")
html_report_laneBarcode = os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
+ f"Demultiplexing_{demux_id}",
legacy_path,
"Reports",
"html",
@@ -819,13 +819,13 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
if os.path.exists(html_report_laneBarcode):
html_reports_laneBarcode.append(html_report_laneBarcode)
else:
- raise RuntimeError("Not able to find html report {}: possible cause is problem in demultiplexing".format(html_report_laneBarcode))
+ raise RuntimeError(f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing")
- stat_json = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), legacy_path, "Stats", "Stats.json")
+ stat_json = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json")
if os.path.exists(stat_json):
stats_json.append(stat_json)
else:
- raise RuntimeError("Not able to find Stats.json report {}: possible cause is problem in demultiplexing".format(stat_json))
+ raise RuntimeError(f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing")
# Aggregate fastq
lanes_samples = dict()
@@ -848,21 +848,21 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), "Undetermined*L0?{}*".format(lane))):
+ for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_', ''), 'S{}'.format(str(sample_counter))] + old_name_comps[2:]
+ new_name_comps = [sample.replace('Sample_', ''), f'S{str(sample_counter)}'] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name))
sample_counter += 1
# Ordinary cases
else:
- projects = [project for project in os.listdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id))) if os.path.isdir(os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project))]
+ projects = [project for project in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) if os.path.isdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project))]
for project in projects:
if project in "Reports" or project in "Stats":
continue
- project_source = os.path.join(self.run_dir, "Demultiplexing_{}".format(demux_id), project)
+ project_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project)
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
# There might be project seqeunced with multiple index lengths
@@ -881,7 +881,7 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
# Copy fastq files for undetermined and the undetermined stats for simple lanes only
lanes_in_sub_samplesheet = []
header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project']
- with open(samplesheet, mode='r') as sub_samplesheet_file:
+ with open(samplesheet) as sub_samplesheet_file:
sub_samplesheet_reader = csv.reader(sub_samplesheet_file)
for row in sub_samplesheet_reader:
if row[0] not in header:
@@ -890,15 +890,15 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
for lane in lanes_in_sub_samplesheet:
if lane in simple_lanes.keys():
undetermined_fastq_files = glob.glob(os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
- "Undetermined_S0_L00{}*.fastq*".format(lane))) # Contains only simple lanes undetermined
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined_S0_L00{lane}*.fastq*")) # Contains only simple lanes undetermined
for fastqfile in undetermined_fastq_files:
os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1]))
DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir,
- "Demultiplexing_{}".format(demux_id),
+ f"Demultiplexing_{demux_id}",
legacy_path,
"Stats",
- "*L{}*txt".format(lane)))
+ f"*L{lane}*txt"))
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
for DemuxSummaryFile in DemuxSummaryFiles:
@@ -913,7 +913,7 @@ def _aggregate_demux_results_simple_complex(self):
if self.software == 'bcl2fastq':
legacy_path = ''
elif self.software == 'bclconvert':
- legacy_path = "Reports/{}".format(self.legacy_dir)
+ legacy_path = f"Reports/{self.legacy_dir}"
else:
raise RuntimeError("Unrecognized software!")
@@ -982,11 +982,11 @@ def _generate_lane_html(html_file, html_report_lane_parser):
html.write('
\n')
fc_keys = sorted(list(html_report_lane_parser.flowcell_data.keys()))
for key in fc_keys:
- html.write('
\n')
lane_keys = sorted(list(html_report_lane_parser.sample_data[0].keys()))
for key in lane_keys:
- html.write('
{}
\n'.format(key))
+ html.write(f'
{key}
\n')
html.write('
\n')
for sample in html_report_lane_parser.sample_data:
html.write('
\n')
for key in lane_keys:
- html.write('
{}
\n'.format(sample[key]))
+ html.write(f'
{sample[key]}
\n')
html.write('
\n')
html.write('\n')
# FOOTER
diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py
index ca331115..f2699426 100755
--- a/taca/illumina/Standard_Runs.py
+++ b/taca/illumina/Standard_Runs.py
@@ -1,13 +1,14 @@
+import logging
import os
import re
-import logging
from datetime import datetime
+from io import open
+
+from flowcell_parser.classes import SampleSheetParser
-from taca.utils.filesystem import chdir
from taca.illumina.Runs import Run
from taca.utils import misc
-from flowcell_parser.classes import SampleSheetParser
-from io import open
+from taca.utils.filesystem import chdir
logger = logging.getLogger(__name__)
@@ -64,9 +65,9 @@ def _copy_samplesheet(self):
rename_qPCR_suffix = True,
fields_qPCR=[ssparser.dfield_snm]))
except Exception as e:
- logger.error('Encountered the following exception {}'.format(e))
+ logger.error(f'Encountered the following exception {e}')
return False
- logger.info(('Created SampleSheet.csv for Flowcell {} in {} '.format(self.id, samplesheet_dest)))
+ logger.info(f'Created SampleSheet.csv for Flowcell {self.id} in {samplesheet_dest} ')
# SampleSheet.csv generated
# When demultiplexing SampleSheet.csv is the one I need to use
@@ -258,8 +259,8 @@ def demultiplex_run(self):
samples_to_include[lane].append(sample_name)
else:
samples_to_include.update({lane:[sample_name]})
- except (KeyError, IndexError) as err:
- logger.info(('No corresponding mask in lane {}. Skip it.'.format(lane)))
+ except (KeyError, IndexError):
+ logger.info(f'No corresponding mask in lane {lane}. Skip it.')
continue
elif self.software == 'bclconvert':
mask = unique_masks[i]
@@ -299,7 +300,7 @@ def demultiplex_run(self):
base_mask = []
# Make sub-samplesheet
with chdir(self.run_dir):
- samplesheet_dest='SampleSheet_{}.csv'.format(bcl_cmd_counter)
+ samplesheet_dest=f'SampleSheet_{bcl_cmd_counter}.csv'
with open(samplesheet_dest, 'w') as fcd:
fcd.write(self._generate_samplesheet_subset(self.runParserObj.samplesheet,
samples_to_include, runSetup, self.software, sample_type, index1_size, index2_size, base_mask, self.CONFIG))
@@ -317,10 +318,9 @@ def demultiplex_run(self):
bcl_cmd_counter)
misc.call_external_command_detached(cmd,
with_log_files = True,
- prefix='demux_{}'.format(bcl_cmd_counter))
- logger.info(('BCL to FASTQ conversion and demultiplexing ' \
- 'started for run {} on {}'.format(os.path.basename(self.id),
- datetime.now())))
+ prefix=f'demux_{bcl_cmd_counter}')
+ logger.info('BCL to FASTQ conversion and demultiplexing ' \
+ f'started for run {os.path.basename(self.id)} on {datetime.now()}')
# Demutiplexing done for one mask type and scripts will continue
# Working with the next type. Command counter should increase by 1
@@ -346,7 +346,7 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter):
for lane in sorted(lanes):
# Iterate thorugh each lane and add the correct --use-bases-mask for that lane
base_mask = [per_lane_base_masks[lane][bm]['base_mask'] for bm in per_lane_base_masks[lane]][0] # Get the base_mask
- base_mask_expr = '{}:'.format(lane) + ','.join(base_mask)
+ base_mask_expr = f'{lane}:' + ','.join(base_mask)
cl.extend(['--use-bases-mask', base_mask_expr])
# Case with bclconvert
elif self.software == 'bclconvert':
@@ -355,12 +355,12 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter):
else:
raise RuntimeError("Unrecognized software!")
# Output dir
- output_dir = os.path.join(self.run_dir, 'Demultiplexing_{}'.format(bcl_cmd_counter))
+ output_dir = os.path.join(self.run_dir, f'Demultiplexing_{bcl_cmd_counter}')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
cl.extend(['--output-dir', output_dir])
# Samplesheet
- cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, 'SampleSheet_{}.csv'.format(bcl_cmd_counter)))])
+ cl.extend(['--sample-sheet', os.path.join(os.path.join(self.run_dir, f'SampleSheet_{bcl_cmd_counter}.csv'))])
# Demux options
cl_options = []
if 'options' in self.CONFIG.get(self.software):
@@ -374,9 +374,9 @@ def generate_bcl_command(self, sample_type, mask_table, bcl_cmd_counter):
if isinstance(option, dict):
opt, val = list(option.items())[0]
if 'output-dir' not in opt:
- cl.extend(['--{}'.format(opt), str(val).lower()])
+ cl.extend([f'--{opt}', str(val).lower()])
else:
- cl.append('--{}'.format(option))
+ cl.append(f'--{option}')
return cl
def _generate_per_lane_base_mask(self, sample_type, mask_table):
@@ -548,7 +548,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name
Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG)
"""
- output = u''
+ output = ''
# Expand the ssparser if there are lanes with 10X or Smart-seq samples
index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq'])
@@ -591,12 +591,12 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
if not fields_to_remove:
fields_to_remove = []
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f'[Header]{os.linesep}'
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f'{field.rstrip()},{ssparser.header[field].rstrip()}'
output += os.linesep
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f'[Data]{os.linesep}'
datafields = []
for field in ssparser.datafields:
if field not in fields_to_remove:
@@ -611,13 +611,13 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
try:
if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR:
# Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix
- value = re.sub('__qPCR_$', '', 'Sample_{}'.format(line[ssparser.dfield_snm]))
+ value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}')
else:
# Substitute SampleID with SampleName, add Sample_ as prefix
- value ='Sample_{}'.format(line[ssparser.dfield_snm])
+ value =f'Sample_{line[ssparser.dfield_snm]}'
except:
# Otherwise add Sample_ as prefix
- value = 'Sample_{}'.format(line[ssparser.dfield_sid])
+ value = f'Sample_{line[ssparser.dfield_sid]}'
elif rename_qPCR_suffix and field in fields_qPCR:
value = re.sub('__qPCR_$', '', line[field])
line_ar.append(value)
@@ -626,7 +626,7 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
return output
def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, software, sample_type, index1_size, index2_size, base_mask, CONFIG):
- output = u''
+ output = ''
# Prepare index cycles
index_cycles = [0, 0]
for read in runSetup:
@@ -636,13 +636,13 @@ def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, s
else:
index_cycles[1] = int(read['NumCycles'])
# Header
- output += '[Header]{}'.format(os.linesep)
+ output += f'[Header]{os.linesep}'
for field in sorted(ssparser.header):
- output += '{},{}'.format(field.rstrip(), ssparser.header[field].rstrip())
+ output += f'{field.rstrip()},{ssparser.header[field].rstrip()}'
output += os.linesep
# Settings for BCL Convert
if software == 'bclconvert':
- output += '[Settings]{}'.format(os.linesep)
+ output += f'[Settings]{os.linesep}'
output += 'OverrideCycles,{}{}'.format(';'.join(base_mask), os.linesep)
if CONFIG.get('bclconvert'):
@@ -651,15 +651,15 @@ def _generate_samplesheet_subset(self, ssparser, samples_to_include, runSetup, s
if CONFIG['bclconvert']['settings'].get('common'):
for setting in CONFIG['bclconvert']['settings']['common']:
for k, v in setting.items():
- output += '{},{}{}'.format(k, v, os.linesep)
+ output += f'{k},{v}{os.linesep}'
# Put special settings:
if sample_type in CONFIG['bclconvert']['settings'].keys():
for setting in CONFIG['bclconvert']['settings'][sample_type]:
for k, v in setting.items():
if (k == 'BarcodeMismatchesIndex1' and index1_size != 0) or (k == 'BarcodeMismatchesIndex2' and index2_size != 0) or 'BarcodeMismatchesIndex' not in k:
- output += '{},{}{}'.format(k, v, os.linesep)
+ output += f'{k},{v}{os.linesep}'
# Data
- output += '[Data]{}'.format(os.linesep)
+ output += f'[Data]{os.linesep}'
datafields = []
for field in ssparser.datafields:
datafields.append(field)
diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py
index 5f839058..a6743f04 100644
--- a/taca/nanopore/ONT_run_classes.py
+++ b/taca/nanopore/ONT_run_classes.py
@@ -1,18 +1,18 @@
-import os
-import logging
import csv
-import shutil
import glob
-import re
import json
-import pandas as pd
-import subprocess
+import logging
import os
+import re
+import shutil
+import subprocess
+from datetime import datetime
from typing import Union
-from taca.utils.statusdb import NanoporeRunsConnection
-from datetime import datetime
+import pandas as pd
+
from taca.utils.config import CONFIG
+from taca.utils.statusdb import NanoporeRunsConnection
from taca.utils.transfer import RsyncAgent, RsyncError
logger = logging.getLogger(__name__)
@@ -22,7 +22,7 @@
)
-class ONT_run(object):
+class ONT_run:
"""General Nanopore run.
Expects instantiation from absolute path of run directory on preprocessing server.
@@ -39,7 +39,7 @@ def __init__(self, run_abspath: str):
), f"Run {self.run_name} doesn't look like a run dir"
# Parse MinKNOW sample and experiment name
- with open(self.get_file("/run_path.txt"), "r") as stream:
+ with open(self.get_file("/run_path.txt")) as stream:
self.experiment_name, self.sample_name, _ = stream.read().split("/")
# Get info from run name
@@ -122,7 +122,7 @@ def assert_contents(self):
def is_transferred(self) -> bool:
"""Return True if run ID in transfer.tsv, else False."""
- with open(self.transfer_details["transfer_log"], "r") as f:
+ with open(self.transfer_details["transfer_log"]) as f:
return self.run_name in f.read()
# DB update
@@ -230,7 +230,7 @@ def parse_minknow_json(self, db_update):
logger.info(f"{self.run_name}:Parsing report JSON...")
- dict_json_report = json.load(open(self.get_file("/report*.json"), "r"))
+ dict_json_report = json.load(open(self.get_file("/report*.json")))
# Initialize return dict
parsed_data = {}
@@ -352,10 +352,10 @@ def update_transfer_log(self):
with open(self.transfer_details["transfer_log"], "a") as f:
tsv_writer = csv.writer(f, delimiter="\t")
tsv_writer.writerow([self.run_name, str(datetime.now())])
- except IOError:
+ except OSError:
msg = f"{self.run_name}: Could not update the transfer logfile {self.transfer_details['transfer_log']}"
logger.error(msg)
- raise IOError(msg)
+ raise OSError(msg)
# Archive run
@@ -404,7 +404,7 @@ def get_anglerfish_exit_code(self) -> Union[int, None]:
Return exit code or None.
"""
if os.path.exists(self.anglerfish_done_abspath):
- return int(open(self.anglerfish_done_abspath, "r").read())
+ return int(open(self.anglerfish_done_abspath).read())
else:
return None
@@ -413,7 +413,7 @@ def get_anglerfish_pid(self) -> Union[str, None]:
Return process ID or None."""
if os.path.exists(self.anglerfish_ongoing_abspath):
- return str(open(self.anglerfish_ongoing_abspath, "r").read())
+ return str(open(self.anglerfish_ongoing_abspath).read())
else:
return None
diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py
index 75c2d56d..130a88c8 100644
--- a/taca/nanopore/instrument_transfer.py
+++ b/taca/nanopore/instrument_transfer.py
@@ -2,14 +2,14 @@
"""
__version__ = "1.0.13"
+import argparse
import logging
import os
import re
import shutil
-import argparse
import subprocess
-from glob import glob
from datetime import datetime as dt
+from glob import glob
def main(args):
@@ -53,14 +53,14 @@ def main(args):
if run_path.split(os.sep)[-2][0:3] == "QC_":
# For QC runs, the sample name should start with "QC_"
- logging.info(f"Run categorized as QC.")
+ logging.info("Run categorized as QC.")
rsync_dest = args.dest_dir_qc
else:
rsync_dest = args.dest_dir
- logging.info(f"Dumping run path...")
+ logging.info("Dumping run path...")
dump_path(run_path)
- logging.info(f"Dumping QC and MUX history...")
+ logging.info("Dumping QC and MUX history...")
dump_pore_count_history(run_path, pore_counts)
if not sequencing_finished(run_path):
@@ -119,7 +119,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log:
"""Do a final sync of the run to storage, then archive it.
Skip if rsync is already running on the run."""
- logging.info("Performing a final sync of {} to storage".format(run_dir))
+ logging.info(f"Performing a final sync of {run_dir} to storage")
command = [
"run-one",
@@ -140,9 +140,7 @@ def final_sync_to_storage(run_dir: str, destination: str, archive_dir: str, log:
archive_finished_run(run_dir, archive_dir)
else:
logging.info(
- "Previous rsync might be running still. Skipping {} for now.".format(
- run_dir
- )
+ f"Previous rsync might be running still. Skipping {run_dir} for now."
)
return
diff --git a/taca/server_status/cli.py b/taca/server_status/cli.py
index 723410df..4b786fc1 100644
--- a/taca/server_status/cli.py
+++ b/taca/server_status/cli.py
@@ -1,9 +1,12 @@
-import click
import logging
+import click
+
+from taca.server_status import (
+ cronjobs as cj, # to avoid similar names with command, otherwise exception
+)
from taca.server_status import server_status as status
from taca.utils.config import CONFIG
-from taca.server_status import cronjobs as cj # to avoid similar names with command, otherwise exception
@click.group(name='server_status')
diff --git a/taca/server_status/cronjobs.py b/taca/server_status/cronjobs.py
index 9b808bd8..80fd59fc 100644
--- a/taca/server_status/cronjobs.py
+++ b/taca/server_status/cronjobs.py
@@ -1,20 +1,22 @@
+import datetime
+import getpass
import logging
import platform
-import getpass
-import datetime
from crontab import CronTab
+
from taca.utils import statusdb
from taca.utils.config import CONFIG
+
def _parse_crontab():
result = {}
user = getpass.getuser()
- logging.info('Getting crontab for user {}'.format(user))
+ logging.info(f'Getting crontab for user {user}')
try:
crontab = CronTab(user=user)
except Exception as e:
- logging.error('Cannot get a crontab for user: {}'.format(user))
+ logging.error(f'Cannot get a crontab for user: {user}')
logging.error(e.message)
else:
result[user] = []
@@ -71,7 +73,7 @@ def update_cronjob_db():
except Exception as e:
logging.error(e.message)
else:
- logging.info('{} has been successfully updated'.format(server))
+ logging.info(f'{server} has been successfully updated')
else:
logging.warning('Document has not been created/updated')
diff --git a/taca/server_status/server_status.py b/taca/server_status/server_status.py
index a03a107a..36b6f27a 100644
--- a/taca/server_status/server_status.py
+++ b/taca/server_status/server_status.py
@@ -1,6 +1,6 @@
-import subprocess
-import logging
import datetime
+import logging
+import subprocess
from taca.utils import statusdb
from taca.utils.config import CONFIG
@@ -26,7 +26,7 @@ def get_nases_disk_space():
else:
user = config['user']
# Connect via ssh to server and execute the command
- command = ['ssh', '-t', '{}@{}'.format(user, server_url), command]
+ command = ['ssh', '-t', f'{user}@{server_url}', command]
result[server_url] = _run_cmd(command)
@@ -81,7 +81,7 @@ def _parse_output(output): # for nases
'mounted_on': 'NaN',
'filesystem': 'NaN'
}
- logging.error('Can not parse the output: {}'.format(output))
+ logging.error(f'Can not parse the output: {output}')
return result
@@ -116,7 +116,7 @@ def update_status_db(data, server_type=None):
logging.error(e.message)
raise
else:
- logging.info('{}: Server status has been updated'.format(key))
+ logging.info(f'{key}: Server status has been updated')
def check_promethion_status():
config = CONFIG.get('promethion_status')
diff --git a/taca/testing/cli.py b/taca/testing/cli.py
index 63b89a35..4856b75d 100644
--- a/taca/testing/cli.py
+++ b/taca/testing/cli.py
@@ -1,11 +1,13 @@
""" CLI for the testing commands
"""
-from __future__ import print_function
import os
+
import click
+
import taca.testing.create_uppmax_like_env as createupp
+
@click.group(name='uppmax_env')
def uppmax_env():
""" Create a local set of folders that resembles the uppmax-ngi env. Creates config file for ngi_pipeline, taca, and taca ngi-pipeline. Only a minimal taca config is needed (statusdb and log)
diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py
index e4852b42..9e936857 100644
--- a/taca/testing/create_uppmax_like_env.py
+++ b/taca/testing/create_uppmax_like_env.py
@@ -1,19 +1,18 @@
""" Load and parse configuration file."""
-from __future__ import print_function
+import datetime
import logging
import os
-import datetime
import random
import subprocess
+from io import open
from dateutil.relativedelta import relativedelta
-from taca.utils.config import CONFIG
+
from taca.utils import config as conf
from taca.utils import filesystem as fs
from taca.utils import statusdb
-from io import open
-
+from taca.utils.config import CONFIG
logger = logging.getLogger(__name__)
@@ -21,34 +20,34 @@
def create_version_report(path):
# Creates the file version_report.txt for stuff run ngi_pipeline
with open(os.path.join(path, 'version_report.txt'), 'w') as VERSION_REPORT:
- VERSION_REPORT.write(u'******\n')
- VERSION_REPORT.write(u'README\n')
- VERSION_REPORT.write(u'******\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'The versions of programs and references used:\n')
- VERSION_REPORT.write(u'piper: unknown\n')
- VERSION_REPORT.write(u'bwa: 0.7.12\n')
- VERSION_REPORT.write(u'samtools: 0.1.19\n')
- VERSION_REPORT.write(u'qualimap: v2.2\n')
- VERSION_REPORT.write(u'snpEff: 4.1\n')
- VERSION_REPORT.write(u'snpEff reference: GRCh37.75\n')
- VERSION_REPORT.write(u'gatk: 3.3-0-geee94ec\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'reference: human_g1k_v37.fasta\n')
- VERSION_REPORT.write(u'db_snp: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'hapmap: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'omni: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'1000G_indels: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n')
- VERSION_REPORT.write(u'indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n')
- VERSION_REPORT.write(u'\n')
- VERSION_REPORT.write(u'piper\n')
- VERSION_REPORT.write(u'-----\n')
- VERSION_REPORT.write(u'Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n')
+ VERSION_REPORT.write('******\n')
+ VERSION_REPORT.write('README\n')
+ VERSION_REPORT.write('******\n')
+ VERSION_REPORT.write('\n')
+ VERSION_REPORT.write('Data has been aligned to to the reference using bwa. The raw alignments have then been deduplicated, recalibrated and cleaned using GATK. Quality control information was gathered using Qualimap. SNVs and indels have been called using the HaplotypeCaller. These variants were then funcionally annotated using snpEff. The pipeline used was Piper, see below for more information.\n')
+ VERSION_REPORT.write('\n')
+ VERSION_REPORT.write('The versions of programs and references used:\n')
+ VERSION_REPORT.write('piper: unknown\n')
+ VERSION_REPORT.write('bwa: 0.7.12\n')
+ VERSION_REPORT.write('samtools: 0.1.19\n')
+ VERSION_REPORT.write('qualimap: v2.2\n')
+ VERSION_REPORT.write('snpEff: 4.1\n')
+ VERSION_REPORT.write('snpEff reference: GRCh37.75\n')
+ VERSION_REPORT.write('gatk: 3.3-0-geee94ec\n')
+ VERSION_REPORT.write('\n')
+ VERSION_REPORT.write('reference: human_g1k_v37.fasta\n')
+ VERSION_REPORT.write('db_snp: gatk-bundle/2.8\n')
+ VERSION_REPORT.write('hapmap: gatk-bundle/2.8\n')
+ VERSION_REPORT.write('omni: gatk-bundle/2.8\n')
+ VERSION_REPORT.write('1000G_indels: gatk-bundle/2.8\n')
+ VERSION_REPORT.write('Mills_and_1000G_golden_standard_indels: gatk-bundle/2.8\n')
+ VERSION_REPORT.write('\n')
+ VERSION_REPORT.write('indel resource file: {Mills_and_1000G_gold_standard.indels.b37.vcf version: gatk-bundle/2.8}\n')
+ VERSION_REPORT.write('indel resource file: {1000G_phase1.indels.b37.vcf version: gatk-bundle/2.8}\n')
+ VERSION_REPORT.write('\n')
+ VERSION_REPORT.write('piper\n')
+ VERSION_REPORT.write('-----\n')
+ VERSION_REPORT.write('Piper is a pipeline system developed and maintained at the National Genomics Infrastructure build on top of GATK Queue. For more information and the source code visit: www.github.com/NationalGenomicsInfrastructure/piper\n')
def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None ):
# Create something like 160217_ST-E00201_0063_AHJHNYCCXX
@@ -83,8 +82,8 @@ def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None
# Create dir structure
fs.create_folder(os.path.join(path_to_fc, 'Demultiplexing', project_name, sample_id))
# Now create the data
- fastq_1_dest = '{}_S{}_L00{}_R1_001.fastq.gz'.format(sample_name, counter, lane)
- fastq_2_dest = '{}_S{}_L00{}_R2_001.fastq.gz'.format(sample_name, counter, lane)
+ fastq_1_dest = f'{sample_name}_S{counter}_L00{lane}_R1_001.fastq.gz'
+ fastq_2_dest = f'{sample_name}_S{counter}_L00{lane}_R2_001.fastq.gz'
counter += 1
if fastq_1 is None:
fs.touch(os.path.join(path_to_fc, 'Demultiplexing', project_name,
@@ -98,17 +97,17 @@ def create_FC(incoming_dir, run_name, samplesheet, fastq_1 = None, fastq_2=None
project_name, sample_id, fastq_2_dest))
with open(os.path.join(path_to_fc, 'SampleSheet.csv'), 'w') as Samplesheet_file:
- Samplesheet_file.write(u'[Header]\n')
- Samplesheet_file.write(u'Date,2016-03-29\n')
- Samplesheet_file.write(u'Investigator Name,Christian Natanaelsson\n')
- Samplesheet_file.write(u'[Data]\n')
+ Samplesheet_file.write('[Header]\n')
+ Samplesheet_file.write('Date,2016-03-29\n')
+ Samplesheet_file.write('Investigator Name,Christian Natanaelsson\n')
+ Samplesheet_file.write('[Data]\n')
for key in header:
- Samplesheet_file.write(u'{},'.format(key))
- Samplesheet_file.write(u'\n')
+ Samplesheet_file.write(f'{key},')
+ Samplesheet_file.write('\n')
for line in samplesheet:
for key in header:
- Samplesheet_file.write(u'{},'.format(line[key]))
- Samplesheet_file.write(u'\n')
+ Samplesheet_file.write(f'{line[key]},')
+ Samplesheet_file.write('\n')
def create_uppmax_env(ngi_config):
paths = {}
@@ -122,7 +121,7 @@ def create_uppmax_env(ngi_config):
top_dir = ngi_config['analysis']['top_dir']
paths['top_dir'] = top_dir
except KeyError as e:
- raise SystemExit('Config file is missing the key {}, make sure it have all required information'.format(str(e)))
+ raise SystemExit(f'Config file is missing the key {str(e)}, make sure it have all required information')
if 'environment' not in ngi_config:
sys.exit('ERROR: environment must be a field of NGI_CONFIG.')
try:
@@ -131,10 +130,10 @@ def create_uppmax_env(ngi_config):
flowcell_inbox = flowcell_inboxes[0] # I assume there is only one
paths['flowcell_inbox'] = flowcell_inbox
except ValueError as e:
- sys.exit('key error, flowcell_inbox not found in "{}": {}'.format(ngi_config, e))
+ sys.exit(f'key error, flowcell_inbox not found in "{ngi_config}": {e}')
# Now I need to create the folders for this
if not os.path.exists(base_root):
- sys.exit('base_root needs to exists: {}'.format(base_root))
+ sys.exit(f'base_root needs to exists: {base_root}')
fs.create_folder(flowcell_inbox)
if sthlm_root is None:
path_to_analysis = os.path.join(base_root, top_dir)
@@ -190,11 +189,11 @@ def produce_analysis_piper(ngi_config, project_id):
fs.create_folder(current_dir)
if piper_dir == '05_processed_alignments':
for sample_id in os.listdir(data_dir):
- bam_file = '{}.clean.dedup.bam'.format(sample_id)
+ bam_file = f'{sample_id}.clean.dedup.bam'
fs.touch(os.path.join(current_dir, bam_file))
if piper_dir == '07_variant_calls':
for sample_id in os.listdir(data_dir):
- vcf_file = '{}.clean.dedup.recal.bam.raw.indel.vcf.gz'.format(sample_id)
+ vcf_file = f'{sample_id}.clean.dedup.recal.bam.raw.indel.vcf.gz'
fs.touch(os.path.join(current_dir, vcf_file))
current_dir = os.path.join(piper_ngi_dir, 'sbatch')
fs.create_folder(current_dir)
@@ -278,7 +277,7 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
'application': application,
'no_samples': row['value']['no_samples']}
else:
- print('status {}'.format(project_status))
+ print(f'status {project_status}')
## Now I can parse the x_flowcell db to check what I can and cannot use
whole_genome_projects = int(2*projects/3)
projects_to_reproduce = []
@@ -326,16 +325,16 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
'noWGreseq_open')
# Create ngi_pipeline enviroment
- print('#NGI_CONFIG varaible is {}. This variable needs to be in the .bashrc file'.format(ngi_config_file))
- print('NGI_CONFIG={}'.format(ngi_config_file))
+ print(f'#NGI_CONFIG varaible is {ngi_config_file}. This variable needs to be in the .bashrc file')
+ print(f'NGI_CONFIG={ngi_config_file}')
try:
ngi_config = conf.load_config(ngi_config_file)
- except IOError as e:
- print('ERROR: {}'.format(e.message))
+ except OSError as e:
+ print(f'ERROR: {e.message}')
# Create uppmax env
paths = create_uppmax_env(ngi_config)
- print('#Going to reproduce {} projects (if this number is different from the one you specified.... trust me... do not worry'.format(len(projects_to_reproduce)))
+ print(f'#Going to reproduce {len(projects_to_reproduce)} projects (if this number is different from the one you specified.... trust me... do not worry')
# Scan over x_flowcell and reproduce FCs
flowcellDB = couch_connection['x_flowcells']
reproduced_projects = {}
@@ -363,25 +362,25 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
if project not in reproduced_projects:
reproduced_projects[project] = []
reproduced_projects[project].append(flowcellDB[fc_doc]['RunInfo']['Id'])
- print('#Reproduced {} project (if the numbers diffear do not worry, most likely we selected projects without runs)'.format(len(reproduced_projects)))
+ print(f'#Reproduced {len(reproduced_projects)} project (if the numbers diffear do not worry, most likely we selected projects without runs)')
for project in projects_to_reproduce:
if project[0] in reproduced_projects:
- print('# {}: {}'.format(project[0], project[1]))
+ print(f'# {project[0]}: {project[1]}')
# Need to output the command to organise
to_be_deleted = []
for project in reproduced_projects:
for FC in reproduced_projects[project]:
- print('Running: ngi_pipeline_start.py organize flowcell {} -p {}'.format(FC, project))
+ print(f'Running: ngi_pipeline_start.py organize flowcell {FC} -p {project}')
with open('ngi_pipeline_local.logs', 'w') as NGILOGS:
return_value = subprocess.call(['ngi_pipeline_start.py',
'organize',
'flowcell',
- '{}'.format(FC),
+ f'{FC}',
'-p',
- '{}'.format(project)],
+ f'{project}'],
stdout=NGILOGS, stderr=NGILOGS)
if return_value > 0:
- print('#project {} not organised: have a look to the logs, but most likely this projec is not in charon'.format(project))
+ print(f'#project {project} not organised: have a look to the logs, but most likely this projec is not in charon')
if project not in to_be_deleted:
to_be_deleted.append(project)
@@ -399,4 +398,4 @@ def create(projects, ngi_config_file, fastq_1, fastq_2):
with open('projects.txt', 'w') as PROJECTS:
for project in projects_to_reproduce:
if project[0] in reproduced_projects:
- PROJECTS.write(u'{}:{}\n'.format(project[0], project[1]))
+ PROJECTS.write(f'{project[0]}:{project[1]}\n')
diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py
index 47da90a9..8ec36614 100644
--- a/taca/utils/bioinfo_tab.py
+++ b/taca/utils/bioinfo_tab.py
@@ -1,13 +1,14 @@
-import os
+import datetime
import glob
-import re
import logging
-import datetime
+import os
+import re
+from collections import OrderedDict, defaultdict
+
+from flowcell_parser.classes import RunParametersParser, SampleSheetParser
-from taca.utils.config import CONFIG
from taca.utils import statusdb
-from flowcell_parser.classes import SampleSheetParser, RunParametersParser
-from collections import defaultdict, OrderedDict
+from taca.utils.config import CONFIG
from taca.utils.misc import send_mail
logger = logging.getLogger(__name__)
@@ -32,7 +33,7 @@ def collect_runs():
for run_dir in potential_run_dirs:
if rundir_re.match(os.path.basename(os.path.abspath(run_dir))) and os.path.isdir(run_dir):
found_runs.append(os.path.basename(run_dir))
- logger.info('Working on {}'.format(run_dir))
+ logger.info(f'Working on {run_dir}')
update_statusdb(run_dir)
nosync_data_dir = os.path.join(data_dir, 'nosync')
potential_nosync_run_dirs = glob.glob(os.path.join(nosync_data_dir, '*'))
@@ -158,7 +159,7 @@ def get_ss_projects(run_dir):
elif os.path.exists(os.path.join(run_dir, 'RunParameters.xml')):
run_parameters_file = 'RunParameters.xml'
else:
- logger.error('Cannot find RunParameters.xml or runParameters.xml in the run folder for run {}'.format(run_dir))
+ logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run_dir}')
return []
rp = RunParametersParser(os.path.join(run_dir, run_parameters_file))
if 'Setup' in rp.data['RunParameters']:
@@ -182,7 +183,7 @@ def get_ss_projects(run_dir):
elif os.path.exists(os.path.join(run_dir, 'SampleSheet.csv')):
FCID_samplesheet_origin = os.path.join(run_dir, 'SampleSheet.csv')
else:
- logger.warn('No samplesheet found for {}'.format(run_dir))
+ logger.warn(f'No samplesheet found for {run_dir}')
miseq = True
lanes = str(1)
# Pattern is a bit more rigid since we're no longer also checking for lanes
@@ -191,29 +192,29 @@ def get_ss_projects(run_dir):
# HiSeq X case
elif 'HiSeq X' in runtype:
FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['xten_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ current_year, f'{FCID}.csv')
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# HiSeq 2500 case
elif 'HiSeq' in runtype or 'TruSeq' in runtype:
FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['hiseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ current_year, f'{FCID}.csv')
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
elif 'NovaSeqXPlus' in runtype:
FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseqxplus_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ current_year, f'{FCID}.csv')
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# NovaSeq 6000 case
elif 'NovaSeq' in runtype:
FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['novaseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ current_year, f'{FCID}.csv')
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
# NextSeq Case
elif 'NextSeq' in runtype:
FCID_samplesheet_origin = os.path.join(CONFIG['bioinfo_tab']['nextseq_samplesheets'],
- current_year, '{}.csv'.format(FCID))
+ current_year, f'{FCID}.csv')
data = parse_samplesheet(FCID_samplesheet_origin, run_dir)
else:
- logger.warn('Cannot locate the samplesheet for run {}'.format(run_dir))
+ logger.warn(f'Cannot locate the samplesheet for run {run_dir}')
return []
# If samplesheet is empty, don't bother going through it
@@ -244,7 +245,7 @@ def get_ss_projects(run_dir):
lane = False
if list(proj_tree.keys()) == []:
- logger.info('INCORRECTLY FORMATTED SAMPLESHEET, CHECK {}'.format(run_name))
+ logger.info(f'INCORRECTLY FORMATTED SAMPLESHEET, CHECK {run_name}')
return proj_tree
def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False):
@@ -256,13 +257,13 @@ def parse_samplesheet(FCID_samplesheet_origin, run_dir, is_miseq=False):
ss_reader = SampleSheetParser(FCID_samplesheet_origin)
data = ss_reader.data
except:
- logger.warn('Cannot initialize SampleSheetParser for {}. Most likely due to poor comma separation'.format(run_dir))
+ logger.warn(f'Cannot initialize SampleSheetParser for {run_dir}. Most likely due to poor comma separation')
return []
if is_miseq:
- if not 'Description' in ss_reader.header or not \
+ if 'Description' not in ss_reader.header or not \
('Production' in ss_reader.header['Description'] or 'Application' in ss_reader.header['Description']):
- logger.warn('Run {} not labelled as production or application. Disregarding it.'.format(run_dir))
+ logger.warn(f'Run {run_dir} not labelled as production or application. Disregarding it.')
# Skip this run
return []
return data
@@ -308,11 +309,11 @@ def fail_run(runid, project):
if project is not None:
view = bioinfo_db.view('full_doc/pj_run_to_doc')
rows = view[[project, runid]].rows
- logger.info('Updating status of {} objects with flowcell_id: {} and project_id {}'.format(len(rows), runid, project))
+ logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid} and project_id {project}')
else:
view = bioinfo_db.view('full_doc/run_id_to_doc')
rows = view[[runid]].rows
- logger.info('Updating status of {} objects with flowcell_id: {}'.format(len(rows), runid))
+ logger.info(f'Updating status of {len(rows)} objects with flowcell_id: {runid}')
new_timestamp = datetime.datetime.now().isoformat()
updated = 0
@@ -327,4 +328,4 @@ def fail_run(runid, project):
logger.error('Cannot update object project-sample-run-lane: {}-{}-{}-{}'.format(row.value.get('project_id'), row.value.get('sample'), row.value.get('run_id'), row.value.get('lane')))
logger.error(e)
raise e
- logger.info('Successfully updated {} objects'.format(updated))
+ logger.info(f'Successfully updated {updated} objects')
diff --git a/taca/utils/cli.py b/taca/utils/cli.py
index bbfdb819..4fa3eafa 100644
--- a/taca/utils/cli.py
+++ b/taca/utils/cli.py
@@ -1,7 +1,9 @@
"""CLI for the bioinfo subcommand."""
import click
+
import taca.utils.bioinfo_tab as bt
+
@click.group(name='bioinfo_deliveries')
def bioinfo_deliveries():
"""Update statusdb with information about FC entry point."""
diff --git a/taca/utils/config.py b/taca/utils/config.py
index 74b8876f..004d163a 100644
--- a/taca/utils/config.py
+++ b/taca/utils/config.py
@@ -1,7 +1,8 @@
"""Load and parse configuration file."""
-import yaml
from io import open
+import yaml
+
CONFIG = {}
def load_config(config_file):
@@ -12,8 +13,8 @@ def load_config(config_file):
content = yaml.load(f, Loader=yaml.FullLoader)
config.update(content)
return content
- except IOError as e:
- e.message = 'Could not open configuration file "{}".'.format(config_file)
+ except OSError as e:
+ e.message = f'Could not open configuration file "{config_file}".'
raise e
def load_yaml_config(config_file):
@@ -30,6 +31,6 @@ def load_yaml_config(config_file):
content = yaml.load(f, Loader=yaml.FullLoader)
CONFIG.update(content)
return content
- except IOError as e:
- e.message = 'Could not open configuration file "{}".'.format(config_file)
+ except OSError as e:
+ e.message = f'Could not open configuration file "{config_file}".'
raise e
diff --git a/taca/utils/filesystem.py b/taca/utils/filesystem.py
index f1db6968..957bf818 100644
--- a/taca/utils/filesystem.py
+++ b/taca/utils/filesystem.py
@@ -26,7 +26,7 @@ def create_folder(target_folder):
"""
try:
os.makedirs(target_folder)
- except OSError as e:
+ except OSError:
pass
return os.path.exists(target_folder)
diff --git a/taca/utils/misc.py b/taca/utils/misc.py
index 3f9bec6a..946723e5 100755
--- a/taca/utils/misc.py
+++ b/taca/utils/misc.py
@@ -5,13 +5,15 @@
import smtplib
import subprocess
import sys
-
from datetime import datetime
from email.mime.text import MIMEText
-from taca.utils import statusdb
from io import open
+
from six.moves import input
+from taca.utils import statusdb
+
+
def send_mail(subject, content, receiver):
"""Sends an email.
@@ -22,7 +24,7 @@ def send_mail(subject, content, receiver):
if not receiver:
raise SystemExit('No receiver was given to send mail')
msg = MIMEText(content)
- msg['Subject'] = 'TACA - {}'.format(subject)
+ msg['Subject'] = f'TACA - {subject}'
msg['From'] = 'TACA@scilifelab.se'
msg['to'] = receiver
@@ -45,7 +47,7 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''):
stderr = sys.stderr
if with_log_files:
if prefix:
- logFile = '{}_{}'.format(prefix, logFile)
+ logFile = f'{prefix}_{logFile}'
# Create log dir if it didn't exist in CWD
if log_dir and not os.path.exists(log_dir):
os.mkdir(log_dir)
@@ -53,8 +55,8 @@ def call_external_command(cl, with_log_files=False, prefix=None, log_dir=''):
stdout = open(logFile + '.out', 'a')
stderr = open(logFile + '.err', 'a')
started = 'Started command {} on {}'.format(' '.join(cl), datetime.now())
- stdout.write(started + u'\n')
- stdout.write(''.join(['=']*len(cl)) + u'\n')
+ stdout.write(started + '\n')
+ stdout.write(''.join(['=']*len(cl)) + '\n')
try:
subprocess.check_call(cl, stdout=stdout, stderr=stderr)
@@ -80,12 +82,12 @@ def call_external_command_detached(cl, with_log_files=False, prefix=None):
if with_log_files:
if prefix:
- command = '{}_{}'.format(prefix, command)
+ command = f'{prefix}_{command}'
stdout = open(command + '.out', 'a')
stderr = open(command + '.err', 'a')
started = 'Started command {} on {}'.format(' '.join(cl), datetime.now())
- stdout.write(started + u'\n')
- stdout.write(''.join(['=']*len(cl)) + u'\n')
+ stdout.write(started + '\n')
+ stdout.write(''.join(['=']*len(cl)) + '\n')
try:
p_handle = subprocess.Popen(cl, stdout=stdout, stderr=stderr)
@@ -205,7 +207,7 @@ def run_is_demuxed(run, couch_info=None, seq_run_type=None):
if len(run_date)>6:
run_date = run_date[2:]
run_fc = run_terms[-1]
- run_name = '{}_{}'.format(run_date, run_fc)
+ run_name = f'{run_date}_{run_fc}'
try:
couch_connection = statusdb.StatusdbSession(couch_info).connection
fc_db = couch_connection[couch_info['xten_db']]
diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py
index 3ae4d291..c02d7ac6 100644
--- a/taca/utils/statusdb.py
+++ b/taca/utils/statusdb.py
@@ -1,24 +1,24 @@
"""Classes for handling connection to StatusDB."""
-import couchdb
-import logging
import csv
-
+import logging
from datetime import datetime
+import couchdb
+
logger = logging.getLogger(__name__)
-class StatusdbSession(object):
+class StatusdbSession:
"""Wrapper class for couchdb."""
def __init__(self, config, db=None):
user = config.get('username')
password = config.get('password')
url = config.get('url')
- url_string = 'https://{}:{}@{}'.format(user, password, url)
+ url_string = f'https://{user}:{password}@{url}'
display_url_string = 'https://{}:{}@{}'.format(user, '*********', url)
self.connection = couchdb.Server(url=url_string)
if not self.connection:
- raise Exception('Couchdb connection failed for url {}'.format(display_url_string))
+ raise Exception(f'Couchdb connection failed for url {display_url_string}')
if db:
self.db_connection = self.connection[db]
@@ -40,7 +40,7 @@ def save_db_doc(self, doc, db=None):
db = db or self.db
db.save(doc)
except Exception as e:
- raise Exception('Failed saving document due to {}'.format(e))
+ raise Exception(f'Failed saving document due to {e}')
def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='%Y-%m-%d'):
"""From information available in flowcell db connection,
@@ -111,10 +111,10 @@ def create_ongoing_run(
self, ont_run, run_path_file: str, pore_count_history_file: str
):
- run_path = open(run_path_file, "r").read().strip()
+ run_path = open(run_path_file).read().strip()
pore_counts = []
- with open(pore_count_history_file, "r") as stream:
+ with open(pore_count_history_file) as stream:
for line in csv.DictReader(stream):
pore_counts.append(line)
@@ -170,8 +170,8 @@ def merge_dicts(d1, d2):
elif d1[key] == d2[key]:
pass # same leaf value
else:
- logger.debug('Values for key {key} in d1 and d2 differ, '
- 'using the value of d1'.format(key=key))
+ logger.debug(f'Values for key {key} in d1 and d2 differ, '
+ 'using the value of d1')
else:
d1[key] = d2[key]
return d1
diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py
index 34e6b314..dfefcdea 100644
--- a/taca/utils/transfer.py
+++ b/taca/utils/transfer.py
@@ -4,15 +4,15 @@
import os
import shutil
import subprocess
+from io import open
from taca.utils.filesystem import create_folder
-from taca.utils.misc import hashfile, call_external_command
-from io import open
+from taca.utils.misc import call_external_command, hashfile
logger = logging.getLogger(__name__)
-class TransferAgent(object):
+class TransferAgent:
"""
(Abstract) superclass representing an Agent that performs file transfers.
Agents implementing specific methods for transferring files should extend
@@ -64,7 +64,7 @@ def format_options(self):
if type(val) == str:
val = [val]
for v in val:
- cmdopts.append('{}={}'.format(param,v))
+ cmdopts.append(f'{param}={v}')
return cmdopts
def transfer(self):
@@ -82,7 +82,7 @@ def validate_src_path(self):
dest_path=self.dest_path)
if not os.path.exists(self.src_path):
raise TransferError(
- msg='src_path "{}" does not exist'.format(self.src_path),
+ msg=f'src_path "{self.src_path}" does not exist',
src_path=self.src_path,
dest_path=self.dest_path)
@@ -173,10 +173,10 @@ def remote_path(self):
[remote_user]@[remote_host]:[dest_path]
"""
return '{}{}{}'.format(
- '{}@'.format(self.remote_user) \
+ f'{self.remote_user}@' \
if self.remote_user is not None \
else '',
- '{}:'.format(self.remote_host) \
+ f'{self.remote_host}:' \
if self.remote_host is not None \
else '',
self.dest_path \
@@ -227,7 +227,7 @@ def validate_transfer(self):
tfile,
hasher=hasher):
return False
- except TypeError as e:
+ except TypeError:
raise RsyncValidationError(
'no digest file specified',
self.src_path,
@@ -269,12 +269,12 @@ def transfer(self):
# source, we're all good
if self.validate_transfer():
logger.debug('target exists and points to the correct '
- 'source path: "{}"'.format(self.src_path))
+ f'source path: "{self.src_path}"')
return True
# If we are not overwriting, return False
if not self.overwrite:
- logger.debug('target "{}" exists and will not be '
- 'overwritten'.format(self.dest_path))
+ logger.debug(f'target "{self.dest_path}" exists and will not be '
+ 'overwritten')
return False
# If the target is a mount, let's not mess with it
if os.path.ismount(self.dest_path):
@@ -282,8 +282,7 @@ def transfer(self):
# If the target is a link or a file, we remove it
if os.path.islink(self.dest_path) or \
os.path.isfile(self.dest_path):
- logger.debug('removing existing target file "{}"'
- .format(self.dest_path))
+ logger.debug(f'removing existing target file "{self.dest_path}"')
try:
os.unlink(self.dest_path)
except OSError as e:
@@ -291,8 +290,7 @@ def transfer(self):
# If the target is a directory, we remove it and
# everything underneath
elif os.path.isdir(self.dest_path):
- logger.debug('removing existing target folder "{}"'
- .format(self.dest_path))
+ logger.debug(f'removing existing target folder "{self.dest_path}"')
try:
shutil.rmtree(self.dest_path)
except OSError as e:
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index c3150f1d..173c57dd 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python
+import json
import os
-import tempfile
import shutil
-import json
+import tempfile
import unittest
-import mock
+from unittest import mock
from taca.analysis import analysis as an
from taca.utils import config
diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py
index 1b3158dc..66688701 100644
--- a/tests/test_analysis_nanopore.py
+++ b/tests/test_analysis_nanopore.py
@@ -1,14 +1,11 @@
#!/usr/bin/env python
import unittest
-import logging
-import mock
-import os
+from unittest import mock
from taca.analysis.analysis_nanopore import *
from taca.nanopore.minion import MinIONqc
from taca.utils import config as conf
-
CONFIG = conf.load_yaml_config('data/taca_test_nanopore_cfg.yaml')
class TestNanoporeAnalysis(unittest.TestCase):
@@ -68,6 +65,6 @@ def test_process_minion_run_fail_analysis(self, mock_mail):
minion_run.qc_run = True
process_minion_qc_run(minion_run)
email_subject = ('Analysis failed for run 20200108_1412_MN19414_AAU648_68125dc2')
- email_message = 'The nanoseq analysis failed for run {}.'.format(minion_run.run_id)
+ email_message = f'The nanoseq analysis failed for run {minion_run.run_id}.'
email_recipients = 'test@test.com'
mock_mail.assert_called_once_with(email_subject, email_message, email_recipients)
diff --git a/tests/test_backup.py b/tests/test_backup.py
index c170f79f..4d3a6bad 100644
--- a/tests/test_backup.py
+++ b/tests/test_backup.py
@@ -1,9 +1,9 @@
#!/usr/bin/env python
-import unittest
-import mock
-import tempfile
import os
import shutil
+import tempfile
+import unittest
+from unittest import mock
from taca.backup import backup
from taca.utils import config as conf
diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py
index d7e04869..2b4f365a 100644
--- a/tests/test_cleanup.py
+++ b/tests/test_cleanup.py
@@ -4,8 +4,8 @@
import shutil
import tempfile
import unittest
-import mock
from datetime import datetime
+from unittest import mock
from taca.cleanup import cleanup
from taca.utils import config as conf
diff --git a/tests/test_illumina.py b/tests/test_illumina.py
index 5bbf323a..e26a48e5 100644
--- a/tests/test_illumina.py
+++ b/tests/test_illumina.py
@@ -1,25 +1,21 @@
#!/usr/bin/env python
+import filecmp
+import json
import os
-import io
import shutil
-import tempfile
-import unittest
-import csv
-import json
-import mock
-import filecmp
import subprocess
-from datetime import datetime
import sys
+import tempfile
+import unittest
+from unittest import mock
+
+from flowcell_parser.classes import LaneBarcodeParser
from taca.analysis.analysis import *
-from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html
-from taca.illumina.Standard_Runs import Standard_Runs, _generate_clean_samplesheet, _classify_samples, parse_10X_indexes, parse_smartseq_indexes, _generate_samplesheet_subset
-from taca.illumina.MiSeq_Runs import MiSeq_Run
-from taca.illumina.NovaSeq_Runs import NovaSeq_Run
from taca.illumina.NextSeq_Runs import NextSeq_Run
-from flowcell_parser.classes import LaneBarcodeParser, SampleSheetParser
+from taca.illumina.NovaSeq_Runs import NovaSeq_Run
+from taca.illumina.Runs import Run, _create_folder_structure, _generate_lane_html
from taca.utils import config as conf
if sys.version_info[0] >= 3:
@@ -146,7 +142,7 @@ def setUpClass(self):
open(os.path.join(completed, 'Demultiplexing', 'Undetermined_S0_L001_R1_001.fastq.gz'), 'w').close()
open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R1_001.fastq.gz'), 'w').close()
open(os.path.join(complex_run_dir, 'Demultiplexing_0', 'N__One_20_01', 'Sample_P12345_1001', 'P16510_1001_S1_L001_R2_001.fastq.gz'), 'w').close()
- with io.open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json:
+ with open(os.path.join(completed, 'Demultiplexing', 'Stats', 'Stats.json'), 'w', encoding="utf-8") as stats_json:
stats_json.write(unicode(json.dumps({'silly': 1}, ensure_ascii=False)))
# Copy transfer file with the completed run
diff --git a/tests/test_instrument_transfer.py b/tests/test_instrument_transfer.py
index 60a1533b..3d9b85fc 100644
--- a/tests/test_instrument_transfer.py
+++ b/tests/test_instrument_transfer.py
@@ -1,10 +1,12 @@
-from taca.nanopore import instrument_transfer
-from unittest.mock import patch, mock_open, call, Mock, MagicMock
-import tempfile
-import pytest
+import json
import os
import re
-import json
+import tempfile
+from unittest.mock import Mock, call, mock_open, patch
+
+import pytest
+
+from taca.nanopore import instrument_transfer
DUMMY_RUN_NAME = "20240112_2342_MN19414_TEST12345_randomhash"
@@ -156,7 +158,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc):
# Check path was dumped
assert os.path.exists(run_path + "/run_path.txt")
- assert open(run_path + "/run_path.txt", "r").read() == "/".join(
+ assert open(run_path + "/run_path.txt").read() == "/".join(
run_path.split("/")[-3:]
)
@@ -179,7 +181,7 @@ def test_main(mock_sync, mock_final_sync, setup_test_fixture, finished, qc):
)
+ "\n"
)
- assert open(run_path + "/pore_count_history.csv", "r").read() == template
+ assert open(run_path + "/pore_count_history.csv").read() == template
def test_sequencing_finished():
@@ -389,7 +391,7 @@ def test_dump_pore_count_history(setup_test_fixture):
run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST','FLG')}"
os.makedirs(run_path)
new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts)
- assert open(new_file, "r").read() == ""
+ assert open(new_file).read() == ""
tmp.cleanup()
# Nothing to add, file is present
@@ -398,7 +400,7 @@ def test_dump_pore_count_history(setup_test_fixture):
os.makedirs(run_path)
open(run_path + "/pore_count_history.csv", "w").write("test")
new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts)
- assert open(new_file, "r").read() == "test"
+ assert open(new_file).read() == "test"
tmp.cleanup()
# Something to add
@@ -424,5 +426,5 @@ def test_dump_pore_count_history(setup_test_fixture):
+ "\n"
)
- assert open(new_file, "r").read() == template
+ assert open(new_file).read() == template
tmp.cleanup()
diff --git a/tests/test_nanopore.py b/tests/test_nanopore.py
index cb1e1a15..0220f6de 100644
--- a/tests/test_nanopore.py
+++ b/tests/test_nanopore.py
@@ -1,12 +1,12 @@
#!/usr/bin/env python
-import unittest
-import mock
import filecmp
import os
import subprocess
+import unittest
+from unittest import mock
-from taca.nanopore.ONT_run_classes import ONT_run
from taca.nanopore.minion_run_class import MinIONqc
+from taca.nanopore.ONT_run_classes import ONT_run
from taca.utils import config
CONFIG = config.load_yaml_config("data/taca_test_nanopore_cfg.yaml")
diff --git a/tests/test_server_status.py b/tests/test_server_status.py
index 2d24c83d..781adbe9 100644
--- a/tests/test_server_status.py
+++ b/tests/test_server_status.py
@@ -1,9 +1,10 @@
#!/usr/bin/env python
import unittest
-import mock
+from unittest import mock
+
import crontab
-from taca.server_status import server_status, cronjobs
+from taca.server_status import cronjobs, server_status
from taca.utils import config
CONFIG = config.load_yaml_config('data/taca_test_cfg.yaml')
@@ -62,9 +63,9 @@ def test_parse_crontab(self, mock_getpass, mock_crontab):
mock_crontab.return_value = crontab.CronTab(tab=INITAL_TAB)
mock_getpass.return_value = 'test_user'
expected_crontab = {'test_user':
- [{'Comment': u'First Comment',
+ [{'Comment': 'First Comment',
'Day of month': '*',
- 'Command': u'firstcommand',
+ 'Command': 'firstcommand',
'Hour': '*',
'Day of week': '*',
'Enabled': True,
@@ -83,9 +84,9 @@ def test_parse_crontab(self, mock_getpass, mock_crontab):
def test_update_cronjob_db(self, mock_parser, mock_platform, mock_logging, mock_statusdb):
"""Update couchdb with cronjobs."""
mock_parser.return_value = {'test_user':
- [{'Comment': u'First Comment',
+ [{'Comment': 'First Comment',
'Day of month': '*',
- 'Command': u'firstcommand',
+ 'Command': 'firstcommand',
'Hour': '*',
'Day of week': '*',
'Enabled': True,
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c4f6f2d9..e16abbd0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,18 +1,18 @@
"""Unit tests for the utils helper functions."""
-import hashlib
-import mock
import os
import shutil
import subprocess
import tempfile
-import unittest
import time
-import couchdb
+import unittest
from collections import defaultdict
-from taca.utils import misc, filesystem, transfer, config, bioinfo_tab, statusdb
+from unittest import mock
+
from six.moves import map
+from taca.utils import bioinfo_tab, config, filesystem, misc, statusdb, transfer
+
class TestMisc(unittest.TestCase):
"""Test class for the misc functions."""
@@ -262,8 +262,8 @@ def setUpClass(self):
self.rootdir = tempfile.mkdtemp(prefix='test_taca_symlink_src')
path = self.rootdir
for n in range(3):
- open(os.path.join(path, 'file{}'.format(n)), 'w').close()
- path = os.path.join(path, 'folder{}'.format(n))
+ open(os.path.join(path, f'file{n}'), 'w').close()
+ path = os.path.join(path, f'folder{n}')
os.mkdir(path)
@classmethod
@@ -424,7 +424,7 @@ def setUpClass(cls):
# create a digest file
def _write_digest(rootdir, fhandle, fpath):
- fhandle.write('{} {}\n'.format(misc.hashfile(fpath), os.path.relpath(fpath, rootdir)))
+ fhandle.write(f'{misc.hashfile(fpath)} {os.path.relpath(fpath, rootdir)}\n')
cls.digestfile = os.path.join(cls.rootdir, 'digestfile.sha1')
with open(cls.digestfile, 'w') as digesth:
@@ -500,12 +500,12 @@ def test_rsync_agent_dest_paths_constructed(self):
'and empty destination host')
self.agent.remote_host = 'localhost'
self.assertEqual(
- 'localhost:{}'.format(self.destdir),
+ f'localhost:{self.destdir}',
self.agent.remote_path(),
'Destination path was not correct for empty remote user')
self.agent.remote_user = 'user'
self.assertEqual(
- 'user@localhost:{}'.format(self.destdir),
+ f'user@localhost:{self.destdir}',
self.agent.remote_path(),
'Destination path was not correct for non-empty remote user')
self.agent.dest_path = None
From 6e54689979d39ae58d91ae119055584cf9aab64a Mon Sep 17 00:00:00 2001
From: kedhammar
Date: Wed, 17 Jan 2024 14:27:33 +0100
Subject: [PATCH 04/44] ruff check unsafe fixes
---
setup.py | 5 ++---
taca/analysis/analysis.py | 1 -
taca/backup/backup.py | 5 ++---
taca/cleanup/cleanup.py | 9 +++------
taca/illumina/MiSeq_Runs.py | 2 +-
taca/illumina/NovaSeqXPlus_Runs.py | 2 +-
taca/illumina/NovaSeq_Runs.py | 2 +-
taca/illumina/Runs.py | 1 -
taca/illumina/Standard_Runs.py | 8 +++-----
taca/nanopore/ONT_run_classes.py | 6 +++---
taca/nanopore/instrument_transfer.py | 2 +-
taca/testing/create_uppmax_like_env.py | 1 -
taca/utils/bioinfo_tab.py | 6 +++---
taca/utils/config.py | 5 ++---
taca/utils/misc.py | 5 +----
taca/utils/statusdb.py | 8 ++++----
taca/utils/transfer.py | 7 +++----
tests/test_analysis_nanopore.py | 3 ---
tests/test_backup.py | 2 +-
tests/test_utils.py | 8 +++-----
20 files changed, 34 insertions(+), 54 deletions(-)
diff --git a/setup.py b/setup.py
index d8962c06..74f35bed 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,17 @@
import glob
-from io import open
from setuptools import find_packages, setup
from taca import __version__
try:
- with open("requirements.txt", "r") as f:
+ with open("requirements.txt") as f:
install_requires = [x.strip() for x in f.readlines()]
except OSError:
install_requires = []
try:
- with open("dependency_links.txt", "r") as f:
+ with open("dependency_links.txt") as f:
dependency_links = [x.strip() for x in f.readlines()]
except OSError:
dependency_links = []
diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py
index 2ef4aafd..4caf9a48 100755
--- a/taca/analysis/analysis.py
+++ b/taca/analysis/analysis.py
@@ -4,7 +4,6 @@
import os
import subprocess
import sys
-from io import open
from shutil import copyfile, copytree
from flowcell_parser.classes import RunParametersParser
diff --git a/taca/backup/backup.py b/taca/backup/backup.py
index 88a4188f..97fdec7b 100644
--- a/taca/backup/backup.py
+++ b/taca/backup/backup.py
@@ -7,7 +7,6 @@
import subprocess as sp
import time
from datetime import datetime
-from io import open
from taca.utils import filesystem, misc, statusdb
from taca.utils.config import CONFIG
@@ -287,7 +286,7 @@ def encrypt_runs(cls, run, force):
if os.path.exists(run.flag):
logger.warn(f'Run {run.name} is already being encrypted, so skipping now')
continue
- flag = open(run.flag, 'w').close()
+ open(run.flag, 'w').close()
# zip the run directory
if os.path.exists(run.zip):
if os.path.isdir(run.name):
@@ -379,7 +378,7 @@ def pdc_put(cls, run):
if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup')
continue
- flag = open(run.flag, 'w').close()
+ open(run.flag, 'w').close()
logger.info(f'Sending file {run.zip_encrypted} to PDC')
if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]):
time.sleep(15) # give some time just in case 'dsmc' needs to settle
diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py
index 80ef1905..498ffcf9 100644
--- a/taca/cleanup/cleanup.py
+++ b/taca/cleanup/cleanup.py
@@ -5,9 +5,6 @@
from collections import defaultdict
from datetime import datetime
from glob import glob
-from io import open
-
-from six.moves import map
from taca.utils import filesystem, misc, statusdb
from taca.utils.config import CONFIG, load_config
@@ -74,7 +71,7 @@ def cleanup_miarka(days_fastq, days_analysis,
exclude_list = []
if exclude_projects:
if os.path.isfile(exclude_projects):
- with open(exclude_projects, 'r') as in_file:
+ with open(exclude_projects) as in_file:
exclude_list.extend([p.strip() for p in in_file.readlines()])
else:
exclude_list.extend(exclude_projects.split(','))
@@ -119,12 +116,12 @@ def cleanup_miarka(days_fastq, days_analysis,
undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files),
undet_size), default='no'):
- removed = _remove_files(all_undet_files)
+ _remove_files(all_undet_files)
return
elif only_analysis:
for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]:
- proj_abs_path = os.path.join(analysis_dir, pid)
+ os.path.join(analysis_dir, pid)
proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date)
if proj_info and proj_info['closed_days'] >= days_analysis:
# move on if this project has to be excluded
diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py
index f6585801..ff7d1095 100644
--- a/taca/illumina/MiSeq_Runs.py
+++ b/taca/illumina/MiSeq_Runs.py
@@ -17,7 +17,7 @@
class MiSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(MiSeq_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
self._get_samplesheet()
diff --git a/taca/illumina/NovaSeqXPlus_Runs.py b/taca/illumina/NovaSeqXPlus_Runs.py
index 4acff5e4..f3b34b2e 100644
--- a/taca/illumina/NovaSeqXPlus_Runs.py
+++ b/taca/illumina/NovaSeqXPlus_Runs.py
@@ -3,7 +3,7 @@
class NovaSeqXPlus_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(NovaSeqXPlus_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
diff --git a/taca/illumina/NovaSeq_Runs.py b/taca/illumina/NovaSeq_Runs.py
index f6ba71e8..cce59eef 100644
--- a/taca/illumina/NovaSeq_Runs.py
+++ b/taca/illumina/NovaSeq_Runs.py
@@ -3,7 +3,7 @@
class NovaSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(NovaSeq_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py
index fa4618c8..56724ac7 100644
--- a/taca/illumina/Runs.py
+++ b/taca/illumina/Runs.py
@@ -328,7 +328,6 @@ def archive_run(self, destination):
def send_mail(self, sbt, msg, rcp):
""" Sends mail about run completion
"""
- already_seen = False
runname = self.id
if not sbt:
sbt = f"{runname}"
diff --git a/taca/illumina/Standard_Runs.py b/taca/illumina/Standard_Runs.py
index f2699426..98741bb4 100755
--- a/taca/illumina/Standard_Runs.py
+++ b/taca/illumina/Standard_Runs.py
@@ -2,7 +2,6 @@
import os
import re
from datetime import datetime
-from io import open
from flowcell_parser.classes import SampleSheetParser
@@ -22,7 +21,7 @@
class Standard_Run(Run):
def __init__(self, run_dir, software, configuration):
- super(Standard_Run, self).__init__(run_dir, software, configuration)
+ super().__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
self._copy_samplesheet()
@@ -82,7 +81,7 @@ def _parse_10X_indexes(self, indexfile):
Todo: Set it up to take the file from config instead
"""
index_dict = {}
- with open(indexfile, 'r') as f:
+ with open(indexfile) as f:
for line in f:
line_ = line.rstrip().split(',')
index_dict[line_[0]] = line_[1:5]
@@ -94,7 +93,7 @@ def _parse_smartseq_indexes(self, indexfile):
Todo: Set it up to take the file from config instead
"""
index_dict = {}
- with open(indexfile, 'r') as f:
+ with open(indexfile) as f:
for line in f:
line_ = line.rstrip().split(',')
if index_dict.get(line_[0]):
@@ -425,7 +424,6 @@ def _compute_base_mask(self, runSetup, sample_type, index1_size, is_dual_index,
- if runSetup is of size 4, then dual index run
"""
bm = []
- dual_index_run = False
if len(runSetup) > 4:
raise RuntimeError("when generating base_masks looks like there are" \
" more than 4 reads in the RunSetup.xml")
diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py
index a6743f04..dfbdf3b8 100644
--- a/taca/nanopore/ONT_run_classes.py
+++ b/taca/nanopore/ONT_run_classes.py
@@ -159,7 +159,7 @@ def update_db_entry(self, force_update=False):
self.touch_db_entry()
# If the run document is marked as "ongoing" or database is being manually updated
- if self.db.check_run_status(self) == "ongoing" or force_update == True:
+ if self.db.check_run_status(self) == "ongoing" or force_update is True:
logger.info(
f"{self.run_name}: Run exists in the database with run status: {self.db.check_run_status(self)}."
)
@@ -372,7 +372,7 @@ class ONT_user_run(ONT_run):
def __init__(self, run_abspath: str):
self.run_type = "user_run"
- super(ONT_user_run, self).__init__(run_abspath)
+ super().__init__(run_abspath)
class ONT_qc_run(ONT_run):
@@ -380,7 +380,7 @@ class ONT_qc_run(ONT_run):
def __init__(self, run_abspath: str):
self.run_type = "qc_run"
- super(ONT_qc_run, self).__init__(run_abspath)
+ super().__init__(run_abspath)
# Get Anglerfish attributes from run
self.anglerfish_done_abspath = f"{self.run_abspath}/.anglerfish_done"
diff --git a/taca/nanopore/instrument_transfer.py b/taca/nanopore/instrument_transfer.py
index 130a88c8..a7cabdf1 100644
--- a/taca/nanopore/instrument_transfer.py
+++ b/taca/nanopore/instrument_transfer.py
@@ -153,7 +153,7 @@ def archive_finished_run(run_dir: str, archive_dir: str):
sample_dir = os.path.dirname(run_dir)
exp_dir = os.path.dirname(sample_dir)
- run_name = os.path.basename(run_dir)
+ os.path.basename(run_dir)
sample_name = os.path.basename(sample_dir)
exp_name = os.path.basename(exp_dir)
diff --git a/taca/testing/create_uppmax_like_env.py b/taca/testing/create_uppmax_like_env.py
index 9e936857..f0a10ea9 100644
--- a/taca/testing/create_uppmax_like_env.py
+++ b/taca/testing/create_uppmax_like_env.py
@@ -5,7 +5,6 @@
import os
import random
import subprocess
-from io import open
from dateutil.relativedelta import relativedelta
diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py
index 8ec36614..022ac93e 100644
--- a/taca/utils/bioinfo_tab.py
+++ b/taca/utils/bioinfo_tab.py
@@ -17,7 +17,7 @@
class Tree(defaultdict):
"""Constructor for a search tree."""
def __init__(self, value=None):
- super(Tree, self).__init__(Tree)
+ super().__init__(Tree)
self.value = value
@@ -105,7 +105,7 @@ def update_statusdb(run_dir):
# Creates record
db.save(obj)
# Sets FC error flag
- if not project_info[flowcell].value == None:
+ if project_info[flowcell].value is not None:
if (('Failed' in project_info[flowcell].value and 'Failed' not in sample_status)
or ('Failed' in sample_status and 'Failed' not in project_info[flowcell].value)):
project_info[flowcell].value = 'Ambiguous'
@@ -113,7 +113,7 @@ def update_statusdb(run_dir):
project_info[flowcell].value = sample_status
# Checks if a flowcell needs partial re-doing
# Email error per flowcell
- if not project_info[flowcell].value == None:
+ if project_info[flowcell].value is not None:
if 'Ambiguous' in project_info[flowcell].value:
error_emailer('failed_run', run_id)
diff --git a/taca/utils/config.py b/taca/utils/config.py
index 004d163a..1a6fd6a1 100644
--- a/taca/utils/config.py
+++ b/taca/utils/config.py
@@ -1,5 +1,4 @@
"""Load and parse configuration file."""
-from io import open
import yaml
@@ -9,7 +8,7 @@ def load_config(config_file):
"""Loads a configuration file."""
config = {}
try:
- with open(config_file, 'r') as f:
+ with open(config_file) as f:
content = yaml.load(f, Loader=yaml.FullLoader)
config.update(content)
return content
@@ -27,7 +26,7 @@ def load_yaml_config(config_file):
:raises IOError: If the config file cannot be opened.
"""
try:
- with open(config_file, 'r') as f:
+ with open(config_file) as f:
content = yaml.load(f, Loader=yaml.FullLoader)
CONFIG.update(content)
return content
diff --git a/taca/utils/misc.py b/taca/utils/misc.py
index 946723e5..3a4a1d68 100755
--- a/taca/utils/misc.py
+++ b/taca/utils/misc.py
@@ -7,9 +7,6 @@
import sys
from datetime import datetime
from email.mime.text import MIMEText
-from io import open
-
-from six.moves import input
from taca.utils import statusdb
@@ -154,7 +151,7 @@ def query_yes_no(question, default='yes', force=False):
"""
valid = {'yes': True, 'y': True, 'ye': True,
'no': False, 'n': False}
- if default == None:
+ if default is None:
prompt = ' [y/n] '
elif default == 'yes':
prompt = ' [Y/n] '
diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py
index c02d7ac6..4bbb70a4 100644
--- a/taca/utils/statusdb.py
+++ b/taca/utils/statusdb.py
@@ -67,7 +67,7 @@ def get_project_flowcell(self, project_id, open_date='2015-01-01', date_format='
class ProjectSummaryConnection(StatusdbSession):
def __init__(self, config, dbname='projects'):
- super(ProjectSummaryConnection, self).__init__(config)
+ super().__init__(config)
self.db = self.connection[dbname]
self.name_view = {k.key: k.id for k in self.db.view('project/project_name', reduce=False)}
self.id_view = {k.key: k.id for k in self.db.view('project/project_id', reduce=False)}
@@ -75,7 +75,7 @@ def __init__(self, config, dbname='projects'):
class FlowcellRunMetricsConnection(StatusdbSession):
def __init__(self, config, dbname='flowcells'):
- super(FlowcellRunMetricsConnection, self).__init__(config)
+ super().__init__(config)
self.db = self.connection[dbname]
self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)}
self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key}
@@ -83,7 +83,7 @@ def __init__(self, config, dbname='flowcells'):
class X_FlowcellRunMetricsConnection(StatusdbSession):
def __init__(self, config, dbname='x_flowcells'):
- super(X_FlowcellRunMetricsConnection, self).__init__(config)
+ super().__init__(config)
self.db = self.connection[dbname]
self.name_view = {k.key:k.id for k in self.db.view('names/name', reduce=False)}
self.proj_list = {k.key:k.value for k in self.db.view('names/project_ids_list', reduce=False) if k.key}
@@ -92,7 +92,7 @@ def __init__(self, config, dbname='x_flowcells'):
class NanoporeRunsConnection(StatusdbSession):
def __init__(self, config, dbname='nanopore_runs'):
- super(NanoporeRunsConnection, self).__init__(config)
+ super().__init__(config)
self.db = self.connection[dbname]
def check_run_exists(self, ont_run) -> bool:
diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py
index dfefcdea..2968d0f9 100644
--- a/taca/utils/transfer.py
+++ b/taca/utils/transfer.py
@@ -4,7 +4,6 @@
import os
import shutil
import subprocess
-from io import open
from taca.utils.filesystem import create_folder
from taca.utils.misc import call_external_command, hashfile
@@ -135,7 +134,7 @@ def __init__(
algorithm will be inferred from the extension of the digest file
:param opts: options that will be passed to the rsync command
"""
- super(RsyncAgent, self).__init__(
+ super().__init__(
src_path=src_path,
dest_path=dest_path,
opts=opts or self.DEFAULT_OPTS,
@@ -246,7 +245,7 @@ def __init__(self, src_path, dest_path, overwrite=True, relative=True, **kwargs)
be overwritten if it already exists
:param bool relative: if true, the destination symlink will be relative
"""
- super(SymlinkAgent,self).__init__(
+ super().__init__(
src_path=src_path,
dest_path=dest_path,
**kwargs)
@@ -323,7 +322,7 @@ def validate_transfer(self):
class TransferError(Exception):
def __init__(self, msg, src_path=None, dest_path=None):
- super(TransferError, self).__init__(msg)
+ super().__init__(msg)
self.src_path = src_path
self.dest_path = dest_path
diff --git a/tests/test_analysis_nanopore.py b/tests/test_analysis_nanopore.py
index 66688701..f1b02676 100644
--- a/tests/test_analysis_nanopore.py
+++ b/tests/test_analysis_nanopore.py
@@ -45,9 +45,6 @@ def test_process_minion_run_transfer(self, mock_mail, mock_archive, mock_update,
mock_cp.return_value = True
run_dir = 'data/nanopore_data/run4/done_demuxing/20200104_1412_MN19414_AAU644_68125dc2'
minion_run = MinIONqc(run_dir, 'dummy/path', None)
- email_subject = ('Run successfully processed: 20200104_1412_MN19414_AAU644_68125dc2')
- email_message = 'Run 20200104_1412_MN19414_AAU644_68125dc2 has been analysed, transferred and archived successfully.'
- email_recipients = 'test@test.com'
process_minion_qc_run(minion_run)
expected_calls = [mock.call('Anglerfish successfully processed run 20200104_1412_MN19414_AAU644_68125dc2',
'Anglerfish has successfully finished for run 20200104_1412_MN19414_AAU644_68125dc2. Please finish the QC step in lims.',
diff --git a/tests/test_backup.py b/tests/test_backup.py
index 4d3a6bad..f1addeb9 100644
--- a/tests/test_backup.py
+++ b/tests/test_backup.py
@@ -104,7 +104,7 @@ def test_call_commands_double(self):
os.makedirs(tmp_dir)
cmd1 = 'ls data/nas/miseq.lab'
cmd2 = 'ls data/nas/miseq.lab'
- got_output = backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False)
+ backup_object._call_commands(cmd1, cmd2, out_file=tmp_file, mail_failed=False)
self.assertTrue(os.path.isfile(tmp_file))
shutil.rmtree(tmp_dir)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e16abbd0..2b52f37d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,8 +9,6 @@
from collections import defaultdict
from unittest import mock
-from six.moves import map
-
from taca.utils import bioinfo_tab, config, filesystem, misc, statusdb, transfer
@@ -119,7 +117,7 @@ def test_run_is_demuxed(self, mock_couch):
'username': 'username',
'password': 'pwd',
'db': 'db'}
- is_demultiplexed = misc.run_is_demuxed(run, couch_info=couch_info)
+ misc.run_is_demuxed(run, couch_info=couch_info)
#TODO: should add a check here but not sure how to mock this properly
class TestFilesystem(unittest.TestCase):
@@ -596,7 +594,7 @@ def test_load_yaml_config(self):
{'file': 'data/taca.log'}}
self.assertEqual(expexted_config_data, got_config_data)
with self.assertRaises(IOError):
- missing_config_data = config.load_yaml_config('data/missing_file.yaml)')
+ config.load_yaml_config('data/missing_file.yaml)')
def test_load_config(self):
"""Load a config file."""
@@ -609,7 +607,7 @@ def test_load_config(self):
{'file': 'data/taca.log'}}
self.assertEqual(expexted_config_data, got_config_data)
with self.assertRaises(IOError):
- missing_config_data = config.load_config('data/missing_file.yaml)')
+ config.load_config('data/missing_file.yaml)')
class TestBioinfoTab(unittest.TestCase):
"""Test class for bioinfo_tab."""
From d5330f615b237beadcec22d5422dff3c02aa54ff Mon Sep 17 00:00:00 2001
From: kedhammar
Date: Wed, 17 Jan 2024 14:28:37 +0100
Subject: [PATCH 05/44] ruff format
---
doc/conf.py | 199 ++--
setup.py | 43 +-
taca/analysis/analysis.py | 436 +++++---
taca/analysis/analysis_nanopore.py | 4 -
taca/analysis/cli.py | 57 +-
taca/backup/backup.py | 400 +++++---
taca/backup/cli.py | 58 +-
taca/cleanup/cleanup.py | 518 ++++++----
taca/cleanup/cli.py | 140 ++-
taca/cli.py | 28 +-
taca/illumina/MiSeq_Runs.py | 157 ++-
taca/illumina/NextSeq_Runs.py | 2 +-
taca/illumina/Runs.py | 1295 ++++++++++++++++--------
taca/illumina/Standard_Runs.py | 799 ++++++++++-----
taca/illumina/__init__.py | 2 +-
taca/log/__init__.py | 21 +-
taca/nanopore/ONT_run_classes.py | 3 -
taca/nanopore/__init__.py | 2 +-
taca/nanopore/instrument_transfer.py | 11 +-
taca/server_status/cli.py | 28 +-
taca/server_status/cronjobs.py | 59 +-
taca/server_status/server_status.py | 107 +-
taca/testing/cli.py | 132 ++-
taca/testing/create_uppmax_like_env.py | 557 ++++++----
taca/utils/bioinfo_tab.py | 335 +++---
taca/utils/cli.py | 15 +-
taca/utils/config.py | 2 +
taca/utils/filesystem.py | 21 +-
taca/utils/misc.py | 106 +-
taca/utils/statusdb.py | 115 ++-
taca/utils/transfer.py | 264 ++---
tests/test_analysis.py | 193 ++--
tests/test_analysis_nanopore.py | 87 +-
tests/test_backup.py | 165 +--
tests/test_cleanup.py | 181 ++--
tests/test_illumina.py | 619 +++++++----
tests/test_instrument_transfer.py | 22 +-
tests/test_nanopore.py | 290 ++++--
tests/test_server_status.py | 129 ++-
tests/test_utils.py | 656 ++++++------
40 files changed, 5223 insertions(+), 3035 deletions(-)
diff --git a/doc/conf.py b/doc/conf.py
index 5c1d130e..01abb472 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -16,162 +16,163 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.todo',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.viewcode',
+ "sphinx.ext.autodoc",
+ "sphinx.ext.todo",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.viewcode",
]
# Add any paths that contain templates here, relative to this directory.
-#templates_path = ['_templates']
+# templates_path = ['_templates']
# The suffix of source filenames.
-source_suffix = '.rst'
+source_suffix = ".rst"
# The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = 'TACA'
-copyright = '2014, Guillermo Carrasco'
+project = "TACA"
+copyright = "2014, Guillermo Carrasco"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
-version = '1.0'
+version = "1.0"
# The full version, including alpha/beta/rc tags.
-release = '1.0'
+release = "1.0"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
-#language = None
+# language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
-#today = ''
+# today = ''
# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
-#default_role = None
+# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
-#show_authors = False
+# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'default'
-on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+html_theme = "default"
+on_rtd = os.environ.get("READTHEDOCS", None) == "True"
if not on_rtd:
import sphinx_rtd_theme
- html_theme = 'sphinx_rtd_theme'
+
+ html_theme = "sphinx_rtd_theme"
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
-#html_theme_options = {}
+# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
-#html_title = None
+# html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
-#html_logo = None
+# html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-#html_favicon = None
+# html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
# If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
# If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
# If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
html_show_copyright = False
@@ -179,68 +180,62 @@
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
# Output file base name for HTML help builder.
-htmlhelp_basename = 'TACAdoc'
+htmlhelp_basename = "TACAdoc"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
-
-# Additional stuff for the LaTeX preamble.
-#'preamble': '',
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ('index', 'TACA.tex', 'TACA Documentation',
- 'Guillermo Carrasco', 'manual'),
+ ("index", "TACA.tex", "TACA Documentation", "Guillermo Carrasco", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
-#latex_logo = None
+# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
# If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
# If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
# Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
# If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [
- ('index', 'taca', 'TACA Documentation',
- ['Guillermo Carrasco'], 1)
-]
+man_pages = [("index", "taca", "TACA Documentation", ["Guillermo Carrasco"], 1)]
# If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
@@ -249,89 +244,95 @@
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
- ('index', 'TACA', 'TACA Documentation',
- 'Guillermo Carrasco', 'TACA', 'One line description of project.',
- 'Miscellaneous'),
+ (
+ "index",
+ "TACA",
+ "TACA Documentation",
+ "Guillermo Carrasco",
+ "TACA",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
]
# Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
# If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
# -- Options for Epub output ----------------------------------------------
# Bibliographic Dublin Core info.
-epub_title = 'TACA'
-epub_author = 'Guillermo Carrasco'
-epub_publisher = 'Guillermo Carrasco'
-epub_copyright = '2014, Guillermo Carrasco'
+epub_title = "TACA"
+epub_author = "Guillermo Carrasco"
+epub_publisher = "Guillermo Carrasco"
+epub_copyright = "2014, Guillermo Carrasco"
# The basename for the epub file. It defaults to the project name.
-#epub_basename = u'TACA'
+# epub_basename = u'TACA'
# The HTML theme for the epub output. Since the default themes are not optimized
# for small screen space, using the same theme for HTML and epub output is
# usually not wise. This defaults to 'epub', a theme designed to save visual
# space.
-#epub_theme = 'epub'
+# epub_theme = 'epub'
# The language of the text. It defaults to the language option
# or en if the language is not set.
-#epub_language = ''
+# epub_language = ''
# The scheme of the identifier. Typical schemes are ISBN or URL.
-#epub_scheme = ''
+# epub_scheme = ''
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
-#epub_identifier = ''
+# epub_identifier = ''
# A unique identification for the text.
-#epub_uid = ''
+# epub_uid = ''
# A tuple containing the cover image and cover page html template filenames.
-#epub_cover = ()
+# epub_cover = ()
# A sequence of (type, uri, title) tuples for the guide element of content.opf.
-#epub_guide = ()
+# epub_guide = ()
# HTML files that should be inserted before the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_pre_files = []
+# epub_pre_files = []
# HTML files shat should be inserted after the pages created by sphinx.
# The format is a list of tuples containing the path and title.
-#epub_post_files = []
+# epub_post_files = []
# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
# The depth of the table of contents in toc.ncx.
-#epub_tocdepth = 3
+# epub_tocdepth = 3
# Allow duplicate toc entries.
-#epub_tocdup = True
+# epub_tocdup = True
# Choose between 'default' and 'includehidden'.
-#epub_tocscope = 'default'
+# epub_tocscope = 'default'
# Fix unsupported image types using the PIL.
-#epub_fix_images = False
+# epub_fix_images = False
# Scale large images.
-#epub_max_image_width = 0
+# epub_max_image_width = 0
# How to display URL addresses: 'footnote', 'no', or 'inline'.
-#epub_show_urls = 'inline'
+# epub_show_urls = 'inline'
# If false, no index is generated.
-#epub_use_index = True
+# epub_use_index = True
diff --git a/setup.py b/setup.py
index 74f35bed..e278a522 100644
--- a/setup.py
+++ b/setup.py
@@ -17,32 +17,33 @@
dependency_links = []
-setup(name='taca',
+setup(
+ name="taca",
version=__version__,
description="Tool for the Automation of Cleanup and Analyses",
- long_description='This package contains a set of functionalities that are '
- 'useful in the day-to-day tasks of bioinformatitians in '
- 'National Genomics Infrastructure in Stockholm, Sweden.',
- keywords='bioinformatics',
- author='NGI-stockholm',
- author_email='ngi_pipeline_operators@scilifelab.se',
- url='http://taca.readthedocs.org/en/latest/',
- license='MIT',
- packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
- scripts=glob.glob('scripts/*.py'),
+ long_description="This package contains a set of functionalities that are "
+ "useful in the day-to-day tasks of bioinformatitians in "
+ "National Genomics Infrastructure in Stockholm, Sweden.",
+ keywords="bioinformatics",
+ author="NGI-stockholm",
+ author_email="ngi_pipeline_operators@scilifelab.se",
+ url="http://taca.readthedocs.org/en/latest/",
+ license="MIT",
+ packages=find_packages(exclude=["ez_setup", "examples", "tests"]),
+ scripts=glob.glob("scripts/*.py"),
include_package_data=True,
zip_safe=False,
entry_points={
- 'console_scripts': ['taca = taca.cli:cli'],
- 'taca.subcommands': [
- 'cleanup = taca.cleanup.cli:cleanup',
- 'analysis = taca.analysis.cli:analysis',
- 'bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries',
- 'server_status = taca.server_status.cli:server_status',
- 'backup = taca.backup.cli:backup',
- 'create_env = taca.testing.cli:uppmax_env'
- ]
+ "console_scripts": ["taca = taca.cli:cli"],
+ "taca.subcommands": [
+ "cleanup = taca.cleanup.cli:cleanup",
+ "analysis = taca.analysis.cli:analysis",
+ "bioinfo_deliveries = taca.utils.cli:bioinfo_deliveries",
+ "server_status = taca.server_status.cli:server_status",
+ "backup = taca.backup.cli:backup",
+ "create_env = taca.testing.cli:uppmax_env",
+ ],
},
install_requires=install_requires,
- dependency_links=dependency_links
+ dependency_links=dependency_links,
)
diff --git a/taca/analysis/analysis.py b/taca/analysis/analysis.py
index 4caf9a48..c615b3f9 100755
--- a/taca/analysis/analysis.py
+++ b/taca/analysis/analysis.py
@@ -29,54 +29,68 @@ def get_runObj(run, software):
None if the sequencer type is unknown of there was an error
"""
- if os.path.exists(os.path.join(run, 'runParameters.xml')):
- run_parameters_file = 'runParameters.xml'
- elif os.path.exists(os.path.join(run, 'RunParameters.xml')):
- run_parameters_file = 'RunParameters.xml'
+ if os.path.exists(os.path.join(run, "runParameters.xml")):
+ run_parameters_file = "runParameters.xml"
+ elif os.path.exists(os.path.join(run, "RunParameters.xml")):
+ run_parameters_file = "RunParameters.xml"
else:
- logger.error(f'Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}')
+ logger.error(
+ f"Cannot find RunParameters.xml or runParameters.xml in the run folder for run {run}"
+ )
return
run_parameters_path = os.path.join(run, run_parameters_file)
try:
run_parameters = RunParametersParser(run_parameters_path)
except OSError:
- logger.warn(f'Problems parsing the runParameters.xml file at {run_parameters_path}. '
- f'This is quite unexpected. please archive the run {run} manually')
+ logger.warn(
+ f"Problems parsing the runParameters.xml file at {run_parameters_path}. "
+ f"This is quite unexpected. please archive the run {run} manually"
+ )
else:
# Do a case by case test because there are so many version of RunParameters that there is no real other way
- runtype = run_parameters.data['RunParameters'].get('InstrumentType',
- run_parameters.data['RunParameters'].get('ApplicationName',
- run_parameters.data['RunParameters'].get('Application',
- '')))
- if 'Setup' in run_parameters.data['RunParameters']:
+ runtype = run_parameters.data["RunParameters"].get(
+ "InstrumentType",
+ run_parameters.data["RunParameters"].get(
+ "ApplicationName",
+ run_parameters.data["RunParameters"].get("Application", ""),
+ ),
+ )
+ if "Setup" in run_parameters.data["RunParameters"]:
# This is the HiSeq2500, MiSeq, and HiSeqX case
try:
# Works for recent control software
- runtype = run_parameters.data['RunParameters']['Setup']['Flowcell']
+ runtype = run_parameters.data["RunParameters"]["Setup"]["Flowcell"]
except KeyError:
# Use this as second resource but print a warning in the logs
- logger.warn('Parsing runParameters to fecth instrument type, '
- 'not found Flowcell information in it. Using ApplicationName')
+ logger.warn(
+ "Parsing runParameters to fecth instrument type, "
+ "not found Flowcell information in it. Using ApplicationName"
+ )
# Here makes sense to use get with default value '' ->
# so that it doesn't raise an exception in the next lines
# (in case ApplicationName is not found, get returns None)
- runtype = run_parameters.data['RunParameters']['Setup'].get('ApplicationName', '')
-
- if 'MiSeq' in runtype:
- return MiSeq_Run(run, software, CONFIG['analysis']['MiSeq'])
- elif 'NextSeq' in runtype:
- return NextSeq_Run(run, software, CONFIG['analysis']['NextSeq'])
- elif 'NovaSeqXPlus' in runtype:
- return NovaSeqXPlus_Run(run, software, CONFIG['analysis']['NovaSeqXPlus'])
- elif 'NovaSeq' in runtype:
- return NovaSeq_Run(run, software, CONFIG['analysis']['NovaSeq'])
+ runtype = run_parameters.data["RunParameters"]["Setup"].get(
+ "ApplicationName", ""
+ )
+
+ if "MiSeq" in runtype:
+ return MiSeq_Run(run, software, CONFIG["analysis"]["MiSeq"])
+ elif "NextSeq" in runtype:
+ return NextSeq_Run(run, software, CONFIG["analysis"]["NextSeq"])
+ elif "NovaSeqXPlus" in runtype:
+ return NovaSeqXPlus_Run(run, software, CONFIG["analysis"]["NovaSeqXPlus"])
+ elif "NovaSeq" in runtype:
+ return NovaSeq_Run(run, software, CONFIG["analysis"]["NovaSeq"])
else:
- logger.warn('Unrecognized run type {}, cannot archive the run {}. '
- 'Someone as likely bought a new sequencer without telling '
- 'it to the bioinfo team'.format(runtype, run))
+ logger.warn(
+ "Unrecognized run type {}, cannot archive the run {}. "
+ "Someone as likely bought a new sequencer without telling "
+ "it to the bioinfo team".format(runtype, run)
+ )
return None
+
def upload_to_statusdb(run_dir, software):
"""Function to upload run_dir informations to statusDB directly from click interface.
@@ -90,60 +104,80 @@ def upload_to_statusdb(run_dir, software):
# Make the actual upload
_upload_to_statusdb(runObj)
+
def _upload_to_statusdb(run):
"""Triggers the upload to statusdb using the dependency flowcell_parser.
:param Run run: the object run
"""
- couch_conf = CONFIG['statusdb']
+ couch_conf = CONFIG["statusdb"]
couch_connection = statusdb.StatusdbSession(couch_conf).connection
- db = couch_connection[couch_conf['xten_db']]
+ db = couch_connection[couch_conf["xten_db"]]
parser = run.runParserObj
# Check if I have NoIndex lanes
- for element in parser.obj['samplesheet_csv']:
- if 'NoIndex' in element['index'] or not element['index']: # NoIndex in the case of HiSeq, empty in the case of HiSeqX
- lane = element['Lane'] # This is a lane with NoIndex
+ for element in parser.obj["samplesheet_csv"]:
+ if (
+ "NoIndex" in element["index"] or not element["index"]
+ ): # NoIndex in the case of HiSeq, empty in the case of HiSeqX
+ lane = element["Lane"] # This is a lane with NoIndex
# In this case PF Cluster is the number of undetermined reads
try:
- PFclusters = parser.obj['Undetermined'][lane]['unknown']
+ PFclusters = parser.obj["Undetermined"][lane]["unknown"]
except KeyError:
- logger.error(f'While taking extra care of lane {lane} of NoIndex type ' \
- 'I found out that not all values were available')
+ logger.error(
+ f"While taking extra care of lane {lane} of NoIndex type "
+ "I found out that not all values were available"
+ )
continue
# In Lanes_stats fix the lane yield
- parser.obj['illumina']['Demultiplex_Stats']['Lanes_stats'][int(lane) - 1]['PF Clusters'] = str(PFclusters)
+ parser.obj["illumina"]["Demultiplex_Stats"]["Lanes_stats"][int(lane) - 1][
+ "PF Clusters"
+ ] = str(PFclusters)
# Now fix Barcode lane stats
- updated = 0 # Check that only one update is made
- for sample in parser.obj['illumina']['Demultiplex_Stats']['Barcode_lane_statistics']:
- if lane in sample['Lane']:
+ updated = 0 # Check that only one update is made
+ for sample in parser.obj["illumina"]["Demultiplex_Stats"][
+ "Barcode_lane_statistics"
+ ]:
+ if lane in sample["Lane"]:
updated += 1
- sample['PF Clusters'] = str(PFclusters)
+ sample["PF Clusters"] = str(PFclusters)
if updated != 1:
- logger.error(f'While taking extra care of lane {lane} of NoIndex type '
- 'I updated more than once the barcode_lane. '
- 'This is too much to continue so I will fail.')
+ logger.error(
+ f"While taking extra care of lane {lane} of NoIndex type "
+ "I updated more than once the barcode_lane. "
+ "This is too much to continue so I will fail."
+ )
os.sys.exit()
# If I am here it means I changed the HTML representation to something
# else to accomodate the wired things we do
# someone told me that in such cases it is better to put a place holder for this
- parser.obj['illumina']['Demultiplex_Stats']['NotOriginal'] = 'True'
+ parser.obj["illumina"]["Demultiplex_Stats"]["NotOriginal"] = "True"
# Update info about bcl2fastq tool
- if not parser.obj.get('DemultiplexConfig'):
- parser.obj['DemultiplexConfig'] = {'Setup': {'Software': run.CONFIG.get('bcl2fastq', {})}}
+ if not parser.obj.get("DemultiplexConfig"):
+ parser.obj["DemultiplexConfig"] = {
+ "Setup": {"Software": run.CONFIG.get("bcl2fastq", {})}
+ }
statusdb.update_doc(db, parser.obj, over_write_db_entry=True)
+
def transfer_run(run_dir):
"""Interface for click to force a transfer a run to uppmax.
:param: string run_dir: the run to tranfer
"""
runObj = get_runObj(run_dir, software)
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
if runObj is None:
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
- logger.error(f'Trying to force a transfer of run {run_dir} but the sequencer was not recognized.')
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
+ logger.error(
+ f"Trying to force a transfer of run {run_dir} but the sequencer was not recognized."
+ )
else:
- runObj.transfer_run(os.path.join('nosync', CONFIG['analysis']['status_dir'], 'transfer.tsv'), mail_recipients)
+ runObj.transfer_run(
+ os.path.join("nosync", CONFIG["analysis"]["status_dir"], "transfer.tsv"),
+ mail_recipients,
+ )
+
def transfer_runfolder(run_dir, pid, exclude_lane):
"""Transfer the entire run folder for a specified project and run to uppmax.
@@ -156,82 +190,105 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
# Validate whether run_dir exists or is valid
run_dir = os.path.abspath(run_dir)
if not os.path.exists(run_dir) or not os.path.isdir(run_dir):
- logger.error('Unable to locate the specified run directory for transfer.')
+ logger.error("Unable to locate the specified run directory for transfer.")
sys.exit()
- original_sample_sheet = os.path.join(run_dir, 'SampleSheet.csv')
- pid_list = list(set([x.strip() for x in pid.split(',')]))
- new_sample_sheet = os.path.join(run_dir, '_'.join(pid_list) + '_SampleSheet.txt')
+ original_sample_sheet = os.path.join(run_dir, "SampleSheet.csv")
+ pid_list = list(set([x.strip() for x in pid.split(",")]))
+ new_sample_sheet = os.path.join(run_dir, "_".join(pid_list) + "_SampleSheet.txt")
# Write new sample sheet including only rows for the specified project
try:
- with open(new_sample_sheet, 'w') as nss:
+ with open(new_sample_sheet, "w") as nss:
nss.write(extract_project_samplesheet(original_sample_sheet, pid_list))
except OSError as e:
- logger.error('An error occured while parsing the samplesheet. '
- 'Please check the sample sheet and try again.')
+ logger.error(
+ "An error occured while parsing the samplesheet. "
+ "Please check the sample sheet and try again."
+ )
raise e
# Create a tar archive of the runfolder
dir_name = os.path.basename(run_dir)
- archive = run_dir + '.tar.gz'
+ archive = run_dir + ".tar.gz"
run_dir_path = os.path.dirname(run_dir)
# Prepare the options for excluding lanes
- if exclude_lane != '':
+ if exclude_lane != "":
dir_for_excluding_lane = []
- lane_to_exclude = exclude_lane.split(',')
+ lane_to_exclude = exclude_lane.split(",")
for lane in lane_to_exclude:
- if os.path.isdir(f'{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}'):
- dir_for_excluding_lane.extend(['--exclude', f'Thumbnail_Images/L00{lane}'])
- if os.path.isdir(f'{run_dir_path}/{dir_name}/Images/Focus/L00{lane}'):
- dir_for_excluding_lane.extend(['--exclude', f'Images/Focus/L00{lane}'])
- if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}'):
- dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/L00{lane}'])
- if os.path.isdir(f'{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}'):
- dir_for_excluding_lane.extend(['--exclude', f'Data/Intensities/BaseCalls/L00{lane}'])
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Thumbnail_Images/L00{lane}"):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Thumbnail_Images/L00{lane}"]
+ )
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Images/Focus/L00{lane}"):
+ dir_for_excluding_lane.extend(["--exclude", f"Images/Focus/L00{lane}"])
+ if os.path.isdir(f"{run_dir_path}/{dir_name}/Data/Intensities/L00{lane}"):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Data/Intensities/L00{lane}"]
+ )
+ if os.path.isdir(
+ f"{run_dir_path}/{dir_name}/Data/Intensities/BaseCalls/L00{lane}"
+ ):
+ dir_for_excluding_lane.extend(
+ ["--exclude", f"Data/Intensities/BaseCalls/L00{lane}"]
+ )
try:
- exclude_options_for_tar = ['--exclude', 'Demultiplexing*',
- '--exclude', 'demux_*',
- '--exclude', 'rsync*',
- '--exclude', '*.csv']
- if exclude_lane != '':
+ exclude_options_for_tar = [
+ "--exclude",
+ "Demultiplexing*",
+ "--exclude",
+ "demux_*",
+ "--exclude",
+ "rsync*",
+ "--exclude",
+ "*.csv",
+ ]
+ if exclude_lane != "":
exclude_options_for_tar += dir_for_excluding_lane
- subprocess.call(['tar'] + exclude_options_for_tar + ['-cvzf', archive, '-C', run_dir_path, dir_name])
+ subprocess.call(
+ ["tar"]
+ + exclude_options_for_tar
+ + ["-cvzf", archive, "-C", run_dir_path, dir_name]
+ )
except subprocess.CalledProcessError as e:
- logger.error('Error creating tar archive')
+ logger.error("Error creating tar archive")
raise e
# Generate the md5sum under the same folder as run_dir
- md5file = archive + '.md5'
+ md5file = archive + ".md5"
try:
- f = open(md5file, 'w')
+ f = open(md5file, "w")
os.chdir(run_dir_path)
- subprocess.call(['md5sum', os.path.basename(archive)], stdout=f)
+ subprocess.call(["md5sum", os.path.basename(archive)], stdout=f)
f.close()
except subprocess.CalledProcessError as e:
- logger.error('Error creating md5 file')
+ logger.error("Error creating md5 file")
raise e
# Rsync the files to the analysis cluster
- destination = CONFIG['analysis']['deliver_runfolder'].get('destination')
- rsync_opts = {'-LtDrv': None,
- '--chmod': 'g+rw'}
- connection_details = CONFIG['analysis']['deliver_runfolder'].get('analysis_server')
- archive_transfer = RsyncAgent(archive,
- dest_path=destination,
- remote_host=connection_details['host'],
- remote_user=connection_details['user'],
- validate=False,
- opts=rsync_opts)
- md5_transfer = RsyncAgent(md5file,
- dest_path=destination,
- remote_host=connection_details['host'],
- remote_user=connection_details['user'],
- validate=False,
- opts=rsync_opts)
+ destination = CONFIG["analysis"]["deliver_runfolder"].get("destination")
+ rsync_opts = {"-LtDrv": None, "--chmod": "g+rw"}
+ connection_details = CONFIG["analysis"]["deliver_runfolder"].get("analysis_server")
+ archive_transfer = RsyncAgent(
+ archive,
+ dest_path=destination,
+ remote_host=connection_details["host"],
+ remote_user=connection_details["user"],
+ validate=False,
+ opts=rsync_opts,
+ )
+ md5_transfer = RsyncAgent(
+ md5file,
+ dest_path=destination,
+ remote_host=connection_details["host"],
+ remote_user=connection_details["user"],
+ validate=False,
+ opts=rsync_opts,
+ )
archive_transfer.transfer()
md5_transfer.transfer()
@@ -242,82 +299,107 @@ def transfer_runfolder(run_dir, pid, exclude_lane):
os.remove(archive)
os.remove(md5file)
except OSError as e:
- logger.error('Was not able to delete all temporary files')
+ logger.error("Was not able to delete all temporary files")
raise e
return
+
def extract_project_samplesheet(sample_sheet, pid_list):
- header_line = ''
- project_entries = ''
+ header_line = ""
+ project_entries = ""
with open(sample_sheet) as f:
for line in f:
- if line.split(',')[0] in ('Lane', 'FCID'): # include the header
+ if line.split(",")[0] in ("Lane", "FCID"): # include the header
header_line += line
elif any(pid in line for pid in pid_list):
- project_entries += line # include only lines related to the specified project
+ project_entries += (
+ line # include only lines related to the specified project
+ )
new_samplesheet_content = header_line + project_entries
return new_samplesheet_content
+
def run_preprocessing(run, software):
"""Run demultiplexing in all data directories.
:param str run: Process a particular run instead of looking for runs
"""
+
def _process(run):
"""Process a run/flowcell and transfer to analysis server.
:param taca.illumina.Run run: Run to be processed and transferred
"""
- logger.info(f'Checking run {run.id}')
- transfer_file = os.path.join(CONFIG['analysis']['status_dir'], 'transfer.tsv')
- if run.is_transferred(transfer_file): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot
- logger.info(f'Run {run.id} already transferred to analysis server, skipping it')
+ logger.info(f"Checking run {run.id}")
+ transfer_file = os.path.join(CONFIG["analysis"]["status_dir"], "transfer.tsv")
+ if run.is_transferred(
+ transfer_file
+ ): # Transfer is ongoing or finished. Do nothing. Sometimes caused by runs that are copied back from NAS after a reboot
+ logger.info(
+ f"Run {run.id} already transferred to analysis server, skipping it"
+ )
return
- if run.get_run_status() == 'SEQUENCING':
- logger.info(f'Run {run.id} is not finished yet')
- if 'statusdb' in CONFIG:
+ if run.get_run_status() == "SEQUENCING":
+ logger.info(f"Run {run.id} is not finished yet")
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
- elif run.get_run_status() == 'TO_START':
- if run.get_run_type() == 'NON-NGI-RUN':
+ elif run.get_run_status() == "TO_START":
+ if run.get_run_type() == "NON-NGI-RUN":
# For now MiSeq specific case. Process only NGI-run, skip all the others (PhD student runs)
- logger.warn(f'Run {run.id} marked as {run.get_run_type()}, '
- 'TACA will skip this and move the run to '
- 'no-sync directory')
- if 'storage' in CONFIG:
- run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
+ logger.warn(
+ f"Run {run.id} marked as {run.get_run_type()}, "
+ "TACA will skip this and move the run to "
+ "no-sync directory"
+ )
+ if "storage" in CONFIG:
+ run.archive_run(
+ CONFIG["storage"]["archive_dirs"][run.sequencer_type]
+ )
return
- logger.info(f'Starting BCL to FASTQ conversion and demultiplexing for run {run.id}')
- if 'statusdb' in CONFIG:
+ logger.info(
+ f"Starting BCL to FASTQ conversion and demultiplexing for run {run.id}"
+ )
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
run.demultiplex_run()
- elif run.get_run_status() == 'IN_PROGRESS':
- logger.info('BCL conversion and demultiplexing process in '
- f'progress for run {run.id}, skipping it')
+ elif run.get_run_status() == "IN_PROGRESS":
+ logger.info(
+ "BCL conversion and demultiplexing process in "
+ f"progress for run {run.id}, skipping it"
+ )
# Upload to statusDB if applies
- if 'statusdb' in CONFIG:
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
# This function checks if demux is done
run.check_run_status()
# Previous elif might change the status to COMPLETED, therefore to avoid skipping
# a cycle take the last if out of the elif
- if run.get_run_status() == 'COMPLETED':
+ if run.get_run_status() == "COMPLETED":
run.check_run_status()
- logger.info(f'Preprocessing of run {run.id} is finished, transferring it')
+ logger.info(f"Preprocessing of run {run.id} is finished, transferring it")
# Upload to statusDB if applies
- if 'statusdb' in CONFIG:
+ if "statusdb" in CONFIG:
_upload_to_statusdb(run)
demux_summary_message = []
for demux_id, demux_log in run.demux_summary.items():
- if demux_log['errors'] or demux_log['warnings']:
- demux_summary_message.append("Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(demux_id, demux_log['errors'], demux_log['warnings']))
- demux_summary_message.append("\n".join(demux_log['error_and_warning_messages'][:5]))
- if len(demux_log['error_and_warning_messages'])>5:
- demux_summary_message.append(f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}.")
+ if demux_log["errors"] or demux_log["warnings"]:
+ demux_summary_message.append(
+ "Sub-Demultiplexing in Demultiplexing_{} completed with {} errors and {} warnings:".format(
+ demux_id, demux_log["errors"], demux_log["warnings"]
+ )
+ )
+ demux_summary_message.append(
+ "\n".join(demux_log["error_and_warning_messages"][:5])
+ )
+ if len(demux_log["error_and_warning_messages"]) > 5:
+ demux_summary_message.append(
+ f"...... Only the first 5 errors or warnings are displayed for Demultiplexing_{demux_id}."
+ )
# Notify with a mail run completion and stats uploaded
if demux_summary_message:
- sbt = (f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!")
+ sbt = f"{run.id} Demultiplexing Completed with ERRORs or WARNINGS!"
msg = """The run {run} has been demultiplexed with errors or warnings!
{errors_warnings}
@@ -326,9 +408,11 @@ def _process(run):
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
- """.format(errors_warnings='\n'.join(demux_summary_message), run=run.id)
+ """.format(
+ errors_warnings="\n".join(demux_summary_message), run=run.id
+ )
else:
- sbt = (f"{run.id} Demultiplexing Completed!")
+ sbt = f"{run.id} Demultiplexing Completed!"
msg = """The run {run} has been demultiplexed without any error or warning.
The Run will be transferred to the analysis cluster for further analysis.
@@ -336,67 +420,103 @@ def _process(run):
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
""".format(run=run.id)
- run.send_mail(sbt, msg, rcp=CONFIG['mail']['recipients'])
+ run.send_mail(sbt, msg, rcp=CONFIG["mail"]["recipients"])
# Copy demultiplex stats file, InterOp meta data and run xml files to shared file system for LIMS purpose
- if 'mfs_path' in CONFIG['analysis']:
+ if "mfs_path" in CONFIG["analysis"]:
try:
- mfs_dest = os.path.join(CONFIG['analysis']['mfs_path'][run.sequencer_type.lower()],run.id)
- logger.info(f'Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}')
+ mfs_dest = os.path.join(
+ CONFIG["analysis"]["mfs_path"][run.sequencer_type.lower()],
+ run.id,
+ )
+ logger.info(
+ f"Copying demultiplex stats, InterOp metadata and XML files for run {run.id} to {mfs_dest}"
+ )
if not os.path.exists(mfs_dest):
os.mkdir(mfs_dest)
- demulti_stat_src = os.path.join(run.run_dir, run.demux_dir, 'Reports',
- 'html', run.flowcell_id, 'all', 'all', 'all', 'laneBarcode.html')
- copyfile(demulti_stat_src, os.path.join(mfs_dest, 'laneBarcode.html'))
+ demulti_stat_src = os.path.join(
+ run.run_dir,
+ run.demux_dir,
+ "Reports",
+ "html",
+ run.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
+ copyfile(
+ demulti_stat_src, os.path.join(mfs_dest, "laneBarcode.html")
+ )
# Copy RunInfo.xml
- run_info_xml_src = os.path.join(run.run_dir, 'RunInfo.xml')
+ run_info_xml_src = os.path.join(run.run_dir, "RunInfo.xml")
if os.path.isfile(run_info_xml_src):
- copyfile(run_info_xml_src, os.path.join(mfs_dest, 'RunInfo.xml'))
+ copyfile(
+ run_info_xml_src, os.path.join(mfs_dest, "RunInfo.xml")
+ )
# Copy RunParameters.xml
- run_parameters_xml_src = os.path.join(run.run_dir, 'RunParameters.xml')
+ run_parameters_xml_src = os.path.join(
+ run.run_dir, "RunParameters.xml"
+ )
if os.path.isfile(run_info_xml_src):
- copyfile(run_parameters_xml_src, os.path.join(mfs_dest, 'RunParameters.xml'))
+ copyfile(
+ run_parameters_xml_src,
+ os.path.join(mfs_dest, "RunParameters.xml"),
+ )
# Copy InterOp
- interop_src = os.path.join(run.run_dir, 'InterOp')
+ interop_src = os.path.join(run.run_dir, "InterOp")
if os.path.exists(interop_src):
- copytree(interop_src, os.path.join(mfs_dest, 'InterOp'), dirs_exist_ok=True)
+ copytree(
+ interop_src,
+ os.path.join(mfs_dest, "InterOp"),
+ dirs_exist_ok=True,
+ )
except:
- logger.warn(f'Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}')
+ logger.warn(
+ f"Could not copy demultiplex stats, InterOp metadata or XML files for run {run.id}"
+ )
# Transfer to analysis server if flag is True
if run.transfer_to_analysis_server:
- mail_recipients = CONFIG.get('mail', {}).get('recipients')
- logger.info('Transferring run {} to {} into {}'
- .format(run.id,
- run.CONFIG['analysis_server']['host'],
- run.CONFIG['analysis_server']['sync']['data_archive']))
+ mail_recipients = CONFIG.get("mail", {}).get("recipients")
+ logger.info(
+ "Transferring run {} to {} into {}".format(
+ run.id,
+ run.CONFIG["analysis_server"]["host"],
+ run.CONFIG["analysis_server"]["sync"]["data_archive"],
+ )
+ )
run.transfer_run(transfer_file, mail_recipients)
# Archive the run if indicated in the config file
- if 'storage' in CONFIG: #TODO: make sure archiving to PDC is not ongoing
- run.archive_run(CONFIG['storage']['archive_dirs'][run.sequencer_type])
+ if "storage" in CONFIG: # TODO: make sure archiving to PDC is not ongoing
+ run.archive_run(CONFIG["storage"]["archive_dirs"][run.sequencer_type])
if run:
# Determine the run type
runObj = get_runObj(run, software)
if not runObj:
- raise RuntimeError(f"Unrecognized instrument type or incorrect run folder {run}")
+ raise RuntimeError(
+ f"Unrecognized instrument type or incorrect run folder {run}"
+ )
else:
_process(runObj)
else:
- data_dirs = CONFIG.get('analysis').get('data_dirs')
+ data_dirs = CONFIG.get("analysis").get("data_dirs")
for data_dir in data_dirs:
# Run folder looks like DATE_*_*_*, the last section is the FC name.
- runs = glob.glob(os.path.join(data_dir, '[1-9]*_*_*_*'))
+ runs = glob.glob(os.path.join(data_dir, "[1-9]*_*_*_*"))
for _run in runs:
runObj = get_runObj(_run, software)
if not runObj:
- logger.warning(f'Unrecognized instrument type or incorrect run folder {run}')
+ logger.warning(
+ f"Unrecognized instrument type or incorrect run folder {run}"
+ )
else:
try:
_process(runObj)
except:
# This function might throw and exception,
# it is better to continue processing other runs
- logger.warning(f'There was an error processing the run {run}')
+ logger.warning(f"There was an error processing the run {run}")
pass
diff --git a/taca/analysis/analysis_nanopore.py b/taca/analysis/analysis_nanopore.py
index 9141551a..c9dea404 100644
--- a/taca/analysis/analysis_nanopore.py
+++ b/taca/analysis/analysis_nanopore.py
@@ -36,7 +36,6 @@ def find_run_dirs(dir_to_search: str, skip_dirs: list):
def send_error_mail(run_name, error: BaseException):
-
email_subject = f"Run processed with errors: {run_name}"
email_message = f"{str(error)}\n\n{traceback.format_exc()}"
email_recipients = CONFIG["mail"]["recipients"]
@@ -75,7 +74,6 @@ def process_user_run(ont_user_run: ONT_user_run):
if not ont_user_run.is_synced():
logger.info(f"{ont_user_run.run_name}: Run is not fully synced, skipping.")
else:
-
if ont_user_run.is_transferred():
logger.warning(
f"{ont_user_run.run_name}: Run is already logged as transferred, sending mail."
@@ -157,7 +155,6 @@ def process_qc_run(ont_qc_run: ONT_qc_run):
if not ont_qc_run.is_synced():
logger.info(f"{ont_qc_run.run_name}: Run is not fully synced, skipping.")
else:
-
# Assert all files are in place
logger.info(f"{ont_qc_run.run_name}: Asserting run contents...")
ont_qc_run.assert_contents()
@@ -257,7 +254,6 @@ def ont_transfer(run_abspath: str or None, qc: bool = False):
# If no run is specified, locate all runs
else:
-
for run_type in ["user_run", "qc_run"]:
logger.info(f"Looking for runs of type '{run_type}'...")
diff --git a/taca/analysis/cli.py b/taca/analysis/cli.py
index ba101d66..52b6423b 100644
--- a/taca/analysis/cli.py
+++ b/taca/analysis/cli.py
@@ -13,21 +13,42 @@ def analysis():
# Illumina analysis subcommands
+
@analysis.command()
-@click.option('-r', '--run', type=click.Path(exists=True), default=None,
- help='Demultiplex only a particular run')
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ default=None,
+ help="Demultiplex only a particular run",
+)
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
def demultiplex(run, software):
- """Demultiplex and transfer all runs present in the data directories."""
- an.run_preprocessing(run, software)
+ """Demultiplex and transfer all runs present in the data directories."""
+ an.run_preprocessing(run, software)
+
@analysis.command()
-@click.option('--runfolder-project', is_flag=False, help='Project IDs for runfolder transfer separated by comma')
-@click.option('--exclude-lane', default='', help='Lanes to exclude separated by comma')
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
-@click.argument('rundir')
+@click.option(
+ "--runfolder-project",
+ is_flag=False,
+ help="Project IDs for runfolder transfer separated by comma",
+)
+@click.option("--exclude-lane", default="", help="Lanes to exclude separated by comma")
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
+@click.argument("rundir")
def transfer(rundir, runfolder_project, exclude_lane, software):
"""Transfers the run without qc."""
if not runfolder_project:
@@ -35,10 +56,16 @@ def transfer(rundir, runfolder_project, exclude_lane, software):
else:
an.transfer_runfolder(rundir, pid=runfolder_project, exclude_lane=exclude_lane)
+
@analysis.command()
-@click.option('-s', '--software', type=click.Choice(['bcl2fastq', 'bclconvert']), default='bcl2fastq',
- help='Available software for demultiplexing: bcl2fastq (default), bclconvert')
-@click.argument('rundir')
+@click.option(
+ "-s",
+ "--software",
+ type=click.Choice(["bcl2fastq", "bclconvert"]),
+ default="bcl2fastq",
+ help="Available software for demultiplexing: bcl2fastq (default), bclconvert",
+)
+@click.argument("rundir")
def updatedb(rundir, software):
"""Save the run to statusdb."""
an.upload_to_statusdb(rundir, software)
@@ -46,6 +73,7 @@ def updatedb(rundir, software):
# Nanopore analysis subcommands
+
@analysis.command()
@click.option(
"-r",
@@ -65,6 +93,7 @@ def ont_transfer(run, qc):
"""Find and process all runs"""
analysis_nanopore.ont_transfer(run, qc)
+
@analysis.command()
@click.argument("run")
def ont_updatedb(run):
diff --git a/taca/backup/backup.py b/taca/backup/backup.py
index 97fdec7b..8d43a558 100644
--- a/taca/backup/backup.py
+++ b/taca/backup/backup.py
@@ -13,16 +13,19 @@
logger = logging.getLogger(__name__)
+
class run_vars:
"""A simple variable storage class."""
+
def __init__(self, run, archive_path):
self.abs_path = os.path.abspath(run)
self.path, self.name = os.path.split(self.abs_path)
- self.name = self.name.split('.', 1)[0]
- self.zip = os.path.join(archive_path, f'{self.name}.tar.gz')
- self.key = f'{self.name}.key'
- self.key_encrypted = f'{self.name}.key.gpg'
- self.zip_encrypted = os.path.join(archive_path, f'{self.name}.tar.gz.gpg')
+ self.name = self.name.split(".", 1)[0]
+ self.zip = os.path.join(archive_path, f"{self.name}.tar.gz")
+ self.key = f"{self.name}.key"
+ self.key_encrypted = f"{self.name}.key.gpg"
+ self.zip_encrypted = os.path.join(archive_path, f"{self.name}.tar.gz.gpg")
+
class backup_utils:
"""A class object with main utility methods related to backing up."""
@@ -30,25 +33,31 @@ class backup_utils:
def __init__(self, run=None):
self.run = run
self.fetch_config_info()
- self.host_name = os.getenv('HOSTNAME', os.uname()[1]).split('.', 1)[0]
+ self.host_name = os.getenv("HOSTNAME", os.uname()[1]).split(".", 1)[0]
def fetch_config_info(self):
"""Try to fecth required info from the config file. Log and exit if any neccesary info is missing."""
try:
- self.data_dirs = CONFIG['backup']['data_dirs']
- self.archive_dirs = CONFIG['backup']['archive_dirs']
- self.archived_dirs = CONFIG['backup']['archived_dirs']
- self.exclude_list = CONFIG['backup']['exclude_list']
- self.keys_path = CONFIG['backup']['keys_path']
- self.gpg_receiver = CONFIG['backup']['gpg_receiver']
- self.mail_recipients = CONFIG['mail']['recipients']
- self.check_demux = CONFIG.get('backup', {}).get('check_demux', False)
- self.couch_info = CONFIG.get('statusdb')
- self.finished_run_indicator = CONFIG.get('storage', {}).get('finished_run_indicator', 'RTAComplete.txt')
- self.copy_complete_indicator = CONFIG.get('storage', {}).get('copy_complete_indicator', 'CopyComplete.txt')
- self.archive_log_location = CONFIG['backup']['archive_log']
+ self.data_dirs = CONFIG["backup"]["data_dirs"]
+ self.archive_dirs = CONFIG["backup"]["archive_dirs"]
+ self.archived_dirs = CONFIG["backup"]["archived_dirs"]
+ self.exclude_list = CONFIG["backup"]["exclude_list"]
+ self.keys_path = CONFIG["backup"]["keys_path"]
+ self.gpg_receiver = CONFIG["backup"]["gpg_receiver"]
+ self.mail_recipients = CONFIG["mail"]["recipients"]
+ self.check_demux = CONFIG.get("backup", {}).get("check_demux", False)
+ self.couch_info = CONFIG.get("statusdb")
+ self.finished_run_indicator = CONFIG.get("storage", {}).get(
+ "finished_run_indicator", "RTAComplete.txt"
+ )
+ self.copy_complete_indicator = CONFIG.get("storage", {}).get(
+ "copy_complete_indicator", "CopyComplete.txt"
+ )
+ self.archive_log_location = CONFIG["backup"]["archive_log"]
except KeyError as e:
- logger.error(f'Config file is missing the key {str(e)}, make sure it have all required information')
+ logger.error(
+ f"Config file is missing the key {str(e)}, make sure it have all required information"
+ )
raise SystemExit
def collect_runs(self, ext=None, filter_by_ext=False):
@@ -58,24 +67,30 @@ def collect_runs(self, ext=None, filter_by_ext=False):
run_type = self._get_run_type(self.run)
archive_path = self.archive_dirs[run_type]
run = run_vars(self.run, archive_path)
- if not (re.match(filesystem.RUN_RE, run.name) or re.match(filesystem.RUN_RE_ONT, run.name)):
- logger.error(f'Given run {self.run} did not match a FC pattern')
+ if not (
+ re.match(filesystem.RUN_RE, run.name)
+ or re.match(filesystem.RUN_RE_ONT, run.name)
+ ):
+ logger.error(f"Given run {self.run} did not match a FC pattern")
raise SystemExit
if self._is_ready_to_archive(run, ext):
self.runs.append(run)
else:
for adir in self.archive_dirs.values():
if not os.path.isdir(adir):
- logger.warn(f'Path {adir} does not exist or it is not a directory')
+ logger.warn(f"Path {adir} does not exist or it is not a directory")
continue
for item in os.listdir(adir):
if filter_by_ext and not item.endswith(ext):
continue
elif item.endswith(ext):
- item = item.replace(ext, '')
+ item = item.replace(ext, "")
elif not os.path.isdir(os.path.join(adir, item)):
continue
- if (re.match(filesystem.RUN_RE, item) or re.match(filesystem.RUN_RE_ONT, item)) and item not in self.runs:
+ if (
+ re.match(filesystem.RUN_RE, item)
+ or re.match(filesystem.RUN_RE_ONT, item)
+ ) and item not in self.runs:
run_type = self._get_run_type(item)
archive_path = self.archive_dirs[run_type]
run = run_vars(os.path.join(adir, item), archive_path)
@@ -85,7 +100,14 @@ def collect_runs(self, ext=None, filter_by_ext=False):
def avail_disk_space(self, path, run):
"""Check the space on file system based on parent directory of the run."""
# not able to fetch runtype use the max size as precaution, size units in GB
- illumina_run_sizes = {'novaseq': 1800, 'miseq': 20, 'nextseq': 250, 'NovaSeqXPlus': 3600, 'promethion': 3000, 'minion': 1000}
+ illumina_run_sizes = {
+ "novaseq": 1800,
+ "miseq": 20,
+ "nextseq": 250,
+ "NovaSeqXPlus": 3600,
+ "promethion": 3000,
+ "minion": 1000,
+ }
required_size = illumina_run_sizes.get(self._get_run_type(run), 900) * 2
# check for any ongoing runs and add up the required size accrdingly
for ddir in self.data_dirs.values():
@@ -94,19 +116,25 @@ def avail_disk_space(self, path, run):
for item in os.listdir(ddir):
if not re.match(filesystem.RUN_RE, item):
continue
- if not os.path.exists(os.path.join(ddir, item, 'RTAComplete.txt')):
- required_size += illumina_run_sizes.get(self._get_run_type(run), 900)
+ if not os.path.exists(os.path.join(ddir, item, "RTAComplete.txt")):
+ required_size += illumina_run_sizes.get(
+ self._get_run_type(run), 900
+ )
# get available free space from the file system
try:
- df_proc = sp.Popen(['df', path], stdout=sp.PIPE, stderr=sp.PIPE)
+ df_proc = sp.Popen(["df", path], stdout=sp.PIPE, stderr=sp.PIPE)
df_out, df_err = df_proc.communicate()
- available_size = int(df_out.strip().decode("utf-8").split('\n')[-1].strip().split()[3])/1024/1024
+ available_size = (
+ int(df_out.strip().decode("utf-8").split("\n")[-1].strip().split()[3])
+ / 1024
+ / 1024
+ )
except Exception as e:
- logger.error(f'Evaluation of disk space failed with error {e}')
+ logger.error(f"Evaluation of disk space failed with error {e}")
raise SystemExit
if available_size < required_size:
- e_msg = f'Required space for encryption is {required_size}GB, but only {available_size}GB available'
- subjt = f'Low space for encryption - {self.host_name}'
+ e_msg = f"Required space for encryption is {required_size}GB, but only {available_size}GB available"
+ subjt = f"Low space for encryption - {self.host_name}"
logger.error(e_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
raise SystemExit
@@ -117,47 +145,63 @@ def file_in_pdc(self, src_file, silent=True):
# non-zero/False though cmd is execudted but file not found
src_file_abs = os.path.abspath(src_file)
try:
- sp.check_call(['dsmc', 'query', 'archive', src_file_abs], stdout=sp.PIPE, stderr=sp.PIPE)
+ sp.check_call(
+ ["dsmc", "query", "archive", src_file_abs],
+ stdout=sp.PIPE,
+ stderr=sp.PIPE,
+ )
value = True
except sp.CalledProcessError:
value = False
if not silent:
- msg = 'File {} {} in PDC'.format(src_file_abs, 'exist' if value else 'do not exist')
+ msg = "File {} {} in PDC".format(
+ src_file_abs, "exist" if value else "do not exist"
+ )
logger.info(msg)
return value
def _get_run_type(self, run):
"""Returns run type based on the flowcell name."""
- run_type = ''
+ run_type = ""
try:
- if '_A0' in run:
- run_type = 'novaseq'
- elif '-' in run.split('_')[-1]:
- run_type = 'miseq'
- elif '_NS' in run or '_VH' in run:
- run_type = 'nextseq'
- elif '_LH' in run:
- run_type = 'NovaSeqXPlus'
- elif '_MN' in run:
- run_type = 'minion'
- elif re.match("^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$",run):
- run_type = 'promethion'
+ if "_A0" in run:
+ run_type = "novaseq"
+ elif "-" in run.split("_")[-1]:
+ run_type = "miseq"
+ elif "_NS" in run or "_VH" in run:
+ run_type = "nextseq"
+ elif "_LH" in run:
+ run_type = "NovaSeqXPlus"
+ elif "_MN" in run:
+ run_type = "minion"
+ elif re.match(
+ "^(\d{8})_(\d{4})_([1-3][A-H])_([0-9a-zA-Z]+)_([0-9a-zA-Z]+)$", run
+ ):
+ run_type = "promethion"
else:
- run_type = ''
+ run_type = ""
except:
- logger.warn(f'Could not fetch run type for run {run}')
+ logger.warn(f"Could not fetch run type for run {run}")
return run_type
- def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_failed=False, tmp_files=[]):
+ def _call_commands(
+ self,
+ cmd1,
+ cmd2=None,
+ out_file=None,
+ return_out=False,
+ mail_failed=False,
+ tmp_files=[],
+ ):
"""Call an external command(s) with atmost two commands per function call.
Given 'out_file' is always used for the later cmd and also stdout can be return
for the later cmd. In case of failure, the 'tmp_files' are removed"""
if out_file:
if not cmd2:
- stdout1 = open(out_file, 'w')
+ stdout1 = open(out_file, "w")
else:
stdout1 = sp.PIPE
- stdout2 = open(out_file, 'w')
+ stdout2 = open(out_file, "w")
else:
stdout1 = sp.PIPE
stdout2 = sp.PIPE
@@ -170,7 +214,9 @@ def _call_commands(self, cmd1, cmd2=None, out_file=None, return_out=False, mail_
p2 = sp.Popen(cmd2, stdin=p1.stdout, stdout=stdout2, stderr=sp.PIPE)
p2_stat = p2.wait()
p2_out, p2_err = p2.communicate()
- if not self._check_status(cmd2, p2_stat, p2_err, mail_failed, tmp_files):
+ if not self._check_status(
+ cmd2, p2_stat, p2_err, mail_failed, tmp_files
+ ):
return (False, p2_err) if return_out else False
p1_stat = p1.wait()
p1_out, p1_err = p1.communicate()
@@ -193,10 +239,12 @@ def _check_status(self, cmd, status, err_msg, mail_failed, files_to_remove=[]):
if status != 0:
self._clean_tmp_files(files_to_remove)
if mail_failed:
- subjt = f'Command call failed - {self.host_name}'
- e_msg = 'Called cmd: {}\n\nError msg: {}'.format(' '.join(cmd), err_msg)
+ subjt = f"Command call failed - {self.host_name}"
+ e_msg = "Called cmd: {}\n\nError msg: {}".format(" ".join(cmd), err_msg)
misc.send_mail(subjt, e_msg, self.mail_recipients)
- logger.error('Command "{}" failed with the error "{}"'.format(' '.join(cmd),err_msg))
+ logger.error(
+ 'Command "{}" failed with the error "{}"'.format(" ".join(cmd), err_msg)
+ )
return False
return True
@@ -209,20 +257,22 @@ def _clean_tmp_files(self, files):
def _log_pdc_statusdb(self, run):
"""Log the time stamp in statusDB if a file is succussfully sent to PDC."""
try:
- run_vals = run.split('_')
+ run_vals = run.split("_")
if len(run_vals[0]) == 8:
run_date = run_vals[0][2:]
else:
run_date = run_vals[0]
- run_fc = f'{run_date}_{run_vals[-1]}'
+ run_fc = f"{run_date}_{run_vals[-1]}"
couch_connection = statusdb.StatusdbSession(self.couch_info).connection
- db = couch_connection[self.couch_info['db']]
- fc_names = {e.key:e.id for e in db.view('names/name', reduce=False)}
+ db = couch_connection[self.couch_info["db"]]
+ fc_names = {e.key: e.id for e in db.view("names/name", reduce=False)}
d_id = fc_names[run_fc]
doc = db.get(d_id)
- doc['pdc_archived'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+ doc["pdc_archived"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
db.save(doc)
- logger.info(f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"')
+ logger.info(
+ f'Logged "pdc_archived" timestamp for fc {run} in statusdb doc "{d_id}"'
+ )
except:
logger.warn(f'Not able to log "pdc_archived" timestamp for run {run}')
@@ -232,24 +282,35 @@ def _is_ready_to_archive(self, run, ext):
run_path = run.abs_path
rta_file = os.path.join(run_path, self.finished_run_indicator)
cp_file = os.path.join(run_path, self.copy_complete_indicator)
- if (os.path.exists(rta_file) and os.path.exists(cp_file) and (not self.file_in_pdc(run.zip_encrypted))) or (self._get_run_type(run.name) in ['promethion', 'minion'] and os.path.exists(os.path.join(run_path, ".sync_finished"))):
+ if (
+ os.path.exists(rta_file)
+ and os.path.exists(cp_file)
+ and (not self.file_in_pdc(run.zip_encrypted))
+ ) or (
+ self._get_run_type(run.name) in ["promethion", "minion"]
+ and os.path.exists(os.path.join(run_path, ".sync_finished"))
+ ):
# Case for encrypting
# Run has NOT been encrypted (run.tar.gz.gpg not exists)
- if ext == '.tar.gz' and (not os.path.exists(run.zip_encrypted)):
- logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving')
+ if ext == ".tar.gz" and (not os.path.exists(run.zip_encrypted)):
+ logger.info(
+ f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for archiving"
+ )
archive_ready = True
# Case for putting data to PDC
# Run has already been encrypted (run.tar.gz.gpg exists)
- elif ext == '.tar.gz.gpg' and os.path.exists(run.zip_encrypted):
- logger.info(f'Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC')
+ elif ext == ".tar.gz.gpg" and os.path.exists(run.zip_encrypted):
+ logger.info(
+ f"Sequencing has finished and copying completed for run {os.path.basename(run_path)} and is ready for sending to PDC"
+ )
archive_ready = True
return archive_ready
def log_archived_run(self, file_name):
"""Write files archived to PDC to log file"""
- with open(self.archive_log_location, 'a') as archive_file:
- tsv_writer = csv.writer(archive_file, delimiter='\t')
+ with open(self.archive_log_location, "a") as archive_file:
+ tsv_writer = csv.writer(archive_file, delimiter="\t")
tsv_writer.writerow([file_name, str(datetime.now())])
def _move_run_to_archived(self, run):
@@ -257,7 +318,7 @@ def _move_run_to_archived(self, run):
run_type = self._get_run_type(run.name)
archived_path = self.archived_dirs[run_type]
if os.path.isdir(archived_path):
- logger.info(f'Moving run {run.name} to the archived folder')
+ logger.info(f"Moving run {run.name} to the archived folder")
shutil.move(run.name, archived_path)
else:
logger.warning("Cannot move run to archived, destination does not exist")
@@ -266,130 +327,207 @@ def _move_run_to_archived(self, run):
def encrypt_runs(cls, run, force):
"""Encrypt the runs that have been collected."""
bk = cls(run)
- bk.collect_runs(ext='.tar.gz')
- logger.info(f'In total, found {len(bk.runs)} run(s) to be encrypted')
+ bk.collect_runs(ext=".tar.gz")
+ logger.info(f"In total, found {len(bk.runs)} run(s) to be encrypted")
for run in bk.runs:
- run.flag = f'{run.name}.encrypting'
+ run.flag = f"{run.name}.encrypting"
run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
tmp_files = [run.zip_encrypted, run.key_encrypted, run.key, run.flag]
- logger.info(f'Encryption of run {run.name} is now started')
+ logger.info(f"Encryption of run {run.name} is now started")
# Check if there is enough space and exit if not
bk.avail_disk_space(run.path, run.name)
# Check if the run in demultiplexed
if not force and bk.check_demux:
- if not misc.run_is_demuxed(run, bk.couch_info, bk._get_run_type(run.name)):
- logger.warn(f'Run {run.name} is not demultiplexed yet, so skipping it')
+ if not misc.run_is_demuxed(
+ run, bk.couch_info, bk._get_run_type(run.name)
+ ):
+ logger.warn(
+ f"Run {run.name} is not demultiplexed yet, so skipping it"
+ )
continue
- logger.info(f'Run {run.name} is demultiplexed and proceeding with encryption')
+ logger.info(
+ f"Run {run.name} is demultiplexed and proceeding with encryption"
+ )
with filesystem.chdir(run.path):
# skip run if already ongoing
if os.path.exists(run.flag):
- logger.warn(f'Run {run.name} is already being encrypted, so skipping now')
+ logger.warn(
+ f"Run {run.name} is already being encrypted, so skipping now"
+ )
continue
- open(run.flag, 'w').close()
+ open(run.flag, "w").close()
# zip the run directory
if os.path.exists(run.zip):
if os.path.isdir(run.name):
- logger.warn(f'Both run source and zipped archive exist for run {run.name}, skipping run as precaution')
+ logger.warn(
+ f"Both run source and zipped archive exist for run {run.name}, skipping run as precaution"
+ )
bk._clean_tmp_files([run.flag])
continue
- logger.info(f'Zipped archive already exist for run {run.name}, so using it for encryption')
+ logger.info(
+ f"Zipped archive already exist for run {run.name}, so using it for encryption"
+ )
else:
- exclude_files = " ".join([f'--exclude {x}' for x in bk.exclude_list])
- logger.info(f'Creating zipped archive for run {run.name}')
- if bk._call_commands(cmd1=f'tar {exclude_files} -cf - {run.name}', cmd2='pigz --fast -c -',
- out_file=run.zip, mail_failed=True, tmp_files=[run.zip, run.flag]):
- logger.info(f'Run {run.name} was successfully compressed and transferred to {run.zip}')
+ exclude_files = " ".join(
+ [f"--exclude {x}" for x in bk.exclude_list]
+ )
+ logger.info(f"Creating zipped archive for run {run.name}")
+ if bk._call_commands(
+ cmd1=f"tar {exclude_files} -cf - {run.name}",
+ cmd2="pigz --fast -c -",
+ out_file=run.zip,
+ mail_failed=True,
+ tmp_files=[run.zip, run.flag],
+ ):
+ logger.info(
+ f"Run {run.name} was successfully compressed and transferred to {run.zip}"
+ )
else:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
# Remove encrypted file if already exists
if os.path.exists(run.zip_encrypted):
- logger.warn(f'Removing already existing encrypted file for run {run.name}, this is a precaution '
- 'to make sure the file was encrypted with correct key file')
- bk._clean_tmp_files([run.zip_encrypted, run.key, run.key_encrypted, run.dst_key_encrypted])
+ logger.warn(
+ f"Removing already existing encrypted file for run {run.name}, this is a precaution "
+ "to make sure the file was encrypted with correct key file"
+ )
+ bk._clean_tmp_files(
+ [
+ run.zip_encrypted,
+ run.key,
+ run.key_encrypted,
+ run.dst_key_encrypted,
+ ]
+ )
# Generate random key to use as pasphrase
- if not bk._call_commands(cmd1='gpg --gen-random 1 256', out_file=run.key, tmp_files=tmp_files):
- logger.warn(f'Skipping run {run.name} and moving on')
+ if not bk._call_commands(
+ cmd1="gpg --gen-random 1 256", out_file=run.key, tmp_files=tmp_files
+ ):
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
- logger.info(f'Generated random phrase key for run {run.name}')
+ logger.info(f"Generated random phrase key for run {run.name}")
# Calculate md5 sum pre encryption
if not force:
- logger.info('Calculating md5sum before encryption')
- md5_call, md5_out = bk._call_commands(cmd1=f'md5sum {run.zip}', return_out=True, tmp_files=tmp_files)
+ logger.info("Calculating md5sum before encryption")
+ md5_call, md5_out = bk._call_commands(
+ cmd1=f"md5sum {run.zip}", return_out=True, tmp_files=tmp_files
+ )
if not md5_call:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
md5_pre_encrypt = md5_out.split()[0]
# Encrypt the zipped run file
- logger.info('Encrypting the zipped run file')
- if not bk._call_commands(cmd1=(f'gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo '
- f'none -o {run.zip_encrypted} {run.zip}'), tmp_files=tmp_files):
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.info("Encrypting the zipped run file")
+ if not bk._call_commands(
+ cmd1=(
+ f"gpg --symmetric --cipher-algo aes256 --passphrase-file {run.key} --batch --compress-algo "
+ f"none -o {run.zip_encrypted} {run.zip}"
+ ),
+ tmp_files=tmp_files,
+ ):
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
# Decrypt and check for md5
if not force:
- logger.info('Calculating md5sum after encryption')
- md5_call, md5_out = bk._call_commands(cmd1=f'gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}',
- cmd2='md5sum', return_out=True, tmp_files=tmp_files)
+ logger.info("Calculating md5sum after encryption")
+ md5_call, md5_out = bk._call_commands(
+ cmd1=f"gpg --decrypt --cipher-algo aes256 --passphrase-file {run.key} --batch {run.zip_encrypted}",
+ cmd2="md5sum",
+ return_out=True,
+ tmp_files=tmp_files,
+ )
if not md5_call:
- logger.warn(f'Skipping run {run.name} and moving on')
+ logger.warn(f"Skipping run {run.name} and moving on")
continue
md5_post_encrypt = md5_out.split()[0]
if md5_pre_encrypt != md5_post_encrypt:
- logger.error(f'md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on')
+ logger.error(
+ f"md5sum did not match before {md5_pre_encrypt} and after {md5_post_encrypt} encryption. Will remove temp files and move on"
+ )
bk._clean_tmp_files(tmp_files)
continue
- logger.info('Md5sum matches before and after encryption')
+ logger.info("Md5sum matches before and after encryption")
# Encrypt and move the key file
- if bk._call_commands(cmd1=f'gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}', tmp_files=tmp_files):
+ if bk._call_commands(
+ cmd1=f"gpg -e -r {bk.gpg_receiver} -o {run.key_encrypted} {run.key}",
+ tmp_files=tmp_files,
+ ):
shutil.move(run.key_encrypted, run.dst_key_encrypted)
else:
- logger.error('Encryption of key file failed, skipping run')
+ logger.error("Encryption of key file failed, skipping run")
continue
bk._clean_tmp_files([run.zip, run.key, run.flag])
- logger.info(f'Encryption of run {run.name} is successfully done, removing zipped run file')
+ logger.info(
+ f"Encryption of run {run.name} is successfully done, removing zipped run file"
+ )
@classmethod
def pdc_put(cls, run):
"""Archive the collected runs to PDC."""
bk = cls(run)
- bk.collect_runs(ext='.tar.gz.gpg', filter_by_ext=True)
- logger.info(f'In total, found {len(bk.runs)} run(s) to send PDC')
+ bk.collect_runs(ext=".tar.gz.gpg", filter_by_ext=True)
+ logger.info(f"In total, found {len(bk.runs)} run(s) to send PDC")
for run in bk.runs:
- run.flag = f'{run.name}.archiving'
+ run.flag = f"{run.name}.archiving"
run.dst_key_encrypted = os.path.join(bk.keys_path, run.key_encrypted)
if run.path not in bk.archive_dirs.values():
- logger.error('Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate '
- 'archive dir before sending it to PDC'.format(','.join(list(bk.archive_dirs.values())), run.name))
+ logger.error(
+ "Given run is not in one of the archive directories {}. Kindly move the run {} to appropriate "
+ "archive dir before sending it to PDC".format(
+ ",".join(list(bk.archive_dirs.values())), run.name
+ )
+ )
continue
if not os.path.exists(run.dst_key_encrypted):
- logger.error(f'Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it')
+ logger.error(
+ f"Encrypted key file {run.dst_key_encrypted} is not found for file {run.zip_encrypted}, skipping it"
+ )
continue
with filesystem.chdir(run.path):
- #skip run if being encrypted
- if os.path.exists(f'{run.name}.encrypting'):
- logger.warn(f'Run {run.name} is currently being encrypted, so skipping now')
+ # skip run if being encrypted
+ if os.path.exists(f"{run.name}.encrypting"):
+ logger.warn(
+ f"Run {run.name} is currently being encrypted, so skipping now"
+ )
continue
# skip run if already ongoing
if os.path.exists(run.flag):
- logger.warn(f'Run {run.name} is already being archived, so skipping now')
+ logger.warn(
+ f"Run {run.name} is already being archived, so skipping now"
+ )
continue
- if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(run.dst_key_encrypted, silent=False):
- logger.warn(f'Seems like files related to run {run.name} already exist in PDC, check and cleanup')
+ if bk.file_in_pdc(run.zip_encrypted, silent=False) or bk.file_in_pdc(
+ run.dst_key_encrypted, silent=False
+ ):
+ logger.warn(
+ f"Seems like files related to run {run.name} already exist in PDC, check and cleanup"
+ )
continue
- open(run.flag, 'w').close()
- logger.info(f'Sending file {run.zip_encrypted} to PDC')
- if bk._call_commands(cmd1=f'dsmc archive {run.zip_encrypted}', tmp_files=[run.flag]):
- time.sleep(15) # give some time just in case 'dsmc' needs to settle
- if bk._call_commands(cmd1=f'dsmc archive {run.dst_key_encrypted}', tmp_files=[run.flag]):
- time.sleep(5) # give some time just in case 'dsmc' needs to settle
- if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(run.dst_key_encrypted):
- logger.info(f'Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder')
+ open(run.flag, "w").close()
+ logger.info(f"Sending file {run.zip_encrypted} to PDC")
+ if bk._call_commands(
+ cmd1=f"dsmc archive {run.zip_encrypted}", tmp_files=[run.flag]
+ ):
+ time.sleep(15) # give some time just in case 'dsmc' needs to settle
+ if bk._call_commands(
+ cmd1=f"dsmc archive {run.dst_key_encrypted}",
+ tmp_files=[run.flag],
+ ):
+ time.sleep(
+ 5
+ ) # give some time just in case 'dsmc' needs to settle
+ if bk.file_in_pdc(run.zip_encrypted) and bk.file_in_pdc(
+ run.dst_key_encrypted
+ ):
+ logger.info(
+ f"Successfully sent file {run.zip_encrypted} to PDC, moving file locally from {run.path} to archived folder"
+ )
bk.log_archived_run(run.zip_encrypted)
if bk.couch_info:
bk._log_pdc_statusdb(run.name)
- bk._clean_tmp_files([run.zip_encrypted, run.dst_key_encrypted, run.flag])
+ bk._clean_tmp_files(
+ [run.zip_encrypted, run.dst_key_encrypted, run.flag]
+ )
bk._move_run_to_archived(run)
continue
- logger.warn(f'Sending file {run.zip_encrypted} to PDC failed')
+ logger.warn(f"Sending file {run.zip_encrypted} to PDC failed")
diff --git a/taca/backup/cli.py b/taca/backup/cli.py
index 89128002..60d8c442 100644
--- a/taca/backup/cli.py
+++ b/taca/backup/cli.py
@@ -7,35 +7,69 @@
@click.group()
@click.pass_context
def backup(ctx):
- """ Backup management methods and utilities """
+ """Backup management methods and utilities"""
pass
+
@backup.command()
-@click.option('-r', '--run', type=click.Path(exists=True), help="A run (directory or a zipped archive) to be encrypted")
-@click.option('-f', '--force', is_flag=True, help="Ignore the checks and just try encryption. USE IT WITH CAUTION.")
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ help="A run (directory or a zipped archive) to be encrypted",
+)
+@click.option(
+ "-f",
+ "--force",
+ is_flag=True,
+ help="Ignore the checks and just try encryption. USE IT WITH CAUTION.",
+)
@click.pass_context
def encrypt(ctx, run, force):
bkut.encrypt_runs(run, force)
-@backup.command(name='put_data')
-@click.option('-r', '--run', type=click.Path(exists=True), help="A run name (without extension) to be sent to PDC")
+
+@backup.command(name="put_data")
+@click.option(
+ "-r",
+ "--run",
+ type=click.Path(exists=True),
+ help="A run name (without extension) to be sent to PDC",
+)
@click.pass_context
def put_data(ctx, run):
bkut.pdc_put(run)
-@backup.command(name='get_data')
-@click.option('-r', '--run', required=True, help="A run name (without extension) to download from PDC")
-@click.option('-o', '--outdir', type=click.Path(exists=True, file_okay=False, writable=True),
- help="Optional directory name to save the downloaded file. Directory should exist")
+
+@backup.command(name="get_data")
+@click.option(
+ "-r",
+ "--run",
+ required=True,
+ help="A run name (without extension) to download from PDC",
+)
+@click.option(
+ "-o",
+ "--outdir",
+ type=click.Path(exists=True, file_okay=False, writable=True),
+ help="Optional directory name to save the downloaded file. Directory should exist",
+)
@click.pass_context
def get_data(ctx, run, outdir):
## W I P ##
raise NotImplementedError
+
@backup.command()
-@click.option('-r', '--run', required=True, type=click.Path(exists=True, dir_okay=False), help="A encripted run file")
-@click.option('-k', '--key', required=True, help="Key file to be used for decryption")
-@click.option('-p', '--password', help="To pass decryption passphrase via command line")
+@click.option(
+ "-r",
+ "--run",
+ required=True,
+ type=click.Path(exists=True, dir_okay=False),
+ help="A encripted run file",
+)
+@click.option("-k", "--key", required=True, help="Key file to be used for decryption")
+@click.option("-p", "--password", help="To pass decryption passphrase via command line")
@click.pass_context
def decrypt(ctx, run, key, password):
## W I P ##
diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py
index 498ffcf9..3a58d871 100644
--- a/taca/cleanup/cleanup.py
+++ b/taca/cleanup/cleanup.py
@@ -11,11 +11,19 @@
logger = logging.getLogger(__name__)
-def cleanup_miarka(days_fastq, days_analysis,
- only_fastq, only_analysis,
- clean_undetermined, status_db_config,
- exclude_projects, list_only,
- date, dry_run=False):
+
+def cleanup_miarka(
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ status_db_config,
+ exclude_projects,
+ list_only,
+ date,
+ dry_run=False,
+):
"""Remove fastq/analysis data for projects that have been closed more than given
days (as days_fastq/days_analysis) from the given 'miarka' cluster.
@@ -46,26 +54,30 @@ def cleanup_miarka(days_fastq, days_analysis,
- "*.bam"
"""
try:
- config = CONFIG['cleanup']['miarka']
- flowcell_dir_root = config['flowcell']['root']
- flowcell_project_source = config['flowcell']['relative_project_source']
- flowcell_undet_files = config['flowcell']['undet_file_pattern']
- data_dir = config['data_dir']
- analysis_dir = config['analysis']['root']
- analysis_data_to_remove = config['analysis']['files_to_remove']
+ config = CONFIG["cleanup"]["miarka"]
+ flowcell_dir_root = config["flowcell"]["root"]
+ flowcell_project_source = config["flowcell"]["relative_project_source"]
+ flowcell_undet_files = config["flowcell"]["undet_file_pattern"]
+ data_dir = config["data_dir"]
+ analysis_dir = config["analysis"]["root"]
+ analysis_data_to_remove = config["analysis"]["files_to_remove"]
if date:
- date = datetime.strptime(date, '%Y-%m-%d')
+ date = datetime.strptime(date, "%Y-%m-%d")
except KeyError as e:
- logger.error(f'Config file is missing the key {str(e)}, make sure it has all required information')
+ logger.error(
+ f"Config file is missing the key {str(e)}, make sure it has all required information"
+ )
raise SystemExit
except ValueError:
- logger.error('Date given with "--date" option is not in required format, see help for more info')
+ logger.error(
+ 'Date given with "--date" option is not in required format, see help for more info'
+ )
raise SystemExit
# make a connection for project db
db_config = load_config(status_db_config)
- pcon = statusdb.ProjectSummaryConnection(db_config.get('statusdb'))
- assert pcon, 'Could not connect to project database in StatusDB'
+ pcon = statusdb.ProjectSummaryConnection(db_config.get("statusdb"))
+ assert pcon, "Could not connect to project database in StatusDB"
# make exclude project list if provided
exclude_list = []
@@ -74,189 +86,302 @@ def cleanup_miarka(days_fastq, days_analysis,
with open(exclude_projects) as in_file:
exclude_list.extend([p.strip() for p in in_file.readlines()])
else:
- exclude_list.extend(exclude_projects.split(','))
+ exclude_list.extend(exclude_projects.split(","))
# sanity check for mentioned project to exculde or valid
- invalid_projects = [p for p in exclude_list if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()]
+ invalid_projects = [
+ p
+ for p in exclude_list
+ if p not in pcon.id_view.keys() and p not in pcon.name_view.keys()
+ ]
if invalid_projects:
- logger.error('"--exclude_projects" was called with some invalid projects "{}", '
- 'provide valid project name/id'.format(','.join(invalid_projects)))
+ logger.error(
+ '"--exclude_projects" was called with some invalid projects "{}", '
+ "provide valid project name/id".format(",".join(invalid_projects))
+ )
raise SystemExit
- #compile list for project to delete
+ # compile list for project to delete
project_clean_list, project_processed_list = ({}, [])
if not list_only and not clean_undetermined:
- logger.info('Building initial project list for removing data...')
+ logger.info("Building initial project list for removing data...")
if only_fastq:
- logger.info('Option "--only_fastq" is given, so will not look for analysis data')
+ logger.info(
+ 'Option "--only_fastq" is given, so will not look for analysis data'
+ )
elif only_analysis:
- logger.info('Option "--only_analysis" is given, so will not look for fastq data')
+ logger.info(
+ 'Option "--only_analysis" is given, so will not look for fastq data'
+ )
if clean_undetermined:
all_undet_files = []
for flowcell_dir in flowcell_dir_root:
- for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)]:
+ for fc in [
+ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)
+ ]:
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn(f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory')
+ logger.warn(
+ f'Flowcell {fc} does not contain a "{flowcell_project_source}" directory'
+ )
continue
- projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
- if re.match(r'^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$',d) and \
- not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))]
+ projects_in_fc = [
+ d
+ for d in os.listdir(flowcell_project_source)
+ if re.match(r"^[A-Z]+[_\.]+[A-Za-z]+_\d\d_\d\d$", d)
+ and not os.path.exists(
+ os.path.join(flowcell_project_source, d, "cleaned")
+ )
+ ]
# the above check looked for project directories and also that are not cleaned
# so if it could not find any project, means there is no project diretory at all
# or all the project directory is already cleaned. Then we can remove the undet
if len(projects_in_fc) > 0:
continue
- fc_undet_files = glob(os.path.join(flowcell_project_source, flowcell_undet_files))
+ fc_undet_files = glob(
+ os.path.join(flowcell_project_source, flowcell_undet_files)
+ )
if fc_undet_files:
- logger.info(f'All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files')
- all_undet_files.extend(list(map(os.path.abspath, fc_undet_files)))
+ logger.info(
+ f"All projects was cleaned for FC {fc}, found {len(fc_undet_files)} undeterminded files"
+ )
+ all_undet_files.extend(
+ list(map(os.path.abspath, fc_undet_files))
+ )
if all_undet_files:
undet_size = _def_get_size_unit(sum(map(os.path.getsize, all_undet_files)))
- if misc.query_yes_no('In total found {} undetermined files which are {} in size, delete now ?'.format(len(all_undet_files),
- undet_size), default='no'):
- _remove_files(all_undet_files)
+ if misc.query_yes_no(
+ "In total found {} undetermined files which are {} in size, delete now ?".format(
+ len(all_undet_files), undet_size
+ ),
+ default="no",
+ ):
+ _remove_files(all_undet_files)
return
elif only_analysis:
- for pid in [d for d in os.listdir(analysis_dir) if re.match(r'^P\d+$', d) and \
- not os.path.exists(os.path.join(analysis_dir, d, 'cleaned'))]:
+ for pid in [
+ d
+ for d in os.listdir(analysis_dir)
+ if re.match(r"^P\d+$", d)
+ and not os.path.exists(os.path.join(analysis_dir, d, "cleaned"))
+ ]:
os.path.join(analysis_dir, pid)
- proj_info = get_closed_proj_info(pid, pcon.get_entry(pid, use_id_view=True), date)
- if proj_info and proj_info['closed_days'] >= days_analysis:
+ proj_info = get_closed_proj_info(
+ pid, pcon.get_entry(pid, use_id_view=True), date
+ )
+ if proj_info and proj_info["closed_days"] >= days_analysis:
# move on if this project has to be excluded
- if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
+ if (
+ proj_info["name"] in exclude_list
+ or proj_info["pid"] in exclude_list
+ ):
continue
- analysis_data, analysis_size = collect_analysis_data_miarka(pid, analysis_dir, analysis_data_to_remove)
- proj_info['analysis_to_remove'] = analysis_data
- proj_info['analysis_size'] = analysis_size
- proj_info['fastq_to_remove'] = 'not_selected'
- proj_info['fastq_size'] = 0
- project_clean_list[proj_info['name']] = proj_info
+ analysis_data, analysis_size = collect_analysis_data_miarka(
+ pid, analysis_dir, analysis_data_to_remove
+ )
+ proj_info["analysis_to_remove"] = analysis_data
+ proj_info["analysis_size"] = analysis_size
+ proj_info["fastq_to_remove"] = "not_selected"
+ proj_info["fastq_size"] = 0
+ project_clean_list[proj_info["name"]] = proj_info
else:
for flowcell_dir in flowcell_dir_root:
- for fc in [d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE,d)]:
+ for fc in [
+ d for d in os.listdir(flowcell_dir) if re.match(filesystem.RUN_RE, d)
+ ]:
fc_abs_path = os.path.join(flowcell_dir, fc)
with filesystem.chdir(fc_abs_path):
if not os.path.exists(flowcell_project_source):
- logger.warn(f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry')
+ logger.warn(
+ f'Flowcell {fc} do not contain a "{flowcell_project_source}" direcotry'
+ )
continue
- projects_in_fc = [d for d in os.listdir(flowcell_project_source) \
- if re.match(r'^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$',d) and \
- not os.path.exists(os.path.join(flowcell_project_source, d, 'cleaned'))]
+ projects_in_fc = [
+ d
+ for d in os.listdir(flowcell_project_source)
+ if re.match(r"^[A-Z]+[_\.]+[A-Za-z0-9]+_\d\d_\d\d$", d)
+ and not os.path.exists(
+ os.path.join(flowcell_project_source, d, "cleaned")
+ )
+ ]
for _proj in projects_in_fc:
- proj = re.sub(r'_+', '.', _proj, 1)
+ proj = re.sub(r"_+", ".", _proj, 1)
# if a project is already processed no need of fetching it again from status db
if proj in project_processed_list:
# if the project is closed more than threshold days collect the fastq files from FC
# no need of looking for analysis data as they would have been collected in the first time
- if proj in project_clean_list and project_clean_list[proj]['closed_days'] >= days_fastq:
- fc_fq_files, fq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj))
- project_clean_list[proj]['fastq_to_remove']['flowcells'][fc] = fc_fq_files['flowcells'][fc]
- project_clean_list[proj]['fastq_size'] += fq_size
+ if (
+ proj in project_clean_list
+ and project_clean_list[proj]["closed_days"]
+ >= days_fastq
+ ):
+ fc_fq_files, fq_size = collect_fastq_data_miarka(
+ fc_abs_path,
+ os.path.join(flowcell_project_source, _proj),
+ )
+ project_clean_list[proj]["fastq_to_remove"][
+ "flowcells"
+ ][fc] = fc_fq_files["flowcells"][fc]
+ project_clean_list[proj]["fastq_size"] += fq_size
continue
project_processed_list.append(proj)
- #by default assume all projects are not old enough for delete
- fastq_data, analysis_data = ('young', 'young')
+ # by default assume all projects are not old enough for delete
+ fastq_data, analysis_data = ("young", "young")
fastq_size, analysis_size = (0, 0)
- proj_info = get_closed_proj_info(proj, pcon.get_entry(proj), date)
+ proj_info = get_closed_proj_info(
+ proj, pcon.get_entry(proj), date
+ )
if proj_info:
# move on if this project has to be excluded
- if proj_info['name'] in exclude_list or proj_info['pid'] in exclude_list:
+ if (
+ proj_info["name"] in exclude_list
+ or proj_info["pid"] in exclude_list
+ ):
continue
# if project not old enough for fastq files and only fastq files selected move on to next project
- if proj_info['closed_days'] >= days_fastq:
- fastq_data, fastq_size = collect_fastq_data_miarka(fc_abs_path, os.path.join(flowcell_project_source, _proj),
- data_dir, proj_info['pid'])
+ if proj_info["closed_days"] >= days_fastq:
+ fastq_data, fastq_size = collect_fastq_data_miarka(
+ fc_abs_path,
+ os.path.join(flowcell_project_source, _proj),
+ data_dir,
+ proj_info["pid"],
+ )
if not only_fastq:
# if project is old enough for fastq files and not 'only_fastq' try collect analysis files
- if proj_info['closed_days'] >= days_analysis:
- analysis_data, analysis_size = collect_analysis_data_miarka(proj_info['pid'], analysis_dir, analysis_data_to_remove)
+ if proj_info["closed_days"] >= days_analysis:
+ (
+ analysis_data,
+ analysis_size,
+ ) = collect_analysis_data_miarka(
+ proj_info["pid"],
+ analysis_dir,
+ analysis_data_to_remove,
+ )
# if both fastq and analysis files are not old enough move on
- if (analysis_data == fastq_data) or ((not analysis_data or analysis_data == 'cleaned') and fastq_data == 'young'):
+ if (analysis_data == fastq_data) or (
+ (not analysis_data or analysis_data == "cleaned")
+ and fastq_data == "young"
+ ):
continue
- elif fastq_data == 'young':
+ elif fastq_data == "young":
continue
else:
- analysis_data = 'not_selected'
- proj_info['fastq_to_remove'] = fastq_data
- proj_info['fastq_size'] = fastq_size
- proj_info['analysis_to_remove'] = analysis_data
- proj_info['analysis_size'] = analysis_size
+ analysis_data = "not_selected"
+ proj_info["fastq_to_remove"] = fastq_data
+ proj_info["fastq_size"] = fastq_size
+ proj_info["analysis_to_remove"] = analysis_data
+ proj_info["analysis_size"] = analysis_size
project_clean_list[proj] = proj_info
if not project_clean_list:
- logger.info('There are no projects to clean')
+ logger.info("There are no projects to clean")
return
# list only the project and exit if 'list_only' option is selected
if list_only:
- print('Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size')
- for p_info in sorted(list(project_clean_list.values()), key=lambda d: d['closed_days'], reverse=True):
- print('\t'.join([p_info['name'], p_info['pid'], p_info['bioinfo_responsible'],
- str(p_info['closed_days']), p_info['closed_date'],
- _def_get_size_unit(p_info['fastq_size']), _def_get_size_unit(p_info['analysis_size'])]))
+ print(
+ "Project ID\tProject Name\tBioinfo resp.\tClosed Days\tClosed Date\tFastq size\tAnalysis size"
+ )
+ for p_info in sorted(
+ list(project_clean_list.values()),
+ key=lambda d: d["closed_days"],
+ reverse=True,
+ ):
+ print(
+ "\t".join(
+ [
+ p_info["name"],
+ p_info["pid"],
+ p_info["bioinfo_responsible"],
+ str(p_info["closed_days"]),
+ p_info["closed_date"],
+ _def_get_size_unit(p_info["fastq_size"]),
+ _def_get_size_unit(p_info["analysis_size"]),
+ ]
+ )
+ )
raise SystemExit
- logger.info(f'Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}')
- if misc.query_yes_no('Interactively filter projects for cleanup ?', default='yes'):
+ logger.info(
+ f"Initial list is built with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}"
+ )
+ if misc.query_yes_no("Interactively filter projects for cleanup ?", default="yes"):
filtered_project, proj_count = ([], 0)
- #go through complied project list and remove files
+ # go through complied project list and remove files
for proj, info in project_clean_list.items():
proj_count += 1
- if not misc.query_yes_no('{}Delete files for this project ({}/{})'.format(get_proj_meta_info(info, days_fastq),
- proj_count, len(project_clean_list)), default='no'):
- logger.info(f'Will not remove files for project {proj}')
+ if not misc.query_yes_no(
+ "{}Delete files for this project ({}/{})".format(
+ get_proj_meta_info(info, days_fastq),
+ proj_count,
+ len(project_clean_list),
+ ),
+ default="no",
+ ):
+ logger.info(f"Will not remove files for project {proj}")
filtered_project.append(proj)
# remove projects that were decided not to delete
map(project_clean_list.pop, filtered_project)
- logger.info(f'Removed {len(filtered_project)}/{proj_count} projects from initial list')
+ logger.info(
+ f"Removed {len(filtered_project)}/{proj_count} projects from initial list"
+ )
if not project_clean_list:
- logger.info('There are no projects to clean after filtering')
+ logger.info("There are no projects to clean after filtering")
return
- logger.info(f'Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}')
- if not misc.query_yes_no('Proceed with cleanup ?', default='no'):
- logger.info('Aborting cleanup')
+ logger.info(
+ f"Final list is created with {len(project_clean_list)} projects {get_files_size_text(project_clean_list)}"
+ )
+ if not misc.query_yes_no("Proceed with cleanup ?", default="no"):
+ logger.info("Aborting cleanup")
return
- logger.info('Will start cleaning up project now')
+ logger.info("Will start cleaning up project now")
for proj, info in project_clean_list.items():
- fastq_info = info.get('fastq_to_remove')
+ fastq_info = info.get("fastq_to_remove")
if fastq_info and isinstance(fastq_info, dict):
- logger.info(f'Cleaning fastq files for project {proj}')
- fastq_fc = fastq_info.get('flowcells', {})
+ logger.info(f"Cleaning fastq files for project {proj}")
+ fastq_fc = fastq_info.get("flowcells", {})
removed_fc = []
for fc, fc_info in fastq_fc.items():
- proj_fc_root = fc_info['proj_root']
- logger.info(f'Removing fastq files from {proj_fc_root}')
+ proj_fc_root = fc_info["proj_root"]
+ logger.info(f"Removing fastq files from {proj_fc_root}")
if not dry_run:
- if _remove_files(fc_info['fq_files']):
- logger.info(f'Removed fastq files from FC {fc} for project {proj}, marking it as cleaned')
+ if _remove_files(fc_info["fq_files"]):
+ logger.info(
+ f"Removed fastq files from FC {fc} for project {proj}, marking it as cleaned"
+ )
_touch_cleaned(proj_fc_root)
removed_fc.append(fc)
if len(fastq_fc) == len(removed_fc):
try:
- proj_data_root = fastq_info['proj_data']['proj_data_root']
- logger.info(f'All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}')
+ proj_data_root = fastq_info["proj_data"]["proj_data_root"]
+ logger.info(
+ f"All flowcells cleaned for this project, marking it as cleaned in {proj_data_root}"
+ )
_touch_cleaned(proj_data_root)
except:
pass
- analysis_info = info.get('analysis_to_remove')
+ analysis_info = info.get("analysis_to_remove")
if analysis_info and isinstance(analysis_info, dict):
- proj_analysis_root = analysis_info['proj_analysis_root']
- logger.info(f'cleaning analysis data for project {proj}')
+ proj_analysis_root = analysis_info["proj_analysis_root"]
+ logger.info(f"cleaning analysis data for project {proj}")
removed_qc = []
- for qc, files in analysis_info['analysis_files'].items():
+ for qc, files in analysis_info["analysis_files"].items():
logger.info(f'Removing files of "{qc}" from {proj_analysis_root}')
if not dry_run:
if _remove_files(files):
removed_qc.append(qc)
else:
- logger.warn(f'Could not remove some files in qc directory "{qc}"')
- map(analysis_info['analysis_files'].pop, removed_qc)
- if len(analysis_info['analysis_files']) == 0:
- logger.info(f'Removed analysis data for project {proj}, marking it cleaned')
+ logger.warn(
+ f'Could not remove some files in qc directory "{qc}"'
+ )
+ map(analysis_info["analysis_files"].pop, removed_qc)
+ if len(analysis_info["analysis_files"]) == 0:
+ logger.info(
+ f"Removed analysis data for project {proj}, marking it cleaned"
+ )
_touch_cleaned(proj_analysis_root)
@@ -264,27 +389,38 @@ def cleanup_miarka(days_fastq, days_analysis,
# Class helper methods, not exposed as commands/subcommands #
#############################################################
+
def get_closed_proj_info(prj, pdoc, tdate=None):
"""Check and return a dict if project is closed."""
pdict = None
if not tdate:
tdate = datetime.today()
if not pdoc:
- logger.warn(f'Seems like project {prj} does not have a proper statusdb document, skipping it')
- elif 'close_date' in pdoc:
- closed_date = pdoc['close_date']
+ logger.warn(
+ f"Seems like project {prj} does not have a proper statusdb document, skipping it"
+ )
+ elif "close_date" in pdoc:
+ closed_date = pdoc["close_date"]
try:
- closed_days = tdate - datetime.strptime(closed_date, '%Y-%m-%d')
- pdict = {'name' : pdoc.get('project_name'),
- 'pid' : pdoc.get('project_id'),
- 'closed_date' : closed_date,
- 'closed_days' : closed_days.days,
- 'bioinfo_responsible' : pdoc.get('project_summary',{}).get('bioinfo_responsible','')}
+ closed_days = tdate - datetime.strptime(closed_date, "%Y-%m-%d")
+ pdict = {
+ "name": pdoc.get("project_name"),
+ "pid": pdoc.get("project_id"),
+ "closed_date": closed_date,
+ "closed_days": closed_days.days,
+ "bioinfo_responsible": pdoc.get("project_summary", {}).get(
+ "bioinfo_responsible", ""
+ ),
+ }
except:
- logger.warn('Problem calculating closed days for project {} with close date {}. Skipping it'.format(
- pdoc.get('project_name'), closed_date))
+ logger.warn(
+ "Problem calculating closed days for project {} with close date {}. Skipping it".format(
+ pdoc.get("project_name"), closed_date
+ )
+ )
return pdict
+
def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}):
"""Collect the analysis files that have to be removed from Miarka
return a tuple with files and total size of collected files."""
@@ -292,42 +428,57 @@ def collect_analysis_data_miarka(pid, analysis_root, files_ext_to_remove={}):
proj_abs_path = os.path.join(analysis_root, pid)
if not os.path.exists(proj_abs_path):
file_list = None
- elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')):
- file_list = 'cleaned'
+ elif os.path.exists(os.path.join(proj_abs_path, "cleaned")):
+ file_list = "cleaned"
else:
- file_list = {'proj_analysis_root':proj_abs_path,
- 'analysis_files': defaultdict(list)}
- for qc_type,ext in files_ext_to_remove.items():
+ file_list = {
+ "proj_analysis_root": proj_abs_path,
+ "analysis_files": defaultdict(list),
+ }
+ for qc_type, ext in files_ext_to_remove.items():
qc_path = os.path.join(proj_abs_path, qc_type)
if os.path.exists(qc_path):
- file_list['analysis_files'][qc_type].extend(collect_files_by_ext(qc_path, ext))
+ file_list["analysis_files"][qc_type].extend(
+ collect_files_by_ext(qc_path, ext)
+ )
try:
- size += sum([sum(map(os.path.getsize, fls)) for fls in file_list['analysis_files'].values()])
+ size += sum(
+ [
+ sum(map(os.path.getsize, fls))
+ for fls in file_list["analysis_files"].values()
+ ]
+ )
except:
pass
return (file_list, size)
+
def collect_fastq_data_miarka(fc_root, fc_proj_src, proj_root=None, pid=None):
"""Collect the fastq files that have to be removed from Miarka.
Return a tuple with files and total size of collected files."""
size = 0
- file_list = {'flowcells': defaultdict(dict)}
+ file_list = {"flowcells": defaultdict(dict)}
fc_proj_path = os.path.join(fc_root, fc_proj_src)
fc_id = os.path.basename(fc_root)
- file_list['flowcells'][fc_id] = {'proj_root': fc_proj_path,
- 'fq_files': collect_files_by_ext(fc_proj_path, '*.fastq.gz')}
+ file_list["flowcells"][fc_id] = {
+ "proj_root": fc_proj_path,
+ "fq_files": collect_files_by_ext(fc_proj_path, "*.fastq.gz"),
+ }
if proj_root and pid:
proj_abs_path = os.path.join(proj_root, pid)
if not os.path.exists(proj_abs_path):
- file_list['proj_data'] = None
- elif os.path.exists(os.path.join(proj_abs_path, 'cleaned')):
- file_list['proj_data'] = 'cleaned'
+ file_list["proj_data"] = None
+ elif os.path.exists(os.path.join(proj_abs_path, "cleaned")):
+ file_list["proj_data"] = "cleaned"
else:
- file_list['proj_data'] = {'proj_data_root': proj_abs_path,
- 'fastq_files' : collect_files_by_ext(proj_abs_path, '*.fastq.gz')}
- size += sum(map(os.path.getsize, file_list['flowcells'][fc_id]['fq_files']))
+ file_list["proj_data"] = {
+ "proj_data_root": proj_abs_path,
+ "fastq_files": collect_files_by_ext(proj_abs_path, "*.fastq.gz"),
+ }
+ size += sum(map(os.path.getsize, file_list["flowcells"][fc_id]["fq_files"]))
return (file_list, size)
+
def collect_files_by_ext(path, ext=[]):
"""Collect files with a given extension from a given path."""
if isinstance(ext, str):
@@ -340,60 +491,79 @@ def collect_files_by_ext(path, ext=[]):
collected_files.extend(collect_files_by_ext(d, ext))
return collected_files
+
def get_proj_meta_info(info, days_fastq):
"""From given info collect meta info for a project."""
- template = '\n'
+ template = "\n"
+
def _get_template_string(h, v):
try:
- v = f'{h}: {v}\n'
+ v = f"{h}: {v}\n"
except:
- v = f'{h}: Problem getting this'
+ v = f"{h}: Problem getting this"
return v
- template += _get_template_string('Project overview', info.get('name'))
- template += _get_template_string('Project ID', info.get('pid'))
- template += _get_template_string('Bioinfo Responsible', info.get('bioinfo_responsible',''))
- template += _get_template_string('Closed for (days)', info.get('closed_days'))
- template += _get_template_string('Closed from (date)', info.get('closed_date'))
+
+ template += _get_template_string("Project overview", info.get("name"))
+ template += _get_template_string("Project ID", info.get("pid"))
+ template += _get_template_string(
+ "Bioinfo Responsible", info.get("bioinfo_responsible", "")
+ )
+ template += _get_template_string("Closed for (days)", info.get("closed_days"))
+ template += _get_template_string("Closed from (date)", info.get("closed_date"))
# set analysis info based upon what we have
- analysis_info = info.get('analysis_to_remove')
+ analysis_info = info.get("analysis_to_remove")
if not analysis_info:
- template += 'Project analysis: No analysis directory\n'
- elif isinstance(analysis_info, str) and analysis_info == 'cleaned':
- template += 'Project analysis: Analysis directory already cleaned\n'
+ template += "Project analysis: No analysis directory\n"
+ elif isinstance(analysis_info, str) and analysis_info == "cleaned":
+ template += "Project analysis: Analysis directory already cleaned\n"
elif isinstance(analysis_info, dict):
f_stat = []
- for qc_type, files in analysis_info['analysis_files'].items():
- f_stat.append(f'{qc_type} ({len(files)} files)')
- template += 'Project analyzed: {}\n'.format(', '.join(f_stat))
+ for qc_type, files in analysis_info["analysis_files"].items():
+ f_stat.append(f"{qc_type} ({len(files)} files)")
+ template += "Project analyzed: {}\n".format(", ".join(f_stat))
# set fastq info based upon what we have
- fq_info = info.get('fastq_to_remove')
+ fq_info = info.get("fastq_to_remove")
if isinstance(fq_info, str) and fq_info == "young":
- template += f'Project been closed less than {days_fastq} days, so will not remove any fastq files\n'
+ template += f"Project been closed less than {days_fastq} days, so will not remove any fastq files\n"
elif isinstance(fq_info, dict):
- proj_fq_info = fq_info.get('proj_data')
+ proj_fq_info = fq_info.get("proj_data")
if not proj_fq_info:
- template += 'Project organized: No organized directory for project\n'
+ template += "Project organized: No organized directory for project\n"
elif isinstance(proj_fq_info, str) and proj_fq_info == "cleaned":
- template += 'Project organized: Project directory is already cleaned\n'
+ template += "Project organized: Project directory is already cleaned\n"
elif isinstance(proj_fq_info, dict):
- template += 'Project organized: Project is organized with {} fastq files\n'.format(len(proj_fq_info['fastq_files']))
- fc_fq_info = fq_info.get('flowcells', {})
+ template += (
+ "Project organized: Project is organized with {} fastq files\n".format(
+ len(proj_fq_info["fastq_files"])
+ )
+ )
+ fc_fq_info = fq_info.get("flowcells", {})
fc_num = len(fc_fq_info.keys())
- fc_files = sum(map(len, [fc_info.get('fq_files', [])for fc_info in fc_fq_info.values()]))
- template += f'Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n'
- template += 'Estimated data size: {}\n'.format(_def_get_size_unit(info.get('fastq_size',0) + info.get('fastq_size', 0)))
+ fc_files = sum(
+ map(len, [fc_info.get("fq_files", []) for fc_info in fc_fq_info.values()])
+ )
+ template += (
+ f"Flowcells: There are {fc_num} FC with total {fc_files} fastq files\n"
+ )
+ template += "Estimated data size: {}\n".format(
+ _def_get_size_unit(info.get("fastq_size", 0) + info.get("fastq_size", 0))
+ )
return template
+
def get_files_size_text(plist):
"""Get project list dict and give back string with overll sizes."""
- fsize = _def_get_size_unit(sum([i.get('fastq_size',0) for i in plist.values()]))
- asize = _def_get_size_unit(sum([i.get('analysis_size',0) for i in plist.values()]))
- return '({f}{s}{a}) '.format(f = f'~{fsize} fastq data' if fsize else '',
- a = f'~{asize} analysis data' if asize else '',
- s = ' and ' if fsize and asize else '')
+ fsize = _def_get_size_unit(sum([i.get("fastq_size", 0) for i in plist.values()]))
+ asize = _def_get_size_unit(sum([i.get("analysis_size", 0) for i in plist.values()]))
+ return "({f}{s}{a}) ".format(
+ f=f"~{fsize} fastq data" if fsize else "",
+ a=f"~{asize} analysis data" if asize else "",
+ s=" and " if fsize and asize else "",
+ )
+
def _def_get_size_unit(s):
"""Change the given size to appropriate unit measurement for better readability."""
@@ -402,17 +572,18 @@ def _def_get_size_unit(s):
gb = mb * 1000
tb = gb * 1000
if s > tb:
- s = f'~{int(s/tb)}tb'
+ s = f"~{int(s/tb)}tb"
elif s > gb:
- s = f'~{int(s/gb)}gb'
+ s = f"~{int(s/gb)}gb"
elif s > mb:
- s = f'~{int(s/mb)}mb'
+ s = f"~{int(s/mb)}mb"
elif s > kb:
- s = f'~{int(s/kb)}kb'
+ s = f"~{int(s/kb)}kb"
elif s > 0:
- s = f'~{int(s/b)}b'
+ s = f"~{int(s/b)}b"
return str(s)
+
def _remove_files(files):
"""Remove files from given list."""
status = True
@@ -424,9 +595,12 @@ def _remove_files(files):
status = False
return status
+
def _touch_cleaned(path):
"""Touch a 'cleaned' file in a given path."""
try:
- open(os.path.join(path, 'cleaned'), 'w').close()
+ open(os.path.join(path, "cleaned"), "w").close()
except Exception as e:
- logger.warn(f'Could not create "cleaned" file in path {path} due to "{e.message}"')
+ logger.warn(
+ f'Could not create "cleaned" file in path {path} due to "{e.message}"'
+ )
diff --git a/taca/cleanup/cli.py b/taca/cleanup/cli.py
index 6410567b..fe7e11ba 100644
--- a/taca/cleanup/cli.py
+++ b/taca/cleanup/cli.py
@@ -7,63 +7,119 @@
@click.group()
@click.pass_context
-@click.option('--status_db_config',
- type=click.Path(exists=True, dir_okay=False),
- envvar='STATUS_DB_CONFIG',
- help='Path to statusdb-configuration.')
+@click.option(
+ "--status_db_config",
+ type=click.Path(exists=True, dir_okay=False),
+ envvar="STATUS_DB_CONFIG",
+ help="Path to statusdb-configuration.",
+)
def cleanup(ctx, status_db_config):
"""Cleaning up servers - management methods and utilities."""
pass
+
# cleanup subcommands
@cleanup.command()
-@click.option('-d', '--days', type=click.IntRange(min=1),
- help='Days to consider as thershold, should not be combined with option "--hours"')
-@click.option('-h', '--hours', type=click.IntRange(min=1),
- help='Hours to consider as thershold, should not be combined with option "--days"')
+@click.option(
+ "-d",
+ "--days",
+ type=click.IntRange(min=1),
+ help='Days to consider as thershold, should not be combined with option "--hours"',
+)
+@click.option(
+ "-h",
+ "--hours",
+ type=click.IntRange(min=1),
+ help='Hours to consider as thershold, should not be combined with option "--days"',
+)
@click.pass_context
def preproc(ctx, days, hours):
"""Do appropriate cleanup on preproc."""
seconds = misc.to_seconds(days, hours)
cln.cleanup_processing(seconds)
+
@cleanup.command()
-@click.option('--days_fastq', type=click.IntRange(min=1),
- help='Days to consider as thershold for removing "fastq" files')
-@click.option('--days_analysis', type=click.IntRange(min=1),
- help='Days to consider as thershold for removing analysis data')
-@click.option('--only_fastq', is_flag=True,
- help='Clean only fastq data in "miarka"')
-@click.option('--only_analysis', is_flag=True,
- help='Clean only analysis data in "miarka"')
-@click.option('--date', type=click.STRING,
- help='Consider the given date instead of today while collecting closed projects. '
- 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"')
-@click.option('--exclude_projects', type=click.STRING,
- help='A project or a file with a list of projects to exclude from deleting. '
- 'Either name or id can be given. Examples: --exclude_projects P1234 or '
- '--exclude_projects P1234,P5678 or '
- '--exclude_projects file_with_projects_id.txt')
-@click.option('--clean_undetermined', is_flag=True,
- help='Remove only the undetermined reads for a flowcell that have '
- 'all project cleaned. All other parameters are ignored if this '
- 'flag is called.')
-@click.option('-l', '--list_only', is_flag=True,
- help='Only build the project list that will be cleaned')
-@click.option('-n', '--dry_run', is_flag=True,
- help='Perform dry run i.e. execute nothing but log')
+@click.option(
+ "--days_fastq",
+ type=click.IntRange(min=1),
+ help='Days to consider as thershold for removing "fastq" files',
+)
+@click.option(
+ "--days_analysis",
+ type=click.IntRange(min=1),
+ help="Days to consider as thershold for removing analysis data",
+)
+@click.option("--only_fastq", is_flag=True, help='Clean only fastq data in "miarka"')
+@click.option(
+ "--only_analysis", is_flag=True, help='Clean only analysis data in "miarka"'
+)
+@click.option(
+ "--date",
+ type=click.STRING,
+ help="Consider the given date instead of today while collecting closed projects. "
+ 'Date format should be "YYYY-MM-DD", ex: "2016-01-31"',
+)
+@click.option(
+ "--exclude_projects",
+ type=click.STRING,
+ help="A project or a file with a list of projects to exclude from deleting. "
+ "Either name or id can be given. Examples: --exclude_projects P1234 or "
+ "--exclude_projects P1234,P5678 or "
+ "--exclude_projects file_with_projects_id.txt",
+)
+@click.option(
+ "--clean_undetermined",
+ is_flag=True,
+ help="Remove only the undetermined reads for a flowcell that have "
+ "all project cleaned. All other parameters are ignored if this "
+ "flag is called.",
+)
+@click.option(
+ "-l",
+ "--list_only",
+ is_flag=True,
+ help="Only build the project list that will be cleaned",
+)
+@click.option(
+ "-n", "--dry_run", is_flag=True, help="Perform dry run i.e. execute nothing but log"
+)
@click.pass_context
-def miarka(ctx, days_fastq, days_analysis, only_fastq, only_analysis, clean_undetermined, date, exclude_projects, list_only, dry_run):
+def miarka(
+ ctx,
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ date,
+ exclude_projects,
+ list_only,
+ dry_run,
+):
"""Do appropriate cleanup on Miarka."""
- status_db_config = ctx.parent.params['status_db_config']
+ status_db_config = ctx.parent.params["status_db_config"]
if only_fastq and only_analysis:
- raise SystemExit('ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one')
+ raise SystemExit(
+ 'ERROR: Both option "only_fastq" and "only_analysis" is given, should only give either one'
+ )
if not days_fastq and not only_analysis and not clean_undetermined:
- raise SystemExit('ERROR: "days_fastq" is not given while not selecting "only_analysis" option')
+ raise SystemExit(
+ 'ERROR: "days_fastq" is not given while not selecting "only_analysis" option'
+ )
if not days_analysis and not only_fastq and not clean_undetermined:
- raise SystemExit('ERROR: "days_analysis" is not given while not selecting "only_fastq" option')
- cln.cleanup_miarka(days_fastq, days_analysis,
- only_fastq, only_analysis,
- clean_undetermined, status_db_config,
- exclude_projects, list_only,
- date, dry_run)
+ raise SystemExit(
+ 'ERROR: "days_analysis" is not given while not selecting "only_fastq" option'
+ )
+ cln.cleanup_miarka(
+ days_fastq,
+ days_analysis,
+ only_fastq,
+ only_analysis,
+ clean_undetermined,
+ status_db_config,
+ exclude_projects,
+ list_only,
+ date,
+ dry_run,
+ )
diff --git a/taca/cli.py b/taca/cli.py
index ad8d59b6..d777884a 100644
--- a/taca/cli.py
+++ b/taca/cli.py
@@ -10,26 +10,30 @@
logger = logging.getLogger(__name__)
+
@click.group()
@click.version_option(__version__)
# Priority for the configuration file is: environment variable > -c option > default
-@click.option('-c', '--config-file',
- default=os.path.join(os.environ['HOME'], '.taca/taca.yaml'),
- envvar='TACA_CONFIG',
- type=click.File('r'),
- help='Path to TACA configuration file')
-
+@click.option(
+ "-c",
+ "--config-file",
+ default=os.path.join(os.environ["HOME"], ".taca/taca.yaml"),
+ envvar="TACA_CONFIG",
+ type=click.File("r"),
+ help="Path to TACA configuration file",
+)
@click.pass_context
def cli(ctx, config_file):
- """ Tool for the Automation of Storage and Analyses """
+ """Tool for the Automation of Storage and Analyses"""
ctx.obj = {}
config = conf.load_yaml_config(config_file.name)
- log_file = config.get('log', {}).get('file', None)
+ log_file = config.get("log", {}).get("file", None)
if log_file:
- level = config.get('log').get('log_level', 'INFO')
+ level = config.get("log").get("log_level", "INFO")
taca.log.init_logger_file(log_file, level)
- logger.debug('starting up CLI')
+ logger.debug("starting up CLI")
+
-#Add subcommands dynamically to the CLI
-for entry_point in iter_entry_points('taca.subcommands'):
+# Add subcommands dynamically to the CLI
+for entry_point in iter_entry_points("taca.subcommands"):
cli.add_command(entry_point.load())
diff --git a/taca/illumina/MiSeq_Runs.py b/taca/illumina/MiSeq_Runs.py
index ff7d1095..0428db3c 100644
--- a/taca/illumina/MiSeq_Runs.py
+++ b/taca/illumina/MiSeq_Runs.py
@@ -9,11 +9,12 @@
logger = logging.getLogger(__name__)
-TENX_SINGLE_PAT = re.compile('SI-(?:GA|NA)-[A-H][1-9][0-2]?')
-TENX_DUAL_PAT = re.compile('SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?')
-SMARTSEQ_PAT = re.compile('SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]')
-IDT_UMI_PAT = re.compile('([ATCG]{4,}N+$)')
-RECIPE_PAT = re.compile('[0-9]+-[0-9]+')
+TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?")
+TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?")
+SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]")
+IDT_UMI_PAT = re.compile("([ATCG]{4,}N+$)")
+RECIPE_PAT = re.compile("[0-9]+-[0-9]+")
+
class MiSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
@@ -33,8 +34,7 @@ def _get_samplesheet(self):
"""Locate and parse the samplesheet for a run.
In MiSeq case this is located in FC_DIR/SampleSheet.csv
"""
- ssname = os.path.join(self.run_dir,
- 'SampleSheet.csv')
+ ssname = os.path.join(self.run_dir, "SampleSheet.csv")
if os.path.exists(ssname):
# If exists parse the SampleSheet
return ssname
@@ -49,14 +49,14 @@ def _copy_samplesheet(self):
# Load index files
indexfile = dict()
try:
- indexfile['tenX'] = self.CONFIG[self.software]['tenX_index_path']
+ indexfile["tenX"] = self.CONFIG[self.software]["tenX_index_path"]
except KeyError:
- logger.error('Path to index file (10X) not found in the config file')
+ logger.error("Path to index file (10X) not found in the config file")
raise RuntimeError
try:
- indexfile['smartseq'] = self.CONFIG[self.software]['smartseq_index_path']
+ indexfile["smartseq"] = self.CONFIG[self.software]["smartseq_index_path"]
except KeyError:
- logger.error('Path to index file (Smart-seq) not found in the config file')
+ logger.error("Path to index file (Smart-seq) not found in the config file")
raise RuntimeError
if ssname is None:
return None
@@ -65,97 +65,144 @@ def _copy_samplesheet(self):
# Copy the original samplesheet locally.
# Copy again if already done as there might have been changes to the samplesheet
try:
- shutil.copy(ssname, os.path.join(self.run_dir, f'{self.flowcell_id}.csv'))
+ shutil.copy(ssname, os.path.join(self.run_dir, f"{self.flowcell_id}.csv"))
ssname = os.path.join(self.run_dir, os.path.split(ssname)[1])
except:
- raise RuntimeError(f"unable to copy file {ssname} to destination {self.run_dir}")
+ raise RuntimeError(
+ f"unable to copy file {ssname} to destination {self.run_dir}"
+ )
# This sample sheet has been created by the LIMS and copied by a sequencing operator. It is not ready
# to be used it needs some editing.
# This will contain the samplesheet with all the renaiming to be used with bcl2fastq
- samplesheet_dest = os.path.join(self.run_dir, 'SampleSheet_copy.csv')
+ samplesheet_dest = os.path.join(self.run_dir, "SampleSheet_copy.csv")
# Check that the samplesheet is not already present. In this case go the next step
if os.path.exists(samplesheet_dest):
- logger.info('SampleSheet_copy.csv found ... overwriting it')
+ logger.info("SampleSheet_copy.csv found ... overwriting it")
try:
- with open(samplesheet_dest, 'w') as fcd:
- fcd.write(self._generate_clean_samplesheet(ssparser,
- indexfile,
- fields_to_remove=None,
- rename_samples=True,
- rename_qPCR_suffix = True,
- fields_qPCR=[ssparser.dfield_snm]))
+ with open(samplesheet_dest, "w") as fcd:
+ fcd.write(
+ self._generate_clean_samplesheet(
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=True,
+ fields_qPCR=[ssparser.dfield_snm],
+ )
+ )
except Exception as e:
logger.error(e)
return False
- logger.info(f'Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} ')
+ logger.info(
+ f"Created SampleSheet_copy.csv for Flowcell {self.id} in {samplesheet_dest} "
+ )
# SampleSheet.csv generated
# When demultiplexing SampleSheet.csv is the one I need to use
- self.runParserObj.samplesheet = SampleSheetParser(os.path.join(self.run_dir, 'SampleSheet_copy.csv'))
- if not self.runParserObj.obj.get('samplesheet_csv'):
- self.runParserObj.obj['samplesheet_csv'] = self.runParserObj.samplesheet.data
+ self.runParserObj.samplesheet = SampleSheetParser(
+ os.path.join(self.run_dir, "SampleSheet_copy.csv")
+ )
+ if not self.runParserObj.obj.get("samplesheet_csv"):
+ self.runParserObj.obj[
+ "samplesheet_csv"
+ ] = self.runParserObj.samplesheet.data
- def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None, rename_samples=True, rename_qPCR_suffix = False, fields_qPCR= None):
+ def _generate_clean_samplesheet(
+ self,
+ ssparser,
+ indexfile,
+ fields_to_remove=None,
+ rename_samples=True,
+ rename_qPCR_suffix=False,
+ fields_qPCR=None,
+ ):
"""Generate a 'clean' samplesheet, the given fields will be removed.
If rename_samples is True, samples prepended with 'Sample_' are renamed to match the sample name
Will also replace 10X or Smart-seq indicies (e.g. SI-GA-A3 into TGTGCGGG)
Note that the index 2 of 10X or Smart-seq dual indexes will be converted to RC
"""
- output = ''
- compl = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+ output = ""
+ compl = {"A": "T", "C": "G", "G": "C", "T": "A"}
# Expand the ssparser if there are lanes with 10X or Smart-seq samples
- index_dict_tenX = self._parse_10X_indexes(indexfile['tenX'])
- index_dict_smartseq = self._parse_smartseq_indexes(indexfile['smartseq'])
+ index_dict_tenX = self._parse_10X_indexes(indexfile["tenX"])
+ index_dict_smartseq = self._parse_smartseq_indexes(indexfile["smartseq"])
# Replace 10X or Smart-seq indices
for sample in ssparser.data:
- if sample['index'] in index_dict_tenX.keys():
- tenX_index = sample['index']
+ if sample["index"] in index_dict_tenX.keys():
+ tenX_index = sample["index"]
# In the case of 10X dual indexes, replace index and index2
if TENX_DUAL_PAT.findall(tenX_index):
- sample['index'] = index_dict_tenX[tenX_index][0]
- sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_tenX[tenX_index][1].replace(',','').upper() ] ) )
+ sample["index"] = index_dict_tenX[tenX_index][0]
+ sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_tenX[tenX_index][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
# In the case of 10X single indexes, replace the index name with the 4 actual indicies
else:
x = 0
indices_number = len(index_dict_tenX[tenX_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_tenX[tenX_index][x]
+ new_sample["index"] = index_dict_tenX[tenX_index][x]
ssparser.data.append(new_sample)
x += 1
# Set the original 10X index to the 4th correct index
- sample['index'] = index_dict_tenX[tenX_index][x]
- elif SMARTSEQ_PAT.findall(sample['index']):
+ sample["index"] = index_dict_tenX[tenX_index][x]
+ elif SMARTSEQ_PAT.findall(sample["index"]):
x = 0
- smartseq_index = sample['index'].split('-')[1]
+ smartseq_index = sample["index"].split("-")[1]
indices_number = len(index_dict_smartseq[smartseq_index])
while x < indices_number - 1:
new_sample = dict(sample)
- new_sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- new_sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) )
+ new_sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ new_sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_smartseq[smartseq_index][x][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
ssparser.data.append(new_sample)
x += 1
- sample['index'] = index_dict_smartseq[smartseq_index][x][0]
- sample['index2'] = ''.join( reversed( [compl.get(b,b) for b in index_dict_smartseq[smartseq_index][x][1].replace(',','').upper() ] ) )
+ sample["index"] = index_dict_smartseq[smartseq_index][x][0]
+ sample["index2"] = "".join(
+ reversed(
+ [
+ compl.get(b, b)
+ for b in index_dict_smartseq[smartseq_index][x][1]
+ .replace(",", "")
+ .upper()
+ ]
+ )
+ )
# Sort to get the added indicies from 10x in the right place
# Python 3 doesn't support sorting a list of dicts implicitly. Sort by lane and then Sample_ID
- ssparser.data.sort(key=lambda item: (item.get('Lane'), item.get('Sample_ID')))
+ ssparser.data.sort(key=lambda item: (item.get("Lane"), item.get("Sample_ID")))
if not fields_to_remove:
fields_to_remove = []
# Header
- output += f'[Header]{os.linesep}'
+ output += f"[Header]{os.linesep}"
for field in sorted(ssparser.header):
- output += f'{field.rstrip()},{ssparser.header[field].rstrip()}'
+ output += f"{field.rstrip()},{ssparser.header[field].rstrip()}"
output += os.linesep
# Data
- output += f'[Data]{os.linesep}'
+ output += f"[Data]{os.linesep}"
datafields = []
for field in ssparser.datafields:
if field not in fields_to_remove:
datafields.append(field)
- output += ','.join(datafields)
+ output += ",".join(datafields)
output += os.linesep
for line in ssparser.data:
line_ar = []
@@ -165,16 +212,18 @@ def _generate_clean_samplesheet(self, ssparser, indexfile, fields_to_remove=None
try:
if rename_qPCR_suffix and ssparser.dfield_snm in fields_qPCR:
# Substitute SampleID with SampleName, add Sample_ as prefix and remove __qPCR_ suffix
- value = re.sub('__qPCR_$', '', f'Sample_{line[ssparser.dfield_snm]}')
+ value = re.sub(
+ "__qPCR_$", "", f"Sample_{line[ssparser.dfield_snm]}"
+ )
else:
# Substitute SampleID with SampleName, add Sample_ as prefix
- value =f'Sample_{line[ssparser.dfield_snm]}'
+ value = f"Sample_{line[ssparser.dfield_snm]}"
except:
- # Otherwise add Sample_ as prefix
- value = f'Sample_{line[ssparser.dfield_sid]}'
+ # Otherwise add Sample_ as prefix
+ value = f"Sample_{line[ssparser.dfield_sid]}"
elif rename_qPCR_suffix and field in fields_qPCR:
- value = re.sub('__qPCR_$', '', line[field])
+ value = re.sub("__qPCR_$", "", line[field])
line_ar.append(value)
- output += ','.join(line_ar)
+ output += ",".join(line_ar)
output += os.linesep
return output
diff --git a/taca/illumina/NextSeq_Runs.py b/taca/illumina/NextSeq_Runs.py
index d03b1e9e..6dc8cee1 100755
--- a/taca/illumina/NextSeq_Runs.py
+++ b/taca/illumina/NextSeq_Runs.py
@@ -3,7 +3,7 @@
class NextSeq_Run(Standard_Run):
def __init__(self, run_dir, software, configuration):
- super(Standard_Runs, self).__init__( run_dir, software, configuration)
+ super(Standard_Runs, self).__init__(run_dir, software, configuration)
self._set_sequencer_type()
self._set_run_type()
# NextSeq2000 has a different FC ID pattern that ID contains the first letter for position
diff --git a/taca/illumina/Runs.py b/taca/illumina/Runs.py
index 56724ac7..5fbac30a 100644
--- a/taca/illumina/Runs.py
+++ b/taca/illumina/Runs.py
@@ -15,32 +15,39 @@
logger = logging.getLogger(__name__)
+
class Run:
- """ Defines an Illumina run
- """
+ """Defines an Illumina run"""
def __init__(self, run_dir, software, configuration):
if not os.path.exists(run_dir):
raise RuntimeError(f"Could not locate run directory {run_dir}")
- if 'analysis_server' not in configuration or \
- 'bcl2fastq' not in configuration or \
- 'bclconvert' not in configuration or \
- 'samplesheets_dir' not in configuration:
- raise RuntimeError("configuration missing required entries "
- "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)")
- if not os.path.exists(os.path.join(run_dir, 'runParameters.xml')) \
- and os.path.exists(os.path.join(run_dir, 'RunParameters.xml')):
+ if (
+ "analysis_server" not in configuration
+ or "bcl2fastq" not in configuration
+ or "bclconvert" not in configuration
+ or "samplesheets_dir" not in configuration
+ ):
+ raise RuntimeError(
+ "configuration missing required entries "
+ "(analysis_server, bcl2fastq, bclconvert, samplesheets_dir)"
+ )
+ if not os.path.exists(
+ os.path.join(run_dir, "runParameters.xml")
+ ) and os.path.exists(os.path.join(run_dir, "RunParameters.xml")):
# In NextSeq runParameters is named RunParameters
logger.warning("Creating link from runParameters.xml to RunParameters.xml")
- os.symlink('RunParameters.xml', os.path.join(run_dir, 'runParameters.xml'))
- elif not os.path.exists(os.path.join(run_dir, 'runParameters.xml')):
- raise RuntimeError(f"Could not locate runParameters.xml in run directory {run_dir}")
+ os.symlink("RunParameters.xml", os.path.join(run_dir, "runParameters.xml"))
+ elif not os.path.exists(os.path.join(run_dir, "runParameters.xml")):
+ raise RuntimeError(
+ f"Could not locate runParameters.xml in run directory {run_dir}"
+ )
self.run_dir = os.path.abspath(run_dir)
self.software = software
self.id = os.path.basename(os.path.normpath(run_dir))
- pattern = r'(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)'
+ pattern = r"(\d{6,8})_([ST-]*\w+\d+)_\d+_([AB]?)([A-Z0-9\-]+)"
m = re.match(pattern, self.id)
self.date = m.group(1)
self.instrument = m.group(2)
@@ -63,51 +70,78 @@ def check_run_status(self):
This function checks the status of a run while in progress.
In the case of HiSeq check that all demux have been done and in that case perform aggregation
"""
- dex_status = self.get_run_status()
- if self.software == 'bcl2fastq':
- legacy_path = ''
- elif self.software == 'bclconvert':
+ dex_status = self.get_run_status()
+ if self.software == "bcl2fastq":
+ legacy_path = ""
+ elif self.software == "bclconvert":
legacy_path = f"Reports/{self.legacy_dir}"
# Check the status of running demux
# Collect all samplesheets generated before
- samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv")) # A single digit, this hypothesis should hold for a while
+ samplesheets = glob.glob(
+ os.path.join(self.run_dir, "*_[0-9].csv")
+ ) # A single digit, this hypothesis should hold for a while
all_demux_done = True
for samplesheet in samplesheets:
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
demux_folder = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
# Check if this job is done
- if os.path.exists(os.path.join(self.run_dir, demux_folder, legacy_path, 'Stats', 'DemultiplexingStats.xml')):
+ if os.path.exists(
+ os.path.join(
+ self.run_dir,
+ demux_folder,
+ legacy_path,
+ "Stats",
+ "DemultiplexingStats.xml",
+ )
+ ):
all_demux_done = all_demux_done and True
- if self.software == 'bcl2fastq':
- demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl2fastq.err")
- elif self.software == 'bclconvert':
- demux_log = os.path.join(self.run_dir, f"demux_{demux_id}_bcl-convert.err")
+ if self.software == "bcl2fastq":
+ demux_log = os.path.join(
+ self.run_dir, f"demux_{demux_id}_bcl2fastq.err"
+ )
+ elif self.software == "bclconvert":
+ demux_log = os.path.join(
+ self.run_dir, f"demux_{demux_id}_bcl-convert.err"
+ )
else:
raise RuntimeError("Unrecognized software!")
if os.path.isfile(demux_log):
- errors, warnings, error_and_warning_messages = self._check_demux_log(demux_id, demux_log)
+ (
+ errors,
+ warnings,
+ error_and_warning_messages,
+ ) = self._check_demux_log(demux_id, demux_log)
else:
- raise RuntimeError(f"No demux log file found for sub-demultiplexing {demux_id}!")
- self.demux_summary[demux_id] = {'errors' : errors,
- 'warnings' : warnings,
- 'error_and_warning_messages' : error_and_warning_messages
- }
+ raise RuntimeError(
+ f"No demux log file found for sub-demultiplexing {demux_id}!"
+ )
+ self.demux_summary[demux_id] = {
+ "errors": errors,
+ "warnings": warnings,
+ "error_and_warning_messages": error_and_warning_messages,
+ }
if errors or warnings:
- logger.info(f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!")
+ logger.info(
+ f"Sub-Demultiplexing in {demux_folder} completed with {errors} errors and {warnings} warnings!"
+ )
else:
- logger.info(f"Sub-Demultiplexing in {demux_folder} completed without any error or warning.")
+ logger.info(
+ f"Sub-Demultiplexing in {demux_folder} completed without any error or warning."
+ )
else:
all_demux_done = all_demux_done and False
logger.info(f"Sub-Demultiplexing in {demux_folder} not completed yet.")
# All demux jobs finished and all stats aggregated under Demultiplexing
# Aggreate all the results in the Demultiplexing folder
- if all_demux_done and dex_status!='COMPLETED':
- dex_status = 'COMPLETED'
+ if all_demux_done and dex_status != "COMPLETED":
+ dex_status = "COMPLETED"
self._aggregate_demux_results()
self.runParserObj = RunParser(self.run_dir)
# Rename undetermined if needed
- lanes = misc.return_unique([lanes['Lane'] for lanes in self.runParserObj.samplesheet.data])
+ lanes = misc.return_unique(
+ [lanes["Lane"] for lanes in self.runParserObj.samplesheet.data]
+ )
samples_per_lane = self.get_samples_per_lane()
for lane in lanes:
if self.is_unpooled_lane(lane):
@@ -121,8 +155,8 @@ def _check_demux_log(self, demux_id, demux_log):
"""
with open(demux_log) as demux_log_file:
demux_log_content = demux_log_file.readlines()
- if self.software == 'bcl2fastq':
- pattern = r'Processing completed with (\d+) errors and (\d+) warnings'
+ if self.software == "bcl2fastq":
+ pattern = r"Processing completed with (\d+) errors and (\d+) warnings"
match = re.search(pattern, demux_log_content[-1])
if match:
errors = int(match.group(1))
@@ -130,20 +164,22 @@ def _check_demux_log(self, demux_id, demux_log):
error_and_warning_messages = []
if errors or warnings:
for line in demux_log_content:
- if 'ERROR' in line or 'WARN' in line:
+ if "ERROR" in line or "WARN" in line:
error_and_warning_messages.append(line)
return errors, warnings, error_and_warning_messages
else:
- raise RuntimeError(f"Bad format with log file demux_{demux_id}_bcl2fastq.err")
- elif self.software == 'bclconvert':
+ raise RuntimeError(
+ f"Bad format with log file demux_{demux_id}_bcl2fastq.err"
+ )
+ elif self.software == "bclconvert":
errors = 0
warnings = 0
error_and_warning_messages = []
for line in demux_log_content:
- if 'ERROR' in line:
+ if "ERROR" in line:
errors += 1
error_and_warning_messages.append(line)
- elif 'WARNING' in line:
+ elif "WARNING" in line:
warnnings += 1
error_and_warning_messages.append(line)
return errors, warnings, error_and_warning_messages
@@ -170,50 +206,53 @@ def _get_demux_folder(self):
def _get_samplesheet(self):
"""
- Locate and parse the samplesheet for a run. The idea is that there is a folder in
- samplesheet_folders that contains a samplesheet named flowecell_id.csv.
+ Locate and parse the samplesheet for a run. The idea is that there is a folder in
+ samplesheet_folders that contains a samplesheet named flowecell_id.csv.
"""
try:
# Only implemented for some, (e.g. NovaSeqXPlus)
# Will raise AttributeError if not implemented.
current_year = self._current_year()
except AttributeError:
- current_year = '20' + self.id[0:2]
+ current_year = "20" + self.id[0:2]
- samplesheets_dir = os.path.join(self.CONFIG['samplesheets_dir'],
- current_year)
- ssname = os.path.join(samplesheets_dir, f'{self.flowcell_id}.csv')
+ samplesheets_dir = os.path.join(self.CONFIG["samplesheets_dir"], current_year)
+ ssname = os.path.join(samplesheets_dir, f"{self.flowcell_id}.csv")
if os.path.exists(ssname):
return ssname
else:
- raise RuntimeError("not able to find samplesheet {}.csv in {}".format(self.flowcell_id, self.CONFIG['samplesheets_dir']))
+ raise RuntimeError(
+ "not able to find samplesheet {}.csv in {}".format(
+ self.flowcell_id, self.CONFIG["samplesheets_dir"]
+ )
+ )
def _is_demultiplexing_done(self):
- return os.path.exists(os.path.join(self.run_dir,
- self._get_demux_folder(),
- 'Stats',
- 'Stats.json'))
+ return os.path.exists(
+ os.path.join(self.run_dir, self._get_demux_folder(), "Stats", "Stats.json")
+ )
def _is_demultiplexing_started(self):
return os.path.exists(os.path.join(self.run_dir, self._get_demux_folder()))
def _is_sequencing_done(self):
- return os.path.exists(os.path.join(self.run_dir, 'RTAComplete.txt')) and os.path.exists(os.path.join(self.run_dir, 'CopyComplete.txt'))
+ return os.path.exists(
+ os.path.join(self.run_dir, "RTAComplete.txt")
+ ) and os.path.exists(os.path.join(self.run_dir, "CopyComplete.txt"))
def get_run_status(self):
- """ Return the current status of the run.
- """
+ """Return the current status of the run."""
demux_started = self._is_demultiplexing_started()
demux_done = self._is_demultiplexing_done()
sequencing_done = self._is_sequencing_done()
if sequencing_done and demux_done:
- return 'COMPLETED' # run is done, transfer might be ongoing.
+ return "COMPLETED" # run is done, transfer might be ongoing.
elif sequencing_done and demux_started and not demux_done:
- return 'IN_PROGRESS'
+ return "IN_PROGRESS"
elif sequencing_done and not demux_started:
- return 'TO_START'
+ return "TO_START"
elif not sequencing_done:
- return 'SEQUENCING'
+ return "SEQUENCING"
else:
raise RuntimeError("Unexpected status in get_run_status")
@@ -249,49 +288,52 @@ def _compute_base_mask(self):
raise NotImplementedError("Please Implement this method")
def transfer_run(self, t_file, mail_recipients=None):
- """ Transfer a run to the analysis server. Will add group R/W permissions to
- the run directory in the destination server so that the run can be processed
- by any user/account in that group (i.e a functional account...).
- :param str t_file: File where to put the transfer information
+ """Transfer a run to the analysis server. Will add group R/W permissions to
+ the run directory in the destination server so that the run can be processed
+ by any user/account in that group (i.e a functional account...).
+ :param str t_file: File where to put the transfer information
"""
# The option -a implies -o and -g which is not the desired behaviour
- command_line = ['rsync', '-LtDrv']
+ command_line = ["rsync", "-LtDrv"]
# Add R/W permissions to the group
- command_line.append('--chmod=g+rw')
+ command_line.append("--chmod=g+rw")
# This horrible thing here avoids data dup when we use multiple indexes in a lane/FC
command_line.append("--exclude=Demultiplexing_*/*_*")
command_line.append("--include=*/")
- for to_include in self.CONFIG['analysis_server']['sync']['include']:
+ for to_include in self.CONFIG["analysis_server"]["sync"]["include"]:
command_line.append(f"--include={to_include}")
command_line.extend(["--exclude=*", "--prune-empty-dirs"])
- r_user = self.CONFIG['analysis_server']['user']
- r_host = self.CONFIG['analysis_server']['host']
- r_dir = self.CONFIG['analysis_server']['sync']['data_archive']
+ r_user = self.CONFIG["analysis_server"]["user"]
+ r_host = self.CONFIG["analysis_server"]["host"]
+ r_dir = self.CONFIG["analysis_server"]["sync"]["data_archive"]
remote = f"{r_user}@{r_host}:{r_dir}"
command_line.extend([self.run_dir, remote])
# Create temp file indicating that the run is being transferred
try:
- open(os.path.join(self.run_dir, 'transferring'), 'w').close()
+ open(os.path.join(self.run_dir, "transferring"), "w").close()
except OSError as e:
- logger.error(f"Cannot create a file in {self.id}. "
- "Check the run name, and the permissions.")
+ logger.error(
+ f"Cannot create a file in {self.id}. "
+ "Check the run name, and the permissions."
+ )
raise e
- started = (f"Started transfer of run {self.id} on {datetime.now()}")
+ started = f"Started transfer of run {self.id} on {datetime.now()}"
logger.info(started)
# In this particular case we want to capture the exception because we want
# to delete the transfer file
try:
- msge_text=f"I am about to transfer with this command \n{command_line}"
- logger.info(msge_text)
- misc.call_external_command(command_line, with_log_files=True,
- prefix="", log_dir=self.run_dir)
+ msge_text = f"I am about to transfer with this command \n{command_line}"
+ logger.info(msge_text)
+ misc.call_external_command(
+ command_line, with_log_files=True, prefix="", log_dir=self.run_dir
+ )
except subprocess.CalledProcessError as exception:
- os.remove(os.path.join(self.run_dir, 'transferring'))
- #Send an email notifying that the transfer failed
+ os.remove(os.path.join(self.run_dir, "transferring"))
+ # Send an email notifying that the transfer failed
runname = self.id
- sbt = (f"Rsync of run {runname} failed")
- msg= f""" Rsync of data for run {runname} has failed!
+ sbt = f"Rsync of run {runname} failed"
+ msg = f""" Rsync of data for run {runname} has failed!
Raised the following exception: {exception}
"""
if mail_recipients:
@@ -299,16 +341,16 @@ def transfer_run(self, t_file, mail_recipients=None):
raise exception
- logger.info(f'Adding run {self.id} to {t_file}')
- with open(t_file, 'a') as tranfer_file:
- tsv_writer = csv.writer(tranfer_file, delimiter='\t')
+ logger.info(f"Adding run {self.id} to {t_file}")
+ with open(t_file, "a") as tranfer_file:
+ tsv_writer = csv.writer(tranfer_file, delimiter="\t")
tsv_writer.writerow([self.id, str(datetime.now())])
- os.remove(os.path.join(self.run_dir, 'transferring'))
+ os.remove(os.path.join(self.run_dir, "transferring"))
- #Send an email notifying that the transfer was successful
+ # Send an email notifying that the transfer was successful
runname = self.id
- sbt = (f"Rsync of data for run {runname} to the analysis cluster has finished")
- msg= """ Rsync of data for run {run} to the analysis cluster has finished!
+ sbt = f"Rsync of data for run {runname} to the analysis cluster has finished"
+ msg = """ Rsync of data for run {run} to the analysis cluster has finished!
The run is available at : https://genomics-status.scilifelab.se/flowcells/{run}
""".format(run=runname)
@@ -316,36 +358,35 @@ def transfer_run(self, t_file, mail_recipients=None):
send_mail(sbt, msg, mail_recipients)
def archive_run(self, destination):
- """ Move run to the archive folder
- :param str destination: the destination folder
+ """Move run to the archive folder
+ :param str destination: the destination folder
"""
if destination and os.path.isdir(destination):
- logger.info(f'archiving run {self.id}')
+ logger.info(f"archiving run {self.id}")
shutil.move(self.run_dir, os.path.join(destination, self.id))
else:
logger.warning("Cannot move run to archive, destination does not exist")
def send_mail(self, sbt, msg, rcp):
- """ Sends mail about run completion
- """
+ """Sends mail about run completion"""
runname = self.id
if not sbt:
sbt = f"{runname}"
misc.send_mail(sbt, msg, rcp)
def is_transferred(self, transfer_file):
- """ Checks wether a run has been transferred to the analysis server or not.
- Returns true in the case in which the tranfer is finished or ongoing.
- :param str transfer_file: Path to file with information about transferred runs
+ """Checks wether a run has been transferred to the analysis server or not.
+ Returns true in the case in which the tranfer is finished or ongoing.
+ :param str transfer_file: Path to file with information about transferred runs
"""
try:
with open(transfer_file) as file_handle:
- transfer_file_contents = csv.reader(file_handle, delimiter='\t')
+ transfer_file_contents = csv.reader(file_handle, delimiter="\t")
for row in transfer_file_contents:
# Rows have two columns: run and transfer date
if row[0] == os.path.basename(self.id):
return True
- if os.path.exists(os.path.join(self.run_dir, 'transferring')):
+ if os.path.exists(os.path.join(self.run_dir, "transferring")):
return True
return False
except OSError:
@@ -353,14 +394,14 @@ def is_transferred(self, transfer_file):
def is_unpooled_lane(self, lane):
"""
- :param lane: lane identifier
- :type lane: string
- :rtype: boolean
- :returns: True if the samplesheet has one entry for that lane, False otherwise
+ :param lane: lane identifier
+ :type lane: string
+ :rtype: boolean
+ :returns: True if the samplesheet has one entry for that lane, False otherwise
"""
count = 0
for l in self.runParserObj.samplesheet.data:
- if l['Lane'] == lane:
+ if l["Lane"] == lane:
count += 1
return count == 1
@@ -374,7 +415,7 @@ def get_samples_per_lane(self):
ss = self.runParserObj.samplesheet
d = {}
for l in ss.data:
- d[l['Lane']] = l[ss.dfield_snm]
+ d[l["Lane"]] = l[ss.dfield_snm]
return d
def _rename_undet(self, lane, samples_per_lane):
@@ -387,25 +428,35 @@ def _rename_undet(self, lane, samples_per_lane):
:param samples_per_lane: lane:sample dict
:type status: dict
"""
- for file in glob.glob(os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")):
- old_name=os.path.basename(file)
- old_name_comps=old_name.split("_")
- old_name_comps[1]=old_name_comps[0]# replace S0 with Undetermined
- old_name_comps[0]=samples_per_lane[lane]#replace Undetermined with samplename
+ for file in glob.glob(
+ os.path.join(self.run_dir, self.demux_dir, f"Undetermined*L0?{lane}*")
+ ):
+ old_name = os.path.basename(file)
+ old_name_comps = old_name.split("_")
+ old_name_comps[1] = old_name_comps[0] # replace S0 with Undetermined
+ old_name_comps[0] = samples_per_lane[
+ lane
+ ] # replace Undetermined with samplename
for index, comp in enumerate(old_name_comps):
- if comp.startswith('L00'):
- old_name_comps[index]=comp.replace('L00','L01')#adds a 1 as the second lane number in order to differentiate undetermined from normal in piper
-
- new_name="_".join(old_name_comps)
- logger.info(f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}")
+ if comp.startswith("L00"):
+ old_name_comps[index] = comp.replace(
+ "L00", "L01"
+ ) # adds a 1 as the second lane number in order to differentiate undetermined from normal in piper
+
+ new_name = "_".join(old_name_comps)
+ logger.info(
+ f"Renaming {file} to {os.path.join(os.path.dirname(file), new_name)}"
+ )
os.rename(file, os.path.join(os.path.dirname(file), new_name))
def _classify_lanes(self, samplesheets):
# Prepare a list for lanes with NoIndex samples
noindex_lanes = []
for entry in self.runParserObj.samplesheet.data:
- if entry['index'].upper() == 'NOINDEX' or (entry['index'] == '' and entry['index2'] == ''):
- noindex_lanes.append(entry['Lane'])
+ if entry["index"].upper() == "NOINDEX" or (
+ entry["index"] == "" and entry["index2"] == ""
+ ):
+ noindex_lanes.append(entry["Lane"])
# Prepare a dict with the lane, demux_id and index_length info based on the sub-samplesheets
# This is for the purpose of deciding simple_lanes and complex_lanes, plus we should start with the Stats.json file from which demux_id for each lane
lane_demuxid_indexlength = dict()
@@ -413,10 +464,18 @@ def _classify_lanes(self, samplesheets):
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
ssparser = SampleSheetParser(samplesheet)
for row in ssparser.data:
- if row['Lane'] not in lane_demuxid_indexlength.keys():
- lane_demuxid_indexlength[row['Lane']] = {demux_id: [len(row.get('index','')), len(row.get('index2',''))]}
- elif demux_id not in lane_demuxid_indexlength[row['Lane']].keys():
- lane_demuxid_indexlength[row['Lane']][demux_id] = [len(row.get('index','')), len(row.get('index2',''))]
+ if row["Lane"] not in lane_demuxid_indexlength.keys():
+ lane_demuxid_indexlength[row["Lane"]] = {
+ demux_id: [
+ len(row.get("index", "")),
+ len(row.get("index2", "")),
+ ]
+ }
+ elif demux_id not in lane_demuxid_indexlength[row["Lane"]].keys():
+ lane_demuxid_indexlength[row["Lane"]][demux_id] = [
+ len(row.get("index", "")),
+ len(row.get("index2", "")),
+ ]
else:
pass
@@ -433,7 +492,12 @@ def _classify_lanes(self, samplesheets):
# Dual and longer indexes have higher priority
if 0 in list(complex_lanes[key].values())[0] and 0 not in vv:
complex_lanes[key] = {vk: vv}
- elif (0 in list(complex_lanes[key].values())[0] and 0 in vv) or (0 not in list(complex_lanes[key].values())[0] and 0 not in vv):
+ elif (
+ 0 in list(complex_lanes[key].values())[0] and 0 in vv
+ ) or (
+ 0 not in list(complex_lanes[key].values())[0]
+ and 0 not in vv
+ ):
if sum(vv) > sum(list(complex_lanes[key].values())[0]):
complex_lanes[key] = {vk: vv}
else:
@@ -441,113 +505,192 @@ def _classify_lanes(self, samplesheets):
return noindex_lanes, simple_lanes, complex_lanes
- def _process_noindex_sample_with_fake_index_with_single_demux(self, demux_id, legacy_path):
+ def _process_noindex_sample_with_fake_index_with_single_demux(
+ self, demux_id, legacy_path
+ ):
demux_folder = os.path.join(self.run_dir, self.demux_dir)
sample_counter = 1
- for entry in sorted(self.runParserObj.samplesheet.data, key=lambda k: k['Lane']):
- lane = entry['Lane']
- project = entry['Sample_Project']
- sample = entry['Sample_ID']
+ for entry in sorted(
+ self.runParserObj.samplesheet.data, key=lambda k: k["Lane"]
+ ):
+ lane = entry["Lane"]
+ project = entry["Sample_Project"]
+ sample = entry["Sample_ID"]
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
os.makedirs(project_dest)
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")):
+ for file in glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined*L0?{lane}*",
+ )
+ ):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_',''), f'S{str(sample_counter)}'] + old_name_comps[2:]
+ new_name_comps = [
+ sample.replace("Sample_", ""),
+ f"S{str(sample_counter)}",
+ ] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
- logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_',''), old_name, new_name))
+ logger.info(
+ "For undet sample {}, renaming {} to {}".format(
+ sample.replace("Sample_", ""), old_name, new_name
+ )
+ )
sample_counter += 1
# Make a softlink of lane.html
- html_report_lane_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
- html_report_lane_dest = os.path.join(demux_folder, "Reports", "html", self.flowcell_id, "all", "all", "all", "lane.html")
+ html_report_lane_source = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
+ html_report_lane_dest = os.path.join(
+ demux_folder,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
if not os.path.isdir(os.path.dirname(html_report_lane_dest)):
os.makedirs(os.path.dirname(html_report_lane_dest))
os.symlink(html_report_lane_source, html_report_lane_dest)
# Modify the laneBarcode.html file
- html_report_laneBarcode = os.path.join(self.run_dir,
- f"Demultiplexing_{demux_id}",
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ html_report_laneBarcode = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
html_report_laneBarcode_parser = LaneBarcodeParser(html_report_laneBarcode)
lane_project_sample = dict()
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Sample'] != 'Undetermined':
- lane_project_sample[entry['Lane']] = {'Project': entry['Project'],
- 'Sample': entry['Sample']
- }
+ if entry["Sample"] != "Undetermined":
+ lane_project_sample[entry["Lane"]] = {
+ "Project": entry["Project"],
+ "Sample": entry["Sample"],
+ }
for entry in html_report_laneBarcode_parser.sample_data[:]:
- if entry['Sample'] == 'Undetermined':
- entry['Project'] = lane_project_sample[entry['Lane']]['Project']
- entry['Sample'] = lane_project_sample[entry['Lane']]['Sample']
+ if entry["Sample"] == "Undetermined":
+ entry["Project"] = lane_project_sample[entry["Lane"]]["Project"]
+ entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"]
else:
html_report_laneBarcode_parser.sample_data.remove(entry)
- html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data,
- key=lambda k: (k['Lane'].lower(), k['Sample']))
- new_html_report_laneBarcode = os.path.join(demux_folder,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ html_report_laneBarcode_parser.sample_data = sorted(
+ html_report_laneBarcode_parser.sample_data,
+ key=lambda k: (k["Lane"].lower(), k["Sample"]),
+ )
+ new_html_report_laneBarcode = os.path.join(
+ demux_folder,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
_generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser)
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
# Modify the Stats.json file
- stat_json_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json")
+ stat_json_source = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ "Stats.json",
+ )
stat_json_new = os.path.join(demux_folder, "Stats", "Stats.json")
with open(stat_json_source) as json_data:
data = json.load(json_data)
# Fix the sample stats per lane
- for entry in data['ConversionResults'][:]:
- del entry['DemuxResults'][0]['IndexMetrics']
- entry['DemuxResults'][0].update(entry['Undetermined'])
- del entry['Undetermined']
+ for entry in data["ConversionResults"][:]:
+ del entry["DemuxResults"][0]["IndexMetrics"]
+ entry["DemuxResults"][0].update(entry["Undetermined"])
+ del entry["Undetermined"]
# Reset unknown barcodes list
- for entry in data['UnknownBarcodes'][:]:
- entry['Barcodes'] = {'unknown': 1}
+ for entry in data["UnknownBarcodes"][:]:
+ entry["Barcodes"] = {"unknown": 1}
# Write to a new Stats.json file
- with open(stat_json_new, 'w') as stat_json_new_file:
+ with open(stat_json_new, "w") as stat_json_new_file:
json.dump(data, stat_json_new_file)
- def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_lanes):
- elements = [element for element in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) ]
+ def _process_simple_lane_with_single_demux(
+ self, demux_id, legacy_path, noindex_lanes
+ ):
+ elements = [
+ element
+ for element in os.listdir(
+ os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
+ )
+ ]
for element in elements:
- if "Stats" not in element and "Reports" not in element: #skip this folder and treat it differently to take into account the NoIndex case
- source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", element)
+ if (
+ "Stats" not in element and "Reports" not in element
+ ): # skip this folder and treat it differently to take into account the NoIndex case
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", element
+ )
dest = os.path.join(self.run_dir, self.demux_dir, element)
os.symlink(source, dest)
os.makedirs(os.path.join(self.run_dir, self.demux_dir, "Stats"))
# Fetch the lanes that have NoIndex
- statsFiles = glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*" ))
+ statsFiles = glob.glob(
+ os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "*"
+ )
+ )
for source in statsFiles:
source_name = os.path.split(source)[1]
- if source_name not in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
- lane = os.path.splitext(os.path.split(source)[1])[0][-1] #lane
+ if source_name not in [
+ "DemultiplexingStats.xml",
+ "AdapterTrimming.txt",
+ "ConversionStats.xml",
+ "Stats.json",
+ ]:
+ lane = os.path.splitext(os.path.split(source)[1])[0][-1] # lane
if lane not in noindex_lanes:
- dest = os.path.join(self.run_dir, self.demux_dir, "Stats", source_name)
+ dest = os.path.join(
+ self.run_dir, self.demux_dir, "Stats", source_name
+ )
os.symlink(source, dest)
- for file in ["DemultiplexingStats.xml", "AdapterTrimming.txt", "ConversionStats.xml", "Stats.json"]:
- source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file)
+ for file in [
+ "DemultiplexingStats.xml",
+ "AdapterTrimming.txt",
+ "ConversionStats.xml",
+ "Stats.json",
+ ]:
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", file
+ )
dest = os.path.join(self.run_dir, self.demux_dir, "Stats", file)
os.symlink(source, dest)
- source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports")
+ source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Reports"
+ )
dest = os.path.join(self.run_dir, self.demux_dir, "Reports")
if os.path.exists(dest):
try:
@@ -556,17 +699,27 @@ def _process_simple_lane_with_single_demux(self, demux_id, legacy_path, noindex_
os.unlink(dest)
os.symlink(source, dest)
- def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode):
+ def _fix_html_reports_for_complex_lanes(
+ self,
+ demux_folder,
+ index_cycles,
+ complex_lanes,
+ noindex_lanes,
+ html_reports_lane,
+ html_reports_laneBarcode,
+ ):
# Start with the lane
html_report_lane_parser = None
for next_html_report_lane in html_reports_lane:
if html_report_lane_parser is None:
html_report_lane_parser = LaneBarcodeParser(next_html_report_lane)
else:
- lanesInReport = [Lane['Lane'] for Lane in html_report_lane_parser.sample_data]
+ lanesInReport = [
+ Lane["Lane"] for Lane in html_report_lane_parser.sample_data
+ ]
next_html_report_lane_parser = LaneBarcodeParser(next_html_report_lane)
for entry in next_html_report_lane_parser.sample_data:
- if entry['Lane'] not in lanesInReport:
+ if entry["Lane"] not in lanesInReport:
# If this is a new lane not included before
html_report_lane_parser.sample_data.append(entry)
# Now all lanes have been inserted
@@ -579,193 +732,392 @@ def _fix_html_reports_for_complex_lanes(self, demux_folder, index_cycles, comple
Yield_Mbases = 0
for entry in html_report_lane_parser.sample_data:
# Update NumberReads for total lane clusters
- NumberReads_Summary[entry['Lane']] = {'total_lane_cluster': int(entry['PF Clusters'].replace(',', '')),
- 'total_lane_yield': int(entry['Yield (Mbases)'].replace(',', ''))}
- Clusters_Raw += int(int(entry['PF Clusters'].replace(',', '')) / float(entry['% PFClusters']) * 100)
- Clusters_PF += int(entry['PF Clusters'].replace(',', ''))
- Yield_Mbases += int(entry['Yield (Mbases)'].replace(',', ''))
- if entry['Lane'] in complex_lanes.keys():
- entry['% Perfectbarcode'] = None
- entry['% One mismatchbarcode'] = None
+ NumberReads_Summary[entry["Lane"]] = {
+ "total_lane_cluster": int(entry["PF Clusters"].replace(",", "")),
+ "total_lane_yield": int(entry["Yield (Mbases)"].replace(",", "")),
+ }
+ Clusters_Raw += int(
+ int(entry["PF Clusters"].replace(",", ""))
+ / float(entry["% PFClusters"])
+ * 100
+ )
+ Clusters_PF += int(entry["PF Clusters"].replace(",", ""))
+ Yield_Mbases += int(entry["Yield (Mbases)"].replace(",", ""))
+ if entry["Lane"] in complex_lanes.keys():
+ entry["% Perfectbarcode"] = None
+ entry["% One mismatchbarcode"] = None
# Update the values in Flowcell Summary
- html_report_lane_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}'
- html_report_lane_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}'
- html_report_lane_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}'
+ html_report_lane_parser.flowcell_data["Clusters (Raw)"] = f"{Clusters_Raw:,}"
+ html_report_lane_parser.flowcell_data["Clusters(PF)"] = f"{Clusters_PF:,}"
+ html_report_lane_parser.flowcell_data["Yield (MBases)"] = f"{Yield_Mbases:,}"
# Add lanes not present in this demux
# Create the new lane.html
- new_html_report_lane_dir = _create_folder_structure(demux_folder, ['Reports', 'html', self.flowcell_id, 'all', 'all', 'all'])
- new_html_report_lane = os.path.join(new_html_report_lane_dir, 'lane.html')
+ new_html_report_lane_dir = _create_folder_structure(
+ demux_folder, ["Reports", "html", self.flowcell_id, "all", "all", "all"]
+ )
+ new_html_report_lane = os.path.join(new_html_report_lane_dir, "lane.html")
_generate_lane_html(new_html_report_lane, html_report_lane_parser)
# Generate the laneBarcode
html_report_laneBarcode_parser = None
for next_html_report_laneBarcode in html_reports_laneBarcode:
if html_report_laneBarcode_parser is None:
- html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode)
+ html_report_laneBarcode_parser = LaneBarcodeParser(
+ next_html_report_laneBarcode
+ )
else:
# No need to check samples occuring in more than one file as it would be spotted while softlinking
- next_html_report_laneBarcode_parser = LaneBarcodeParser(next_html_report_laneBarcode)
+ next_html_report_laneBarcode_parser = LaneBarcodeParser(
+ next_html_report_laneBarcode
+ )
for entry in next_html_report_laneBarcode_parser.sample_data:
html_report_laneBarcode_parser.sample_data.append(entry)
# For complex lanes, set all numbers of undetermined to 0. And only keep one such entry
- constant_keys = ['Lane', 'Barcode sequence', 'Project', 'Sample']
+ constant_keys = ["Lane", "Barcode sequence", "Project", "Sample"]
modified_complex_lanes = []
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Lane'] in list(complex_lanes.keys()) and entry['Project'] in 'default':
- if entry['Lane'] not in modified_complex_lanes:
+ if (
+ entry["Lane"] in list(complex_lanes.keys())
+ and entry["Project"] in "default"
+ ):
+ if entry["Lane"] not in modified_complex_lanes:
for key in entry.keys():
if key not in constant_keys:
- entry[key] = '0'
- modified_complex_lanes.append(entry['Lane'])
+ entry[key] = "0"
+ modified_complex_lanes.append(entry["Lane"])
else:
html_report_laneBarcode_parser.sample_data.remove(entry)
# Update NumberReads for total sample yields
for entry in html_report_laneBarcode_parser.sample_data:
- if 'total_sample_cluster' not in NumberReads_Summary[entry['Lane']].keys():
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] = 0
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] = 0
- if entry['Project'] != 'default':
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', ''))
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', ''))
+ if "total_sample_cluster" not in NumberReads_Summary[entry["Lane"]].keys():
+ NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] = 0
+ NumberReads_Summary[entry["Lane"]]["total_sample_yield"] = 0
+ if entry["Project"] != "default":
+ NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int(
+ entry["PF Clusters"].replace(",", "")
+ )
+ NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int(
+ entry["Yield (Mbases)"].replace(",", "")
+ )
else:
- if entry['Project'] != 'default':
- NumberReads_Summary[entry['Lane']]['total_sample_cluster'] += int(entry['PF Clusters'].replace(',', ''))
- NumberReads_Summary[entry['Lane']]['total_sample_yield'] += int(entry['Yield (Mbases)'].replace(',', ''))
+ if entry["Project"] != "default":
+ NumberReads_Summary[entry["Lane"]]["total_sample_cluster"] += int(
+ entry["PF Clusters"].replace(",", "")
+ )
+ NumberReads_Summary[entry["Lane"]]["total_sample_yield"] += int(
+ entry["Yield (Mbases)"].replace(",", "")
+ )
# Calculate the numbers clusters/yields of undet reads
for key, value in NumberReads_Summary.items():
- value['undet_cluster'] = value['total_lane_cluster'] - value['total_sample_cluster']
- value['undet_yield'] = value['total_lane_yield'] - value['total_sample_yield']
+ value["undet_cluster"] = (
+ value["total_lane_cluster"] - value["total_sample_cluster"]
+ )
+ value["undet_yield"] = (
+ value["total_lane_yield"] - value["total_sample_yield"]
+ )
# Update the cluster/yield info of undet for complex lanes
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Project'] == 'default' and entry['Lane'] in complex_lanes.keys():
- entry['PF Clusters'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_cluster'])
- entry['Yield (Mbases)'] = '{:,}'.format(NumberReads_Summary[entry['Lane']]['undet_yield'])
+ if entry["Project"] == "default" and entry["Lane"] in complex_lanes.keys():
+ entry["PF Clusters"] = "{:,}".format(
+ NumberReads_Summary[entry["Lane"]]["undet_cluster"]
+ )
+ entry["Yield (Mbases)"] = "{:,}".format(
+ NumberReads_Summary[entry["Lane"]]["undet_yield"]
+ )
# Fix special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
lane_project_sample = dict()
for entry in html_report_laneBarcode_parser.sample_data:
- if entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined':
- lane_project_sample[entry['Lane']] = {'Project': entry['Project'],
- 'Sample': entry['Sample']}
+ if entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined":
+ lane_project_sample[entry["Lane"]] = {
+ "Project": entry["Project"],
+ "Sample": entry["Sample"],
+ }
for entry in html_report_laneBarcode_parser.sample_data[:]:
- if entry['Lane'] in noindex_lanes and entry['Sample'] == 'Undetermined':
- entry['Project'] = lane_project_sample[entry['Lane']]['Project']
- entry['Sample'] = lane_project_sample[entry['Lane']]['Sample']
- elif entry['Lane'] in noindex_lanes and entry['Sample'] != 'Undetermined':
+ if entry["Lane"] in noindex_lanes and entry["Sample"] == "Undetermined":
+ entry["Project"] = lane_project_sample[entry["Lane"]]["Project"]
+ entry["Sample"] = lane_project_sample[entry["Lane"]]["Sample"]
+ elif (
+ entry["Lane"] in noindex_lanes and entry["Sample"] != "Undetermined"
+ ):
html_report_laneBarcode_parser.sample_data.remove(entry)
# Sort sample_data: first by lane then by sample ID
- html_report_laneBarcode_parser.sample_data = sorted(html_report_laneBarcode_parser.sample_data,
- key=lambda k: (k['Lane'].lower(), k['Sample']))
+ html_report_laneBarcode_parser.sample_data = sorted(
+ html_report_laneBarcode_parser.sample_data,
+ key=lambda k: (k["Lane"].lower(), k["Sample"]),
+ )
# Update the values in Flowcell Summary
- html_report_laneBarcode_parser.flowcell_data['Clusters (Raw)'] = f'{Clusters_Raw:,}'
- html_report_laneBarcode_parser.flowcell_data['Clusters(PF)'] = f'{Clusters_PF:,}'
- html_report_laneBarcode_parser.flowcell_data['Yield (MBases)'] = f'{Yield_Mbases:,}'
+ html_report_laneBarcode_parser.flowcell_data[
+ "Clusters (Raw)"
+ ] = f"{Clusters_Raw:,}"
+ html_report_laneBarcode_parser.flowcell_data[
+ "Clusters(PF)"
+ ] = f"{Clusters_PF:,}"
+ html_report_laneBarcode_parser.flowcell_data[
+ "Yield (MBases)"
+ ] = f"{Yield_Mbases:,}"
# Generate the new report for laneBarcode.html
- new_html_report_laneBarcode = os.path.join(new_html_report_lane_dir, 'laneBarcode.html')
+ new_html_report_laneBarcode = os.path.join(
+ new_html_report_lane_dir, "laneBarcode.html"
+ )
_generate_lane_html(new_html_report_laneBarcode, html_report_laneBarcode_parser)
- def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes):
+ def _fix_demultiplexingstats_xml_dir(
+ self,
+ demux_folder,
+ stats_json,
+ samplesheets,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ ):
# Create the DemultiplexingStats.xml (empty it is here only to say thay demux is done)
- DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ['Stats'])
+ DemultiplexingStats_xml_dir = _create_folder_structure(demux_folder, ["Stats"])
# For creating DemuxSummary.txt files for complex lanes
DemuxSummaryFiles_complex_lanes = dict()
# Generate the Stats.json
- with open(os.path.join(DemultiplexingStats_xml_dir, 'Stats.json'), 'w') as json_data_cumulative:
+ with open(
+ os.path.join(DemultiplexingStats_xml_dir, "Stats.json"), "w"
+ ) as json_data_cumulative:
stats_list = {}
for stat_json in stats_json:
- demux_id = re.findall('Demultiplexing_([0-9])', stat_json)[0]
+ demux_id = re.findall("Demultiplexing_([0-9])", stat_json)[0]
with open(stat_json) as json_data_partial:
data = json.load(json_data_partial)
if len(stats_list) == 0:
# First time I do this
- stats_list['RunNumber'] = data['RunNumber']
- stats_list['Flowcell'] = data['Flowcell']
- stats_list['RunId'] = data['RunId']
- stats_list['ConversionResults'] = data['ConversionResults']
- stats_list['ReadInfosForLanes'] = data['ReadInfosForLanes']
- stats_list['UnknownBarcodes'] = []
+ stats_list["RunNumber"] = data["RunNumber"]
+ stats_list["Flowcell"] = data["Flowcell"]
+ stats_list["RunId"] = data["RunId"]
+ stats_list["ConversionResults"] = data["ConversionResults"]
+ stats_list["ReadInfosForLanes"] = data["ReadInfosForLanes"]
+ stats_list["UnknownBarcodes"] = []
else:
# Update only the importat fields
- lanes_present_in_stats_json = [entry['LaneNumber'] for entry in stats_list['ConversionResults']]
- for ReadInfosForLanes_lane in data['ReadInfosForLanes']:
- if ReadInfosForLanes_lane['LaneNumber'] not in lanes_present_in_stats_json:
- stats_list['ReadInfosForLanes'].extend([ReadInfosForLanes_lane])
- for ConversionResults_lane in data['ConversionResults']:
- if ConversionResults_lane['LaneNumber'] in lanes_present_in_stats_json and str(ConversionResults_lane['LaneNumber']) in complex_lanes.keys():
+ lanes_present_in_stats_json = [
+ entry["LaneNumber"]
+ for entry in stats_list["ConversionResults"]
+ ]
+ for ReadInfosForLanes_lane in data["ReadInfosForLanes"]:
+ if (
+ ReadInfosForLanes_lane["LaneNumber"]
+ not in lanes_present_in_stats_json
+ ):
+ stats_list["ReadInfosForLanes"].extend(
+ [ReadInfosForLanes_lane]
+ )
+ for ConversionResults_lane in data["ConversionResults"]:
+ if (
+ ConversionResults_lane["LaneNumber"]
+ in lanes_present_in_stats_json
+ and str(ConversionResults_lane["LaneNumber"])
+ in complex_lanes.keys()
+ ):
# For complex lanes, we set all stats to 0, except for read number and yield which will use values from NumberReads_Summary
- ConversionResults_lane['Undetermined']['NumberReads'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_cluster']
- ConversionResults_lane['Undetermined']['Yield'] = NumberReads_Summary[str(ConversionResults_lane['LaneNumber'])]['undet_yield']*1000000
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['QualityScoreSum'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['TrimmedBases'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['Yield'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][0]['YieldQ30'] = 0
- if len([r for r in self.runParserObj.runinfo.data['Reads'] if r['IsIndexedRead'] == 'N']) == 2:
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['QualityScoreSum'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['TrimmedBases'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['Yield'] = 0
- ConversionResults_lane['Undetermined']['ReadMetrics'][1]['YieldQ30'] = 0
+ ConversionResults_lane["Undetermined"][
+ "NumberReads"
+ ] = NumberReads_Summary[
+ str(ConversionResults_lane["LaneNumber"])
+ ]["undet_cluster"]
+ ConversionResults_lane["Undetermined"]["Yield"] = (
+ NumberReads_Summary[
+ str(ConversionResults_lane["LaneNumber"])
+ ]["undet_yield"]
+ * 1000000
+ )
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["QualityScoreSum"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["TrimmedBases"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["Yield"] = 0
+ ConversionResults_lane["Undetermined"]["ReadMetrics"][
+ 0
+ ]["YieldQ30"] = 0
+ if (
+ len(
+ [
+ r
+ for r in self.runParserObj.runinfo.data[
+ "Reads"
+ ]
+ if r["IsIndexedRead"] == "N"
+ ]
+ )
+ == 2
+ ):
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["QualityScoreSum"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["TrimmedBases"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["Yield"] = 0
+ ConversionResults_lane["Undetermined"][
+ "ReadMetrics"
+ ][1]["YieldQ30"] = 0
# Find the list containing info for this lane #TODO: can lane_to_update be removed?
- lane_to_update = [entry for entry in stats_list['ConversionResults'] if entry['LaneNumber'] == ConversionResults_lane['LaneNumber']][0]
- lane_to_update['DemuxResults'].extend(ConversionResults_lane['DemuxResults'])
- lane_to_update['Undetermined'] = ConversionResults_lane['Undetermined']
+ lane_to_update = [
+ entry
+ for entry in stats_list["ConversionResults"]
+ if entry["LaneNumber"]
+ == ConversionResults_lane["LaneNumber"]
+ ][0]
+ lane_to_update["DemuxResults"].extend(
+ ConversionResults_lane["DemuxResults"]
+ )
+ lane_to_update["Undetermined"] = ConversionResults_lane[
+ "Undetermined"
+ ]
else:
- stats_list['ConversionResults'].extend([ConversionResults_lane])
-
- for unknown_barcode_lane in data['UnknownBarcodes']:
- if str(unknown_barcode_lane['Lane']) in simple_lanes.keys():
- stats_list['UnknownBarcodes'].extend([unknown_barcode_lane])
- elif str(unknown_barcode_lane['Lane']) in complex_lanes.keys():
- if list(complex_lanes[str(unknown_barcode_lane['Lane'])].keys())[0] == demux_id:
+ stats_list["ConversionResults"].extend(
+ [ConversionResults_lane]
+ )
+
+ for unknown_barcode_lane in data["UnknownBarcodes"]:
+ if str(unknown_barcode_lane["Lane"]) in simple_lanes.keys():
+ stats_list["UnknownBarcodes"].extend([unknown_barcode_lane])
+ elif str(unknown_barcode_lane["Lane"]) in complex_lanes.keys():
+ if (
+ list(
+ complex_lanes[
+ str(unknown_barcode_lane["Lane"])
+ ].keys()
+ )[0]
+ == demux_id
+ ):
# First have the list of unknown indexes from the top priority demux run
full_list_unknownbarcodes = unknown_barcode_lane
# Remove the samples involved in the other samplesheets
for samplesheet in samplesheets:
- demux_id_ss = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
+ demux_id_ss = os.path.splitext(
+ os.path.split(samplesheet)[1]
+ )[0].split("_")[1]
if demux_id_ss != demux_id:
ssparser = SampleSheetParser(samplesheet)
- ssparser_data_lane = [row for row in ssparser.data if row['Lane'] == str(unknown_barcode_lane['Lane'])]
+ ssparser_data_lane = [
+ row
+ for row in ssparser.data
+ if row["Lane"]
+ == str(unknown_barcode_lane["Lane"])
+ ]
for row in ssparser_data_lane:
- sample_idx1 = row.get('index','')
- sample_idx2 = row.get('index2','')
- idx_copy = tuple(full_list_unknownbarcodes['Barcodes'].keys())
+ sample_idx1 = row.get("index", "")
+ sample_idx2 = row.get("index2", "")
+ idx_copy = tuple(
+ full_list_unknownbarcodes[
+ "Barcodes"
+ ].keys()
+ )
for idx in idx_copy:
- unknownbarcode_idx1 = idx.split('+')[0] if '+' in idx else idx
- unknownbarcode_idx2 = idx.split('+')[1] if '+' in idx else ''
+ unknownbarcode_idx1 = (
+ idx.split("+")[0]
+ if "+" in idx
+ else idx
+ )
+ unknownbarcode_idx2 = (
+ idx.split("+")[1]
+ if "+" in idx
+ else ""
+ )
if sample_idx1 and sample_idx2:
- comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)]
- comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx2) else sample_idx2[:len(unknownbarcode_idx2)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)] and comparepart_idx2 == unknownbarcode_idx2[:len(comparepart_idx2)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
+ comparepart_idx1 = (
+ sample_idx1
+ if len(sample_idx1)
+ <= len(unknownbarcode_idx1)
+ else sample_idx1[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ comparepart_idx2 = (
+ sample_idx2
+ if len(sample_idx2)
+ <= len(unknownbarcode_idx2)
+ else sample_idx2[
+ : len(unknownbarcode_idx2)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx1)
+ ]
+ and comparepart_idx2
+ == unknownbarcode_idx2[
+ : len(comparepart_idx2)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
elif sample_idx1 and not sample_idx2:
- comparepart_idx1 = sample_idx1 if len(sample_idx1) <= len(unknownbarcode_idx1) else sample_idx1[:len(unknownbarcode_idx1)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx1)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
+ comparepart_idx1 = (
+ sample_idx1
+ if len(sample_idx1)
+ <= len(unknownbarcode_idx1)
+ else sample_idx1[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx1)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
elif not sample_idx1 and sample_idx2:
- comparepart_idx2 = sample_idx2 if len(sample_idx2) <= len(unknownbarcode_idx1) else sample_idx2[:len(unknownbarcode_idx1)]
- if comparepart_idx1 == unknownbarcode_idx1[:len(comparepart_idx2)]:
- del full_list_unknownbarcodes['Barcodes'][idx]
- stats_list['UnknownBarcodes'].extend([full_list_unknownbarcodes])
- DemuxSummaryFiles_complex_lanes[str(unknown_barcode_lane['Lane'])] = full_list_unknownbarcodes
+ comparepart_idx2 = (
+ sample_idx2
+ if len(sample_idx2)
+ <= len(unknownbarcode_idx1)
+ else sample_idx2[
+ : len(unknownbarcode_idx1)
+ ]
+ )
+ if (
+ comparepart_idx1
+ == unknownbarcode_idx1[
+ : len(comparepart_idx2)
+ ]
+ ):
+ del full_list_unknownbarcodes[
+ "Barcodes"
+ ][idx]
+ stats_list["UnknownBarcodes"].extend(
+ [full_list_unknownbarcodes]
+ )
+ DemuxSummaryFiles_complex_lanes[
+ str(unknown_barcode_lane["Lane"])
+ ] = full_list_unknownbarcodes
else:
pass
# Fix special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
- for entry in stats_list['ConversionResults'][:]:
- if str(entry['LaneNumber']) in noindex_lanes:
- del entry['DemuxResults'][0]['IndexMetrics']
- entry['DemuxResults'][0].update(entry['Undetermined'])
- del entry['Undetermined']
+ for entry in stats_list["ConversionResults"][:]:
+ if str(entry["LaneNumber"]) in noindex_lanes:
+ del entry["DemuxResults"][0]["IndexMetrics"]
+ entry["DemuxResults"][0].update(entry["Undetermined"])
+ del entry["Undetermined"]
# Reset unknown barcodes list
- for entry in stats_list['UnknownBarcodes'][:]:
- if str(entry['Lane']) in noindex_lanes:
- entry['Barcodes'] = {'unknown': 1}
+ for entry in stats_list["UnknownBarcodes"][:]:
+ if str(entry["Lane"]) in noindex_lanes:
+ entry["Barcodes"] = {"unknown": 1}
# Write the final version of Stats.json file
json.dump(stats_list, json_data_cumulative)
@@ -773,100 +1125,161 @@ def _fix_demultiplexingstats_xml_dir(self, demux_folder, stats_json, samplesheet
# Create DemuxSummary.txt files for complex lanes
if len(DemuxSummaryFiles_complex_lanes) > 0:
for key, value in DemuxSummaryFiles_complex_lanes.items():
- with open(os.path.join(DemultiplexingStats_xml_dir, f'DemuxSummaryF1L{key}.txt'), 'w') as DemuxSummaryFile:
- DemuxSummaryFile.write('### Most Popular Unknown Index Sequences\n')
- DemuxSummaryFile.write('### Columns: Index_Sequence Hit_Count\n')
- for idx, count in value['Barcodes'].items():
- DemuxSummaryFile.write(f'{idx}\t{count}\n')
-
- open(os.path.join(DemultiplexingStats_xml_dir, 'DemultiplexingStats.xml'), 'a').close()
-
- def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes):
+ with open(
+ os.path.join(
+ DemultiplexingStats_xml_dir, f"DemuxSummaryF1L{key}.txt"
+ ),
+ "w",
+ ) as DemuxSummaryFile:
+ DemuxSummaryFile.write("### Most Popular Unknown Index Sequences\n")
+ DemuxSummaryFile.write("### Columns: Index_Sequence Hit_Count\n")
+ for idx, count in value["Barcodes"].items():
+ DemuxSummaryFile.write(f"{idx}\t{count}\n")
+
+ open(
+ os.path.join(DemultiplexingStats_xml_dir, "DemultiplexingStats.xml"), "a"
+ ).close()
+
+ def _process_demux_with_complex_lanes(
+ self,
+ demux_folder,
+ samplesheets,
+ legacy_path,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ ):
html_reports_lane = []
html_reports_laneBarcode = []
stats_json = []
for samplesheet in samplesheets:
ssparser = SampleSheetParser(samplesheet)
demux_id = os.path.splitext(os.path.split(samplesheet)[1])[0].split("_")[1]
- html_report_lane = os.path.join(self.run_dir,
- f"Demultiplexing_{demux_id}",
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "lane.html"
- )
+ html_report_lane = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "lane.html",
+ )
if os.path.exists(html_report_lane):
html_reports_lane.append(html_report_lane)
else:
- raise RuntimeError(f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing")
-
- html_report_laneBarcode = os.path.join(self.run_dir,
- f"Demultiplexing_{demux_id}",
- legacy_path,
- "Reports",
- "html",
- self.flowcell_id,
- "all",
- "all",
- "all",
- "laneBarcode.html"
- )
+ raise RuntimeError(
+ f"Not able to find html report {html_report_lane}: possible cause is problem in demultiplexing"
+ )
+
+ html_report_laneBarcode = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Reports",
+ "html",
+ self.flowcell_id,
+ "all",
+ "all",
+ "all",
+ "laneBarcode.html",
+ )
if os.path.exists(html_report_laneBarcode):
html_reports_laneBarcode.append(html_report_laneBarcode)
else:
- raise RuntimeError(f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing")
-
- stat_json = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", legacy_path, "Stats", "Stats.json")
+ raise RuntimeError(
+ f"Not able to find html report {html_report_laneBarcode}: possible cause is problem in demultiplexing"
+ )
+
+ stat_json = os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ "Stats.json",
+ )
if os.path.exists(stat_json):
stats_json.append(stat_json)
else:
- raise RuntimeError(f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing")
+ raise RuntimeError(
+ f"Not able to find Stats.json report {stat_json}: possible cause is problem in demultiplexing"
+ )
# Aggregate fastq
lanes_samples = dict()
for row in ssparser.data:
- if row['Lane'] not in lanes_samples.keys():
- lanes_samples[row['Lane']] = [row['Sample_Name']]
+ if row["Lane"] not in lanes_samples.keys():
+ lanes_samples[row["Lane"]] = [row["Sample_Name"]]
else:
- lanes_samples[row['Lane']].append(row['Sample_Name'])
+ lanes_samples[row["Lane"]].append(row["Sample_Name"])
# Special case that when we assign fake indexes for NoIndex samples
- if (set(list(lanes_samples.keys())) & set(noindex_lanes)) and index_cycles != [0, 0]:
+ if (
+ set(list(lanes_samples.keys())) & set(noindex_lanes)
+ ) and index_cycles != [0, 0]:
sample_counter = 1
- for entry in sorted(ssparser.data, key=lambda k: k['Lane']):
- lane = entry['Lane']
- project = entry['Sample_Project']
- sample = entry['Sample_ID']
+ for entry in sorted(ssparser.data, key=lambda k: k["Lane"]):
+ lane = entry["Lane"]
+ project = entry["Sample_Project"]
+ sample = entry["Sample_ID"]
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
os.makedirs(project_dest)
sample_dest = os.path.join(project_dest, sample)
if not os.path.exists(sample_dest):
os.makedirs(sample_dest)
- for file in glob.glob(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", f"Undetermined*L0?{lane}*")):
+ for file in glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined*L0?{lane}*",
+ )
+ ):
old_name = os.path.basename(file)
old_name_comps = old_name.split("_")
- new_name_comps = [sample.replace('Sample_', ''), f'S{str(sample_counter)}'] + old_name_comps[2:]
+ new_name_comps = [
+ sample.replace("Sample_", ""),
+ f"S{str(sample_counter)}",
+ ] + old_name_comps[2:]
new_name = "_".join(new_name_comps)
os.symlink(file, os.path.join(sample_dest, new_name))
- logger.info("For undet sample {}, renaming {} to {}".format(sample.replace('Sample_', ''), old_name, new_name))
+ logger.info(
+ "For undet sample {}, renaming {} to {}".format(
+ sample.replace("Sample_", ""), old_name, new_name
+ )
+ )
sample_counter += 1
# Ordinary cases
else:
- projects = [project for project in os.listdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")) if os.path.isdir(os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project))]
+ projects = [
+ project
+ for project in os.listdir(
+ os.path.join(self.run_dir, f"Demultiplexing_{demux_id}")
+ )
+ if os.path.isdir(
+ os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", project
+ )
+ )
+ ]
for project in projects:
if project in "Reports" or project in "Stats":
continue
- project_source = os.path.join(self.run_dir, f"Demultiplexing_{demux_id}", project)
+ project_source = os.path.join(
+ self.run_dir, f"Demultiplexing_{demux_id}", project
+ )
project_dest = os.path.join(demux_folder, project)
if not os.path.exists(project_dest):
# There might be project seqeunced with multiple index lengths
os.makedirs(project_dest)
- samples = [sample for sample in os.listdir(project_source) if os.path.isdir(os.path.join(project_source, sample))]
+ samples = [
+ sample
+ for sample in os.listdir(project_source)
+ if os.path.isdir(os.path.join(project_source, sample))
+ ]
for sample in samples:
sample_source = os.path.join(project_source, sample)
sample_dest = os.path.join(project_dest, sample)
@@ -874,12 +1287,30 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
# There should never be the same sample sequenced with different index length,
# however a sample might be pooled in several lanes and therefore sequenced using different samplesheets
os.makedirs(sample_dest)
- fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*"))
+ fastqfiles = glob.glob(os.path.join(sample_source, "*.fastq*"))
for fastqfile in fastqfiles:
- os.symlink(fastqfile, os.path.join(sample_dest, os.path.split(fastqfile)[1]))
+ os.symlink(
+ fastqfile,
+ os.path.join(sample_dest, os.path.split(fastqfile)[1]),
+ )
# Copy fastq files for undetermined and the undetermined stats for simple lanes only
lanes_in_sub_samplesheet = []
- header = ['[Header]','[Data]','FCID','Lane', 'Sample_ID', 'Sample_Name', 'Sample_Ref', 'index', 'index2', 'Description', 'Control', 'Recipe', 'Operator', 'Sample_Project']
+ header = [
+ "[Header]",
+ "[Data]",
+ "FCID",
+ "Lane",
+ "Sample_ID",
+ "Sample_Name",
+ "Sample_Ref",
+ "index",
+ "index2",
+ "Description",
+ "Control",
+ "Recipe",
+ "Operator",
+ "Sample_Project",
+ ]
with open(samplesheet) as sub_samplesheet_file:
sub_samplesheet_reader = csv.reader(sub_samplesheet_file)
for row in sub_samplesheet_reader:
@@ -888,68 +1319,119 @@ def _process_demux_with_complex_lanes(self, demux_folder, samplesheets, legacy_p
lanes_in_sub_samplesheet = list(set(lanes_in_sub_samplesheet))
for lane in lanes_in_sub_samplesheet:
if lane in simple_lanes.keys():
- undetermined_fastq_files = glob.glob(os.path.join(self.run_dir,
- f"Demultiplexing_{demux_id}",
- f"Undetermined_S0_L00{lane}*.fastq*")) # Contains only simple lanes undetermined
+ undetermined_fastq_files = glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ f"Undetermined_S0_L00{lane}*.fastq*",
+ )
+ ) # Contains only simple lanes undetermined
for fastqfile in undetermined_fastq_files:
- os.symlink(fastqfile, os.path.join(demux_folder, os.path.split(fastqfile)[1]))
- DemuxSummaryFiles = glob.glob(os.path.join(self.run_dir,
- f"Demultiplexing_{demux_id}",
- legacy_path,
- "Stats",
- f"*L{lane}*txt"))
+ os.symlink(
+ fastqfile,
+ os.path.join(demux_folder, os.path.split(fastqfile)[1]),
+ )
+ DemuxSummaryFiles = glob.glob(
+ os.path.join(
+ self.run_dir,
+ f"Demultiplexing_{demux_id}",
+ legacy_path,
+ "Stats",
+ f"*L{lane}*txt",
+ )
+ )
if not os.path.exists(os.path.join(demux_folder, "Stats")):
os.makedirs(os.path.join(demux_folder, "Stats"))
for DemuxSummaryFile in DemuxSummaryFiles:
- os.symlink(DemuxSummaryFile, os.path.join(demux_folder, "Stats", os.path.split(DemuxSummaryFile)[1]))
+ os.symlink(
+ DemuxSummaryFile,
+ os.path.join(
+ demux_folder,
+ "Stats",
+ os.path.split(DemuxSummaryFile)[1],
+ ),
+ )
return html_reports_lane, html_reports_laneBarcode, stats_json
def _aggregate_demux_results_simple_complex(self):
runSetup = self.runParserObj.runinfo.get_read_configuration()
- demux_folder = os.path.join(self.run_dir , self.demux_dir)
+ demux_folder = os.path.join(self.run_dir, self.demux_dir)
samplesheets = glob.glob(os.path.join(self.run_dir, "*_[0-9].csv"))
- if self.software == 'bcl2fastq':
- legacy_path = ''
- elif self.software == 'bclconvert':
+ if self.software == "bcl2fastq":
+ legacy_path = ""
+ elif self.software == "bclconvert":
legacy_path = f"Reports/{self.legacy_dir}"
else:
raise RuntimeError("Unrecognized software!")
index_cycles = [0, 0]
for read in runSetup:
- if read['IsIndexedRead'] == 'Y':
- if int(read['Number']) == 2:
- index_cycles[0] = int(read['NumCycles'])
+ if read["IsIndexedRead"] == "Y":
+ if int(read["Number"]) == 2:
+ index_cycles[0] = int(read["NumCycles"])
else:
- index_cycles[1] = int(read['NumCycles'])
+ index_cycles[1] = int(read["NumCycles"])
# Classify lanes in samplesheets
- (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(samplesheets)
+ (noindex_lanes, simple_lanes, complex_lanes) = self._classify_lanes(
+ samplesheets
+ )
# Case with only one sub-demultiplexing
if len(complex_lanes) == 0 and len(samplesheets) == 1:
- demux_id = "0" # in this case this is the only demux dir
+ demux_id = "0" # in this case this is the only demux dir
# Special case that when we assign fake indexes for NoIndex samples
if noindex_lanes and index_cycles != [0, 0]:
# We first softlink the FastQ files of undet as the FastQ files of samples
- self._process_noindex_sample_with_fake_index_with_single_demux(demux_id, legacy_path)
+ self._process_noindex_sample_with_fake_index_with_single_demux(
+ demux_id, legacy_path
+ )
# This is the simple case, Demultiplexing dir is simply a symlink to the only sub-demultiplexing dir
else:
- self._process_simple_lane_with_single_demux(demux_id, legacy_path, noindex_lanes)
+ self._process_simple_lane_with_single_demux(
+ demux_id, legacy_path, noindex_lanes
+ )
return True
# Case with multiple sub-demultiplexings
- (html_reports_lane, html_reports_laneBarcode, stats_json) = self._process_demux_with_complex_lanes(samplesheets, legacy_path, index_cycles, simple_lanes, complex_lanes, noindex_lanes)
+ (
+ html_reports_lane,
+ html_reports_laneBarcode,
+ stats_json,
+ ) = self._process_demux_with_complex_lanes(
+ samplesheets,
+ legacy_path,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ )
# Create the html reports
- self._fix_html_reports_for_complex_lanes(demux_folder, index_cycles, complex_lanes, noindex_lanes, html_reports_lane, html_reports_laneBarcode)
+ self._fix_html_reports_for_complex_lanes(
+ demux_folder,
+ index_cycles,
+ complex_lanes,
+ noindex_lanes,
+ html_reports_lane,
+ html_reports_laneBarcode,
+ )
# Fix contents under the DemultiplexingStats folder
- self._fix_demultiplexingstats_xml_dir(demux_folder, stats_json, samplesheets, index_cycles, simple_lanes, complex_lanes, noindex_lanes)
+ self._fix_demultiplexingstats_xml_dir(
+ demux_folder,
+ stats_json,
+ samplesheets,
+ index_cycles,
+ simple_lanes,
+ complex_lanes,
+ noindex_lanes,
+ )
return True
+
def _create_folder_structure(root, dirs):
"""Creates a fodler stucture rooted in root usinf all dirs listed in dirs (a list)
returns the path to the deepest directory
@@ -961,49 +1443,56 @@ def _create_folder_structure(root, dirs):
os.makedirs(path)
return path
+
def _generate_lane_html(html_file, html_report_lane_parser):
- with open(html_file, 'w') as html:
+ with open(html_file, "w") as html:
# HEADER
- html.write('\n')
- html.write('\n')
- html.write('\n')
- html.write('\n')
+ html.write(
+ '\n'
+ )
+ html.write("\n")
+ html.write(
+ '\n'
+ )
+ html.write("\n")
html.write('