Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store balsamic #551

Merged
merged 76 commits into from
Mar 25, 2020
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
bad7473
store-like-mip
patrikgrenfeldt Jan 24, 2020
ac07248
Test store
patrikgrenfeldt Jan 24, 2020
2ccbe94
store balsamic files in meta-file
patrikgrenfeldt Feb 12, 2020
e7f35b5
Merge branch 'master' into store-balsamic
patrikgrenfeldt Feb 12, 2020
2662751
remove unused argument
patrikgrenfeldt Feb 12, 2020
9d38d82
Merge branch 'store-balsamic' of https://github.com/Clinical-Genomics…
patrikgrenfeldt Feb 12, 2020
1634ef4
add missing required parameter
patrikgrenfeldt Feb 12, 2020
9215da1
move calls to store object
patrikgrenfeldt Feb 12, 2020
4bac8a2
change black pre-commit line length to what we use
patrikgrenfeldt Feb 13, 2020
ee8174b
Remove Python2 charset declaration. Format code
patrikgrenfeldt Feb 13, 2020
5ee1c81
Remove Python2 charset declaration
patrikgrenfeldt Feb 13, 2020
820d73d
Capitalize in log message
patrikgrenfeldt Feb 13, 2020
71889f2
add method description
patrikgrenfeldt Feb 13, 2020
7bef772
output with click instead of print
patrikgrenfeldt Feb 13, 2020
523278c
remove unused imports
patrikgrenfeldt Feb 13, 2020
2770128
remove unneccesary whitespace
patrikgrenfeldt Feb 13, 2020
abc5b5c
describe methods
patrikgrenfeldt Feb 13, 2020
418673f
Merge branch 'master' into store-balsamic
patrikgrenfeldt Feb 17, 2020
6825863
use existing case instead of refetching
patrikgrenfeldt Feb 19, 2020
b471db6
remove hard coded dummy values
patrikgrenfeldt Feb 19, 2020
3617120
Merge branch 'master' into store-balsamic
patrikgrenfeldt Feb 27, 2020
c9805c5
remove unused assignment
patrikgrenfeldt Feb 27, 2020
c2f1bb1
add missing argument in call
patrikgrenfeldt Feb 27, 2020
f6dee12
make mock behave more like real thing
patrikgrenfeldt Feb 27, 2020
c526de9
store root_dir in hk_api
patrikgrenfeldt Feb 27, 2020
a33836d
fix string
patrikgrenfeldt Feb 27, 2020
642aa90
handle directories and multiple tags
patrikgrenfeldt Mar 20, 2020
55c6f42
Merge branch 'master' into store-balsamic
patrikgrenfeldt Mar 20, 2020
685e806
Merge with master
patrikgrenfeldt Mar 20, 2020
d6c118d
clarify the priority mapping
patrikgrenfeldt Mar 23, 2020
dd761d0
simplify test
patrikgrenfeldt Mar 23, 2020
e4d6471
remove debug print
patrikgrenfeldt Mar 23, 2020
3a3740b
move test helpers to a module
patrikgrenfeldt Mar 23, 2020
28067a0
fix import of store_helpers
patrikgrenfeldt Mar 23, 2020
1ae475f
remove redundant test setup code
patrikgrenfeldt Mar 23, 2020
3770d5a
fix store_helper module path
patrikgrenfeldt Mar 23, 2020
8b60548
remove redundant import
patrikgrenfeldt Mar 23, 2020
74e94d9
fix import order
patrikgrenfeldt Mar 23, 2020
1c6a013
Add docstring
patrikgrenfeldt Mar 23, 2020
f5d2153
linitng
patrikgrenfeldt Mar 23, 2020
54b485b
linting
patrikgrenfeldt Mar 23, 2020
31e0a76
Merge branch 'master' into store-balsamic
patrikgrenfeldt Mar 23, 2020
5187a59
skip conversion to path
patrikgrenfeldt Mar 23, 2020
16e1b8f
Merge branch 'store-balsamic' of https://github.com/Clinical-Genomics…
patrikgrenfeldt Mar 23, 2020
29c3178
use full path
patrikgrenfeldt Mar 23, 2020
027cccb
rework the path and tag parsing
patrikgrenfeldt Mar 24, 2020
b120d21
return actual compressed filename
patrikgrenfeldt Mar 24, 2020
e78476e
fix broken test
patrikgrenfeldt Mar 24, 2020
0e29ae9
fix path to generated .hk file
patrikgrenfeldt Mar 24, 2020
e1d0726
fix path to generated .hk file
patrikgrenfeldt Mar 24, 2020
5d3188d
fix deliverables file path
patrikgrenfeldt Mar 24, 2020
755a033
Merge branch 'master' into store-balsamic
patrikgrenfeldt Mar 24, 2020
8dbbefb
merge with master
patrikgrenfeldt Mar 24, 2020
e9c9c3b
Protect the store in housekeeper from direct usage
patrikgrenfeldt Mar 24, 2020
68fea14
format code
patrikgrenfeldt Mar 24, 2020
164e476
linting
patrikgrenfeldt Mar 24, 2020
44b4c13
linting
patrikgrenfeldt Mar 24, 2020
585e94f
check black with our decided linelength
patrikgrenfeldt Mar 24, 2020
f8ca62d
format code
patrikgrenfeldt Mar 24, 2020
a4f1f24
call fixture what it is
patrikgrenfeldt Mar 25, 2020
279b8c0
test magic __getattr__
patrikgrenfeldt Mar 25, 2020
b3b02f4
Update cg/cli/workflow/balsamic/base.py
patrikgrenfeldt Mar 25, 2020
e45ef80
simplify creation of balsamic command
patrikgrenfeldt Mar 25, 2020
4530b17
Merge branch 'store-balsamic' of https://github.com/Clinical-Genomics…
patrikgrenfeldt Mar 25, 2020
4c7b169
create the deliverables file path in one way only
patrikgrenfeldt Mar 25, 2020
ab1b7ec
simplify balsamic command
patrikgrenfeldt Mar 25, 2020
539af7a
restore unsecure call
patrikgrenfeldt Mar 25, 2020
259b4bc
Merge branch 'store-balsamic' of https://github.com/Clinical-Genomics…
patrikgrenfeldt Mar 25, 2020
2cc3810
Merge branch 'master' into store-balsamic
patrikgrenfeldt Mar 25, 2020
656223a
remove erronous usage of store on API
patrikgrenfeldt Mar 25, 2020
54a7fdd
forward all arguments to wrapped add_commit
patrikgrenfeldt Mar 25, 2020
596b6d1
fix docstring
patrikgrenfeldt Mar 25, 2020
31ad4f0
Wrap version method in Store
patrikgrenfeldt Mar 25, 2020
518927a
capture log at right level
patrikgrenfeldt Mar 25, 2020
12226ef
more tests
patrikgrenfeldt Mar 25, 2020
635f4f3
fix store name according to real implementation
patrikgrenfeldt Mar 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 26 additions & 20 deletions cg/apps/hk.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,26 @@
# -*- coding: utf-8 -*-
""" Module to decouple cg code from Housekeeper code """
import datetime as dt
import logging
import os
from pathlib import Path

from housekeeper.exc import VersionIncludedError
from housekeeper.include import include_version, checksum as hk_checksum
from housekeeper.store import Store, models

log = logging.getLogger(__name__)
LOG = logging.getLogger(__name__)


class HousekeeperAPI(Store):
class HousekeeperAPI:
""" API to decouple cg code from Housekeeper """

def __init__(self, config):
super(HousekeeperAPI, self).__init__(config['housekeeper']['database'],
config['housekeeper']['root'])
self.root_dir = config['housekeeper']['root']
self.store = Store(
config["housekeeper"]["database"], config["housekeeper"]["root"]
)

def add_bundle(self, bundle_data):
""" Wrap method in Housekeeper Store """
return self.store.add_bundle(bundle_data)

def include(self, version_obj: models.Version):
"""Call the include version function to import related assets."""
Expand All @@ -30,26 +34,28 @@ def include_file(self, file_obj: models.File, version_obj: models.Version):
# generate root directory
version_root_dir = global_root_dir / version_obj.relative_root_dir
version_root_dir.mkdir(parents=True, exist_ok=True)
log.info(f"created new bundle version dir: {version_root_dir}")
LOG.info("created new bundle version dir: %s", version_root_dir)

if file_obj.to_archive:
# calculate sha1 checksum if file is to be archived
file_obj.checksum = HousekeeperAPI.checksum(file_obj.path)
# hardlink file to the internal structure
new_path = version_root_dir / Path(file_obj.path).name
os.link(file_obj.path, new_path)
log.info(f"linked file: {file_obj.path} -> {new_path}")
file_obj.path = str(new_path).replace(f"{global_root_dir}/", '', 1)
LOG.info("linked file: %s -> %s", file_obj.path, new_path)
file_obj.path = str(new_path).replace(f"{global_root_dir}/", "", 1)

def last_version(self, bundle: str) -> models.Version:
return (self.Version.query
.join(models.Version.bundle)
.filter(models.Bundle.name == bundle)
.order_by(models.Version.created_at.desc())
.first())
return (
self.store.Version.query.join(models.Version.bundle)
.filter(models.Bundle.name == bundle)
.order_by(models.Version.created_at.desc())
.first()
)

def get_root_dir(self):
return self.root_dir
"""Returns the root dir of Housekeeper"""
return self.store.root_dir

def get_files(self, bundle: str, tags: list, version: int = None):
"""Fetch all the files in housekeeper, optionally filtered by bundle and/or tags and/or
Expand All @@ -58,17 +64,17 @@ def get_files(self, bundle: str, tags: list, version: int = None):
Returns:
iterable(hk.Models.File)
"""
return self.files(bundle=bundle, tags=tags, version=version)
return self.store.files(bundle=bundle, tags=tags, version=version)

def add_file(self, file, version_obj: models.Version, tag_name, to_archive=False):
"""Add a file to housekeeper."""
new_file = self.new_file(
new_file = self.store.new_file(
path=str(Path(file).absolute()),
to_archive=to_archive,
tags=[self.tag(tag_name)]
tags=[self.store.tag(tag_name)],
)
new_file.version = version_obj
self.add_commit(new_file)
self.store.add_commit(new_file)
return new_file

@staticmethod
Expand Down
124 changes: 23 additions & 101 deletions cg/cli/workflow/balsamic/store.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
"""Click commands to store balsamic analyses"""
import datetime as dt
import logging
from pathlib import Path

import logging
import click

from cg.apps import hk, tb
from cg.exc import AnalysisNotFinishedError, AnalysisDuplicationError
from cg.meta.store.balsamic import _gather_files_and_bundle_in_housekeeper

from cg.store import Store

from cg.exc import AnalysisNotFinishedError, AnalysisDuplicationError
from housekeeper.exc import VersionIncludedError

LOG = logging.getLogger(__name__)


Expand All @@ -22,117 +24,37 @@ def store(context):


@store.command()
@click.argument("config-stream", type=click.File("r"), required=False)
@click.argument("case_id")
@click.argument("config-stream", type=click.File("r"))
@click.pass_context
def analysis(context, config_stream):
def analysis(context, case_id, config_stream):
"""Store a finished analysis in Housekeeper."""

status = context.obj["db"]
tb_api = context.obj["tb_api"]
hk_api = context.obj["hk_api"]
case_obj = status.family(case_id)

if not config_stream:
LOG.error("provide a config, suggestions:")
for analysis_obj in tb_api.analyses(
status="completed", deleted=False
)[:25]:
click.echo(analysis_obj.config_path)
if not case_obj:
click.echo(click.style(f"Case {case_id} not found", fg="red"))
context.abort()

new_analysis = _gather_files_and_bundle_in_housekeeper(
config_stream, context, hk_api, status, tb_api
)

status.add_commit(new_analysis)
click.echo(click.style("included files in Housekeeper", fg="green"))


def _gather_files_and_bundle_in_housekeeper(
config_stream, context, hk_api, status, tb_api
):
"""Function to gather files and bundle in housekeeper"""
hk_api = context.obj["hk_api"]

try:
bundle_data = tb_api.add_analysis(config_stream)
new_analysis = _gather_files_and_bundle_in_housekeeper(
config_stream, hk_api, status
)
except AnalysisNotFinishedError as error:
click.echo(click.style(error.message, fg="red"))
context.abort()

try:
results = hk_api.add_bundle(bundle_data)
if results is None:
print(click.style("analysis version already added", fg="yellow"))
context.abort()
bundle_obj, version_obj = results
except FileNotFoundError as error:
click.echo(click.style(f"missing file: {error.args[0]}", fg="red"))
context.abort()

family_obj = _add_new_analysis_to_the_status_api(bundle_obj, status)
_reset_action_from_running_on_family(family_obj)
new_analysis = _add_new_complete_analysis_record(
bundle_data, family_obj, status, version_obj
)
version_date = version_obj.created_at.date()
click.echo(f"new bundle added: {bundle_obj.name}, version {version_date}")
_include_files_in_housekeeper(bundle_obj, context, hk_api, version_obj)

return new_analysis


def _include_files_in_housekeeper(bundle_obj, context, hk_api, version_obj):
"""Function to include files in housekeeper"""
try:
hk_api.include(version_obj)
except hk.VersionIncludedError as error:
except AnalysisDuplicationError as error:
print(click.style("analysis version already added", fg="yellow"))
context.abort()
except VersionIncludedError as error:
click.echo(click.style(error.message, fg="red"))
context.abort()
hk_api.add_commit(bundle_obj, version_obj)


def _add_new_complete_analysis_record(bundle_data, family_obj, status, version_obj):
"""Function to create and return a new analysis database record"""
pipeline = family_obj.links[0].sample.data_analysis
pipeline = pipeline if pipeline else "mip" # TODO remove this default from here

if status.analysis(family=family_obj, started_at=version_obj.created_at):
raise AnalysisDuplicationError(
f"Analysis object already exists for {family_obj.internal_id}{version_obj.created_at}"
)

new_analysis = status.add_analysis(
pipeline=pipeline,
version=bundle_data["pipeline_version"],
started_at=version_obj.created_at,
completed_at=dt.datetime.now(),
primary=(len(family_obj.analyses) == 0),
)
new_analysis.family = family_obj
return new_analysis


def _reset_action_from_running_on_family(family_obj):
family_obj.action = None


def _add_new_analysis_to_the_status_api(bundle_obj, status):
family_obj = status.family(bundle_obj.name)
return family_obj


@store.command()
@click.pass_context
def completed(context):
"""Store all completed analyses."""
hk_api = context.obj["hk_api"]
for analysis_obj in context.obj["tb_api"].analyses(
status="completed", deleted=False
):
existing_record = hk_api.version(analysis_obj.family, analysis_obj.started_at)
if existing_record:
LOG.debug(
"analysis stored: %s - %s", analysis_obj.family, analysis_obj.started_at
)
continue
click.echo(click.style(f"storing family: {analysis_obj.family}", fg="blue"))
with Path(analysis_obj.config_path).open() as config_stream:
context.invoke(analysis, config_stream=config_stream)
status.add_commit(new_analysis)
click.echo(click.style("included files in Housekeeper", fg="green"))
102 changes: 102 additions & 0 deletions cg/meta/store/balsamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Builds balsamic bundle for linking in Housekeeper"""


import datetime as dt
import logging
import ruamel.yaml
from cg.exc import AnalysisDuplicationError

LOG = logging.getLogger(__name__)


def _gather_files_and_bundle_in_housekeeper(config_stream, hk_api, status):
"""Function to gather files and bundle in housekeeper"""

bundle_data = _add_analysis(config_stream)

results = hk_api.add_bundle(bundle_data)
if not results:
raise AnalysisDuplicationError("analysis version already added")
bundle_obj, version_obj = results

case_obj = _get_case(bundle_obj, status)
_reset_analysis_action(case_obj)
new_analysis = _create_analysis(bundle_data, case_obj, status, version_obj)
version_date = version_obj.created_at.date()
LOG.info(f"new bundle added: {bundle_obj.name}, version {version_date}")
_include_files_in_housekeeper(bundle_obj, hk_api, version_obj)
return new_analysis


def _include_files_in_housekeeper(bundle_obj, hk_api, version_obj):
"""Function to include files in housekeeper"""
hk_api.include(version_obj)
hk_api.store.add_commit(bundle_obj, version_obj)


def _create_analysis(bundle_data, case_obj, status, version_obj):
"""Function to create and return a new analysis database record"""
pipeline = case_obj.links[0].sample.data_analysis
pipeline = pipeline if pipeline else "balsamic"

if status.analysis(family=case_obj, started_at=version_obj.created_at):
raise AnalysisDuplicationError(
f"Analysis object already exists for {case_obj.internal_id}{version_obj.created_at}"
)

new_analysis = status.add_analysis(
pipeline=pipeline,
version=bundle_data["pipeline_version"],
started_at=version_obj.created_at,
completed_at=dt.datetime.now(),
primary=(len(case_obj.analyses) == 0),
)
new_analysis.family = case_obj
return new_analysis


def _reset_analysis_action(case_obj):
case_obj.action = None


def _get_case(bundle_obj, status):
case_obj = status.family(bundle_obj.name)
return case_obj


def _parse_meta_data(config_raw):
return config_raw


def _add_analysis(config_stream):
"""Gather information from balsamic analysis to store."""
meta_raw = ruamel.yaml.safe_load(config_stream)
new_bundle = _build_bundle(meta_raw)
return new_bundle


def _build_bundle(meta_data: dict) -> dict:
"""Create a new bundle."""
data = {
"name": "not-implemented",
"created": "not-implemented",
"pipeline_version": "not-implemented",
"files": _get_files(meta_data),
}
return data


def _get_files(meta_data: dict) -> list:
"""Get all the files from the balsamic files."""

data = []

for tag in meta_data["files"]:

paths = meta_data["files"][tag]

for path in paths:

data.append({"path": path, "tags": [tag], "archive": False})

return data
Loading