Skip to content

Commit

Permalink
Merge pull request #546 from kbase/dev-prov
Browse files Browse the repository at this point in the history
RE2022-268: Add default parameters to match record
  • Loading branch information
MrCreosote authored Nov 15, 2023
2 parents 1286010 + 2e92278 commit c5f2552
Show file tree
Hide file tree
Showing 6 changed files with 342 additions and 307 deletions.
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ PyYAML = "==6.0.1"
urllib3 = "==1.26.17"
python-dateutil = "==2.8.2"
numpy = "==1.26.1"
jsonschema-default = "==1.6.0"
mergedeep = "==1.3.4"

[dev-packages]
pytest = "==7.4.3"
Expand Down
607 changes: 317 additions & 290 deletions Pipfile.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/service/matchers/lineage_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ class GTDBLineageMatcherCollectionParameters(BaseModel):

class GTDBLineageMatcherUserParameters(BaseModel):
"User parameters for the GTDB lineage matcher."
gtdb_rank: GTDBRank | None = Field(
gtdb_rank: GTDBRank = Field(
default=GTDBRank.SPECIES,
example=GTDBRank.SPECIES,
description="A rank in the the GTDB lineage."
)
Expand Down Expand Up @@ -102,8 +103,7 @@ def generate_match_process(
token - the user's token.
"""
lineages = set() # remove duplicates
rank = user_parameters.get("gtdb_rank") if user_parameters else None
rank = GTDBRank(rank) if rank else GTDBRank.SPECIES
rank = GTDBRank(user_parameters["gtdb_rank"])
for upa, meta in metadata.items():
lin = meta.get(_GTDB_LINEAGE_METADATA_KEY)
if not lin:
Expand Down
5 changes: 2 additions & 3 deletions src/service/matchers/minhash_homology_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class MinHashHomologyMatcherCollectionParameters(BaseModel):

class MinHashHomologyMatcherUserParameters(BaseModel):
"User parameters for the Minhash homology matcher."
maximum_distance: float | None = Field(
maximum_distance: float = Field(
default=_DEFAULT_MAX_DIST,
example=0.2,
ge=0,
Expand Down Expand Up @@ -203,8 +203,7 @@ def generate_match_process(
"""
# No checks necessary since the calling code checks that the UPAs are accessible
# and are assemblies and / or genomes, which is all we need
max_dist = (user_parameters.get("maximum_distance")
if user_parameters else _DEFAULT_MAX_DIST)
max_dist = user_parameters["maximum_distance"]
return CollectionProcess(
process=_process_match,
data_id=internal_match_id,
Expand Down
28 changes: 18 additions & 10 deletions src/service/processing_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import hashlib
import json
import jsonschema
import jsonschema_default
import logging
import mergedeep
import uuid

from typing import Any, Callable, Awaitable
Expand Down Expand Up @@ -43,19 +45,19 @@ async def create_match(
) -> models.Match:
coll = await appstate.arangostorage.get_collection_active(collection_id)
matcher_info = _get_matcher_from_collection(coll, matcher_id)
matcher = appstate.get_matcher(matcher_info.matcher)
params = _process_user_params(match_params, matcher)
ww = WorkspaceWrapper(appstate.sdk_client, token=user.token)
internal_match_id = str(uuid.uuid4())
matcher = appstate.get_matcher(matcher_info.matcher)
match_process, upas, wsids = await _create_match_process(
internal_match_id,
matcher,
ww,
upas,
match_params,
params,
matcher_info.parameters,
)
perm_check = appstate.get_epoch_ms()
params = match_params or {}
int_match = models.InternalMatch(
match_id=_calc_match_id_md5(matcher_id, collection_id, coll.ver_num, params, upas),
matcher_id=matcher_id,
Expand All @@ -81,6 +83,19 @@ async def create_match(
return curr_match


def _process_user_params(params: dict[str, Any], matcher: Matcher):
ret = jsonschema_default.create_from(matcher.user_parameters)
if params:
ret = mergedeep.merge(ret, params, strategy=mergedeep.Strategy.REPLACE)
try:
jsonschema.validate(instance=ret, schema=matcher.user_parameters)
except jsonschema.exceptions.ValidationError as e:
raise errors.IllegalParameterError(
# TODO MATCHERS str(e) is pretty gnarly. Figure out a nicer representation
f"Failed to validate user parameters for matcher {matcher.id}: {e}")
return ret


def _get_matcher_from_collection(collection: models.SavedCollection, matcher_id: str
) -> models.Matcher:
for m in collection.matchers:
Expand Down Expand Up @@ -276,13 +291,6 @@ async def _create_match_process(
# leave that to the matchers themselves, which should probably start a ee2 (?) job if object
# downloads are required
upas, _ = _check_and_sort_UPAs_and_get_wsids(upas)
if user_parameters:
try:
jsonschema.validate(instance=user_parameters, schema=matcher.user_parameters)
except jsonschema.exceptions.ValidationError as e:
raise errors.IllegalParameterError(
# TODO MATCHERS str(e) is pretty gnarly. Figure out a nicer representation
f"Failed to validate user parameters for matcher {matcher.id}: {e}")
if len(upas) > MAX_UPAS:
raise errors.IllegalParameterError(f"No more than {MAX_UPAS} UPAs are allowed per match")
meta = await ww.get_object_metadata(
Expand Down
1 change: 0 additions & 1 deletion src/service/processing_selections.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def _make_prov(
"user's selection"
}
if match_:
# TODO PROV Need to fill match with default params if not provided by user
prov["method_params"] = [match_.user_parameters, match_.collection_parameters]
prov["custom"] |= {
"matcher_id": match_.matcher_id,
Expand Down

0 comments on commit c5f2552

Please sign in to comment.