Skip to content

Commit

Permalink
MIDRC-478 Fix type conversion in TSV array submission (uc-cdis#409)
Browse files Browse the repository at this point in the history
  • Loading branch information
paulineribeyre authored Apr 23, 2024
1 parent d200ae8 commit 74ea87e
Show file tree
Hide file tree
Showing 19 changed files with 459 additions and 298 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ repos:
- id: no-commit-to-branch
args: [--branch, develop, --branch, master, --pattern, release/.*]
- repo: https://github.com/psf/black
rev: 22.12.0
rev: 24.4.0
hooks:
- id: black
8 changes: 6 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_baseline_file",
"filename": ".secrets.baseline"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
Expand Down Expand Up @@ -123,7 +127,7 @@
"filename": "README.md",
"hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3",
"is_verified": false,
"line_number": 72
"line_number": 74
}
],
"bin/settings.py": [
Expand Down Expand Up @@ -350,5 +354,5 @@
}
]
},
"generated_at": "2024-02-08T23:30:59Z"
"generated_at": "2024-04-22T20:07:28Z"
}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ Ensure you've run `poetry install`.

Ensure you have Postgresql 13 set up and running.

Ensure there is a postgres user `postgres` *and* `test` setup with password `test`:
Ensure there are 2 postgres users `postgres` and `test`, both set up with password `test`:

```
CREATE USER postgres WITH PASSWORD 'test';
Expand Down
567 changes: 330 additions & 237 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions sheepdog/blueprint/routes/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,9 @@ def get_templates():
)
)
suffix = "json" if file_format == "json" else "tar.gz"
response.headers[
"Content-Disposition"
] = "attachment; filename=submission_templates.{}".format(suffix)
response.headers["Content-Disposition"] = (
"attachment; filename=submission_templates.{}".format(suffix)
)
return response


Expand Down
10 changes: 5 additions & 5 deletions sheepdog/blueprint/routes/views/program/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,9 +823,9 @@ def get_project_templates(program, project):
)
response = flask.make_response(template)
suffix = "json" if file_format == "json" else "tar.gz"
response.headers[
"Content-Disposition"
] = "attachment; filename=submission_templates.{}".format(suffix)
response.headers["Content-Disposition"] = (
"attachment; filename=submission_templates.{}".format(suffix)
)
return response


Expand Down Expand Up @@ -1144,8 +1144,8 @@ def delete_project(program, project):
transaction_args = dict(
program=program, project=project, flask_config=flask.current_app.config
)
with (
transactions.deletion.transaction.DeletionTransaction(**transaction_args)
with transactions.deletion.transaction.DeletionTransaction(
**transaction_args
) as trans:
session.delete(node)
trans.claim_transaction_log()
Expand Down
1 change: 0 additions & 1 deletion sheepdog/transactions/submission/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@


class SubmissionEntity(EntityBase):

"""Models an entity to be marked submitted."""

def __init__(self, transaction, node):
Expand Down
1 change: 0 additions & 1 deletion sheepdog/transactions/submission/transaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@


class SubmissionTransaction(TransactionBase):

"""Models a transaction to mark all nodes in a project submitted."""

REQUIRED_PROJECT_STATES = ["review"]
Expand Down
1 change: 1 addition & 0 deletions sheepdog/transactions/upload/entity_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Class Factory for creating new upload entities
"""

import psqlgraph

from sheepdog.transactions.upload.entity import UploadEntity
Expand Down
1 change: 1 addition & 0 deletions sheepdog/transactions/upload/sub_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Subclasses for UploadEntity that handle different types of
uploaded entites.
"""

import uuid
import requests
import json
Expand Down
83 changes: 38 additions & 45 deletions sheepdog/utils/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from flask import current_app
from psqlgraph import Node

from sheepdog import dictionary
from sheepdog.errors import UserError
from sheepdog.utils.transforms.bcr_xml_to_json import (
BcrXmlToJsonParser,
Expand All @@ -30,7 +31,7 @@ def parse_bool_from_string(value):
return mapping.get(strip(value).lower(), value)


def parse_list_from_string(value, list_type=None):
def parse_list_from_string(value, list_item_type=None):
"""
Handle array fields by converting them to a list.
Try to cast to float to handle arrays of numbers.
Expand All @@ -40,37 +41,29 @@ def parse_list_from_string(value, list_type=None):
"""
items = [x.strip() for x in value.split(",")]

all_ints = True
try:
# TODO: Actually pass in and use list_type as the expected type
# and don't try to infer it this way.
for item in items:
if not float(item).is_integer():
all_ints = False
break
except ValueError as exc:
current_app.logger.warning(
f"list of values {items} are likely NOT ints or floats so we're leaving "
f"them as-is. Exception: {exc}"
)
return items

if all_ints:
current_app.logger.warning(
f"list of values {items} could all be integers, so we are ASSUMING they "
"are instead of defaulting to float."
)
# all can be ints, infer `int` as correct type
new_items = [int(float(item)) for item in items]
if list_item_type == int:
items = [int(float(item)) for item in items]
elif list_item_type == float:
items = [list_item_type(item) for item in items]
elif list_item_type == bool:
items = [e.lower() in ["true", "t", "yes", "y"] for e in items]
else:
current_app.logger.warning(
f"list of values {items} are NOT all integers, so we are ASSUMING they "
"they are all float by default."
f"'parse_list_from_string' does not know how to handle type '{list_item_type}' so assuming string is fine... Value is '{value}'"
)
# default to float for backwards compatibility
new_items = [float(item) for item in items]

return new_items
return items


def jsonschema_to_python_type(str_type):
return {
"string": str,
"integer": int,
"number": float,
"float": float,
"boolean": bool,
"array": list,
}.get(str_type)


def set_row_type(row):
Expand Down Expand Up @@ -234,29 +227,29 @@ def value_to_list_value(self, cls, link_name, prop, value):

@staticmethod
def get_converted_type_from_list(cls, prop_name, value):
current_app.logger.debug(f"cls.__pg_properties__:{cls.__pg_properties__}")
types = cls.__pg_properties__.get(prop_name, (str,))
current_app.logger.debug(f"types:{types}")
value_type = types[0]

property_list = cls.get_property_list()
current_app.logger.debug(f"property_list:{property_list}")

# TODO: list_type is not used b/c for some reason it's always
# str even if the dictionary says it's an array of ints
list_type = None
if len(types) > 1:
list_type = types[1]

current_app.logger.debug(f"prop_name:{prop_name}")
current_app.logger.debug(f"value:{value}")
current_app.logger.debug(f"value_type:{value_type}")

try:
if value_type == bool:
return parse_bool_from_string(value)
elif value_type == list:
return parse_list_from_string(value, list_type=list_type)
# Parse the item type from the dictionary schema.
# NOTE: `cls.__pg_properties__.get(prop_name)` would be easier but the value is
# only `list` and does not include the item type.
# https://github.com/uc-cdis/gdcdatamodel/blob/190f998/gdcdatamodel/models/__init__.py#L120
# Setting this ^ to `list[<item type>]` may work but it breaks other code.
list_item_type = (
dictionary.schema.get(cls.label, {})
.get("properties", {})
.get(prop_name, {})
.get("items", {})
.get("type")
)
list_item_type = jsonschema_to_python_type(list_item_type)
current_app.logger.debug(
f"get_converted_type_from_list: {cls.label}.{prop_name} items type is {list_item_type}"
)
return parse_list_from_string(value, list_item_type=list_item_type)
elif value_type == float:
if float(value).is_integer():
return int(float(value))
Expand Down
2 changes: 1 addition & 1 deletion sheepdog/utils/transforms/graph_to_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ def dict_props_to_list(obj, props, titles_linked, file_format):

l_prop_values = [str(obj.get(k)) for k in props]
link_fields = []
for (link_name, link_prop) in link_props_split:
for link_name, link_prop in link_props_split:
s = sub_splitter.join(
list(
filter(
Expand Down
1 change: 1 addition & 0 deletions tests/ci_setup.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash
set -e
poetry run python bin/setup_test_database.py
mkdir -p tests/integration/resources/keys
cd tests/integration/resources/keys
Expand Down
1 change: 1 addition & 0 deletions tests/integration/datadict/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
+ "/datadict/schemas"
)


# update these settings if you want to point to another db
def pg_config(use_ssl=False, isolation_level=None):
test_host = (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Tests for admin endpoint functionality.
"""

# pylint: disable=unused-argument, no-member

import json
Expand Down
1 change: 1 addition & 0 deletions tests/integration/datadict/submission/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Test upload entities (mostly data file handling and communication with
index service).
"""

import csv
import json
import copy
Expand Down

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions tests/integration/datadictwithobjid/submission/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# pylint: disable=superfluous-parens
# pylint: disable=no-member
import contextlib
import csv
import json
import os
import uuid
Expand Down Expand Up @@ -682,6 +683,71 @@ def test_export_node_with_array_json(
assert len(js_data["data"][0]["consent_codes"]) == len(consent_codes)


@pytest.mark.parametrize(
"property_name,tsv_value,expected_value",
[
("consent_codes", "1.1,70", ["1.1", "70"]),
("consent_codes_ints", "1,70", [1, 70]),
("consent_codes_floats", "1.1,70.0", [1.1, 70.0]),
("consent_codes_bools", "true,false", [True, False]),
],
)
def test_submit_tsv_array_type(
client,
pg_driver,
cgci_blgsp,
require_index_exists_off,
submitter,
property_name,
tsv_value,
expected_value,
):
"""
Ensure arrays of strings with values that look like numbers can be submitted via TSV. Also
test arrays of other types.
Test arrays of strings, ints and flots using the `case` node's `consent_codes`,
`consent_codes_ints`, `consent_codes_floats` and `consent_codes_bools` properties.
This is a regression test for this error:
Validation error while validating entity '{'type': 'case', 'consent_codes': [4], ...'
against subschema '{'type': 'string'}': 4 is not of type 'string'
"""
# submit records for parent nodes
resp = post_example_entities_together(client, submitter, extended_data_fnames)
assert resp.status_code == 201, resp.data

with pg_driver.session_scope() as s:
case = pg_driver.nodes(md.Case).first()
assert (
case
), "`post_example_entities_together` should have submitted a case record"
case_submitter_id = case.props["submitter_id"]
experiment = pg_driver.nodes(md.Experiment).first()
assert (
experiment
), "`post_example_entities_together` should have submitted an experiment record"
experiment_submitter_id = experiment.props["submitter_id"]

# generate and submit the TSV data
data = f"{property_name} submitter_id experiments.submitter_id type\n{tsv_value} {case_submitter_id} {experiment_submitter_id} case"
print(f"Submitting TSV data:\n{data}")
r = client.put(
BLGSP_PATH, headers={**submitter, "Content-Type": "text/tsv"}, data=data
)
assert r.status_code == 200, r.data

# query the data and check that the values are of the expected type even when they look
# like ints/floats
path = "/v0/submission/CGCI/BLGSP/export/?node_label=case&format=json"
r = client.get(path, headers=submitter)
assert r.status_code == 200, r.data
assert r.json, "Expected to receive a json body"
assert (
len(r.json.get("data", [])) == 1
), f"Expected exactly 1 case but got: {r.json}"
assert r.json["data"][0][property_name] == expected_value


def test_export_all_node_types_json(
client, pg_driver, cgci_blgsp, require_index_exists_off, submitter
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Test upload entities (mostly data file handling and communication with
index service).
"""

import json
import copy

Expand Down

0 comments on commit 74ea87e

Please sign in to comment.