Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bug fix] Fix check_synapse_cache_size function; allow file size to be float #1389

Merged
merged 11 commits into from
Apr 5, 2024
7 changes: 2 additions & 5 deletions schematic/store/synapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,9 @@
from schematic.utils.general import (
entity_type_mapping,
get_dir_size,
convert_gb_to_bytes,
create_temp_folder,
check_synapse_cache_size,
clear_synapse_cache,
profile,
calculate_datetime,
)

from schematic.utils.schema_utils import get_class_label_from_display_name
Expand Down Expand Up @@ -234,8 +231,8 @@ def _purge_synapse_cache(self, maximum_storage_allowed_cache_gb=1):
# try clearing the cache
# scan a directory and check size of files
if os.path.exists(self.root_synapse_cache):
maximum_storage_allowed_cache_bytes = convert_gb_to_bytes(
maximum_storage_allowed_cache_gb
maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
1024**3
)
nbytes = get_dir_size(self.root_synapse_cache)
dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
Expand Down
17 changes: 4 additions & 13 deletions schematic/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def calculate_datetime(

def check_synapse_cache_size(
directory: str = "/root/.synapseCache",
) -> Union[float, int]:
) -> float:
"""use du --sh command to calculate size of .synapseCache.

Args:
directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache'

Returns:
float or integer: returns size of .synapsecache directory in bytes
float: returns size of .synapsecache directory in bytes
"""
# Note: this command might fail on windows user.
# But since this command is primarily for running on AWS, it is fine.
Expand All @@ -154,8 +154,8 @@ def check_synapse_cache_size(
size_in_mb = float(size.rstrip("M"))
byte_size = size_in_mb * 1000000
elif "G" in size:
size_in_gb = int(size.rstrip("G"))
byte_size = convert_gb_to_bytes(size_in_gb)
size_in_gb = float(size.rstrip("G"))
byte_size = size_in_gb * (1024**3)
elif "B" in size:
byte_size = float(size.rstrip("B"))
else:
Expand All @@ -180,15 +180,6 @@ def clear_synapse_cache(synapse_cache: cache.Cache, minutes: int) -> int:
return num_of_deleted_files


def convert_gb_to_bytes(g_bytes: int) -> int:
"""convert gb to bytes
Args:
g_bytes: number of gb
return: total number of bytes
"""
return g_bytes * 1024 * 1024 * 1024


def entity_type_mapping(syn: Synapse, entity_id: str) -> str:
"""Return the entity type of manifest

Expand Down
110 changes: 75 additions & 35 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import time
from datetime import datetime
from unittest import mock
from pathlib import Path
from typing import Union, Generator
from _pytest.fixtures import FixtureRequest

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -147,8 +150,7 @@
"duplicated_component": {
"validation_rules": ["#Patient unique^^#Patient int"],
"parsed_rules": "raises_exception",
}

},
}

TEST_DN_DICT = {
Expand All @@ -159,22 +161,55 @@
"bio_things": {"class": "BioThings", "property": "bioThings"},
}

test_disk_storage = [
(2, 4000, 16000),
(1000, 4000, 16000),
(2000000, 1900000, 2000000),
(1073741825, 1073741824, 1181116006.4),
]


# create temporary files with various size based on request
@pytest.fixture()
def create_temp_query_file(
tmp_path: Path, request: FixtureRequest
) -> Generator[tuple[Path, Path, Path], None, None]:
"""create temporary files of various size based on request parameter.

Args:
tmp_path (Path): temporary file path
request (any): a request for a fixture from a test

Yields:
Generator[Tuple[Path, Path, Path]]: return path of mock synapse cache directory, mock table query folder and csv
"""
# define location of mock synapse cache
mock_synapse_cache_dir = tmp_path / ".synapseCache/"
mock_synapse_cache_dir.mkdir()
mock_sub_folder = mock_synapse_cache_dir / "123"
mock_sub_folder.mkdir()
mock_table_query_folder = mock_sub_folder / "456"
mock_table_query_folder.mkdir()

# create mock table query csv
mock_synapse_table_query_csv = (
mock_table_query_folder / "mock_synapse_table_query.csv"
)
with open(mock_synapse_table_query_csv, "wb") as f:
f.write(b"\0" * request.param)
yield mock_synapse_cache_dir, mock_table_query_folder, mock_synapse_table_query_csv


class TestGeneral:
def test_clear_synapse_cache(self, tmp_path):
@pytest.mark.parametrize("create_temp_query_file", [3, 1000], indirect=True)
def test_clear_synapse_cache(self, create_temp_query_file) -> None:
# define location of mock synapse cache
mock_synapse_cache_dir = tmp_path / ".synapseCache/"
mock_synapse_cache_dir.mkdir()
mock_sub_folder = mock_synapse_cache_dir / "123"
mock_sub_folder.mkdir()
mock_table_query_folder = mock_sub_folder / "456"
mock_table_query_folder.mkdir()

# create mock table query csv and a mock cache map
mock_synapse_table_query_csv = (
mock_table_query_folder / "mock_synapse_table_query.csv"
)
mock_synapse_table_query_csv.write_text("mock table query content")
(
mock_synapse_cache_dir,
mock_table_query_folder,
mock_synapse_table_query_csv,
) = create_temp_query_file
# create a mock cache map
mock_cache_map = mock_table_query_folder / ".cacheMap"
mock_cache_map.write_text(
f"{mock_synapse_table_query_csv}: '2022-06-13T19:24:27.000Z'"
Expand Down Expand Up @@ -222,22 +257,25 @@ def test_calculate_datetime_raise_error(self):

# this test might fail for windows machine
@pytest.mark.not_windows
def test_check_synapse_cache_size(self, tmp_path):
mock_synapse_cache_dir = tmp_path / ".synapseCache"
mock_synapse_cache_dir.mkdir()

mock_synapse_table_query_csv = (
mock_synapse_cache_dir / "mock_synapse_table_query.csv"
)
mock_synapse_table_query_csv.write_text("example file for calculating cache")

file_size = check_synapse_cache_size(mock_synapse_cache_dir)
@pytest.mark.parametrize(
"create_temp_query_file,local_disk_size,gh_disk_size",
test_disk_storage,
indirect=["create_temp_query_file"],
)
def test_check_synapse_cache_size(
self,
create_temp_query_file,
local_disk_size: int,
gh_disk_size: Union[int, float],
) -> None:
mock_synapse_cache_dir, _, _ = create_temp_query_file
disk_size = check_synapse_cache_size(mock_synapse_cache_dir)

# For some reasons, when running in github action, the size of file changes.
if IN_GITHUB_ACTIONS:
assert file_size == 8000
assert disk_size == gh_disk_size
else:
assert file_size == 4000
assert disk_size == local_disk_size

def test_find_duplicates(self):
mock_list = ["foo", "bar", "foo"]
Expand Down Expand Up @@ -775,18 +813,20 @@ def test_parse_single_set_validation_rules(self, test_individual_rule_set):
@pytest.mark.parametrize(
"component_names",
[
["duplicated_component", ['Patient', 'Biospecimen', 'Patient']],
["individual_component", ['Patient', 'Biospecimen']],
["no_component", []]
["duplicated_component", ["Patient", "Biospecimen", "Patient"]],
["individual_component", ["Patient", "Biospecimen"]],
["no_component", []],
],
ids=["duplicated_component", "individual_component", "no_component"],
)
def test_check_for_duplicate_components(self, component_names):
"""Test that we are properly identifying duplicates in a list.
Exception should only be triggered when the duplicate component list is passed.
Exception should only be triggered when the duplicate component list is passed.
"""
try:
check_for_duplicate_components(component_names=component_names[1], validation_rule_string='dummy_str')
check_for_duplicate_components(
component_names=component_names[1], validation_rule_string="dummy_str"
)
except:
assert component_names[0] == "duplicated_component"

Expand All @@ -812,7 +852,7 @@ def test_parse_validation_rules(self, test_rule_name):
)
assert expected_parsed_rules == parsed_validation_rules
except:
assert test_rule_name in ["str_rule", "duplicated_component"]
assert test_rule_name in ["str_rule", "duplicated_component"]

@pytest.mark.parametrize(
"test_rule_name",
Expand All @@ -836,6 +876,7 @@ def test_extract_component_validation_rules(self, test_rule_name):
component
]
)

@pytest.mark.parametrize(
"test_dn",
list(TEST_DN_DICT.keys()),
Expand Down Expand Up @@ -950,8 +991,7 @@ def test_get_label_from_display_name(self, test_dn: str, data_model_labels: str)

class TestValidateUtils:
def test_validate_schema(self, helpers):
"""
"""
""" """

# Get data model path
data_model_path = helpers.get_data_path("example.model.jsonld")
Expand Down
Loading