diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index 56c9a3443..69eea3fab 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -17,6 +17,7 @@ from schematic.schemas.data_model_graph import DataModelGraphExplorer from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.validate_rules_utils import validation_rule_info from schematic.utils.validate_utils import ( comma_separated_list_regex, @@ -868,7 +869,7 @@ def _get_target_manifest_dataframes( entity: File = self.synStore.getDatasetManifest( datasetId=dataset_id, downloadFile=True ) - manifests.append(pd.read_csv(entity.path)) + manifests.append(read_csv(entity.path)) return dict(zip(manifest_ids, manifests)) def get_target_manifests( diff --git a/schematic/store/database/synapse_database_wrapper.py b/schematic/store/database/synapse_database_wrapper.py index b827b140f..ba0ed2dc9 100644 --- a/schematic/store/database/synapse_database_wrapper.py +++ b/schematic/store/database/synapse_database_wrapper.py @@ -8,6 +8,7 @@ from opentelemetry import trace from schematic.store.synapse_tracker import SynapseEntityTracker +from schematic.utils.df_utils import read_csv class SynapseTableNameError(Exception): @@ -108,7 +109,7 @@ def execute_sql_query( pandas.DataFrame: The queried table """ result = self.execute_sql_statement(query, include_row_data) - table = pandas.read_csv(result.filepath) + table = read_csv(result.filepath) return table def execute_sql_statement( diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py index e0e8017bd..93a7140f2 100644 --- a/schematic/store/synapse.py +++ b/schematic/store/synapse.py @@ -56,7 +56,12 @@ from schematic.store.base import BaseStorage from schematic.store.database.synapse_database import SynapseDatabase from schematic.store.synapse_tracker import SynapseEntityTracker -from schematic.utils.df_utils import col_in_dataframe, load_df, update_df +from schematic.utils.df_utils import ( + STR_NA_VALUES_FILTERED, + col_in_dataframe, + load_df, + update_df, +) # entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment # Please do not remove these import statements @@ -399,7 +404,7 @@ def query_fileview( try: self.storageFileviewTable = self.syn.tableQuery( query=self.fileview_query, - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) except SynapseHTTPError as exc: exception_text = str(exc) if "Unknown column path" in exception_text: @@ -1418,7 +1423,11 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable """ results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) - df = results.asDataFrame(rowIdAndVersionInIndex=False) + df = results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) return df, results @@ -3470,7 +3479,11 @@ def query(self, tidy=True, force=False): if self.table is None or force: fileview_id = self.view_schema["id"] self.results = self.synapse.tableQuery(f"select * from {fileview_id}") - self.table = self.results.asDataFrame(rowIdAndVersionInIndex=False) + self.table = self.results.asDataFrame( + rowIdAndVersionInIndex=False, + na_values=STR_NA_VALUES_FILTERED, + keep_default_na=False, + ) if tidy: self.tidy_table() return self.table diff --git a/schematic/utils/df_utils.py b/schematic/utils/df_utils.py index b25e7db82..ee458ba17 100644 --- a/schematic/utils/df_utils.py +++ b/schematic/utils/df_utils.py @@ -4,17 +4,58 @@ import logging from copy import deepcopy -from time import perf_counter -from typing import Union, Any, Optional from datetime import datetime +from time import perf_counter +from typing import Any, Optional, Union + import dateparser as dp -import pandas as pd import numpy as np +import pandas as pd from pandarallel import pandarallel # type: ignore +# pylint:disable=no-name-in-module +from pandas._libs.parsers import STR_NA_VALUES # type: ignore + +STR_NA_VALUES_FILTERED = deepcopy(STR_NA_VALUES) + +try: + STR_NA_VALUES_FILTERED.remove("None") +except KeyError: + pass + logger = logging.getLogger(__name__) +def read_csv( + path_or_buffer: str, + keep_default_na: bool = False, + encoding: str = "utf8", + **load_args: Any, +) -> pd.DataFrame: + """ + A wrapper around pd.read_csv that filters out "None" from the na_values list. + + Args: + path_or_buffer: The path to the file or a buffer containing the file. + keep_default_na: Whether to keep the default na_values list. + encoding: The encoding of the file. + **load_args: Additional arguments to pass to pd.read_csv. + + Returns: + pd.DataFrame: The dataframe created from the CSV file or buffer. + """ + na_values = load_args.pop( + "na_values", STR_NA_VALUES_FILTERED if not keep_default_na else None + ) + return pd.read_csv( # type: ignore + path_or_buffer, + na_values=na_values, + keep_default_na=keep_default_na, + encoding=encoding, + **load_args, + ) + + def load_df( file_path: str, preserve_raw_input: bool = True, @@ -45,9 +86,7 @@ def load_df( t_load_df = perf_counter() # Read CSV to df as type specified in kwargs - org_df = pd.read_csv( # type: ignore - file_path, keep_default_na=True, encoding="utf8", **load_args - ) + org_df = read_csv(file_path, encoding="utf8", **load_args) # type: ignore if not isinstance(org_df, pd.DataFrame): raise ValueError( ( diff --git a/schematic_api/api/routes.py b/schematic_api/api/routes.py index 97484c15c..ad1e25279 100644 --- a/schematic_api/api/routes.py +++ b/schematic_api/api/routes.py @@ -20,6 +20,7 @@ from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser from schematic.store.synapse import ManifestDownload, SynapseStorage +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder, entity_type_mapping from schematic.utils.schema_utils import ( DisplayLabelType, @@ -178,7 +179,7 @@ def parse_bool(str_bool): def return_as_json(manifest_local_file_path): - manifest_csv = pd.read_csv(manifest_local_file_path) + manifest_csv = read_csv(manifest_local_file_path) manifest_json = manifest_csv.to_dict(orient="records") return manifest_json diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv index 7438c7145..1c2923535 100644 --- a/tests/data/example.model.csv +++ b/tests/data/example.model.csv @@ -10,7 +10,7 @@ Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,, Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,, Sample ID,,,,,TRUE,DataProperty,,, -Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,, +Tissue Status,,"Healthy, Malignant, None",,,TRUE,DataProperty,,, Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,, Filename,,,,,TRUE,DataProperty,,,#MockFilename filenameExists^^ File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,, @@ -24,8 +24,8 @@ Check List,,,,,TRUE,DataProperty,,,list Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list Check List Like,,,,,TRUE,DataProperty,,,list like Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like -Check List Strict,,,,,TRUE,DataProperty,,,list strict -Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict +Check List Strict,,,,,TRUE,DataProperty,,,list strict +Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f] Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f] Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f] diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld index c4279a605..7e4560d50 100644 --- a/tests/data/example.model.jsonld +++ b/tests/data/example.model.jsonld @@ -540,6 +540,9 @@ }, { "@id": "bts:Malignant" + }, + { + "@id": "bts:None" } ], "sms:displayName": "Tissue Status", @@ -563,6 +566,23 @@ "sms:required": "sms:false", "sms:validationRules": [] }, + { + "@id": "bts:None", + "@type": "rdfs:Class", + "rdfs:comment": "TBD", + "rdfs:label": "None", + "rdfs:subClassOf": [ + { + "@id": "bts:TissueStatus" + } + ], + "schema:isPartOf": { + "@id": "http://schema.biothings.io" + }, + "sms:displayName": "None", + "sms:required": "sms:false", + "sms:validationRules": [] + }, { "@id": "bts:BulkRNA-seqAssay", "@type": "rdfs:Class", diff --git a/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv new file mode 100644 index 000000000..8230442f3 --- /dev/null +++ b/tests/data/mock_manifests/Invalid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,InvalidValue,Biospecimen diff --git a/tests/data/mock_manifests/Valid_none_value_test_manifest.csv b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv new file mode 100644 index 000000000..b7a0666d6 --- /dev/null +++ b/tests/data/mock_manifests/Valid_none_value_test_manifest.csv @@ -0,0 +1,6 @@ +Sample ID,Patient ID,Tissue Status,Component +1,1,Healthy,Biospecimen +2,2,Malignant,Biospecimen +3,3,None,Biospecimen +4,4,None,Biospecimen +5,5,None,Biospecimen \ No newline at end of file diff --git a/tests/integration/test_commands.py b/tests/integration/test_commands.py index 4367d817b..342736ab1 100644 --- a/tests/integration/test_commands.py +++ b/tests/integration/test_commands.py @@ -5,7 +5,6 @@ from io import BytesIO import numpy as np -import pandas as pd import pytest import requests from click.testing import CliRunner @@ -14,6 +13,7 @@ from schematic.configuration.configuration import CONFIG, Configuration from schematic.manifest.commands import manifest from schematic.models.commands import model +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting LIGHT_BLUE = "FFEAF7F9" # Required cell @@ -155,8 +155,8 @@ def test_generate_empty_csv_manifests(self, runner: CliRunner) -> None: # command has no (python) errors, has exit code 0 assert result.exit_code == 0 - biospecimen_df = pd.read_csv("tests/data/example.Biospecimen.manifest.csv") - patient_df = pd.read_csv("tests/data/example.Patient.manifest.csv") + biospecimen_df = read_csv("tests/data/example.Biospecimen.manifest.csv") + patient_df = read_csv("tests/data/example.Patient.manifest.csv") # Remove created files: finally: @@ -339,7 +339,7 @@ def test_generate_empty_google_sheet_manifests( assert False, f"Unexpected data validation found: {dv}" assert tissue_status_validation is not None assert tissue_status_validation.type == "list" - assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$3" + assert tissue_status_validation.formula1 == "Sheet2!$C$2:$C$4" # required fields are marked as “light blue”, while other non-required fields are marked as white. for col in ["Sample ID", "Patient ID", "Tissue Status", "Component"]: diff --git a/tests/integration/test_manifest_submission.py b/tests/integration/test_manifest_submission.py index 92e6911c1..84581f7d8 100644 --- a/tests/integration/test_manifest_submission.py +++ b/tests/integration/test_manifest_submission.py @@ -4,7 +4,6 @@ import uuid from typing import Any, Callable, Dict -import pandas as pd import pytest import requests from flask.testing import FlaskClient @@ -12,6 +11,7 @@ from schematic.configuration.configuration import CONFIG from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import read_csv from tests.conftest import ConfigurationForTesting, Helpers from tests.utils import CleanupItem @@ -73,7 +73,7 @@ def validate_submitted_manifest_file( manifest_file_path = os.path.join( download_location, manifest_data["properties"]["name"] ) - manifest_submitted_df = pd.read_csv(manifest_file_path) + manifest_submitted_df = read_csv(manifest_file_path) assert "entityId" in manifest_submitted_df.columns assert "Id" in manifest_submitted_df.columns @@ -1035,6 +1035,11 @@ def test_submit_manifest_with_hide_blanks( randomized_annotation_content = str(uuid.uuid4()) df["RandomizedAnnotation"] = randomized_annotation_content + # AND a "None" string remains in the manifest + df["NoneString"] = "None" + df["NoneString1"] = "none" + df["NoneString2"] = "NoNe" + with tempfile.NamedTemporaryFile(delete=True, suffix=".csv") as tmp_file: # Write the DF to a temporary file df.to_csv(tmp_file.name, index=False) @@ -1070,6 +1075,9 @@ def test_submit_manifest_with_hide_blanks( modified_file = syn.get(df["entityId"][0], downloadFile=False) assert modified_file is not None assert modified_file["RandomizedAnnotation"][0] == randomized_annotation_content + assert modified_file["NoneString"][0] == "None" + assert modified_file["NoneString1"][0] == "none" + assert modified_file["NoneString2"][0] == "NoNe" # AND the blank annotations are not present assert "Genome Build" not in modified_file diff --git a/tests/integration/test_metadata_model.py b/tests/integration/test_metadata_model.py index 2f604ed5d..21da9cfd8 100644 --- a/tests/integration/test_metadata_model.py +++ b/tests/integration/test_metadata_model.py @@ -1,5 +1,5 @@ """ -This script contains a test suite for verifying the submission and annotation of +This script contains a test suite for verifying the validation, submission and annotation of file-based manifests using the `TestMetadataModel` class to communicate with Synapse and verify the expected behavior of uploading annotation manifest CSVs using the metadata model. @@ -13,7 +13,7 @@ import tempfile import uuid from contextlib import nullcontext as does_not_raise -from typing import Callable, Optional +from typing import Any, Callable, Optional import pandas as pd import pytest @@ -23,6 +23,7 @@ from synapseclient.models import File, Folder from schematic.store.synapse import SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import create_temp_folder from tests.conftest import Helpers, metadata_model from tests.utils import CleanupItem @@ -534,7 +535,7 @@ def _submit_and_verify_manifest( ) manifest_table = synapse_store.syn.tableQuery( f"select * from {expected_table_id}", downloadLocation=download_dir - ).asDataFrame() + ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) # AND the columns in the manifest table should reflect the ones in the file table_columns = manifest_table.columns @@ -569,3 +570,46 @@ def _submit_and_verify_manifest( spy_upload_file_as_table.call_count == 1 spy_upload_file_as_csv.assert_not_called() spy_upload_file_combo.assert_not_called() + + def test_validate_model_manifest_valid_with_none_string( + self, helpers: Helpers + ) -> None: + """ + Tests for validateModelManifest when the manifest is valid with 'None' values + + Args: + helpers: Test helper functions + """ + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Valid_none_value_test_manifest.csv", + rootNode="Biospecimen", + ) + assert not warnings + assert not errors + + def test_validate_model_manifest_invalid(self, helpers: Helpers) -> None: + """ + Tests for validateModelManifest when the manifest requires values to be from a set of values containing 'None' + + Args: + helpers: Test helper functions + """ + meta_data_model = metadata_model(helpers, "class_label") + + errors, warnings = meta_data_model.validateModelManifest( + manifestPath="tests/data/mock_manifests/Invalid_none_value_test_manifest.csv", + rootNode="Biospecimen", + ) + assert not warnings + assert errors[0][0] == "6" + assert errors[0][1] == "Tissue Status" + assert errors[0][3] == "InvalidValue" + # The order of the valid values in the error message are random, so the test must be + # slightly complicated: + # 'InvalidValue' is not one of ['Malignant', 'Healthy', 'None'] + # 'InvalidValue' is not one of ['Healthy', 'Malignant', 'None'] + error_message = errors[0][2] + assert isinstance(error_message, str) + assert error_message.startswith("'InvalidValue' is not one of") diff --git a/tests/test_api.py b/tests/test_api.py index 1f2d79add..842210d5b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -14,11 +14,11 @@ from flask.testing import FlaskClient from opentelemetry import trace -from schematic.configuration.configuration import Configuration +from schematic.configuration.configuration import CONFIG, Configuration from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer from schematic.schemas.data_model_parser import DataModelParser +from schematic.utils.df_utils import read_csv from schematic.utils.general import create_temp_folder -from schematic.configuration.configuration import CONFIG logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -838,7 +838,7 @@ def test_generate_manifest_file_based_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) @@ -894,7 +894,7 @@ def test_generate_manifest_not_file_based_with_annotations( response_google_sheet = json.loads(response.data) # open the google sheet - google_sheet_df = pd.read_csv( + google_sheet_df = read_csv( response_google_sheet[0] + "/export?gid=0&format=csv" ) diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 409c653b0..670aa2474 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -462,7 +462,7 @@ def test_generate_data_model_graph(self, helpers, data_model, data_model_labels) ) # Check Edge directions - assert 4 == (len(graph.out_edges("TissueStatus"))) + assert 6 == (len(graph.out_edges("TissueStatus"))) assert 2 == (len(graph.in_edges("TissueStatus"))) diff --git a/tests/test_store.py b/tests/test_store.py index 516b7805d..49b79a3c0 100644 --- a/tests/test_store.py +++ b/tests/test_store.py @@ -29,6 +29,7 @@ from schematic.schemas.data_model_parser import DataModelParser from schematic.store.base import BaseStorage from schematic.store.synapse import DatasetFileView, ManifestDownload, SynapseStorage +from schematic.utils.df_utils import STR_NA_VALUES_FILTERED from schematic.utils.general import check_synapse_cache_size, create_temp_folder from tests.conftest import Helpers from tests.utils import CleanupItem @@ -1322,7 +1323,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1359,7 +1360,7 @@ async def copy_folder_and_update_manifest( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) days_to_follow_up = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1421,7 +1422,7 @@ async def test_upsert_table( # Query table for DaystoFollowUp column table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) @@ -1462,7 +1463,7 @@ async def test_upsert_table( table_id = synapse_store.syn.findEntityId(name=table_name, parent=projectId) table_query = ( synapse_store.syn.tableQuery(f"SELECT {column_of_interest} FROM {table_id}") - .asDataFrame() + .asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) .squeeze() ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 13dac46bb..4a8d30cd3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -19,7 +19,7 @@ ) from schematic.schemas.data_model_parser import DataModelParser from schematic.utils import cli_utils, df_utils, io_utils, validate_utils -from schematic.utils.df_utils import load_df +from schematic.utils.df_utils import load_df, read_csv from schematic.utils.schema_utils import ( check_for_duplicate_components, check_if_display_name_is_valid_label, @@ -294,7 +294,7 @@ def test_load_df(self, helpers, preserve_raw_input): test_col = "Check NA" file_path = helpers.get_data_path("mock_manifests", "Invalid_Test_Manifest.csv") - unprocessed_df = pd.read_csv(file_path, encoding="utf8") + unprocessed_df = read_csv(file_path, encoding="utf8") df = df_utils.load_df( file_path, preserve_raw_input=preserve_raw_input, data_model=False ) @@ -907,6 +907,11 @@ def test_validate_property_schema(self, helpers): "example.model.csv", "Patient", ), + ( + "mock_manifests/Valid_none_value_test_manifest.csv", + "example.model.csv", + "Biospecimen", + ), ( "mock_manifests/Valid_Test_Manifest_with_nones.csv", "example_test_nones.model.csv", @@ -921,7 +926,7 @@ def test_convert_nan_entries_to_empty_strings( manifest_path = helpers.get_data_path(manifest) model_path = helpers.get_data_path(model) - ## Gather parmeters needed to run validate_manifest_rules + # Gather parmeters needed to run validate_manifest_rules errors = [] load_args = { "dtype": "string", @@ -943,13 +948,13 @@ def test_convert_nan_entries_to_empty_strings( **load_args, ) - metadataModel = get_metadataModel(helpers, model) + get_metadataModel(helpers, model) # Instantiate Validate manifest, and run manifest validation # In this step the manifest is modified while running rule # validation so need to do this step to get the updated manfest. vm = ValidateManifest(errors, manifest, manifest_path, dmge, json_schema) - manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules( + manifest, _, _ = vm.validate_manifest_rules( manifest, dmge, restrict_rules=False, @@ -964,6 +969,10 @@ def test_convert_nan_entries_to_empty_strings( if root_node == "Patient": assert manifest["Family History"][0] == [""] assert output["Family History"][0] == [""] + elif root_node == "Biospecimen": + assert output["Tissue Status"][2] == "None" + assert output["Tissue Status"][3] == "None" + assert output["Tissue Status"][4] == "None" elif root_node == "MockComponent": assert manifest["Check List"][2] == [""] assert manifest["Check List Like Enum"][2] == [] diff --git a/tests/test_validation.py b/tests/test_validation.py index c16a95bb7..646a2e543 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -54,11 +54,17 @@ class TestManifestValidation: "mock_manifests/Valid_Test_Manifest_with_nones.csv", "MockComponent", ), + ( + "example.model.csv", + "mock_manifests/Valid_none_value_test_manifest.csv", + "Biospecimen", + ), ], ids=[ "example_model", "example_with_no_entry_for_cond_required_columns", - "example_with_nones", + "example_with_nones_mock_component", + "example_with_nones_biospecimen", ], ) @pytest.mark.parametrize( diff --git a/tests/test_viz.py b/tests/test_viz.py index b94d79688..2ab78b9ce 100644 --- a/tests/test_viz.py +++ b/tests/test_viz.py @@ -1,11 +1,10 @@ import json import logging -import os from io import StringIO -import pandas as pd import pytest +from schematic.utils.df_utils import read_csv from schematic.visualization.attributes_explorer import AttributesExplorer from schematic.visualization.tangled_tree import TangledTree @@ -44,7 +43,7 @@ class TestVisualization: def test_ae(self, helpers, attributes_explorer): attributes_str = attributes_explorer.parse_attributes(save_file=False) - df = pd.read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(attributes_str)).drop(columns=["Unnamed: 0"]) # For the attributes df define expected columns expect_col_names = [ @@ -76,7 +75,7 @@ def test_ce(self, component, attributes_explorer): component=component, save_file=False, include_index=False ) # convert to dataframe - component_attributes = pd.read_csv(StringIO(component_attributes_str)) + component_attributes = read_csv(StringIO(component_attributes_str)) # For the attributes df define expected columns expect_col_names = [ @@ -103,7 +102,7 @@ def test_text(self, helpers, tangled_tree): # Get text for tangled tree. text_str = tangled_tree.get_text_for_tangled_tree(text_format, save_file=False) - df = pd.read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) + df = read_csv(StringIO(text_str)).drop(columns=["Unnamed: 0"]) # Define expected text associated with 'Patient' and 'Imaging' tree expected_patient_text = ["Biospecimen", "BulkRNA-seqAssay"] diff --git a/tests/unit/test_df_utils.py b/tests/unit/test_df_utils.py new file mode 100644 index 000000000..28db591a2 --- /dev/null +++ b/tests/unit/test_df_utils.py @@ -0,0 +1,49 @@ +from io import BytesIO + +import numpy as np +import pandas as pd +from pandas._libs.parsers import STR_NA_VALUES + +from schematic.utils.df_utils import read_csv + + +class TestReadCsv: + def test_none_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND None is included in the STR_NA_VALUES + if "None" not in STR_NA_VALUES: + STR_NA_VALUES.add("None") + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer, na_values=STR_NA_VALUES) + + # THEN the None string value is not preserved + assert not result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] is np.nan + + def test_none_not_in_na_values(self) -> None: + # GIVEN a pandas DF that includes a column with a None value + df = pd.DataFrame({"col1": ["AAA", "BBB", "None"]}) + + # AND its CSV representation + csv_buffer = BytesIO() + df.to_csv(csv_buffer, index=False) + csv_buffer.seek(0) + + # WHEN the CSV is read + result = read_csv(csv_buffer) + + # THEN the None string value is preserved + assert result.equals(df) + assert result["col1"][0] == "AAA" + assert result["col1"][1] == "BBB" + assert result["col1"][2] == "None"