diff --git a/csvbase/conv.py b/csvbase/conv.py index 409962d..079ba72 100644 --- a/csvbase/conv.py +++ b/csvbase/conv.py @@ -8,6 +8,24 @@ WHITESPACE_REGEX = re.compile(r"^ *$") +NULL_STRINGS = { + "", + "#n/a n/a", + "#n/a", + "#na", + "-1.#ind", + "-1.#qnan", + "-nan", + "1.#ind", + "1.#qnan", + "", + "n/a", + "na", + "nan", + "null", +} + + def sniff_and_allow_blanks(regex: Pattern, values: Iterable[str]) -> bool: """This function takes a regex and looks at the values, return if: - at least one value matches the regex @@ -27,6 +45,10 @@ def sniff_and_allow_blanks(regex: Pattern, values: Iterable[str]) -> bool: return (non_match is False) and one_match +def is_null_str(value: str) -> bool: + return value.lower() in NULL_STRINGS + + class DateConverter: DATE_REGEX = re.compile(r"^ ?\d{4}-\d{2}-\d{2} ?$") DATE_FORMAT = "%Y-%m-%d" @@ -36,7 +58,7 @@ def sniff(self, values: Iterable[str]) -> bool: def convert(self, value: str) -> Optional[date]: stripped = value.strip() - if stripped == "": + if is_null_str(stripped): return None try: @@ -46,33 +68,33 @@ def convert(self, value: str) -> Optional[date]: class IntegerConverter: - INTEGER_SNIFF_REGEX = re.compile(r"^ ?(-?(?:\d|,| )+)$") - INTEGER_CONVERT_REGEX = re.compile(r"^ ?(-?(?:\d|,| )+)(\.0)?$") + INTEGER_SNIFF_REGEX = re.compile(r"^(-?(?:\d|,| )+)$") + INTEGER_CONVERT_REGEX = re.compile(r"^(-?(?:\d|,| )+)(\.0)?$") def sniff(self, values: Iterable[str]) -> bool: return sniff_and_allow_blanks(self.INTEGER_SNIFF_REGEX, values) def convert(self, value: str) -> Optional[int]: stripped = value.strip() - if stripped == "": + if is_null_str(stripped): return None - match = self.INTEGER_CONVERT_REGEX.match(value) + match = self.INTEGER_CONVERT_REGEX.match(stripped) if not match: raise exc.UnconvertableValueException(ColumnType.INTEGER, value) return int(match.group(1).replace(",", "")) class FloatConverter: - FLOAT_REGEX = re.compile(r"^ ?-?(\d|,|\.| )+$") + FLOAT_REGEX = re.compile(r"^-?(\d|,|\.| )+$") def sniff(self, values: Iterable[str]) -> bool: return sniff_and_allow_blanks(self.FLOAT_REGEX, values) def convert(self, value: str) -> Optional[float]: stripped = value.strip() - if stripped == "": + if is_null_str(stripped): return None - match = self.FLOAT_REGEX.match(value) + match = self.FLOAT_REGEX.match(stripped) if not match: raise exc.UnconvertableValueException(ColumnType.FLOAT, value) return float(match.group().replace(",", "")) @@ -88,7 +110,7 @@ def sniff(self, values: Iterable[str]) -> bool: def convert(self, value: str) -> Optional[float]: stripped = value.strip() - if stripped == "": + if is_null_str(stripped): return None false_match = self.FALSE_REGEX.match(stripped) diff --git a/tests/test_conv.py b/tests/test_conv.py index 12a12b2..767b696 100644 --- a/tests/test_conv.py +++ b/tests/test_conv.py @@ -185,3 +185,34 @@ def test_BooleanConverter__convert_failure(): ic = BooleanConverter() with pytest.raises(exc.UnconvertableValueException): ic.convert("nonsense") + + +@pytest.mark.parametrize( + "Converter", [BooleanConverter, DateConverter, FloatConverter, IntegerConverter] +) +@pytest.mark.parametrize( + "null_str", + [ + "", + "#N/A", + "#N/A N/A", + "#NA", + "-1.#IND", + "-1.#QNAN", + "-NaN", + "-nan", + "1.#IND", + "1.#QNAN", + "", + "N/A", + "NA", + "NULL", + "NaN", + "n/a", + "nan", + "null", + ], +) +def test_nulls(Converter, null_str): + c = Converter() + assert c.convert(null_str) is None