Skip to content

Commit

Permalink
Implement a simple null filter
Browse files Browse the repository at this point in the history
Start with the values recognised by pandas (but case-insensitive)
  • Loading branch information
calpaterson committed Oct 8, 2022
1 parent 33d45a4 commit f75ac42
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 9 deletions.
40 changes: 31 additions & 9 deletions csvbase/conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,24 @@
WHITESPACE_REGEX = re.compile(r"^ *$")


NULL_STRINGS = {
"",
"#n/a n/a",
"#n/a",
"#na",
"-1.#ind",
"-1.#qnan",
"-nan",
"1.#ind",
"1.#qnan",
"<na>",
"n/a",
"na",
"nan",
"null",
}


def sniff_and_allow_blanks(regex: Pattern, values: Iterable[str]) -> bool:
"""This function takes a regex and looks at the values, return if:
- at least one value matches the regex
Expand All @@ -27,6 +45,10 @@ def sniff_and_allow_blanks(regex: Pattern, values: Iterable[str]) -> bool:
return (non_match is False) and one_match


def is_null_str(value: str) -> bool:
return value.lower() in NULL_STRINGS


class DateConverter:
DATE_REGEX = re.compile(r"^ ?\d{4}-\d{2}-\d{2} ?$")
DATE_FORMAT = "%Y-%m-%d"
Expand All @@ -36,7 +58,7 @@ def sniff(self, values: Iterable[str]) -> bool:

def convert(self, value: str) -> Optional[date]:
stripped = value.strip()
if stripped == "":
if is_null_str(stripped):
return None

try:
Expand All @@ -46,33 +68,33 @@ def convert(self, value: str) -> Optional[date]:


class IntegerConverter:
INTEGER_SNIFF_REGEX = re.compile(r"^ ?(-?(?:\d|,| )+)$")
INTEGER_CONVERT_REGEX = re.compile(r"^ ?(-?(?:\d|,| )+)(\.0)?$")
INTEGER_SNIFF_REGEX = re.compile(r"^(-?(?:\d|,| )+)$")
INTEGER_CONVERT_REGEX = re.compile(r"^(-?(?:\d|,| )+)(\.0)?$")

def sniff(self, values: Iterable[str]) -> bool:
return sniff_and_allow_blanks(self.INTEGER_SNIFF_REGEX, values)

def convert(self, value: str) -> Optional[int]:
stripped = value.strip()
if stripped == "":
if is_null_str(stripped):
return None
match = self.INTEGER_CONVERT_REGEX.match(value)
match = self.INTEGER_CONVERT_REGEX.match(stripped)
if not match:
raise exc.UnconvertableValueException(ColumnType.INTEGER, value)
return int(match.group(1).replace(",", ""))


class FloatConverter:
FLOAT_REGEX = re.compile(r"^ ?-?(\d|,|\.| )+$")
FLOAT_REGEX = re.compile(r"^-?(\d|,|\.| )+$")

def sniff(self, values: Iterable[str]) -> bool:
return sniff_and_allow_blanks(self.FLOAT_REGEX, values)

def convert(self, value: str) -> Optional[float]:
stripped = value.strip()
if stripped == "":
if is_null_str(stripped):
return None
match = self.FLOAT_REGEX.match(value)
match = self.FLOAT_REGEX.match(stripped)
if not match:
raise exc.UnconvertableValueException(ColumnType.FLOAT, value)
return float(match.group().replace(",", ""))
Expand All @@ -88,7 +110,7 @@ def sniff(self, values: Iterable[str]) -> bool:

def convert(self, value: str) -> Optional[float]:
stripped = value.strip()
if stripped == "":
if is_null_str(stripped):
return None

false_match = self.FALSE_REGEX.match(stripped)
Expand Down
31 changes: 31 additions & 0 deletions tests/test_conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,34 @@ def test_BooleanConverter__convert_failure():
ic = BooleanConverter()
with pytest.raises(exc.UnconvertableValueException):
ic.convert("nonsense")


@pytest.mark.parametrize(
"Converter", [BooleanConverter, DateConverter, FloatConverter, IntegerConverter]
)
@pytest.mark.parametrize(
"null_str",
[
"",
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
"NA",
"NULL",
"NaN",
"n/a",
"nan",
"null",
],
)
def test_nulls(Converter, null_str):
c = Converter()
assert c.convert(null_str) is None

0 comments on commit f75ac42

Please sign in to comment.