Skip to content

Commit

Permalink
Merge pull request #2 from woonstadrotterdam/feature/referentiedata-test
Browse files Browse the repository at this point in the history
ReferentiedataTest
  • Loading branch information
sTomerG authored Jun 28, 2024
2 parents beb5c8d + c54e5ae commit e3fe2e4
Show file tree
Hide file tree
Showing 8 changed files with 2,486 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,5 @@ cython_debug/
.python-version
lcov.info
_version.py
.DS_store
tutorial.md
104 changes: 104 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,107 @@
⏳ Work in progress

Maakt het makkelijk om te testen of data voldoet aan de [VERA-standaard](https://github.com/Aedes-datastandaarden/vera-referentiedata) m.b.v. het [pyspark-testframework](https://github.com/woonstadrotterdam/pyspark-testframework).

# Tutorial

**Op het moment is het _vera-testframework_ alleen compatibel met _pyspark_.**

```python
from vera_testframework.pyspark import ReferentiedataTest
from pyspark.sql import SparkSession
from testframework.dataquality import DataFrameTester
```

```python
spark = SparkSession.builder.appName("vera_testframework").getOrCreate()
```

**Hieronder wordt een voorbeeld DataFrame gemaakt m.b.t. ruimten, waarvan we gaan testen of de waardes voldoen aan de VERA-standaard.**

```python
ruimten = [
(1, "LOG", "Loggia"),
(2, "WOO", "Woonkamer"),
(3, "BAD", "Badruimte"),
(4, "BAD", "Badkamer"),
(5, None, "Kelder"),
(6, "SLA", None),
]

ruimten_df = spark.createDataFrame(ruimten, ["id", "code", "naam"])
```

**We maken gebruik van de `DataFrameTester` van het _pyspark-testframework_ om onze testresultaten in bij te houden.**

```python
testframework = DataFrameTester(
df=ruimten_df,
primary_key="id",
spark=spark,
)
```

**Door middel van de `ReferentiedataTest` kunnen we testen of een kolom voldoet aan de VERA-standaard m.b.t. Referentiedata.**

```python
testframework.test(
col="code",
test=ReferentiedataTest(
soort="RUIMTEDETAILSOORT",
attribuut="Code",
),
nullable=False, # of een waarde leeg mag zijn. Dit is aan de gebruiker
).show()
```

+---+----+-------------------+
| id|code|code__VERAStandaard|
+---+----+-------------------+
| 1| LOG| true|
| 2| WOO| true|
| 3| BAD| true|
| 4| BAD| true|
| 5|NULL| false|
| 6| SLA| true|
+---+----+-------------------+

```python
testframework.test(
col="naam",
test=ReferentiedataTest(
soort="RUIMTEDETAILSOORT",
attribuut="Naam",
),
nullable=True,
).show()
```

+---+---------+-------------------+
| id| naam|naam__VERAStandaard|
+---+---------+-------------------+
| 1| Loggia| true|
| 2|Woonkamer| true|
| 3|Badruimte| false|
| 4| Badkamer| true|
| 5| Kelder| true|
| 6| NULL| true|
+---+---------+-------------------+

**De resultaten van de testen zijn te vinden in de `.results` attribuut van de `DataFrameTester`.**

```python
testframework.results.show()
```

+---+-------------------+-------------------+
| id|code__VERAStandaard|naam__VERAStandaard|
+---+-------------------+-------------------+
| 1| true| true|
| 2| true| true|
| 3| true| false|
| 4| true| true|
| 5| false| true|
| 6| true| true|
+---+-------------------+-------------------+

**Voor meer informatie over hoe het _pyspark-testframework_ te gebruiken, raadpleeg de documentatie op [hun Github](https://github.com/woonstadrotterdam/pyspark-testframework)**
2,000 changes: 2,000 additions & 0 deletions src/vera_testframework/data/Referentiedata.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/vera_testframework/pyspark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .referentiedata import ReferentiedataTest

__all__ = ["ReferentiedataTest"]
59 changes: 59 additions & 0 deletions src/vera_testframework/pyspark/referentiedata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import csv
from typing import Literal, Optional

from testframework.dataquality.tests import ValidCategory


class ReferentiedataTest(ValidCategory): # type: ignore
"""
Initialize a ReferentiedataTest instance.
Args:
name (Optional[str]): The name of the test. If not provided, defaults to "VERAStandaard".
soort (str): The type/category of the data, which will be converted to uppercase.
attribuut (Literal["Code", "Naam"]): The attribute to use, either "Code" or "Naam". It will be capitalized.
Raises:
TypeError: If soort is not a string.
ValueError: If attribuut is not "Code" or "Naam".
"""

with open(
"src/vera_testframework/data/Referentiedata.csv", newline="", encoding="utf-8"
) as csvfile:
referentiedata = [row for row in csv.DictReader(csvfile, delimiter=";")]

def __init__(
self,
*,
name: Optional[str] = None,
soort: str,
attribuut: Literal["Code", "Naam"],
):
if not isinstance(soort, str):
raise TypeError("soort must be a string")
if attribuut not in ["Code", "Naam"]:
raise ValueError("attribuut must be either 'Code' or 'Naam'")

self.soort = soort.upper()
self.attribuut = attribuut.capitalize()

name = name if name else "VERAStandaard"
super().__init__(name=name, categories=self._categorieen())

def _categorieen(self) -> set[str]:
categorieen_rows = [
row for row in self.referentiedata if row["Soort"] == self.soort
]
if not categorieen_rows:
mogelijke_soorten = {row["Soort"] for row in self.referentiedata}
raise ValueError(
f"Geen soorten gevonden voor soort '{self.soort}'. Opties zijn: {', '.join(sorted(mogelijke_soorten))}"
)

return {row[self.attribuut] for row in categorieen_rows}

def __str__(self) -> str:
return f"ReferentiedataTest({self.soort}, {self.attribuut})"

def __repr__(self) -> str:
return self.__str__()
6 changes: 0 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
import pytest


def pytest_collection_modifyitems(config, items):
if not items:
pytest.exit("No tests found.", returncode=0)
93 changes: 93 additions & 0 deletions tests/pyspark/referentiedata/test_ReferentiedataTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pytest
from pyspark.sql import SparkSession

# Adjust the import according to your module structure
from vera_testframework.pyspark import ReferentiedataTest


@pytest.fixture(scope="module")
def spark():
return SparkSession.builder.master("local[2]").appName("pytest").getOrCreate()


@pytest.fixture
def ruimten_df(spark):
ruimten = [
(1, "LOG", "Loggia"),
(2, "WOO", "Woonkamer"),
(3, "BAD", "Badruimte"),
(4, "BAD", "Badkamer"),
(5, None, "Kelder"),
(6, "SLA", None),
]
return spark.createDataFrame(ruimten, ["id", "code", "naam"])


def test_referentiedata_valid_code(ruimten_df):
test = ReferentiedataTest(soort="RUIMTEDETAILSOORT", attribuut="Code")

# Get valid codes from referentiedata in the test object
valid_codes = set(
row["Code"]
for row in test.referentiedata
if row["Soort"] == "RUIMTEDETAILSOORT"
)

# Apply the test
result_df = test.test(ruimten_df, "code", "id", False)

# Collect the results
results = result_df.select("code", "code__VERAStandaard").collect()
for row in results:
if row["code"] is not None:
assert (row["code"] in valid_codes) == row["code__VERAStandaard"]
else:
assert row["code__VERAStandaard"] is False


def test_referentiedata_valid_naam(ruimten_df):
test = ReferentiedataTest(soort="RUIMTEDETAILSOORT", attribuut="Naam")

# Get valid names from referentiedata in the test object
valid_namen = set(
row["Naam"]
for row in test.referentiedata
if row["Soort"] == "RUIMTEDETAILSOORT"
)

# Apply the test
result_df = test.test(ruimten_df, "naam", "id", False)

# Collect the results
results = result_df.select("naam", "naam__VERAStandaard").collect()
for row in results:
if row["naam"] is not None:
assert (row["naam"] in valid_namen) == row["naam__VERAStandaard"]
else:
assert row["naam__VERAStandaard"] is False


def test_referentiedata_invalid_soort():
with pytest.raises(ValueError):
ReferentiedataTest(soort="INVALID", attribuut="Code")


def test_referentiedata_invalid_attribuut():
with pytest.raises(ValueError):
ReferentiedataTest(soort="RUIMTEDETAILSOORT", attribuut="InvalidAttribuut")


def test_wrong_type_soort():
with pytest.raises(TypeError):
ReferentiedataTest(soort=123, attribuut="Code")


def test_str_and_repr():
assert (
str(ReferentiedataTest(soort="RUIMTEDETAILSOORT", attribuut="Code"))
== "ReferentiedataTest(RUIMTEDETAILSOORT, Code)"
)
assert (
repr(ReferentiedataTest(soort="RUIMTEDETAILSOORT", attribuut="Code"))
== "ReferentiedataTest(RUIMTEDETAILSOORT, Code)"
)
Loading

0 comments on commit e3fe2e4

Please sign in to comment.