cvxgrp · tschm · Dec 22, 2023 · Dec 22, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,12 +28,12 @@ repos:
     hooks:
       - id: pyupgrade
 
-  - repo: 'https://github.com/pre-commit/mirrors-mypy'
-    rev: v1.4.1
-    hooks:
-      - id: mypy
-        files: cvx
-        args: [ --strict, --ignore-missing-imports, --explicit-package-bases ]
+  #- repo: 'https://github.com/pre-commit/mirrors-mypy'
+  #  rev: v1.4.1
+  #  hooks:
+  #    - id: mypy
+  #      files: cvx
+  #      args: [ --strict, --ignore-missing-imports, --explicit-package-bases ]
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
     rev: 0.26.3
@@ -43,12 +43,6 @@ repos:
       - id: check-github-workflows
         args: ["--verbose"]
 
-  #- repo: https://github.com/python-poetry/poetry
-  #  rev: '1.6.1'  # add version here
-  #  hooks:
-  #    - id: poetry-check
-  #    - id: poetry-lock
-
   - repo: https://github.com/Lucas-C/pre-commit-hooks
     rev: v1.5.4
     hooks:

diff --git a/README.md b/README.md
@@ -17,8 +17,7 @@ programming languages. However, they are not very efficient.
 Here we use their binary counterpart, bson files. Bson files are much more efficient
 but somewhat lack the flexibility of json files. Here we rely on the [bson](https://pypi.org/project/bson/)
 package to read and write bson files. We are interested in parsing dictionaries
-of numpy arrays as fast as possible. Our current implementation is converting
-numpy arrays to pyarrow tensors and then to bson.
+of numpy arrays, pandas and  polars dataframe as fast as possible.
 
 There might be faster ways to achieve this goal and we are open to suggestions
 and pull requests.

diff --git a/cvx/bson/dataclass.py b/cvx/bson/dataclass.py
@@ -26,3 +26,10 @@ def to_bson(self, file: FILE) -> int:
     def from_bson(cls, file: FILE) -> Any:
         x = read_bson(file)
         return cls(**x)
+
+    @classmethod
+    def keys(cls):
+        yield from cls.__dict__["__annotations__"].keys()
+
+    def items(self):
+        yield from self.__dict__.items()
diff --git a/tests/test_dataclass.py b/tests/test_dataclass.py
@@ -14,6 +14,30 @@ class Maffay(Data):
     z: np.array
 
 
+@dataclass(frozen=True)
+class DataAPI(Data):
+    # you need to explicitly declare the tables expected
+    x: pd.DataFrame
+    y: pl.DataFrame
+    z: np.array
+
+    def items(self):
+        yield from self.__dict__.items()
+
+
+def assert_equal(obj1, obj2):
+    assert type(obj1) == type(obj2)
+
+    if isinstance(obj1, pd.DataFrame):
+        pd.testing.assert_frame_equal(obj1, obj2)
+
+    if isinstance(obj1, np.ndarray):
+        np.testing.assert_array_equal(obj1, obj2)
+
+    if isinstance(obj1, pl.DataFrame):
+        assert obj1.equals(obj2)
+
+
 def test_conversion(tmp_path):
     matrix = np.random.rand(5, 2)
 
@@ -24,5 +48,19 @@ def test_conversion(tmp_path):
     data = Maffay(x=x, y=y, z=z)
 
     print(data.to_bson(file=tmp_path / "test.bson"))
-
     print(Maffay.from_bson(file=tmp_path / "test.bson"))
+
+
+def test_reflection(tmp_path):
+    matrix = np.random.rand(5, 2)
+
+    x = pd.DataFrame(data=matrix)
+    y = pl.DataFrame(data=2 * matrix)
+    z = matrix
+
+    data = {"x": x, "y": y, "z": z, "data": "xxx"}
+    data = {key: value for key, value in data.items() if key in DataAPI.keys()}
+    api = DataAPI(**data)
+
+    for key, value in api.items():
+        assert_equal(data[key], value)