diff --git a/doltpy/cli/read.py b/doltpy/cli/read.py index 2449fe8..dfc3f64 100644 --- a/doltpy/cli/read.py +++ b/doltpy/cli/read.py @@ -1,7 +1,12 @@ import logging +from pathlib import Path +import random +import string +from tempfile import TemporaryDirectory from typing import List, Mapping import io + from doltcli import Dolt # type: ignore from doltcli.utils import ( # type: ignore get_read_table_asof_query, @@ -13,6 +18,12 @@ ) import pandas as pd # type: ignore +read_formats = { + "parquet": ".parquet", + "pq": ".parquet", + "csv": ".csv", +} + logger = logging.getLogger(__name__) @@ -24,5 +35,27 @@ def read_pandas_sql(dolt: Dolt, sql: str) -> pd.DataFrame: return read_table_sql(dolt, sql, result_parser=parse_to_pandas) -def read_pandas(dolt: Dolt, table: str, as_of: str = None) -> pd.DataFrame: - return read_pandas_sql(dolt, get_read_table_asof_query(table, as_of)) +def read_pandas_parquet(dolt: Dolt, table: str, asof: str = None) -> pd.DataFrame: + # TODO: either dolt export should support as of, or sql query should + # support parquet output format + ab = dolt.active_branch + letters = string.ascii_lowercase + tmp_branch = "".join(random.choice(letters) for i in range(10)) + try: + dolt.checkout(tmp_branch, checkout_branch=True, start_point=asof) + with TemporaryDirectory() as tmpdir: + fpath = Path(tmpdir) / "tmp.parquet" + dolt.table_export(table, filename=str(fpath)) + return pd.read_parquet(fpath) + finally: + dolt.checkout(ab) + dolt.branch(tmp_branch, delete=True) + + +def read_pandas(dolt: Dolt, table: str, as_of: str = None, fmt="csv") -> pd.DataFrame: + if fmt == "csv": + return read_pandas_sql(dolt, get_read_table_asof_query(table, as_of)) + elif fmt == "parquet" or fmt == "pq": + return read_pandas_parquet(dolt, table, as_of) + else: + raise RuntimeError(f"unexpected read format: {fmt}; expected: 'parquet' or 'csv'") diff --git a/poetry.lock b/poetry.lock index af4b36d..ca40c6d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -90,11 +90,11 @@ python-versions = "*" [[package]] name = "certifi" -version = "2021.10.8" +version = "2022.5.18.1" description = "Python package for providing Mozilla's CA Bundle." category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.6" [[package]] name = "cffi" @@ -515,7 +515,7 @@ test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] [[package]] name = "paramiko" -version = "2.10.4" +version = "2.11.0" description = "SSH2 protocol library" category = "dev" optional = false @@ -605,6 +605,17 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "pyarrow" +version = "6.0.1" +description = "Python library for Apache Arrow" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +numpy = ">=1.16.6" + [[package]] name = "pycparser" version = "2.21" @@ -921,7 +932,7 @@ pg = [] [metadata] lock-version = "1.1" python-versions = ">=3.6.1,<4.0" -content-hash = "684fa0de237ce99475b545280983d09d4a579e8f00d40f65e9f4f2c71013138c" +content-hash = "b19823d81191dd0b465d69a58ff689b86530515d44b87b03d8f737cb61391ff7" [metadata.files] appdirs = [ @@ -961,8 +972,8 @@ cached-property = [ {file = "cached_property-1.5.2-py2.py3-none-any.whl", hash = "sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"}, ] certifi = [ - {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, - {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, + {file = "certifi-2022.5.18.1-py3-none-any.whl", hash = "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"}, + {file = "certifi-2022.5.18.1.tar.gz", hash = "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7"}, ] cffi = [ {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, @@ -1317,8 +1328,8 @@ pandas = [ {file = "pandas-1.1.5.tar.gz", hash = "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"}, ] paramiko = [ - {file = "paramiko-2.10.4-py2.py3-none-any.whl", hash = "sha256:3c9ed6084f4b671ab66dc3c729092d32d96c3258f1426071301cb33654b09027"}, - {file = "paramiko-2.10.4.tar.gz", hash = "sha256:3d2e650b6812ce6d160abff701d6ef4434ec97934b13e95cf1ad3da70ffb5c58"}, + {file = "paramiko-2.11.0-py2.py3-none-any.whl", hash = "sha256:655f25dc8baf763277b933dfcea101d636581df8d6b9774d1fb653426b72c270"}, + {file = "paramiko-2.11.0.tar.gz", hash = "sha256:003e6bee7c034c21fbb051bf83dc0a9ee4106204dd3c53054c71452cc4ec3938"}, ] pathspec = [ {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"}, @@ -1401,6 +1412,44 @@ py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +pyarrow = [ + {file = "pyarrow-6.0.1-cp310-cp310-macosx_10_13_universal2.whl", hash = "sha256:c80d2436294a07f9cc54852aa1cef034b6f9c97d29235c4bd53bbf52e24f1ebf"}, + {file = "pyarrow-6.0.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:f150b4f222d0ba397388908725692232345adaa8e58ad543ca00f03c7234ae7b"}, + {file = "pyarrow-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c3a727642c1283dcb44728f0d0a00f8864b171e31c835f4b8def07e3fa8f5c73"}, + {file = "pyarrow-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d29605727865177918e806d855fd8404b6242bf1e56ade0a0023cd4fe5f7f841"}, + {file = "pyarrow-6.0.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b63b54dd0bada05fff76c15b233f9322de0e6947071b7871ec45024e16045aeb"}, + {file = "pyarrow-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e90e75cb11e61ffeffb374f1db7c4788f1df0cb269596bf86c473155294958d"}, + {file = "pyarrow-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f4f3db1da51db4cfbafab3066a01b01578884206dced9f505da950d9ed4402d"}, + {file = "pyarrow-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:2523f87bd36877123fc8c4813f60d298722143ead73e907690a87e8557114693"}, + {file = "pyarrow-6.0.1-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:8f7d34efb9d667f9204b40ce91a77613c46691c24cd098e3b6986bd7401b8f06"}, + {file = "pyarrow-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e3c9184335da8faf08c0df95668ce9d778df3795ce4eec959f44908742900e10"}, + {file = "pyarrow-6.0.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:02baee816456a6e64486e587caaae2bf9f084fa3a891354ff18c3e945a1cb72f"}, + {file = "pyarrow-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604782b1c744b24a55df80125991a7154fbdef60991eb3d02bfaed06d22f055e"}, + {file = "pyarrow-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fab8132193ae095c43b1e8d6d7f393451ac198de5aaf011c6b576b1442966fec"}, + {file = "pyarrow-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:31038366484e538608f43920a5e2957b8862a43aa49438814619b527f50ec127"}, + {file = "pyarrow-6.0.1-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:632bea00c2fbe2da5d29ff1698fec312ed3aabfb548f06100144e1907e22093a"}, + {file = "pyarrow-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dc03c875e5d68b0d0143f94c438add3ab3c2411ade2748423a9c24608fea571e"}, + {file = "pyarrow-6.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1cd4de317df01679e538004123d6d7bc325d73bad5c6bbc3d5f8aa2280408869"}, + {file = "pyarrow-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e77b1f7c6c08ec319b7882c1a7c7304731530923532b3243060e6e64c456cf34"}, + {file = "pyarrow-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a424fd9a3253d0322d53be7bbb20b5b01511706a61efadcf37f416da325e3d48"}, + {file = "pyarrow-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:c958cf3a4a9eee09e1063c02b89e882d19c61b3a2ce6cbd55191a6f45ed5004b"}, + {file = "pyarrow-6.0.1-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:0e0ef24b316c544f4bb56f5c376129097df3739e665feca0eb567f716d45c55a"}, + {file = "pyarrow-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2c13ec3b26b3b069d673c5fa3a0c70c38f0d5c94686ac5dbc9d7e7d24040f812"}, + {file = "pyarrow-6.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:71891049dc58039a9523e1cb0d921be001dacb2b327fa7b62a35b96a3aad9f0d"}, + {file = "pyarrow-6.0.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:943141dd8cca6c5722552a0b11a3c2e791cdf85f1768dea8170b0a8a7e824ff9"}, + {file = "pyarrow-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fd077c06061b8fa8fdf91591a4270e368f63cf73c6ab56924d3b64efa96a873"}, + {file = "pyarrow-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5308f4bb770b48e07c8cff36cf6a4452862e8ce9492428ad5581d846420b3884"}, + {file = "pyarrow-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:cde4f711cd9476d4da18128c3a40cb529b6b7d2679aee6e0576212547530fef1"}, + {file = "pyarrow-6.0.1-cp39-cp39-macosx_10_13_universal2.whl", hash = "sha256:b8628269bd9289cae0ea668f5900451043252fe3666667f614e140084dd31aac"}, + {file = "pyarrow-6.0.1-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:981ccdf4f2696550733e18da882469893d2f33f55f3cbeb6a90f81741cbf67aa"}, + {file = "pyarrow-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:954326b426eec6e31ff55209f8840b54d788420e96c4005aaa7beed1fe60b42d"}, + {file = "pyarrow-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6b6483bf6b61fe9a046235e4ad4d9286b707607878d7dbdc2eb85a6ec4090baf"}, + {file = "pyarrow-6.0.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:7ecad40a1d4e0104cd87757a403f36850261e7a989cf9e4cb3e30420bbbd1092"}, + {file = "pyarrow-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04c752fb41921d0064568a15a87dbb0222cfbe9040d4b2c1b306fe6e0a453530"}, + {file = "pyarrow-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:725d3fe49dfe392ff14a8ae6a75b230a60e8985f2b621b18cfa912fe02b65f1a"}, + {file = "pyarrow-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:2403c8af207262ce8e2bc1a9d19313941fd2e424f1cb3c4b749c17efe1fd699a"}, + {file = "pyarrow-6.0.1.tar.gz", hash = "sha256:423990d56cd8f12283b67367d48e142739b789085185018eb03d05087c3c8d43"}, +] pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, diff --git a/pyproject.toml b/pyproject.toml index 4c595e0..4d213e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ wcwidth = "0.2.5" more-itertools = "^8.6.0" mysql-connector-python = "^8.0.20" doltcli = "^0.1.14" +pyarrow = ">=6.0.0" [tool.poetry.dev-dependencies] black = "^20.8b1" diff --git a/tests/cli/test_read.py b/tests/cli/test_read.py index 290fa8b..5ca1901 100644 --- a/tests/cli/test_read.py +++ b/tests/cli/test_read.py @@ -9,12 +9,12 @@ TEST_TABLE = 'characters' TEST_DATA_INITIAL = [ {'name': 'Anna', 'adjective': 'tragic', 'id': 1, 'date_of_death': '1877-01-01'}, - {'name': 'Vronksy', 'adjective': 'honorable', 'id': 2, 'date_of_death': ''}, - {'name': 'Oblonksy', 'adjective': 'buffoon', 'id': 3, 'date_of_death': ''}, + {'name': 'Vronksy', 'adjective': 'honorable', 'id': 2, 'date_of_death': '1877-1-1'}, + {'name': 'Oblonksy', 'adjective': 'buffoon', 'id': 3, 'date_of_death': '1877'}, ] TEST_DATA_UPDATE = [ - {'name': 'Levin', 'adjective': 'tiresome', 'id': 4, 'date_of_death': ''} + {'name': 'Levin', 'adjective': 'tiresome', 'id': 4, 'date_of_death': '1877-01-01'} ] TEST_DATA_COMBINED = TEST_DATA_INITIAL + TEST_DATA_UPDATE @@ -37,7 +37,7 @@ def _write_helper(dolt: Dolt, data: List[dict], update_type: str): return dolt, commit_hash -def test_read_pandas(with_initial_test_data): +def test_read_pandas_csv(with_initial_test_data): dolt, first_commit = with_initial_test_data second_commit = update_test_data(dolt) first_write = read_pandas(dolt, TEST_TABLE, first_commit) @@ -45,3 +45,12 @@ def test_read_pandas(with_initial_test_data): second_write = read_pandas(dolt, TEST_TABLE, second_commit) compare_rows(TEST_DATA_COMBINED, second_write.to_dict('records'), "id") + +def test_read_pandas_pq(with_initial_test_data): + dolt, first_commit = with_initial_test_data + second_commit = update_test_data(dolt) + first_write = read_pandas(dolt, TEST_TABLE, first_commit, fmt="parquet") + compare_rows(TEST_DATA_INITIAL, first_write.to_dict('records'), "id") + second_write = read_pandas(dolt, TEST_TABLE, second_commit, fmt="pq") + compare_rows(TEST_DATA_COMBINED, second_write.to_dict('records'), "id") +