Skip to content

Commit

Permalink
Merge pull request #9 from liquidata-inc/oscarbatori/refactoring-and-…
Browse files Browse the repository at this point in the history
…docs

Oscarbatori/refactoring and docs
  • Loading branch information
Oscar Batori authored Apr 13, 2020
2 parents c067e7f + 94cfbad commit 9d3d19a
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 80 deletions.
2 changes: 1 addition & 1 deletion doltpy/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .dolt import Dolt, DoltException, DoltCommitSummary
from .dolt import Dolt, DoltException, DoltCommitSummary, clone_repo, init_new_repo
274 changes: 215 additions & 59 deletions doltpy/core/dolt.py

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions doltpy/core/tests/dolt_testing_fixtures.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from doltpy.core.dolt import Dolt, _execute
from doltpy.core.dolt import init_new_repo, Dolt, _execute
import pytest
import shutil
from typing import Tuple
Expand All @@ -13,10 +13,9 @@ def get_repo_path_tmp_path(path: str) -> Tuple[str, str]:
def init_repo(tmp_path) -> Dolt:
repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path)
assert not os.path.exists(repo_data_dir)
repo = Dolt(repo_path)
repo.init_new_repo()
_execute(['rm', 'LICENSE.md'], repo.repo_dir)
_execute(['rm', 'README.md'], repo.repo_dir)
repo = init_new_repo(repo_path)
_execute(['rm', 'LICENSE.md'], repo.repo_dir())
_execute(['rm', 'README.md'], repo.repo_dir())
yield repo
if os.path.exists(repo_data_dir):
shutil.rmtree(repo_data_dir)
12 changes: 6 additions & 6 deletions doltpy/core/tests/test_dolt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from doltpy.core.dolt import Dolt, _execute, DoltException, UPDATE
from doltpy.core.dolt import init_new_repo, Dolt, _execute, DoltException, UPDATE
import shutil
import pandas as pd
import uuid
Expand Down Expand Up @@ -28,7 +28,7 @@ def create_test_table(init_repo, create_test_data) -> Tuple[Dolt, str]:
''')
repo.import_df('test_players', pd.read_csv(test_data_path), ['id'], UPDATE)
yield repo, 'test_players'
_execute(['dolt', 'table', 'rm', 'test_players'], repo.repo_dir)
_execute(['dolt', 'table', 'rm', 'test_players'], repo.repo_dir())


@pytest.fixture
Expand All @@ -42,8 +42,7 @@ def run_serve_mode(init_repo):
def test_init_new_repo(tmp_path):
repo_path, repo_data_dir = get_repo_path_tmp_path(tmp_path)
assert not os.path.exists(repo_data_dir)
dolt = Dolt(repo_path)
dolt.init_new_repo()
init_new_repo(repo_path)
assert os.path.exists(repo_data_dir)
shutil.rmtree(repo_data_dir)

Expand Down Expand Up @@ -112,6 +111,7 @@ def test_clean_local(create_test_table):
assert repo.repo_is_clean()


# TODO test datetime types here
def test_sql_server(create_test_table, run_serve_mode):
repo, test_table = create_test_table
data = repo.pandas_read_sql('SELECT * FROM {}'.format(test_table))
Expand All @@ -129,9 +129,9 @@ def test_branch_list(create_test_table):

def test_remote_list(create_test_table):
repo, _ = create_test_table
_execute(['dolt', 'remote', 'add', 'origin', 'blah-blah'], repo.repo_dir)
_execute(['dolt', 'remote', 'add', 'origin', 'blah-blah'], repo.repo_dir())
assert repo.get_remote_list() == ['origin']
_execute(['dolt', 'remote', 'add', 'another-origin', 'la-la-land'], repo.repo_dir)
_execute(['dolt', 'remote', 'add', 'another-origin', 'la-la-land'], repo.repo_dir())
assert set(repo.get_remote_list()) == {'origin', 'another-origin'}


Expand Down
4 changes: 4 additions & 0 deletions doltpy/etl/cli_logging_config_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@


def config_cli_logger():
"""
A helper to provide a nicely configured logger across loaders.
:return:
"""
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
datefmt='%Y%m%d %H:%M:%S',
level=logging.INFO)
13 changes: 12 additions & 1 deletion doltpy/etl/dolt_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
from doltpy.core import Dolt
from typing import List
from doltpy.etl.loaders import resolve_function, DoltLoaderBuilder
import logging
from doltpy.etl.cli_logging_config_helper import config_cli_logger
Expand All @@ -9,6 +8,14 @@


def loader(loader_builder: DoltLoaderBuilder, dolt_dir: str, dry_run: bool):
"""
This function takes a `DoltLoaderBuilder`, repo and remote settings, and attempts to execute the loaders returned
by the builder.
:param loader_builder:
:param dolt_dir:
:param dry_run:
:return:
"""
logger.info(
'''Commencing load to Dolt with the following options:
- dolt_dir {dolt_dir}
Expand All @@ -22,6 +29,10 @@ def loader(loader_builder: DoltLoaderBuilder, dolt_dir: str, dry_run: bool):


def main():
"""
Used as a function backing shim for surfacing command line tool.
:return:
"""
config_cli_logger()
parser = argparse.ArgumentParser()
parser.add_argument('dolt_load_module', help='Fully qualified path to a module providing a set of loaders')
Expand Down
22 changes: 19 additions & 3 deletions doltpy/etl/dolthub_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import argparse
from doltpy.core import Dolt
from doltpy.core import Dolt, clone_repo
import os
import tempfile
from doltpy.etl.loaders import resolve_function, DoltLoaderBuilder
Expand All @@ -16,14 +16,26 @@ def loader(loader_builder: DoltLoaderBuilder,
remote_name: str,
dry_run: bool,
remote_url: str):
"""
This function takes a `DoltLoaderBuilder`, repo and remote settings, and attempts to execute the loaders returned
by the builder.
:param loader_builder:
:param dolt_dir:
:param clone:
:param push:
:param remote_name:
:param dry_run:
:param remote_url:
:return:
"""
if clone:
assert remote_url, 'If clone is True then remote must be passed'
temp_dir = tempfile.mkdtemp()
logger.info('Clone is set to true, so ignoring dolt_dir')
repo = Dolt(temp_dir)
if clone:
logger.info('Clone set to True, cloning remote {}'.format(remote_url))
repo.clone(remote_url)
clone_repo(remote_url, temp_dir)
repo = Dolt(temp_dir)
else:
assert os.path.exists(os.path.join(dolt_dir, '.dolt')), 'Repo must exist locally if not cloned'
repo = Dolt(dolt_dir)
Expand All @@ -49,6 +61,10 @@ def loader(loader_builder: DoltLoaderBuilder,


def main():
"""
Used as a function backing shim for surfacing command line tool.
:return:
"""
config_cli_logger()
parser = argparse.ArgumentParser()
parser.add_argument('dolt_load_module', help='Fully qualified path to a module providing a set of loaders')
Expand Down
39 changes: 35 additions & 4 deletions doltpy/etl/loaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Callable, List
import io
from doltpy.core.dolt import UPDATE, Dolt, DoltException
from doltpy.core.dolt import UPDATE, Dolt
import pandas as pd
import hashlib
import importlib
Expand All @@ -17,7 +17,9 @@

def resolve_function(module_path: str):
"""
Takes a string of the form you.module.member_containing_loaders and returns a list of loaders.
Takes a string of the form you.module.member_containing_loaders and returns a list of loaders. It exists to allow
commands to be called with arguments that are strings that can be resolved to functions. This is used when
specifying function parameters via the command line.
:param module_path:
:return:
"""
Expand Down Expand Up @@ -48,6 +50,13 @@ def resolve_branch(branch: str, module_generator_path: str, default: str):


def insert_unique_key(df: pd.DataFrame) -> pd.DataFrame:
"""
This function takes Pandas `DataFrame` and inserts a unique hash to each row created from the row itself, along
with a count of how many rows produce the same hash. The idea is to provide some rudimentary tools for writing data
with unique keys.
:param df:
:return:
"""
assert 'hash_id' not in df.columns and 'count' not in df.columns, 'Require hash_id and count not in df'
ids = df.apply(lambda r: hashlib.md5(','.join([str(el) for el in r]).encode('utf-8')).hexdigest(), axis=1)
with_id = df.assign(hash_id=ids).set_index('hash_id')
Expand Down Expand Up @@ -82,7 +91,10 @@ def get_bulk_table_writer(table: str,
import_mode: str = None,
transformers: List[FileTransformer] = None) -> DoltTableWriter:
"""
Returns a loader function that writes a file-like object to Dolt, the file must be a valid CSV file.
Returns a function that takes a Dolt repository object and writes the contents of the file like object returned by
the function parameter `get_data` to the table specified using the primary keys passed. Optionally toggle the import
mode and apply a list of transformers to do some data cleaning operations. For example, we might apply a transformer
that converts some date strings to proper datetime objects.
:param table:
:param get_data:
:param pk_cols:
Expand All @@ -105,7 +117,10 @@ def get_df_table_writer(table: str,
import_mode: str = None,
transformers: List[DataframeTransformer] = None) -> DoltTableWriter:
"""
Returns a loader that writes the `pd.DataFrame` produced by get_data to Dolt.
Returns a function that takes a Dolt repository object and writes the Pandas DataFrame returned by the function
parameter `get_data` to the table specified using the primary keys passed. Optionally toggle the import mode and
apply a list of transformers to do some data cleaning operations. For example, we might apply a transformer that
converts some date strings to proper datetime objects.
:param table:
:param get_data:
:param pk_cols:
Expand All @@ -127,6 +142,16 @@ def get_table_transfomer(get_data: Callable[[Dolt], pd.DataFrame],
target_pk_cols: List[str],
transformer: DataframeTransformer,
import_mode: str = UPDATE) -> DoltTableWriter:
"""
A version of get_df_table writer where the input is a Dolt repository. This is used for transforming raw data into
derived tables.
:param get_data:
:param target_table:
:param target_pk_cols:
:param transformer:
:param import_mode:
:return:
"""
def inner(repo: Dolt):
input_data = get_data(repo)
transformed_data = transformer(input_data)
Expand Down Expand Up @@ -192,6 +217,12 @@ def inner(repo: Dolt):


def get_branch_creator(new_branch_name: str, refspec: str = None):
"""
Returns a function that creates a branch at the specified refspec.
:param new_branch_name:
:param refspec:
:return:
"""
def inner(repo: Dolt):
assert new_branch_name not in repo.get_branch_list(), 'Branch {} already exists'.format(new_branch_name)
logger.info('Creating new branch on repo in {} named {} at refspec {}'.format(repo.repo_dir,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='doltpy',
version='0.0.6',
version='0.0.7',
packages=find_packages(),
install_requires=['pandas>=0.25.0', 'sqlalchemy>=1.3.8', 'mysql-connector-python==8.0.17', 'retry>=0.9.2'],
author='Liquidata',
Expand Down

0 comments on commit 9d3d19a

Please sign in to comment.