diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..f9e2281 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,13 @@ +[report] +exclude_lines = + # if a line gets matched with this, we dont count it as missing in the coverage report + # keep default behaviour of coverage + pragma: no cover + + # skip lines that raise a specific error (^\s matches beginning of line + any whitespace length) + ^\s*raise NotImplementedError + ^\s*raise AssertionError + ^\s*raise ValueError + + # skip lines that are only "pass" + ^\s*pass\s*$ \ No newline at end of file diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0e4441d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +# Ignore everthing to use this dockerignore as a whitelist +* + +# Specifically add all files that should not be ignored +!requirements.txt +!setup.py +!src +!test +!pytest.ini +!.flake8 +!scripts \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..29ee453 --- /dev/null +++ b/.flake8 @@ -0,0 +1,40 @@ +[flake8] +ignore = + # C812: Missing trailing comma. Deactivated because of incompatibility with black. + C812, + # C815 missing trailing comma in Python 3.5+. Deactivated because of incompatibility with black. + C815 + # E203: Space before :. Deactivated because of incompatibility with black, see https://github.com/ambv/black#slices. + E203, + # E402: Module level import not at top of file. We sometimes need this (e.g. mparams, dataflow). + E402, + # I100: Import statements are in the wrong order. Deactivated because we often need to import `future` at the beginning + # for Python2/3 compatibility. + I100, + # I201: Missing newline between import groups. Same as I100. + I201, + # W503: Handling of breaking around binary operators. Necessary because of Flake8 update in + # https://github.com/merantix/core/commit/dee61ff623b2cb08d6827afcea502edb9a8f76fb + W503, + # W605: Invalid escape sequence 'x'. Necessary because of Flake8 update in + # https://github.com/merantix/core/commit/dee61ff623b2cb08d6827afcea502edb9a8f76fb + W605, + # Exceptions for the type hinting plugin, flake8-annotations + ANN002, # Missing type annotation for *args + ANN003, # Missing type annotation for **kwargs + ANN101, # Missing type annotation for self in method + ANN102, # Missing type annotation for cls in classmethod + ANN204, # Missing return type annotation for special method, e.g. init + # The following is for the docstring plugin, to make it less whiny. We are happy if we have docs on all functions + D100, # Missing docstring in public module + D101, # Missing docstring in public class + D104, # Missing docstring in public package + D202, # No blank lines allowed after function docstring + D205, # 1 blank line required between summary line and description + D212, # Multi-line docstring summary should start at the first line + D415 # First line should end with a period, question mark, or exclamation point + +max_line_length = 120 +import_order_style = appnexus +application_package_names = devtool +docstring-convention = google \ No newline at end of file diff --git a/.flake8_nb b/.flake8_nb new file mode 100644 index 0000000..2d26d4c --- /dev/null +++ b/.flake8_nb @@ -0,0 +1,42 @@ +[flake8_nb] +ignore = + # C812: Missing trailing comma. Deactivated because of incompatibility with black. + C812, + # C815 missing trailing comma in Python 3.5+. Deactivated because of incompatibility with black. + C815 + # E203: Space before :. Deactivated because of incompatibility with black, see https://github.com/ambv/black#slices. + E203, + # E402: Module level import not at top of file. We sometimes need this (e.g. mparams, dataflow). + E402, + # E501: Line too long. Some 'pip install' commands in the notebooks are just too long + E501 + # I100: Import statements are in the wrong order. Deactivated because we often need to import `future` at the beginning + # for Python2/3 compatibility. + I100, + # I201: Missing newline between import groups. Same as I100. + I201, + # W503: Handling of breaking around binary operators. Necessary because of Flake8 update in + # https://github.com/merantix/core/commit/dee61ff623b2cb08d6827afcea502edb9a8f76fb + W503, + # W605: Invalid escape sequence 'x'. Necessary because of Flake8 update in + # https://github.com/merantix/core/commit/dee61ff623b2cb08d6827afcea502edb9a8f76fb + W605, + # Exceptions for the type hinting plugin, flake8-annotations + ANN002, # Missing type annotation for *args + ANN003, # Missing type annotation for **kwargs + ANN101, # Missing type annotation for self in method + ANN102, # Missing type annotation for cls in classmethod + ANN204, # Missing return type annotation for special method, e.g. init + # The following is for the docstring plugin, to make it less whiny. We are happy if we have docs on all functions + D100, # Missing docstring in public module + D101, # Missing docstring in public class + D104, # Missing docstring in public package + D202, # No blank lines allowed after function docstring + D205, # 1 blank line required between summary line and description + D212, # Multi-line docstring summary should start at the first line + D415, # First line should end with a period, question mark, or exclamation point + +max_line_length = 120 +import_order_style = appnexus +application_package_names = devtool +docstring-convention = google \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ec1865d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.ipynb filter=nbstripout +*.ipynb diff=ipynb \ No newline at end of file diff --git a/.gitignore b/.gitignore index 06e110b..6690319 100644 --- a/.gitignore +++ b/.gitignore @@ -168,3 +168,6 @@ npm-debug.log # Hydra outputs/ + + +test/data \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..e2bb1a5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,69 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +# See +# https://medium.com/staqu-dev-logs/keeping-python-code-clean-with-pre-commit-hooks-black-flake8-and-isort-cac8b01e0ea1 +default_language_version: + python: python3.8 +exclude: (?x)(^docs/_build) +repos: +- repo: https://github.com/ambv/black + rev: 21.9b0 + hooks: + - id: black # Format Python code + additional_dependencies: ["--index-url=https://pypi.org/simple/"] +- repo: https://github.com/pre-commit/mirrors-autopep8 + rev: v1.5.4 # Use the sha / tag you want to point at + hooks: + - id: autopep8 + additional_dependencies: ["--index-url=https://pypi.org/simple/"] +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.4 + hooks: + - id: flake8 # Apply flake 8 python file linter + additional_dependencies: + - keyrings.google-artifactregistry-auth + - "--index-url=https://pypi.org/simple/" + - "flake8-annotations==2.5.0" + - "flake8-bugbear==21.9.2" + - "flake8-docstrings==1.5.0" +- repo: https://github.com/adrienverge/yamllint + rev: v1.26.1 + hooks: + - id: yamllint # Check YAML Files + args: ['-d', "{extends: relaxed, rules: {line-length: {max: 120 }}}"] + additional_dependencies: ["--index-url=https://pypi.org/simple/"] +- repo: local + hooks: # Not nice + - id: terraform-version + name: Check if terraform v1.0.4 is installed correctly. If this fails, check your version with `terraform --version` + language: system + entry: bash -c "exit $(terraform --version | sed -n 1p | sed 's/Terraform v//;q' | grep -qE '^\s*1.0.4\s*$')" + files: (\.tf|\.tfvars)$ # Only run if there actually are tf files to lint! + exclude: \.terraform\/.*$ + pass_filenames: false + require_serial: true +- repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.60.0 + hooks: + - id: terraform_fmt + args: + - "--args=-diff" +- repo: https://github.com/sqlfluff/sqlfluff + rev: 0.4.0 # Oldest version with pre-commit hooks! + hooks: + - id: sqlfluff-lint + args: ["--dialect", "ansi"] + additional_dependencies: ["--index-url=https://pypi.org/simple/"] + - id: sqlfluff-fix + args: ["--dialect", "ansi"] + additional_dependencies: ["--index-url=https://pypi.org/simple/"] +- repo: https://github.com/psf/black + rev: 21.9b0 + hooks: + - id: black-jupyter # Format jupyter notebooks + additional_dependencies: [".[jupyter]", "--index-url=https://pypi.org/simple/"] +- repo: https://github.com/s-weigand/flake8-nb + rev: v0.4.0 + hooks: + - id: flake8-nb # Lint jupyter notebooks + additional_dependencies: ["--index-url=https://pypi.org/simple/"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..59fde39 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Merantix Labs GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/infrastructure/docker/Dockerfile b/infrastructure/docker/Dockerfile new file mode 100644 index 0000000..dc7af8b --- /dev/null +++ b/infrastructure/docker/Dockerfile @@ -0,0 +1,36 @@ +FROM python:3.8.9-slim + +RUN apt-get update && \ + apt-get -y install git findutils build-essential unzip wget && \ + apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +ADD . . + +RUN pip3 install keyrings.google-artifactregistry-auth==0.0.3 --index-url=https://pypi.org/simple/ +RUN pip3 install --upgrade pip && \ + pip3 install --require-hashes -r requirements.txt --no-deps --disable-pip-version-check && \ + pip3 cache purge + +# for testing to work, the base image has to have spark<=3.1, >=3.0 +# spark>=3.0 is required by python3.8 +# spark <=3.1 is required by pydeequ (deequ) + +# add java +COPY --from=adoptopenjdk/openjdk8 opt/java opt/java +ENV JAVA_HOME=/opt/java/openjdk +ENV PATH=$PATH:$JAVA_HOME/bin + +# add spark +ADD https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz . +RUN rm -rf /opt/spark && \ + tar -xzf spark-3.2.0-bin-hadoop3.2.tgz -C /opt && \ + mv /opt/spark-3.2.0-bin-hadoop3.2/ /opt/spark +ENV SPARK_HOME="/opt/spark" + +# add hadoop +ADD https://archive.apache.org/dist/hadoop/common/hadoop-3.2.0/hadoop-3.2.0.tar.gz . +RUN tar xf hadoop-3.2.0.tar.gz && mv hadoop-3.2.0 /opt/hadoop && rm hadoop-3.2.0.tar.gz +ENV HADOOP_HOME="/opt/hadoop" + +# add shaded gcs connector +ADD https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.2/gcs-connector-hadoop3-2.2.2-shaded.jar /opt/spark/jars/gcs-connector-hadoop3-latest.jar diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..3f851f4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[tool.autopep8] +max_line_length = 120 +ignore="E265,E501,E203" +in-place = true +recursive = true + +[tool.black] +line-length = 120 +include = '\.pyi?$' \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..54b7e5c --- /dev/null +++ b/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +testpaths = + test + squirrel_datasets/integration_test +addopts = -p no:warnings -v --cov=squirrel_datasets --cov-config=.coveragerc +norecursedirs = '.*', 'build', 'dist', 'CVS', '_darcs', '{arch}', '*.egg' +marks = serial: Tests that should be run sequentially without any tests running in parallel. \ No newline at end of file diff --git a/requirements.dev.in b/requirements.dev.in index 112e5d8..e51c3fe 100644 --- a/requirements.dev.in +++ b/requirements.dev.in @@ -1,4 +1,5 @@ sphinx-rtd-theme +keyrings.google-artifactregistry-auth==0.0.3 twine wheel pytest>=5.3.5 @@ -6,8 +7,12 @@ pytest-timeout pytest-cov pytest-xdist Sphinx<4.0.0 +sphinx-mx-theme>=0.1.4,<1.0.0 sphinx_versions +myst_parser # For importing md into rst files. pyspark>=3.2.0 flaky # for retry flaky tests nbmake jupyter +networkx +pre-commit==2.16.0 \ No newline at end of file diff --git a/requirements.hub.in b/requirements.hub.in new file mode 100644 index 0000000..f4a5657 --- /dev/null +++ b/requirements.hub.in @@ -0,0 +1 @@ +hub \ No newline at end of file diff --git a/requirements.preprocessing.in b/requirements.preprocessing.in new file mode 100644 index 0000000..cf9b401 --- /dev/null +++ b/requirements.preprocessing.in @@ -0,0 +1,2 @@ +pyspark==3.2.0 +hydra-core>=1.1.0 \ No newline at end of file diff --git a/requirements.torchvision.in b/requirements.torchvision.in new file mode 100644 index 0000000..e5be9d8 --- /dev/null +++ b/requirements.torchvision.in @@ -0,0 +1,2 @@ +torchvision +scipy # required by torch dataset "caltech" \ No newline at end of file