diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d582eb6..14eed82 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,7 +17,7 @@ stages: jobs: - job: Build pool: - name: Hosted Ubuntu 1604 + vmImage: 'ubuntu-18.04' steps: - task: UsePythonVersion@0 @@ -30,16 +30,6 @@ stages: # pip install -U databricks-cli displayName: 'Load Python Dependencies' - - task: riserrad.azdo-databricks.azdo-databricks-configuredatabricks.configuredatabricks@0 - inputs: - url: '$(WORKSPACE_REGION_URL)/?o=$(WORKSPACE_ORG_ID)' - token: '$(DATABRICKS_TOKEN)' - displayName: 'Configure Databricks CLI for AZDO' - - #- script: | - # dbfs ls - # displayName: 'Check config working' - - checkout: self persistCredentials: true clean: true @@ -50,6 +40,33 @@ stages: ## git checkout $(Build.SourceBranch) # displayName: 'Get Latest from Branch $(Build.SourceBranchName) / $(Build.SourceBranch) / $(branchName)' + - script: | + python -m pytest --junit-xml=$(Build.Repository.LocalPath)/logs/TEST-LOCAL.xml $(Build.Repository.LocalPath)/libraries/python/dbxdemo/test*.py || true + ls logs + displayName: 'Run Python Unit Tests for library code' + + - task: PublishTestResults@2 + inputs: + testResultsFiles: '**/TEST-*.xml' + failTaskOnFailedTests: true + publishRunAttachments: true + + - script: | + cd $(Build.Repository.LocalPath)/libraries/python/dbxdemo + python3 setup.py sdist bdist_wheel + ls dist/ + displayName: 'Build Python Wheel for Libs' + + - task: riserrad.azdo-databricks.azdo-databricks-configuredatabricks.configuredatabricks@0 + inputs: + url: '$(WORKSPACE_REGION_URL)/?o=$(WORKSPACE_ORG_ID)' + token: '$(DATABRICKS_TOKEN)' + displayName: 'Configure Databricks CLI for AZDO' + + #- script: | + # dbfs ls + # displayName: 'Check config working' + - script: | cat /home/vsts/.databrickscfg echo "" @@ -84,6 +101,23 @@ stages: echo $(response) displayName: 'Batch test result' +- stage: Release + dependsOn: Build + condition: and(succeeded(), startsWith(variables['Build.SourceBranchName'], 'releases')) + jobs: + - job: Release + pool: + vmImage: 'ubuntu-18.04' + + steps: + - task: UsePythonVersion@0 + displayName: 'Use Python 3.7' + inputs: + versionSpec: 3.7 + + - script: | + echo "Release" + displayName: 'Release stage' #- script: | # cat /home/vsts/.databrickscfg diff --git a/libraries/python/dbxdemo/.gitignore b/libraries/python/dbxdemo/.gitignore new file mode 100644 index 0000000..c45cb38 --- /dev/null +++ b/libraries/python/dbxdemo/.gitignore @@ -0,0 +1,6 @@ +.cache +__pycache__ +build/ +dist/ +gill.egg-info/ +.idea/ \ No newline at end of file diff --git a/libraries/python/dbxdemo/README.md b/libraries/python/dbxdemo/README.md new file mode 100644 index 0000000..6a5048b --- /dev/null +++ b/libraries/python/dbxdemo/README.md @@ -0,0 +1,5 @@ +# DBX + +An example PySpark project. + +The project uses pyenv and pytest. It also can be easily leveraged to generate egg/wheel files. \ No newline at end of file diff --git a/libraries/python/dbxdemo/dbxdemo/__init__.py b/libraries/python/dbxdemo/dbxdemo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libraries/python/dbxdemo/dbxdemo/appendcol.py b/libraries/python/dbxdemo/dbxdemo/appendcol.py new file mode 100644 index 0000000..d0b2f54 --- /dev/null +++ b/libraries/python/dbxdemo/dbxdemo/appendcol.py @@ -0,0 +1,5 @@ +import pyspark.sql.functions as F + +def with_status(df): + return df.withColumn("status", F.lit("checked")) + diff --git a/libraries/python/dbxdemo/dbxdemo/spark.py b/libraries/python/dbxdemo/dbxdemo/spark.py new file mode 100644 index 0000000..0415d85 --- /dev/null +++ b/libraries/python/dbxdemo/dbxdemo/spark.py @@ -0,0 +1,9 @@ +from pyspark.sql import SparkSession +from functools import lru_cache + +@lru_cache(maxsize=None) +def get_spark(): + return (SparkSession.builder + .appName("dbxdemo") + .getOrCreate()) + diff --git a/libraries/python/dbxdemo/requirements.txt b/libraries/python/dbxdemo/requirements.txt new file mode 100644 index 0000000..f140a97 --- /dev/null +++ b/libraries/python/dbxdemo/requirements.txt @@ -0,0 +1,3 @@ +pytest==3.2.2 +pyspark==2.4.5 +setuptools==28.8.0 diff --git a/libraries/python/dbxdemo/setup.py b/libraries/python/dbxdemo/setup.py new file mode 100644 index 0000000..df852f8 --- /dev/null +++ b/libraries/python/dbxdemo/setup.py @@ -0,0 +1,10 @@ +from setuptools import setup + +setup(name='dbxdemo', + version='0.0.15', + description='A sample PySpark application - 0.0.15', + author='Peter Tamisin', + author_email='peter.tamisin@databricks.com', + url='www.databricks.com', + packages=['dbxdemo'], + zip_safe=False) diff --git a/libraries/python/dbxdemo/test_appendcol.py b/libraries/python/dbxdemo/test_appendcol.py new file mode 100644 index 0000000..8958fab --- /dev/null +++ b/libraries/python/dbxdemo/test_appendcol.py @@ -0,0 +1,30 @@ +import pytest + +from dbxdemo.spark import get_spark +from dbxdemo.appendcol import with_status + +class TestAppendCol(object): + + def test_with_status(self): + source_data = [ + ("pete", "tamisin", "peter.tamisin@databricks.com"), + ("jason", "baer", "jason.baer@databricks.com") + ] + source_df = get_spark().createDataFrame( + source_data, + ["first_name", "last_name", "email"] + ) + + actual_df = with_status(source_df) + + expected_data = [ + ("pete", "tamisin", "peter.tamisin@databricks.com", "checked"), + ("jason", "baer", "jason.baer@databricks.com", "checked") + ] + expected_df = get_spark().createDataFrame( + expected_data, + ["first_name", "last_name", "email", "status"] + ) + + assert(expected_df.collect() == actual_df.collect()) + diff --git a/pipeline/ETL/lib_use/files.py b/pipeline/ETL/lib_use/files.py new file mode 100644 index 0000000..f3d0180 --- /dev/null +++ b/pipeline/ETL/lib_use/files.py @@ -0,0 +1,8 @@ +# Databricks notebook source +from dbxdemo import appendcol + +# COMMAND ---------- + +display(appendcol.with_status(spark.read.parquet("/databricks-datasets/samples/lending_club/parquet/"))) + +