Skip to content

Commit

Permalink
Merge pull request #3 from MiguelPeralvo/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
MiguelPeralvo authored Jun 10, 2020
2 parents 7bb9bbe + bbb0839 commit 75013b4
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 11 deletions.
56 changes: 45 additions & 11 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ stages:
jobs:
- job: Build
pool:
name: Hosted Ubuntu 1604
vmImage: 'ubuntu-18.04'

steps:
- task: UsePythonVersion@0
Expand All @@ -30,16 +30,6 @@ stages:
# pip install -U databricks-cli
displayName: 'Load Python Dependencies'

- task: riserrad.azdo-databricks.azdo-databricks-configuredatabricks.configuredatabricks@0
inputs:
url: '$(WORKSPACE_REGION_URL)/?o=$(WORKSPACE_ORG_ID)'
token: '$(DATABRICKS_TOKEN)'
displayName: 'Configure Databricks CLI for AZDO'

#- script: |
# dbfs ls
# displayName: 'Check config working'

- checkout: self
persistCredentials: true
clean: true
Expand All @@ -50,6 +40,33 @@ stages:
## git checkout $(Build.SourceBranch)
# displayName: 'Get Latest from Branch $(Build.SourceBranchName) / $(Build.SourceBranch) / $(branchName)'

- script: |
python -m pytest --junit-xml=$(Build.Repository.LocalPath)/logs/TEST-LOCAL.xml $(Build.Repository.LocalPath)/libraries/python/dbxdemo/test*.py || true
ls logs
displayName: 'Run Python Unit Tests for library code'
- task: PublishTestResults@2
inputs:
testResultsFiles: '**/TEST-*.xml'
failTaskOnFailedTests: true
publishRunAttachments: true

- script: |
cd $(Build.Repository.LocalPath)/libraries/python/dbxdemo
python3 setup.py sdist bdist_wheel
ls dist/
displayName: 'Build Python Wheel for Libs'
- task: riserrad.azdo-databricks.azdo-databricks-configuredatabricks.configuredatabricks@0
inputs:
url: '$(WORKSPACE_REGION_URL)/?o=$(WORKSPACE_ORG_ID)'
token: '$(DATABRICKS_TOKEN)'
displayName: 'Configure Databricks CLI for AZDO'

#- script: |
# dbfs ls
# displayName: 'Check config working'

- script: |
cat /home/vsts/.databrickscfg
echo ""
Expand Down Expand Up @@ -84,6 +101,23 @@ stages:
echo $(response)
displayName: 'Batch test result'
- stage: Release
dependsOn: Build
condition: and(succeeded(), startsWith(variables['Build.SourceBranchName'], 'releases'))
jobs:
- job: Release
pool:
vmImage: 'ubuntu-18.04'

steps:
- task: UsePythonVersion@0
displayName: 'Use Python 3.7'
inputs:
versionSpec: 3.7

- script: |
echo "Release"
displayName: 'Release stage'
#- script: |
# cat /home/vsts/.databrickscfg
Expand Down
6 changes: 6 additions & 0 deletions libraries/python/dbxdemo/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.cache
__pycache__
build/
dist/
gill.egg-info/
.idea/
5 changes: 5 additions & 0 deletions libraries/python/dbxdemo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# DBX

An example PySpark project.

The project uses pyenv and pytest. It also can be easily leveraged to generate egg/wheel files.
Empty file.
5 changes: 5 additions & 0 deletions libraries/python/dbxdemo/dbxdemo/appendcol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pyspark.sql.functions as F

def with_status(df):
return df.withColumn("status", F.lit("checked"))

9 changes: 9 additions & 0 deletions libraries/python/dbxdemo/dbxdemo/spark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pyspark.sql import SparkSession
from functools import lru_cache

@lru_cache(maxsize=None)
def get_spark():
return (SparkSession.builder
.appName("dbxdemo")
.getOrCreate())

3 changes: 3 additions & 0 deletions libraries/python/dbxdemo/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pytest==3.2.2
pyspark==2.4.5
setuptools==28.8.0
10 changes: 10 additions & 0 deletions libraries/python/dbxdemo/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from setuptools import setup

setup(name='dbxdemo',
version='0.0.15',
description='A sample PySpark application - 0.0.15',
author='Peter Tamisin',
author_email='peter.tamisin@databricks.com',
url='www.databricks.com',
packages=['dbxdemo'],
zip_safe=False)
30 changes: 30 additions & 0 deletions libraries/python/dbxdemo/test_appendcol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pytest

from dbxdemo.spark import get_spark
from dbxdemo.appendcol import with_status

class TestAppendCol(object):

def test_with_status(self):
source_data = [
("pete", "tamisin", "peter.tamisin@databricks.com"),
("jason", "baer", "jason.baer@databricks.com")
]
source_df = get_spark().createDataFrame(
source_data,
["first_name", "last_name", "email"]
)

actual_df = with_status(source_df)

expected_data = [
("pete", "tamisin", "peter.tamisin@databricks.com", "checked"),
("jason", "baer", "jason.baer@databricks.com", "checked")
]
expected_df = get_spark().createDataFrame(
expected_data,
["first_name", "last_name", "email", "status"]
)

assert(expected_df.collect() == actual_df.collect())

8 changes: 8 additions & 0 deletions pipeline/ETL/lib_use/files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Databricks notebook source
from dbxdemo import appendcol

# COMMAND ----------

display(appendcol.with_status(spark.read.parquet("/databricks-datasets/samples/lending_club/parquet/")))


0 comments on commit 75013b4

Please sign in to comment.