microsoft · wjohnson · Dec 23, 2023 · Dec 23, 2023 · Dec 24, 2023 · Dec 29, 2023
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
@@ -104,18 +104,17 @@ jobs:
         name: FunctionZip
         path: ./artifacts
 
-    - name: Deploy Azure Function to Integration Env
-      uses: Azure/functions-action@v1.4.6
-      with:
-        app-name: ${{ secrets.INT_FUNC_NAME }}
-        package: ./artifacts/FunctionZip.zip
-        publish-profile: ${{ secrets.INT_PUBLISH_PROFILE }}
-
     - name: Azure Login
       uses: azure/login@v1
       with:
         creds: ${{ secrets.INT_AZ_CLI_CREDENTIALS }}
-
+
+    - name: Deploy Azure Function to Integration Env
+      uses: Azure/functions-action@v1.5.1
+      with:
+        app-name: ${{ secrets.INT_FUNC_NAME }}
+        package: ./artifacts/FunctionZip.zip
+
     - name: Compare and Update App Settings on Deployed Function
       uses: azure/CLI@v1
       with:
@@ -134,6 +133,13 @@ jobs:
         AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
         AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
 
+    - name: Start Integration ADX Cluster
+      run: source tests/integration/manage-adx-cluster.sh start ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.ADX_RG_NAME }} ${{ secrets.ADX_CLUSTER_NAME }}
+      env:
+        AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+        AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
+        AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+
     - name: Install Python Requirements and Databricks CLI
       run: pip install pyapacheatlas==0.12.0 azure-identity databricks-cli
 
@@ -173,6 +179,13 @@ jobs:
         AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
         AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
         AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+
+    - name: Stop Integration ADX Cluster
+      run: source tests/integration/manage-adx-cluster.sh stop ${{ secrets.INT_SUBSCRIPTION_ID }} ${{ secrets.ADX_RG_NAME }} ${{ secrets.ADX_CLUSTER_NAME }}
+      env:
+        AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+        AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
+        AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
 
   createRelease:
     name: Create Release

diff --git a/LIMITATIONS.md b/LIMITATIONS.md
@@ -159,3 +159,7 @@ Starting with OpenLineage 0.18.0 and release 2.3.0 of the solution accelerator,
 
 * Delta Merge statements are not supported at this time
 * Delta to Delta is NOT supported at this time
+
+# Unity Catalog
+
+Unity Catalog is not supported due to [OpenLineage not yet supporting Unity Catalog](https://github.com/OpenLineage/OpenLineage/issues/2121). Microsoft Purview supports [Unity Catalog metadata scanning](https://learn.microsoft.com/en-us/purview/register-scan-azure-databricks-unity-catalog) and is the preferred approach to handling Unity Catalog.
diff --git a/README.md b/README.md
@@ -63,6 +63,7 @@ Gathering lineage data is performed in the following steps:
 * Support **column level lineage** for ABFSS, WASBS, and default metastore hive tables (see [Limitations](./LIMITATIONS.md#column-level-mapping-supported-sources) for more detail)
 * Once configured, <span style="color: red;">**does not require any code changes to notebooks or jobs**</span>
 * Can [add new source support through configuration](./docs/extending-source-support.md)  
+* Note: Unity Catalog is not supported. [Unity Catalog metadata scanning](https://learn.microsoft.com/en-us/purview/register-scan-azure-databricks-unity-catalog) is supported in Microsoft Purview and is the preferred way to collect metadata and lineage for Unity enabled Databricks Workspaces.
 
 ## Videos
 

diff --git a/deploy-base.md b/deploy-base.md
@@ -156,6 +156,7 @@ Follow the instructions below and refer to the [OpenLineage Databricks Install I
     > If you do not have line feed endings, your cluster will fail to start due to an init script error.
 
 3. Upload the init script and jar to dbfs using the [Databricks CLI](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/cli/)
+    * Alternatively, use the [databricks workspace import --format SOURCE](https://github.com/databricks/cli/blob/main/docs/commands.md#databricks-workspace-import---import-a-workspace-object) command to upload the init script as a workspace file.
 
     ```text
     dbfs mkdirs dbfs:/databricks/openlineage
@@ -181,7 +182,7 @@ Follow the instructions below and refer to the [OpenLineage Databricks Install I
 
     After configuring the secret storage, the API key for OpenLineage can be configured in the Spark config, as in the following example:
     `spark.openlineage.url.param.code {{secrets/secret_scope/Ol-Output-Api-Key}}`
-    1. Add a reference to the uploaded init script `dbfs:/databricks/openlineage/open-lineage-init-script.sh` on the [Init script section](https://docs.microsoft.com/en-us/azure/databricks/clusters/init-scripts#configure-a-cluster-scoped-init-script-using-the-ui) of the Advanced Options.
+    1. Add a reference to the uploaded init script `dbfs:/databricks/openlineage/open-lineage-init-script.sh` on the [Init script section](https://learn.microsoft.com/en-us/azure/databricks/init-scripts/cluster-scoped#configure-a-cluster-scoped-init-script-using-the-ui) of the Advanced Options.
 
 5. At this point, you can run a Databricks notebook on an "all-purpose cluster" in your configured workspace and observe lineage in Microsoft Purview once the Databricks notebook has finished running all cells.
 
@@ -191,53 +192,19 @@ Follow the instructions below and refer to the [OpenLineage Databricks Install I
 
 ### <a id="jobs-lineage" />Support Extracting Lineage from Databricks Jobs
 
-To support Databricks Jobs, you must add the service principal to your Databricks workspace. To use the below scripts, you must authenticate to Azure Databricks using either [access tokens](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication) or [AAD tokens](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/aad/). The snippets below assume you have generated an access token.
-
-1. [Add your Service Principal to Databricks as a User](https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/scim/scim-sp#add-service-principal)
-    * Create a file named `add-service-principal.json` that contains
-      ```json
-      {
-        "schemas": [ "urn:ietf:params:scim:schemas:core:2.0:ServicePrincipal" ],
-        "applicationId": "<azure-application-id>",
-        "displayName": "<display-name>",
-        "groups": [
-            {
-            "value": "<group-id>"
-            }
-        ],
-        "entitlements": [
-            {
-            "value":"allow-cluster-create"
-            }
-        ]
-      }
-      ```
-    * Provide a group id by executing the `groups` Databricks API and extracting a group id.
-      ```bash
-      curl -X GET \
-      https://<databricks-instance>/api/2.0/preview/scim/v2/Groups \
-      --header 'Authorization: Bearer DATABRICKS_ACCESS_TOKEN' \
-      | jq .
-      ```
-      You may use the admin group id or create a separate group to isolate the service principal.
-
-    * Execute the following bash command after the file above has been created and populated.
-      ```bash
-      curl -X POST \
-      https://<databricks-instance>/api/2.0/preview/scim/v2/ServicePrincipals \
-      --header 'Content-type: application/scim+json' \
-      --header 'Authorization: Bearer DATABRICKS_ACCESS_TOKEN' \
-      --data @add-service-principal.json \
-      | jq .
-      ```
-2. [Assign the Service Principal as a contributor to the Databricks Workspace](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-portal?tabs=current)
+To support Databricks Jobs, you must add the service principal to your Databricks workspace.
+
+1. Follow the [official documentation](https://learn.microsoft.com/en-us/azure/databricks/administration-guide/users-groups/service-principals#--add-a-service-principal-to-a-workspace-using-the-workspace-admin-settings) to add your service principal through the Workspace Admin Settings User Interface. This will also add it to the Admin group.
+    * For adding the service principal via REST API see [databricks jobs and service principals](./docs/databricks-jobs-service-principal.md)
+
+2. This should be the same Service Principal that has Data Curator role in Microsoft Purview.
 
 3. At this point, you can run a Databricks job on a "job cluster" in your configured workspace and observe lineage in Microsoft Purview once the Databricks job has finished.
 
 4. If you do not see any lineage please follow the steps in the [troubleshooting guide](./TROUBLESHOOTING.md).
 
 ### <a id="global-init"/>Global Init Scripts
 
-You can also configure the OpenLineage listener to run globally, so that any cluster which is created automatically runs the listener.  To do this, you can utilize a [global init script](https://docs.microsoft.com/en-us/azure/databricks/clusters/init-scripts#global-init-scripts).
+You can also configure the OpenLineage listener to run globally, so that any cluster which is created automatically runs the listener.  To do this, you can utilize a [global init script](https://learn.microsoft.com/en-us/azure/databricks/init-scripts/global).
 
 **Note**: Global initialization cannot currently use values from Azure Databricks KeyVault integration mentioned above. If using global initialization scripts, this key would need to be retrieved in the notebooks themselves, or hardcoded into the global init script.
diff --git a/deployment/infra/newdeploymenttemp.json b/deployment/infra/newdeploymenttemp.json
@@ -57,7 +57,7 @@
 		"openlineageKeyVaultName": "[replace(replace(toLower(concat(concat('keyvaut',variables('paramName')),variables('uniqueName'))),'-',''),'_','')]",
 		"purviewAccountName": "[parameters('purviewName')]",
 		"eventHubSku": "Standard",
-		"captureEnabled": true,
+		"captureEnabled": false,
 		"captureEncodingFormat": "Avro",
 		"captureTime": 60,
 		"captureSize": 314572800,
@@ -86,7 +86,7 @@
 			"sku": {
 				"name": "Standard_LRS"
 			},
-			"kind": "Storage",
+			"kind": "StorageV2",
 			"tags": "[parameters('resourceTagValues')]",
 			"properties": {
 				"allowBlobPublicAccess": "False",
@@ -101,7 +101,7 @@
 			"sku": {
 				"name": "Standard_LRS"
 			},
-			"kind": "Storage",
+			"kind": "StorageV2",
 			"tags": "[parameters('resourceTagValues')]",
 			"properties": {
 				"allowBlobPublicAccess": "False"

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -30,7 +30,6 @@ The following app settings are experimental and may be removed in future release
 | App Setting| Default Value in Code| Note|
 |----|----|----|
 |useResourceSet|true|Experimental feature|
-|maxQueryPlanSize|null|If the query plan bytes is greater than this value it will be removed from the databricks_process|
 |prioritizeFirstResourceSet|true|When matching against existing assets, the first resource set found will be prioritized over other assets like folders or purview custom connector entities.|
 |Spark_Entities|databricks_workspace;databricks_job;databricks_notebook;databricks_notebook_task||
 |Spark_Process|databricks_process||

diff --git a/docs/data-factory.md b/docs/data-factory.md
@@ -0,0 +1,10 @@
+# Data Factory and Databricks Notebook Lineage
+
+The solution accelerator supports capturing lineage for Databricks Notebook activities in Azure Data Factory (ADF). After running a notebook through ADF on an interactive or job cluster, you will see a Databricks Job asset in Microsoft Purview with a name similar to `ADF_<factory name>_<pipeline name>`. For each Databricks notebook activity, you will also see a Databricks Task with a name similar to `ADF_<factory name>_<pipeline name>_<activity name>`.
+
+* At this time, the Microsoft Purview view of Azure Data Factory lineage will not contain these tasks unless the Databricks Task uses or feeds a data source to a Data Flow or Copy activity.
+* Copy Activities may not show lineage connecting to these Databricks tasks since it emits individual file assets rather than folder or resource set assets.
+
+## Enable Collecting Data Factory Lineage
+
+To enable Data Factory lineage, you must add the [Service Principal to the Databricks Workspace](./databricks-jobs-service-principal.md) and add it to at least the `users` group.
diff --git a/docs/databricks-jobs-service-principal.md b/docs/databricks-jobs-service-principal.md
@@ -0,0 +1,63 @@
+# Add Service Principal to Databricks Workspace for Job and Data Factory Support
+
+When extracting lineage for Databricks Jobs (a.k.a. Workflows), you will need the Solution Accelerator to be able to read Databricks Job information. This also applies to Data Factory calling Databricks Notebooks or Python Files as Activities in a Pipeline. In order to read this information, the Solution Accelerator Service Principal (the one that has been granted the Data Curator role in Microsoft Purview) must be added to the Databricks Workspace as a user.
+
+Given that this solution does not support Unity Catalog, it is assumed that you are using the Workspace Admin settings and not the Account Console to add and manage Service Principals.
+
+For best experience, follow the [official documentation on how to add a service principal to Databricks Workspace and users group](https://learn.microsoft.com/en-us/azure/databricks/administration-guide/users-groups/service-principals#--add-a-service-principal-to-a-workspace-using-the-workspace-admin-settings) via the UI.
+
+The remainder of this page provides sample code for adding a service principal via REST API calls.
+
+## Generate a Databricks Access Token
+
+This sample uses [Databricks Personal Access Tokens](https://learn.microsoft.com/en-us/azure/databricks/dev-tools/auth/pat). Generate a token for use in the code below.
+
+## Add your Service Principal to Databricks as a User and Add to Users Group
+
+[Databricks Workspace REST API: Service Principals - Create](https://docs.databricks.com/api/azure/workspace/serviceprincipals/create)
+
+* Find the `users`` group id by executing the `groups` Databricks API and extracting the group id.
+    ```bash
+    curl -X GET \
+    https://<databricks-instance>/api/2.0/preview/scim/v2/Groups \
+    --header 'Authorization: Bearer <DATABRICKS_ACCESS_TOKEN>' \
+    | jq .
+    ```
+    You may use the users group id or create a separate group to isolate the service principal. 
+* Create a file named `add-service-principal.json` that contains the below payload with the users group id.
+    ```json
+    {
+    "schemas": [ "urn:ietf:params:scim:schemas:core:2.0:ServicePrincipal" ],
+    "applicationId": "<azure-application-id>",
+    "displayName": "<display-name>",
+    "groups": [
+        {
+        "value": "<group-id>"
+        }
+    ],
+    "entitlements": [
+        {
+        "value":"allow-cluster-create"
+        }
+    ]
+    }
+    ```
+* Execute the following bash command after the file above has been created and populated.
+    ```bash
+    curl -X POST \
+    https://<databricks-instance>/api/2.0/preview/scim/v2/ServicePrincipals \
+    --header 'Content-type: application/scim+json' \
+    --header 'Authorization: Bearer <DATABRICKS_ACCESS_TOKEN>' \
+    --data @add-service-principal.json \
+    | jq .
+    ```
+
+## Optional Use the Admin Group
+
+In some cases, you may need to use the Admin group. Repeat the steps above and add the service principal to the Admin group.
+
+## Optional: Assign the Service Principal as a contributor to the Databricks Workspace
+
+The above steps should be sufficient but in some cases, you may need to add the service principal as a contributor to the Databricks Workspace resource.
+
+[How to assign roles to a resource](https://docs.microsoft.com/en-us/azure/role-based-access-control/role-assignments-portal?tabs=current)
diff --git a/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs b/function-app/adb-to-purview/src/Function.Domain/Helpers/OlProcessing/ValidateOlEvent.cs
@@ -39,18 +39,25 @@ public ValidateOlEvent(ILoggerFactory loggerFactory)
         /// </summary>
         /// <param name="olEvent">OpenLineage Event message</param>
         /// <returns>true if input is valid, false if not</returns>
-        public bool Validate(Event olEvent){
+        public bool Validate(Event? olEvent){
+            if (olEvent == null){
+                _log.LogWarning("Event considered NOT valid as it was null");
+                return false;
+            }
+            _log.LogInformation($"Validating input of an event with {olEvent.Inputs.Count} inputs and {olEvent.Outputs.Count} outputs");
             if (olEvent.Inputs.Count > 0 && olEvent.Outputs.Count > 0)
             {
                 // Need to rework for multiple inputs and outputs in one packet - possibly combine and then hash
                 if (InOutEqual(olEvent))
                 { 
+                    _log.LogWarning($"Event considered NOT valid due to inputs and outputs being equal");
                     return false; 
                 }
                 if (olEvent.EventType == "START")
                 {
                     if (olEvent.Run.Facets.EnvironmentProperties == null)
                     {
+                        _log.LogWarning($"Start Event considered NOT valid due to missing Databricks Envrionment Properties");
                         return false;
                     }
                     return true;
@@ -61,9 +68,11 @@ public bool Validate(Event olEvent){
                 }
                 else
                 {
+                    _log.LogWarning($"Event considered NOT valid due to not matching any other condition");
                     return false;
                 }
             }
+            _log.LogWarning($"Event considered NOT valid due to not matching any other condition");
             return false;
         }
 
@@ -77,7 +86,7 @@ private bool InOutEqual(Event ev)
             nms2.Sort();
             nmspc.Sort();
             nmspc2.Sort();
-            return Enumerable.SequenceEqual(nms, nms2) && Enumerable.SequenceEqual(nms, nms2);
+            return Enumerable.SequenceEqual(nms, nms2) && Enumerable.SequenceEqual(nmspc, nmspc2);
         }
     }
 }