Merge branch 'master' into improve_iceberg_connector

datahub-project · Jan 10, 2025 · eb14f5c · eb14f5c
2 parents 66d53f4 + a6cd995
commit eb14f5c
Show file tree

Hide file tree

Showing 19 changed files with 876 additions and 702 deletions.
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -118,10 +118,12 @@ jobs:
         run: |
           echo "BACKEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(metadata-models|entity-registry|datahuyb-graphql-core|metadata-io|metadata-jobs|metadata-utils|metadata-service|medata-dao-impl|metadata-operation|li-utils|metadata-integration|metadata-events|metadata-auth|ingestion-scheduler|notifications|datahub-upgrade)' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
           echo "FRONTEND_FILES=`find ./build/coverage-reports/ -type f | grep -E '(datahub-frontend|datahub-web-react).*\.(xml|json)$' | xargs | sed 's/ /,/g'`" >> $GITHUB_ENV
+      - name: Generate tz artifact name
+        run: echo "NAME_TZ=$(echo ${{ matrix.timezone }} | tr '/' '-')" >> $GITHUB_ENV
       - uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: Test Results (build) - ${{ matrix.command}}-${{ matrix.timezone }}
+          name: Test Results (build) - ${{ matrix.command}}-${{ env.NAME_TZ }}
           path: |
             **/build/reports/tests/test/**
             **/build/test-results/test/**

diff --git a/build.gradle b/build.gradle
@@ -379,6 +379,7 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
 
     resolutionStrategy.force externalDependency.antlr4Runtime
     resolutionStrategy.force externalDependency.antlr4
+    resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4'
   }
 }
 

diff --git a/docs-website/README.md b/docs-website/README.md
@@ -130,7 +130,6 @@ The purpose of this section is to provide developers & technical users with conc
 
 This section aims to provide plain-language feature overviews for both technical and non-technical readers alike.
 
-
 ## Docs Generation Features
 
 **Includes all markdown files**
@@ -145,16 +144,33 @@ You can suppress this check by adding the path to the file in a comment in `side
 
 Use an "inline" directive to include code snippets from other files. The `show_path_as_comment` option will include the path to the file as a comment at the top of the snippet.
 
-  ```python
-  {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
-  ```
+    ```python
+    {{ inline /metadata-ingestion/examples/library/data_quality_mcpw_rest.py show_path_as_comment }}
+    ```
+
+**Command Output**
+
+Use the `{{ command-output cmd }}` directive to run subprocesses and inject the outputs into the final markdown.
+
+    {{ command-output python -c 'print("Hello world")' }}
 
+This also works for multi-line scripts.
+
+    {{ command-output
+    source metadata-ingestion/venv/bin/activate
+    python -m <something>
+    }}
+
+Regardless of the location of the markdown file, the subcommands will be executed with working directory set to the repo root.
+
+Only the stdout of the subprocess will be outputted. The stderr, if any, will be included as a comment in the markdown.
 
 ## Docs site generation process
 
 This process is orchestrated by a combination of Gradle and Yarn tasks. The main entrypoint is via the `docs-website:yarnGenerate` task, which in turn eventually runs `yarn run generate`.
 
 Steps:
+
 1. Generate the GraphQL combined schema using the gradle's `docs-website:generateGraphQLSchema` task. This generates `./graphql/combined.graphql`.
 2. Generate docs for ingestion sources using the `:metadata-ingestion:docGen` gradle task.
 3. Generate docs for our metadata model using the `:metadata-ingestion:modelDocGen` gradle task.

diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts
@@ -439,6 +439,42 @@ function markdown_process_inline_directives(
   contents.content = new_content;
 }
 
+function markdown_process_command_output(
+  contents: matter.GrayMatterFile<string>,
+  filepath: string
+): void {
+  const new_content = contents.content.replace(
+    /^{{\s*command-output\s*([\s\S]*?)\s*}}$/gm,
+    (_, command: string) => {
+      try {
+        // Change to repo root directory before executing command
+        const repoRoot = path.resolve(__dirname, "..");
+
+        console.log(`Executing command: ${command}`);
+
+        // Execute the command and capture output
+        const output = execSync(command, {
+          cwd: repoRoot,
+          encoding: "utf8",
+          stdio: ["pipe", "pipe", "pipe"],
+        });
+
+        // Return the command output
+        return output.trim();
+      } catch (error: any) {
+        // If there's an error, include it as a comment
+        const errorMessage = error.stderr
+          ? error.stderr.toString()
+          : error.message;
+        return `${
+          error.stdout ? error.stdout.toString().trim() : ""
+        }\n<!-- Error: ${errorMessage.trim()} -->`;
+      }
+    }
+  );
+  contents.content = new_content;
+}
+
 function markdown_sanitize_and_linkify(content: string): string {
   // MDX escaping
   content = content.replace(/</g, "&lt;");
@@ -602,6 +638,7 @@ function copy_python_wheels(): void {
     markdown_rewrite_urls(contents, filepath);
     markdown_enable_specials(contents, filepath);
     markdown_process_inline_directives(contents, filepath);
+    markdown_process_command_output(contents, filepath);
     //copy_platform_logos();
     // console.log(contents);
 

diff --git a/...es/metadata-file/metadata-file_recipe.yml → ...ocs/sources/metadata-file/file_recipe.yml b/...es/metadata-file/metadata-file_recipe.yml → ...ocs/sources/metadata-file/file_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi-report-server/powerbi-report-server_pre.md
@@ -0,0 +1,16 @@
+### Configuration Notes
+
+See the
+
+1. [Microsoft Grant user access to a Report Server doc](https://docs.microsoft.com/en-us/sql/reporting-services/security/grant-user-access-to-a-report-server?view=sql-server-ver16)
+2. Use your user credentials from previous step in yaml file
+
+### Concept mapping
+
+| Power BI Report Server | Datahub     |
+| ---------------------- | ----------- |
+| `Paginated Report`     | `Dashboard` |
+| `Power BI Report`      | `Dashboard` |
+| `Mobile Report`        | `Dashboard` |
+| `Linked Report`        | `Dashboard` |
+| `Dataset, Datasource`  | `N/A`       |
diff --git a/.../powerbi/powerbi-report-server_recipe.yml → ...t-server/powerbi-report-server_recipe.yml b/.../powerbi/powerbi-report-server_recipe.yml → ...t-server/powerbi-report-server_recipe.yml
diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi-report-server_pre.md
diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py
@@ -346,7 +346,7 @@ def write_urn_classes(key_aspects: List[dict], urn_dir: Path) -> None:
     code = """
 # This file contains classes corresponding to entity URNs.
 
-from typing import ClassVar, List, Optional, Type, TYPE_CHECKING
+from typing import ClassVar, List, Optional, Type, TYPE_CHECKING, Union
 
 import functools
 from deprecated.sphinx import deprecated as _sphinx_deprecated
@@ -547,10 +547,31 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
         assert fields[0]["type"] == ["null", "string"]
         fields[0]["type"] = "string"
 
+    field_urn_type_classes = {}
+    for field in fields:
+        # Figure out if urn types are valid for each field.
+        field_urn_type_class = None
+        if field_name(field) == "platform":
+            field_urn_type_class = "DataPlatformUrn"
+        elif field.get("Urn"):
+            if len(field.get("entityTypes", [])) == 1:
+                field_entity_type = field["entityTypes"][0]
+                field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
+            else:
+                field_urn_type_class = "Urn"
+
+        field_urn_type_classes[field_name(field)] = field_urn_type_class
+
     _init_arg_parts: List[str] = []
     for field in fields:
+        field_urn_type_class = field_urn_type_classes[field_name(field)]
+
         default = '"PROD"' if field_name(field) == "env" else None
-        _arg_part = f"{field_name(field)}: {field_type(field)}"
+
+        type_hint = field_type(field)
+        if field_urn_type_class:
+            type_hint = f'Union["{field_urn_type_class}", str]'
+        _arg_part = f"{field_name(field)}: {type_hint}"
         if default:
             _arg_part += f" = {default}"
         _init_arg_parts.append(_arg_part)
@@ -579,16 +600,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
         init_validation += f'if not {field_name(field)}:\n    raise InvalidUrnError("{class_name} {field_name(field)} cannot be empty")\n'
 
         # Generalized mechanism for validating embedded urns.
-        field_urn_type_class = None
-        if field_name(field) == "platform":
-            field_urn_type_class = "DataPlatformUrn"
-        elif field.get("Urn"):
-            if len(field.get("entityTypes", [])) == 1:
-                field_entity_type = field["entityTypes"][0]
-                field_urn_type_class = f"{capitalize_entity_name(field_entity_type)}Urn"
-            else:
-                field_urn_type_class = "Urn"
-
+        field_urn_type_class = field_urn_type_classes[field_name(field)]
         if field_urn_type_class:
             init_validation += f"{field_name(field)} = str({field_name(field)})\n"
             init_validation += (
@@ -608,7 +620,7 @@ def generate_urn_class(entity_type: str, key_aspect: dict) -> str:
             init_coercion += "    platform_name = DataPlatformUrn.from_string(platform_name).platform_name\n"
 
         if field_name(field) == "platform":
-            init_coercion += "platform = DataPlatformUrn(platform).urn()\n"
+            init_coercion += "platform = platform.urn() if isinstance(platform, DataPlatformUrn) else DataPlatformUrn(platform).urn()\n"
         elif field_urn_type_class is None:
             # For all non-urns, run the value through the UrnEncoder.
             init_coercion += (
-Original file line number
+Diff line change
@@ Expand Up @@
         resolutionStrategy.force externalDependency.antlr4Runtime
         resolutionStrategy.force externalDependency.antlr4
+        resolutionStrategy.force 'org.apache.mina:mina-core:2.2.4'
       }
     }
@@ Expand Down @@