Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingestion/tableau): introduce project_path_pattern #10855

68 changes: 43 additions & 25 deletions metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,22 @@ class TableauConfig(
# Tableau project pattern
project_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Filter for specific Tableau projects. For example, use 'My Project' to ingest a root-level Project with name 'My Project', or 'My Project/Nested Project' to ingest a nested Project with name 'Nested Project'. "
description="[deprecated] Use project_path_pattern instead. Filter for specific Tableau projects. For example, use 'My Project' to ingest a root-level Project with name 'My Project', or 'My Project/Nested Project' to ingest a nested Project with name 'Nested Project'. "
"By default, all Projects nested inside a matching Project will be included in ingestion. "
"You can both allow and deny projects based on their name using their name, or a Regex pattern. "
"Deny patterns always take precedence over allow patterns. "
"By default, all projects will be ingested.",
)
_deprecate_projects_pattern = pydantic_field_deprecated("project_pattern")

project_path_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Filters Tableau projects by their full path. For instance, 'My Project/Nested Project' targets a specific nested project named 'Nested Project'."
" This is also useful when you need to exclude all nested projects under a particular project."
" You can allow or deny projects by specifying their path or a regular expression pattern."
" Deny patterns always override allow patterns."
" By default, all projects are ingested.",
)

project_path_separator: str = Field(
default="/",
Expand Down Expand Up @@ -454,17 +464,23 @@ class TableauConfig(
def projects_backward_compatibility(cls, values: Dict) -> Dict:
projects = values.get("projects")
project_pattern = values.get("project_pattern")
if project_pattern is None and projects:
project_path_pattern = values.get("project_path_pattern")
if project_pattern is None and project_path_pattern is None and projects:
logger.warning(
"project_pattern is not set but projects is set. projects is deprecated, please use "
"project_pattern instead."
"projects is deprecated, please use " "project_path_pattern instead."
)
logger.info("Initializing project_pattern from projects")
values["project_pattern"] = AllowDenyPattern(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sid-acryl I now deprecated project_pattern but I wanted to point out that for now this validation rule still initializes project_pattern when projects is set. Ideally, we would remove the old projects completely now but I tried it and it would require quite some refactoring in the tests.

allow=[f"^{prj}$" for prj in projects]
)
elif project_pattern != AllowDenyPattern.allow_all() and projects:
raise ValueError("projects is deprecated. Please use project_pattern only.")
elif (project_pattern or project_path_pattern) and projects:
raise ValueError(
"projects is deprecated. Please use project_path_pattern only."
)
elif project_path_pattern and project_pattern:
raise ValueError(
"project_pattern is deprecated. Please use project_path_pattern only."
)

return values

Expand Down Expand Up @@ -850,12 +866,13 @@ def form_path(project_id: str) -> List[str]:

def _is_allowed_project(self, project: TableauProject) -> bool:
# Either project name or project path should exist in allow
is_allowed: bool = self.config.project_pattern.allowed(
project.name
) or self.config.project_pattern.allowed(self._get_project_path(project))
is_allowed: bool = (
self.config.project_pattern.allowed(project.name)
or self.config.project_pattern.allowed(self._get_project_path(project))
) and self.config.project_path_pattern.allowed(self._get_project_path(project))
if is_allowed is False:
logger.info(
f"project({project.name}) is not allowed as per project_pattern"
f"Project ({project.name}) is not allowed as per project_pattern or project_path_pattern"
)
return is_allowed

Expand Down Expand Up @@ -887,28 +904,29 @@ def _init_tableau_project_registry(self, all_project_map: dict) -> None:
logger.debug(f"Project {project.name} is added in project registry")
projects_to_ingest[project.id] = project

# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
# Otherwise, nested projects will not have the correct browse paths if not created in correct order / hierarchy.
self.tableau_project_registry = OrderedDict(
sorted(projects_to_ingest.items(), key=lambda item: len(item[1].path))
)

if self.config.extract_project_hierarchy is False:
logger.debug(
"Skipping project hierarchy processing as configuration extract_project_hierarchy is "
"disabled"
)
return
else:
logger.debug(
"Reevaluating projects as extract_project_hierarchy is enabled"
)

logger.debug("Reevaluating projects as extract_project_hierarchy is enabled")
for project in list_of_skip_projects:
if (
project.parent_id in projects_to_ingest
and self._is_denied_project(project) is False
):
logger.debug(f"Project {project.name} is added in project registry")
projects_to_ingest[project.id] = project

for project in list_of_skip_projects:
if (
project.parent_id in self.tableau_project_registry
and self._is_denied_project(project) is False
):
logger.debug(f"Project {project.name} is added in project registry")
self.tableau_project_registry[project.id] = project
# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
Copy link
Contributor Author

@haeniya haeniya Sep 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reverted the changes to include the extract_project_hierarchy part again, as discussed. However, I slightly refactored it so we do the sorting below at the very end for all the projects to ingest. Let me know if you see any problem with this.

# Otherwise, nested projects will not have the correct browse paths if not created in correct order / hierarchy.
self.tableau_project_registry = OrderedDict(
sorted(projects_to_ingest.items(), key=lambda item: len(item[1].path))
)

def _init_datasource_registry(self) -> None:
if self.server is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,36 @@ def test_value_error_projects_and_project_pattern(
pipeline_config=new_config,
)
except Exception as e:
assert "projects is deprecated. Please use project_pattern only" in str(e)
assert "projects is deprecated. Please use project_path_pattern only" in str(e)


def test_project_pattern_deprecation(
pytestconfig, tmp_path, mock_datahub_graph
):
# Ingestion should raise ValueError
output_file_name: str = "tableau_project_pattern_deprecation_mces.json"
golden_file_name: str = "tableau_project_pattern_deprecation_mces_golden.json"

new_config = config_source_default.copy()
del new_config["projects"]
new_config["project_pattern"] = {"allow": ["^Samples$"]}
new_config["project_path_pattern"] = {"allow": ["^Samples$"]}

try:
tableau_ingest_common(
pytestconfig,
tmp_path,
mock_data(),
golden_file_name,
output_file_name,
mock_datahub_graph,
pipeline_config=new_config,
)
except Exception as e:
assert (
"project_pattern is deprecated. Please use project_path_pattern only"
in str(e)
)


@freeze_time(FROZEN_TIME)
Expand Down
Loading