Skip to content

Commit

Permalink
Merge pull request #22 from rtekal/rtekal-pl-join-ui-ERModelRelation
Browse files Browse the repository at this point in the history
Rtekal pl join UI er model relation
  • Loading branch information
rtekal authored Mar 18, 2024
2 parents 6f4d1b1 + ce7d553 commit 40da935
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 23 deletions.
2 changes: 1 addition & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

### Breaking Changes

- #9934 - Stateful ingestion is now enabled by default if datahub-rest sink is used or if a `datahub_api` is specified. It will still be disabled by default when any other sink type is used.
- #9934 and #10075 - Stateful ingestion is now enabled by default if a `pipeline_name` is set and either a datahub-rest sink or `datahub_api` is specified. It will still be disabled by default when any other sink type is used or if there is no pipeline name set.
- #10002 - The `DataHubGraph` client no longer makes a request to the backend during initialization. If you want to preserve the old behavior, call `graph.test_connection()` after constructing the client.

### Potential Downtime
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/scripts/docgen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion
EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources

rm -r $DOCS_OUT_DIR || true
SPARK_VERSION=3.3 python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
21 changes: 5 additions & 16 deletions metadata-ingestion/src/datahub/emitter/rest_emitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,11 @@ def test_connection(self) -> None:
return

else:
# Looks like we either connected to an old GMS or to some other service. Let's see if we can determine which before raising an error
# A common misconfiguration is connecting to datahub-frontend so we special-case this check
if (
config.get("config", {}).get("application") == "datahub-frontend"
or config.get("config", {}).get("shouldShowDatasetLineage")
is not None
):
raise ConfigurationError(
"You seem to have connected to the frontend instead of the GMS endpoint. "
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms)"
)
else:
raise ConfigurationError(
"You have either connected to a pre-v0.8.0 DataHub GMS instance, or to a different server altogether! "
"Please check your configuration and make sure you are talking to the DataHub GMS endpoint."
)
raise ConfigurationError(
"You seem to have connected to the frontend service instead of the GMS endpoint. "
"The rest emitter should connect to DataHub GMS (usually <datahub-gms-host>:8080) or Frontend GMS API (usually <frontend>:9002/api/gms). "
"For Acryl users, the endpoint should be https://<name>.acryl.io/gms"
)
else:
logger.debug(
f"Unable to connect to {url} with status_code: {response.status_code}. Response: {response.text}"
Expand Down
4 changes: 3 additions & 1 deletion metadata-ingestion/src/datahub/ingestion/source/s3/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,14 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
self.init_spark()

def init_spark(self):
os.environ.setdefault("SPARK_VERSION", "3.3")
spark_version = os.environ["SPARK_VERSION"]

# Importing here to avoid Deequ dependency for non profiling use cases
# Deequ fails if Spark is not available which is not needed for non profiling use cases
import pydeequ

conf = SparkConf()
spark_version = os.getenv("SPARK_VERSION", "3.3")
conf.set(
"spark.jars.packages",
",".join(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class StatefulIngestionConfig(ConfigModel):
enabled: bool = Field(
default=False,
description="Whether or not to enable stateful ingest. "
"Default: True if datahub-rest sink is used or if a `datahub_api` is specified, otherwise False",
"Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
)
max_checkpoint_state_size: pydantic.PositiveInt = Field(
default=2**24, # 16 MB
Expand Down Expand Up @@ -233,9 +233,13 @@ def _initialize_checkpointing_state_provider(self) -> None:
IngestionCheckpointingProviderBase
] = None

if self.stateful_ingestion_config is None and self.ctx.graph:
if (
self.stateful_ingestion_config is None
and self.ctx.graph
and self.ctx.pipeline_name
):
logger.info(
"Stateful ingestion got enabled by default, as datahub-rest sink is used or `datahub_api` is specified"
"Stateful ingestion will be automatically enabled, as datahub-rest sink is used or `datahub_api` is specified"
)
self.stateful_ingestion_config = StatefulIngestionConfig(
enabled=True,
Expand Down
1 change: 0 additions & 1 deletion metadata-ingestion/tests/integration/s3/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def test_data_lake_s3_ingest(
def test_data_lake_local_ingest(
pytestconfig, touch_local_files, source_file, tmp_path, mock_time
):
os.environ["SPARK_VERSION"] = "3.3.2"
test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"
f = open(os.path.join(SOURCE_FILES_PATH, source_file))
source = json.load(f)
Expand Down

0 comments on commit 40da935

Please sign in to comment.