From 77abb1d62b83679eca4b0aa7071b1e450d362219 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Wed, 29 Jan 2025 12:13:31 +0900 Subject: [PATCH 1/6] init fresh scripts --- .../examples/ml/mlflow_dh_client.py | 443 ++++++++++++++++++ .../examples/ml/mlflow_dh_client_sample.py | 128 +++++ 2 files changed, 571 insertions(+) create mode 100644 metadata-ingestion/examples/ml/mlflow_dh_client.py create mode 100644 metadata-ingestion/examples/ml/mlflow_dh_client_sample.py diff --git a/metadata-ingestion/examples/ml/mlflow_dh_client.py b/metadata-ingestion/examples/ml/mlflow_dh_client.py new file mode 100644 index 00000000000000..f5581591e2fd84 --- /dev/null +++ b/metadata-ingestion/examples/ml/mlflow_dh_client.py @@ -0,0 +1,443 @@ +import logging +import time +from typing import Any, Dict, List, Optional, Union + +import datahub.metadata.schema_classes as models +from datahub.api.entities.dataset.dataset import Dataset +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import ( + DataProcessInstanceInput, + DataProcessInstanceOutput, +) +from datahub.metadata.schema_classes import ( + ChangeTypeClass, + DataProcessInstanceRunResultClass, + DataProcessRunStatusClass, +) +from datahub.metadata.urns import ( + ContainerUrn, + DataPlatformUrn, + MlModelGroupUrn, + MlModelUrn, + VersionSetUrn, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class MLflowDatahubClient: + """Client for creating and managing MLflow metadata in DataHub.""" + + def __init__( + self, + token: str, + server_url: str = "http://localhost:8080", + platform: str = "mlflow", + ) -> None: + """Initialize the MLflow DataHub client.""" + self.token = token + self.server_url = server_url + self.platform = platform + self.graph = DataHubGraph( + DatahubClientConfig( + server=server_url, + token=token, + extra_headers={"Authorization": f"Bearer {token}"}, + ) + ) + + def _create_timestamp( + self, timestamp: Optional[int] = None + ) -> models.TimeStampClass: + """Helper to create timestamp with current time if not provided""" + return models.TimeStampClass( + time=timestamp or int(time.time() * 1000), actor="urn:li:corpuser:datahub" + ) + + def _emit_mcps( + self, + mcps: Union[List[MetadataChangeProposalWrapper], MetadataChangeProposalWrapper], + ) -> None: + """Helper to emit MCPs with proper connection handling""" + if not isinstance(mcps, list): + mcps = [mcps] + with self.graph: + for mcp in mcps: + self.graph.emit(mcp) + + def _get_aspect( + self, entity_urn: str, aspect_type: Any, default_constructor: Any = None + ) -> Any: + """Helper to safely get an aspect with fallback""" + try: + return self.graph.get_aspect(entity_urn=entity_urn, aspect_type=aspect_type) + except Exception as e: + logger.warning(f"Could not fetch aspect for {entity_urn}: {e}") + return default_constructor() if default_constructor else None + + def _create_properties_class( + self, props_class: Any, props_dict: Optional[Dict[str, Any]] = None + ) -> Any: + """Helper to create properties class with provided values""" + if props_dict is None: + props_dict = {} + + filtered_props = {k: v for k, v in props_dict.items() if v is not None} + + if hasattr(props_class, "created"): + filtered_props.setdefault("created", self._create_timestamp()) + if hasattr(props_class, "lastModified"): + filtered_props.setdefault("lastModified", self._create_timestamp()) + + return props_class(**filtered_props) + + def _update_list_property( + self, existing_list: Optional[List[str]], new_item: str + ) -> List[str]: + """Helper to update a list property while maintaining uniqueness""" + items = set(existing_list if existing_list else []) + items.add(new_item) + return list(items) + + def _create_mcp( + self, + entity_urn: str, + aspect: Any, + entity_type: Optional[str] = None, + aspect_name: Optional[str] = None, + change_type: str = ChangeTypeClass.UPSERT, + ) -> MetadataChangeProposalWrapper: + """Helper to create an MCP with standard parameters""" + mcp_args = {"entityUrn": entity_urn, "aspect": aspect} + if entity_type: + mcp_args["entityType"] = entity_type + if aspect_name: + mcp_args["aspectName"] = aspect_name + mcp_args["changeType"] = change_type + return MetadataChangeProposalWrapper(**mcp_args) + + def _update_entity_properties( + self, + entity_urn: str, + aspect_type: Any, + updates: Dict[str, Any], + entity_type: str, + skip_properties: Optional[List[str]] = None, + ) -> None: + """Helper to update entity properties while preserving existing ones""" + existing_props = self._get_aspect(entity_urn, aspect_type, aspect_type) + skip_list = [] if skip_properties is None else skip_properties + props = self._copy_existing_properties(existing_props, skip_list) or {} + + for key, value in updates.items(): + if isinstance(value, str) and hasattr(existing_props, key): + existing_value = getattr(existing_props, key, []) + props[key] = self._update_list_property(existing_value, value) + else: + props[key] = value + + updated_props = self._create_properties_class(aspect_type, props) + mcp = self._create_mcp( + entity_urn, updated_props, entity_type, f"{entity_type}Properties" + ) + self._emit_mcps(mcp) + + def _copy_existing_properties( + self, existing_props: Any, skip_properties: Optional[List[str]] = None + ) -> Dict[str, Any]: + """Helper to copy existing properties while skipping specified ones""" + skip_list = [] if skip_properties is None else skip_properties + + internal_props = { + "ASPECT_INFO", + "ASPECT_NAME", + "ASPECT_TYPE", + "RECORD_SCHEMA", + } + skip_list.extend(internal_props) + + props: Dict[str, Any] = {} + if existing_props: + for prop in dir(existing_props): + if ( + prop.startswith("_") + or callable(getattr(existing_props, prop)) + or prop in skip_list + ): + continue + + value = getattr(existing_props, prop) + if value is not None: + props[prop] = value + + if hasattr(existing_props, "created"): + props.setdefault("created", self._create_timestamp()) + if hasattr(existing_props, "lastModified"): + props.setdefault("lastModified", self._create_timestamp()) + + return props + + def _create_run_event( + self, + status: str, + timestamp: int, + result: Optional[str] = None, + duration_millis: Optional[int] = None, + ) -> models.DataProcessInstanceRunEventClass: + """Helper to create run event with common parameters.""" + event_args: Dict[str, Any] = { + "timestampMillis": timestamp, + "status": status, + "attempt": 1, + } + + if result: + event_args["result"] = DataProcessInstanceRunResultClass( + type=result, nativeResultType=str(result) + ) + if duration_millis: + event_args["durationMillis"] = duration_millis + + return models.DataProcessInstanceRunEventClass(**event_args) + + def create_model_group( + self, + group_id: str, + properties: Optional[models.MLModelGroupPropertiesClass] = None, + **kwargs: Any, + ) -> str: + """Create an ML model group with either property class or kwargs.""" + model_group_urn = MlModelGroupUrn(platform=self.platform, name=group_id) + + if properties is None: + properties = self._create_properties_class( + models.MLModelGroupPropertiesClass, kwargs + ) + + mcp = self._create_mcp( + str(model_group_urn), properties, "mlModelGroup", "mlModelGroupProperties" + ) + self._emit_mcps(mcp) + logger.info(f"Created model group: {model_group_urn}") + return str(model_group_urn) + + def create_model( + self, + model_id: str, + version: str, + alias: Optional[str] = None, + properties: Optional[models.MLModelPropertiesClass] = None, + **kwargs: Any, + ) -> str: + """Create an ML model with either property classes or kwargs.""" + model_urn = MlModelUrn(platform=self.platform, name=model_id) + version_set_urn = VersionSetUrn( + id=f"mlmodel_{model_id}_versions", entity_type="mlModel" + ) + + # Handle model properties + if properties is None: + # If no properties provided, create from kwargs + properties = self._create_properties_class( + models.MLModelPropertiesClass, kwargs + ) + + # Ensure version is set in model properties + version_tag = models.VersionTagClass(versionTag=str(version)) + properties.version = version_tag + + # Create version properties + version_props = { + "version": version_tag, + "versionSet": str(version_set_urn), + "sortId": "AAAAAAAA", + } + + # Add alias if provided + if alias: + version_props["aliases"] = [models.VersionTagClass(versionTag=alias)] + + version_properties = self._create_properties_class( + models.VersionPropertiesClass, version_props + ) + + # Create version set properties + version_set_properties = models.VersionSetPropertiesClass( + latest=str(model_urn), + versioningScheme="ALPHANUMERIC_GENERATED_BY_DATAHUB", + ) + + mcps = [ + self._create_mcp( + str(model_urn), properties, "mlModel", "mlModelProperties" + ), + self._create_mcp( + str(version_set_urn), + version_set_properties, + "versionSet", + "versionSetProperties", + ), + self._create_mcp( + str(model_urn), version_properties, "mlModel", "versionProperties" + ), + ] + self._emit_mcps(mcps) + logger.info(f"Created model: {model_urn}") + return str(model_urn) + + def create_experiment( + self, + experiment_id: str, + properties: Optional[models.ContainerPropertiesClass] = None, + **kwargs: Any, + ) -> str: + """Create an ML experiment with either property class or kwargs.""" + container_urn = ContainerUrn(guid=experiment_id) + platform_urn = DataPlatformUrn(platform_name=self.platform) + + if properties is None: + properties = self._create_properties_class( + models.ContainerPropertiesClass, kwargs + ) + + container_subtype = models.SubTypesClass(typeNames=["ML Experiment"]) + browse_path = models.BrowsePathsV2Class(path=[]) + platform_instance = models.DataPlatformInstanceClass(platform=str(platform_urn)) + + mcps = MetadataChangeProposalWrapper.construct_many( + entityUrn=str(container_urn), + aspects=[container_subtype, properties, browse_path, platform_instance], + ) + self._emit_mcps(mcps) + logger.info(f"Created experiment: {container_urn}") + return str(container_urn) + + def create_training_run( + self, + run_id: str, + properties: Optional[models.DataProcessInstancePropertiesClass] = None, + training_run_properties: Optional[models.MLTrainingRunPropertiesClass] = None, + run_result: Optional[str] = None, + start_timestamp: Optional[int] = None, + end_timestamp: Optional[int] = None, + **kwargs: Any, + ) -> str: + """Create a training run with properties and events.""" + dpi_urn = f"urn:li:dataProcessInstance:{run_id}" + + # Create basic properties and aspects + aspects = [ + ( + properties + or self._create_properties_class( + models.DataProcessInstancePropertiesClass, kwargs + ) + ), + models.SubTypesClass(typeNames=["ML Training Run"]), + ] + + # Add training run properties if provided + if training_run_properties: + aspects.append(training_run_properties) + + # Handle run events + current_time = int(time.time() * 1000) + start_ts = start_timestamp or current_time + end_ts = end_timestamp or current_time + + # Create events + aspects.append( + self._create_run_event( + status=DataProcessRunStatusClass.STARTED, timestamp=start_ts + ) + ) + + if run_result: + aspects.append( + self._create_run_event( + status=DataProcessRunStatusClass.COMPLETE, + timestamp=end_ts, + result=run_result, + duration_millis=end_ts - start_ts, + ) + ) + + # Create and emit MCPs + mcps = [self._create_mcp(dpi_urn, aspect) for aspect in aspects] + self._emit_mcps(mcps) + logger.info(f"Created training run: {dpi_urn}") + return dpi_urn + + def create_dataset(self, name: str, platform: str, **kwargs: Any) -> str: + """Create a dataset with flexible properties.""" + dataset = Dataset(id=name, platform=platform, name=name, **kwargs) + mcps = list(dataset.generate_mcp()) + self._emit_mcps(mcps) + if dataset.urn is None: + raise ValueError(f"Failed to create dataset URN for {name}") + return dataset.urn + + def add_run_to_model(self, model_urn: str, run_urn: str) -> None: + """Add a run to a model while preserving existing properties.""" + self._update_entity_properties( + entity_urn=model_urn, + aspect_type=models.MLModelPropertiesClass, + updates={"trainingJobs": run_urn}, + entity_type="mlModel", + skip_properties=["trainingJobs"], + ) + logger.info(f"Added run {run_urn} to model {model_urn}") + + def add_run_to_model_group(self, model_group_urn: str, run_urn: str) -> None: + """Add a run to a model group while preserving existing properties.""" + self._update_entity_properties( + entity_urn=model_group_urn, + aspect_type=models.MLModelGroupPropertiesClass, + updates={"trainingJobs": run_urn}, + entity_type="mlModelGroup", + skip_properties=["trainingJobs"], + ) + logger.info(f"Added run {run_urn} to model group {model_group_urn}") + + def add_model_to_model_group(self, model_urn: str, group_urn: str) -> None: + """Add a model to a group while preserving existing properties""" + self._update_entity_properties( + entity_urn=model_urn, + aspect_type=models.MLModelPropertiesClass, + updates={"groups": group_urn}, + entity_type="mlModel", + skip_properties=["groups"], + ) + logger.info(f"Added model {model_urn} to group {group_urn}") + + def add_run_to_experiment(self, run_urn: str, experiment_urn: str) -> None: + """Add a run to an experiment""" + mcp = self._create_mcp( + entity_urn=run_urn, aspect=models.ContainerClass(container=experiment_urn) + ) + self._emit_mcps(mcp) + logger.info(f"Added run {run_urn} to experiment {experiment_urn}") + + def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: + """Add input datasets to a run""" + mcp = self._create_mcp( + entity_urn=run_urn, + entity_type="dataProcessInstance", + aspect_name="dataProcessInstanceInput", + aspect=DataProcessInstanceInput(inputs=dataset_urns), + ) + self._emit_mcps(mcp) + logger.info(f"Added input datasets to run {run_urn}") + + def add_output_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: + """Add output datasets to a run""" + mcp = self._create_mcp( + entity_urn=run_urn, + entity_type="dataProcessInstance", + aspect_name="dataProcessInstanceOutput", + aspect=DataProcessInstanceOutput(outputs=dataset_urns), + ) + self._emit_mcps(mcp) + logger.info(f"Added output datasets to run {run_urn}") diff --git a/metadata-ingestion/examples/ml/mlflow_dh_client_sample.py b/metadata-ingestion/examples/ml/mlflow_dh_client_sample.py new file mode 100644 index 00000000000000..867f118fa88392 --- /dev/null +++ b/metadata-ingestion/examples/ml/mlflow_dh_client_sample.py @@ -0,0 +1,128 @@ +import argparse + +from mlflow_dh_client import MLflowDatahubClient + +import datahub.metadata.schema_classes as models +from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType + +if __name__ == "__main__": + # Example usage + parser = argparse.ArgumentParser() + parser.add_argument("--token", required=True, help="DataHub access token") + args = parser.parse_args() + + client = MLflowDatahubClient(token=args.token) + + # Create model group + model_group_urn = client.create_model_group( + group_id="airline_forecast_models_group", + properties=models.MLModelGroupPropertiesClass( + name="Airline Forecast Models Group", + description="Group of models for airline passenger forecasting", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), + ) + + # Creating a model with property classes + model_urn = client.create_model( + model_id="arima_model", + properties=models.MLModelPropertiesClass( + name="ARIMA Model", + description="ARIMA model for airline passenger forecasting", + customProperties={"team": "forecasting"}, + trainingMetrics=[ + models.MLMetricClass(name="accuracy", value="0.9"), + models.MLMetricClass(name="precision", value="0.8"), + ], + hyperParams=[ + models.MLHyperParamClass(name="learning_rate", value="0.01"), + models.MLHyperParamClass(name="batch_size", value="32"), + ], + externalUrl="https:localhost:5000", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + tags=["forecasting", "arima"], + ), + version="1.0", + alias="champion", + ) + + # Creating an experiment with property class + experiment_urn = client.create_experiment( + experiment_id="airline_forecast_experiment", + properties=models.ContainerPropertiesClass( + name="Airline Forecast Experiment", + description="Experiment to forecast airline passenger numbers", + customProperties={"team": "forecasting"}, + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), + ) + + run_urn = client.create_training_run( + run_id="simple_training_run", + properties=models.DataProcessInstancePropertiesClass( + name="Simple Training Run", + created=models.AuditStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + customProperties={"team": "forecasting"}, + ), + training_run_properties=models.MLTrainingRunPropertiesClass( + id="simple_training_run", + outputUrls=["s3://my-bucket/output"], + trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], + hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], + externalUrl="https:localhost:5000", + ), + run_result=RunResultType.FAILURE, + start_timestamp=1628580000000, + end_timestamp=1628580001000, + ) + # Create datasets + input_dataset_urn = client.create_dataset( + platform="snowflake", + name="iris_input", + ) + + output_dataset_urn = client.create_dataset( + platform="snowflake", + name="iris_ouptut", + ) + + # Add run to experiment + client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) + + # Add model to model group + client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) + + # Add run to model + client.add_run_to_model( + model_urn=model_urn, + run_urn=run_urn, + ) + + # add run to model group + client.add_run_to_model_group( + model_group_urn=model_group_urn, + run_urn=run_urn, + ) + + # Add input and output datasets to run + client.add_input_datasets_to_run( + run_urn=run_urn, dataset_urns=[str(input_dataset_urn)] + ) + + client.add_output_datasets_to_run( + run_urn=run_urn, dataset_urns=[str(output_dataset_urn)] + ) From f490e4e70b4045f86118283b8fc52036eac407f2 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Wed, 29 Jan 2025 18:22:48 +0900 Subject: [PATCH 2/6] draft tutorial for demo --- docs-website/sidebars.js | 1 + docs/api/tutorials/ml_v2.md | 792 ++++++++++++++++++++++++++++++++++++ 2 files changed, 793 insertions(+) create mode 100644 docs/api/tutorials/ml_v2.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index fbd35b60aedba9..97f19f071df0dc 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -933,6 +933,7 @@ module.exports = { "docs/api/tutorials/forms", "docs/api/tutorials/lineage", "docs/api/tutorials/ml", + "docs/api/tutorials/ml_v2", "docs/api/tutorials/owners", "docs/api/tutorials/structured-properties", "docs/api/tutorials/tags", diff --git a/docs/api/tutorials/ml_v2.md b/docs/api/tutorials/ml_v2.md new file mode 100644 index 00000000000000..d41a5926f04a8a --- /dev/null +++ b/docs/api/tutorials/ml_v2.md @@ -0,0 +1,792 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# ML System with DataHub + +## Why Would You Integrate ML System with DataHub? + +As a data practitioner, keeping track of your ML experiments, models, and their relationships can be challenging. DataHub makes this easier by providing a central place to organize and track your ML assets. + +This guide will show you how to integrate your ML workflows with DataHub. +With this integration, you can easily find and share ML models across your organization. +Your team can track how models evolve over time and understand how training data connects to each model. +Most importantly, it enables seamless collaboration on ML projects by making everything discoverable and connected. + +## Goals Of This Guide + +In this guide, you'll learn how to: +- Create your basic ML components (models, experiments, runs) +- Connect these components to build a complete ML system +- Track relationships between models, data, and experiments + +## Core ML Concepts + +Here's what you need to know about the key components, based on MLflow's terminology: + +- **Experiments** are collections of **training runs** for the same project, like all attempts to build a churn predictor. +- **Training Runs** are attempts to train a **model** within an **experiment**, capturing your parameters and results. +- **Model** organize related model versions together, like all versions of your churn predictor. +- **Model Versions** are successful training runs that you've registered for production use. + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/concept-diagram.png"/> +</p> + +The hierarchy works like this: +1. Every run belongs to an experiment +2. Successful runs can become model versions +3. Model versions belong to a model group +4. Not every run becomes a model version + +:::note Terminology +Here's how DataHub and MLflow terms map to each other. +For more details, see the [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md).: + +| DataHub | MLflow | Description | +|---------|---------|-------------| +| ML Model Group | Model | Collection of related model versions | +| ML Model | Model Version | Specific version of a trained model | +| ML Training Run | Run | Single training attempt | +| ML Experiment | Experiment | Project workspace | +::: + +## Basic Setup + +For this tutorial, you need to deploy DataHub Quickstart locally. +For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). + +Next, you need to set up the Python client for DataHub. +Create a token in DataHub UI and replace `<your_token>` with your token in the code below: + +```python +from mlflow_dh_client import MLflowDatahubClient + +client = MLflowDatahubClient(token="<your_token>") +``` + +:::note Verifying via GraphQL +In this Guide, we'll show you how to verify your changes using GraphQL queries. +You can run these queries in the DataHub UI -- just go to `https://localhost:9002/api/graphiql` and paste the query. +::: +## Create Simple ML Entities + +In this section, we'll create the basic building blocks of your ML system. These components will help you organize your ML work and make it discoverable by your team. + +### Create Model Group + +A model group is like a folder that contains different versions of a similar model. For example, all versions of your "Customer Churn Predictor" would go in one group. + +<Tabs> +<TabItem value="simple" label="Simple Version"> +Here's how to create a basic model group with just an identifier: + +```python +client.create_model_group( + group_id="airline_forecast_models_group", +) +``` + +</TabItem> +<TabItem value="detailed" label="Detailed Version"> + +For production use, you can add rich metadata like descriptions, creation timestamps, and team information: + + +```python +client.create_model_group( + group_id="airline_forecast_models_group", + properties=models.MLModelGroupPropertiesClass( + name="Airline Forecast Models Group", + description="Group of models for airline passenger forecasting", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), +) +``` + +</TabItem> +</Tabs> + +Let's verify that our model group was created: + +<Tabs> +<TabItem value="UI" label="UI"> +You can see your new model group in the DataHub UI: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +You can also query your model group using GraphQL to check its properties: + +```graphql +query { + mlModelGroup( + urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_models_group,PROD)" + ) { + name + description + } +} +``` + +The response should show your model group's details: + +```json +{ + "data": { + "mlModelGroup": { + "name": "airline_forecast_models_group", + "description": "Group of models for airline passenger forecasting" + } + } +} +``` + +</TabItem> +</Tabs> + +### Create Model + +Now let's create a specific model version. This represents a trained model that you might deploy. + +<Tabs> +<TabItem value="simple" label="Simple Version"> + +Here's the minimum needed to create a model (note that version is required): + + +```python +client.create_model( + model_id="arima_model", + version="1.0", +) +``` + +</TabItem> +<TabItem value="detailed" label="Detailed Version"> + +For a production model, you'll want to include metrics, parameters, and other metadata: + + +```python +client.create_model( + model_id="arima_model", + properties=models.MLModelPropertiesClass( + name="ARIMA Model", + description="ARIMA model for airline passenger forecasting", + customProperties={"team": "forecasting"}, + trainingMetrics=[ + models.MLMetricClass(name="accuracy", value="0.9"), + models.MLMetricClass(name="precision", value="0.8"), + ], + hyperParams=[ + models.MLHyperParamClass(name="learning_rate", value="0.01"), + models.MLHyperParamClass(name="batch_size", value="32"), + ], + externalUrl="https:localhost:5000", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + tags=["forecasting", "arima"], + ), + version="1.0", + alias="champion", +) +``` + +</TabItem> +</Tabs> + +Let's verify our model: + +<Tabs> +<TabItem value="UI" label="UI"> +You can view your model's details in the DataHub UI. + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Here's how to query your model's information using GraphQL: + +```graphql +query { + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { + name + description + versionProperties { + version { + versionTag + } + } + } +} +``` + +You should see details about your model: + +```json +{ + "data": { + "mlModel": { + "name": "arima_model", + "description": "ARIMA model for airline passenger forecasting", + "versionProperties": { + "version": { + "versionTag": "1.0" + } + } + } + } +} +``` + +</TabItem> +</Tabs> + +### Create Experiment + +An experiment helps you organize multiple training runs for a specific project. + +<Tabs> +<TabItem value="simple" label="Simple Version"> +Create a basic experiment with just an ID: + +```python +client.create_experiment( + experiment_id="airline_forecast_experiment", +) +``` + +</TabItem> +<TabItem value="detailed" label="Detailed Version"> +Add more context to your experiment with metadata: + +```python +client.create_experiment( + experiment_id="airline_forecast_experiment", + properties=models.ContainerPropertiesClass( + name="Airline Forecast Experiment", + description="Experiment to forecast airline passenger numbers", + customProperties={"team": "forecasting"}, + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), +) +``` + +</TabItem> +</Tabs> + +Verify your experiment: + +<Tabs> +<TabItem value="UI" label="UI"> +View your experiment's details in the UI: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query your experiment's information: + +```graphql +query { + container( + urn:"urn:li:container:airline_forecast_experiment" + ) { + name + description + properties { + customProperties + } + } +} +``` + +Check the experiment's details in the response: + +```json +{ + "data": { + "container": { + "name": "Airline Forecast Experiment", + "description": "Experiment to forecast airline passenger numbers", + "properties": { + "customProperties": { + "team": "forecasting" + } + } + } + } +} +``` + +</TabItem> +</Tabs> + +### Create Training Run + +A training run captures everything about a specific model training attempt. + +<Tabs> +<TabItem value="simple" label="Simple Version"> + +Create a basic training run: + +```python +client.create_training_run( + run_id="simple_training_run_4", +) +``` + +</TabItem> +<TabItem value="detailed" label="Detailed Version"> + +For a production run, you'll want to include metrics, parameters, and other metadata: + +```python +client.create_training_run( + run_id="simple_training_run_4", + properties=models.DataProcessInstancePropertiesClass( + name="Simple Training Run 4", + created=models.AuditStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + customProperties={"team": "forecasting"}, + ), + training_run_properties=models.MLTrainingRunPropertiesClass( + id="simple_training_run_4", + outputUrls=["s3://my-bucket/output"], + trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], + hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], + externalUrl="https:localhost:5000", + ), + run_result=RunResultType.FAILURE, + start_timestamp=1628580000000, + end_timestamp=1628580001000, +) +``` + +</TabItem> +</Tabs> + +Check your training run: + +<Tabs> +<TabItem value="UI" label="UI"> + +View the training run details in DataHub UI: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query your training run information: + +```graphql +query { + dataProcessInstance( + urn:"urn:li:dataProcessInstance:simple_training_run_4" + ) { + name + created { + time + } + properties { + customProperties + } + } +} +``` + +See the run details in the response: + +```json +{ + "data": { + "dataProcessInstance": { + "name": "Simple Training Run 4", + "created": { + "time": 1628580000000 + }, + "properties": { + "customProperties": { + "team": "forecasting" + } + } + } + } +} +``` + +</TabItem> +</Tabs> + +## Define Entity Relationships + +Now comes the important part - connecting all these components together. By establishing relationships between your ML assets, you'll be able to: +- Track model lineage (which data and runs produced which models) +- Monitor model evolution over time +- Understand dependencies between different components +- Enable comprehensive searching and filtering + +### Add Model To Model Group + +Connect your model to its group to organize related model versions together: + +```python +client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) +``` + +After connecting your model to a group, you'll be able to: +- View all versions of a model in one place +- Track model evolution over time +- Compare different versions easily +- Manage model lifecycles better + +<Tabs> +<TabItem value="UI" label="UI"> + +In **Model Group** view, you can see the model versions under the **Models** section: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-with-model.png"/> +</p> + +In **Model** page , you can see the group it belongs to under the **Group** tab: +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-model-group.png"/> +</p> + +</TabItem> + +<TabItem value="graphql" label="GraphQL"> + +You can query the model's group using `groups` field to see the relationship: + +```graphql +query { + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { + name + properties { + groups { + urn + properties { + name + } + } + } + } +} +``` + +Expected response: + +```json +{ + "data": { + "mlModel": { + "name": "arima_model", + "properties": { + "groups": [ + { + "urn": "urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)", + "properties": { + "name": "Airline Forecast Model Group" + } + } + ] + } + } + } +} +``` + +</TabItem> +</Tabs> + +### Add Run To Experiment + +Organize your training runs by adding them to an experiment: + +```python +client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) +``` + +This connection enables you to: +- Group related training attempts together +- Compare runs within the same experiment +- Track progress toward your ML goals +- Share results with your team + +<Tabs> +<TabItem value="UI" label="UI"> + +Under **Entities** tab in the **Experiment** page, you can see the runs associated with the experiment: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-with-run.png"/> +</p> + +In the **Run** page, you can see the experiment it belongs to. + +<p align="center"> + <img width="40%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-with-experiment.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> + +Query the run's experiment through `parentContainers`: + +```graphql +query { + dataProcessInstance( + urn:"urn:li:dataProcessInstance:simple_training_run" + ) { + name + parentContainers { + containers { + urn + properties { + name + } + } + } + } +} +``` + +See the relationship details: + +```json +{ + "data": { + "dataProcessInstance": { + "name": "Simple Training Run", + "parentContainers": { + "containers": [ + { + "urn": "urn:li:container:airline_forecast_experiment", + "properties": { + "name": "Airline Forecast Experiment" + } + } + ] +} + } + } +} +``` + +</TabItem> +</Tabs> + +### Add Run To Model + +Link a training run to the model it produced: + +```python +client.add_run_to_model(model_urn=model_urn, run_urn=run_urn) +``` + +This connection helps you: +- Track which training runs produced which models +- Understand model provenance +- Debug model issues by examining training history +- Monitor model evolution + +<Tabs> +<TabItem value="UI" label="UI"> + +In the **Model** page, you can see the runs that produced the model as **Source Run** under the **Summary** tab: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-source-run.png"/> +</p> + +In the **Run** page, you can see the related model under the **Lineage** tab: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model.png"/> +</p> +<p align="center"> + <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-graph.png"/> +</p> + +</TabItem> + +<TabItem value="graphql" label="GraphQL"> + +You can query the model's training jobs using `trainingJobs` to see the relationship: + +```graphql +query { + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { + name + properties { + mlModelLineageInfo { + trainingJobs + } + } + } +} +``` + +Check the relationship in the response: + +```json +{ + "data": { + "mlModel": { + "name": "arima_model", + "properties": { + "mlModelLineageInfo": { + "trainingJobs": [ + "urn:li:dataProcessInstance:simple_training_run_test" + ] + } + } + } + } +} +``` + +</TabItem> +</Tabs> + +### Add Run To Model Group + +Connect a training run directly to a model group: + +```python +client.add_run_to_model_group(model_group_urn=model_group_urn, run_urn=run_urn) +``` + +After establishing this connection, you'll be able to: +- View model groups in the run's lineage tab +- Query training jobs at the group level + +<Tabs> +<TabItem value="UI" label="UI"> + +In the **Run** page, you can see the model groups associated with the group under the **Lineage** tab: +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-group.png"/> +</p> +<p align="center"> + <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-group-graph.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> + +You can query the model groups's training jobs using `trainingJobs` to see the relationship: + + +```graphql +query { + mlModelGroup( + urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)" + ) { + name + properties { + mlModelLineageInfo { + trainingJobs + } + } + } +} +``` + +Verify the relationship: + +```json +{ + "data": { + "mlModelGroup": { + "name": "airline_forecast_model_group", + "properties": { + "mlModelLineageInfo": { + "trainingJobs": [ + "urn:li:dataProcessInstance:simple_training_run_test" + ] + } + } + } + } +} +``` + +</TabItem> +</Tabs> + +### Add Dataset To Run + +Track the data used in your training runs: + +```python +client.add_input_datasets_to_run( + run_urn=run_urn, + dataset_urns=[str(input_dataset_urn)] +) + +client.add_output_datasets_to_run( + run_urn=run_urn, + dataset_urns=[str(output_dataset_urn)] +) +``` + +This connection enables you to: +- Track data lineage for your models +- Understand data dependencies +- Ensure reproducibility of your training runs +- Monitor data quality impacts on model performance + +You can verify the relationship in the **Lineage** Tab either in **DataSet** page or **Run** page. +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-dataset-graph.png"/> +</p> + +## Full Overview + +This is how your ML system looks after connecting all the components. + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/lineage-full.png"/> +</p> + +Now you have a complete lineage view of your ML assets -- from training runs to models to datasets. + +You can check the complete script [here](). + +## What's Next? + +To see this integration in action and learn about real-world use cases: +- Watch our [Townhall demo](https://youtu.be/_WUoVqkF2Zo?feature=shared&t=1932) on MLflow integration with DataHub +- Check out the discussion in our [Slack community](https://slack.datahubproject.io) +- Readh our [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md) for more details \ No newline at end of file From cbe567cc7450969585bd6f47934d29478631706f Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Wed, 29 Jan 2025 18:35:20 +0900 Subject: [PATCH 3/6] fix some sections --- docs/api/tutorials/ml_v2.md | 186 +++++++++++++++--------------------- 1 file changed, 76 insertions(+), 110 deletions(-) diff --git a/docs/api/tutorials/ml_v2.md b/docs/api/tutorials/ml_v2.md index d41a5926f04a8a..df24a797584b09 100644 --- a/docs/api/tutorials/ml_v2.md +++ b/docs/api/tutorials/ml_v2.md @@ -3,14 +3,11 @@ import TabItem from '@theme/TabItem'; # ML System with DataHub -## Why Would You Integrate ML System with DataHub? +## Why Integrate Your ML System with DataHub? As a data practitioner, keeping track of your ML experiments, models, and their relationships can be challenging. DataHub makes this easier by providing a central place to organize and track your ML assets. -This guide will show you how to integrate your ML workflows with DataHub. -With this integration, you can easily find and share ML models across your organization. -Your team can track how models evolve over time and understand how training data connects to each model. -Most importantly, it enables seamless collaboration on ML projects by making everything discoverable and connected. +This guide will show you how to integrate your ML workflows with DataHub. With this integration, you can easily find and share ML models across your organization, track how models evolve over time, and understand how training data connects to each model. Most importantly, it enables seamless collaboration on ML projects by making everything discoverable and connected. ## Goals Of This Guide @@ -23,10 +20,10 @@ In this guide, you'll learn how to: Here's what you need to know about the key components, based on MLflow's terminology: -- **Experiments** are collections of **training runs** for the same project, like all attempts to build a churn predictor. -- **Training Runs** are attempts to train a **model** within an **experiment**, capturing your parameters and results. -- **Model** organize related model versions together, like all versions of your churn predictor. -- **Model Versions** are successful training runs that you've registered for production use. +- **Experiments** are collections of training runs for the same project, like all attempts to build a churn predictor +- **Training Runs** are attempts to train a model within an experiment, capturing parameters and results +- **Models** organize related model versions together, like all versions of your churn predictor +- **Model Versions** are successful training runs registered for production use <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/concept-diagram.png"/> @@ -40,7 +37,7 @@ The hierarchy works like this: :::note Terminology Here's how DataHub and MLflow terms map to each other. -For more details, see the [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md).: +For more details, see the [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md): | DataHub | MLflow | Description | |---------|---------|-------------| @@ -52,11 +49,11 @@ For more details, see the [MLflow integration doc](/docs/generated/ingestion/sou ## Basic Setup -For this tutorial, you need to deploy DataHub Quickstart locally. -For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). +To follow this tutorial, you'll need DataHub Quickstart deployed locally. +For detailed steps, see the [Datahub Quickstart Guide](/docs/quickstart.md). -Next, you need to set up the Python client for DataHub. -Create a token in DataHub UI and replace `<your_token>` with your token in the code below: +Next, set up the Python client for DataHub. +Create a token in DataHub UI and replace `<your_token>` with your token: ```python from mlflow_dh_client import MLflowDatahubClient @@ -65,20 +62,21 @@ client = MLflowDatahubClient(token="<your_token>") ``` :::note Verifying via GraphQL -In this Guide, we'll show you how to verify your changes using GraphQL queries. -You can run these queries in the DataHub UI -- just go to `https://localhost:9002/api/graphiql` and paste the query. +Throughout this guide, we'll show how to verify changes using GraphQL queries. +You can run these queries in the DataHub UI at `https://localhost:9002/api/graphiql`. ::: + ## Create Simple ML Entities -In this section, we'll create the basic building blocks of your ML system. These components will help you organize your ML work and make it discoverable by your team. +Let's create the basic building blocks of your ML system. These components will help you organize your ML work and make it discoverable by your team. ### Create Model Group -A model group is like a folder that contains different versions of a similar model. For example, all versions of your "Customer Churn Predictor" would go in one group. +A model group contains different versions of a similar model. For example, all versions of your "Customer Churn Predictor" would go in one group. <Tabs> <TabItem value="simple" label="Simple Version"> -Here's how to create a basic model group with just an identifier: +Create a basic model group with just an identifier: ```python client.create_model_group( @@ -88,9 +86,7 @@ client.create_model_group( </TabItem> <TabItem value="detailed" label="Detailed Version"> - -For production use, you can add rich metadata like descriptions, creation timestamps, and team information: - +Add rich metadata like descriptions, creation timestamps, and team information: ```python client.create_model_group( @@ -112,7 +108,7 @@ Let's verify that our model group was created: <Tabs> <TabItem value="UI" label="UI"> -You can see your new model group in the DataHub UI: +See your new model group in the DataHub UI: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-empty.png"/> @@ -120,7 +116,7 @@ You can see your new model group in the DataHub UI: </TabItem> <TabItem value="graphql" label="GraphQL"> -You can also query your model group using GraphQL to check its properties: +Query your model group to check its properties: ```graphql query { @@ -133,7 +129,7 @@ query { } ``` -The response should show your model group's details: +The response will show your model group's details: ```json { @@ -151,13 +147,11 @@ The response should show your model group's details: ### Create Model -Now let's create a specific model version. This represents a trained model that you might deploy. +Next, let's create a specific model version that represents a trained model ready for deployment. <Tabs> <TabItem value="simple" label="Simple Version"> - -Here's the minimum needed to create a model (note that version is required): - +Create a model with just the required version: ```python client.create_model( @@ -168,9 +162,7 @@ client.create_model( </TabItem> <TabItem value="detailed" label="Detailed Version"> - -For a production model, you'll want to include metrics, parameters, and other metadata: - +Include metrics, parameters, and metadata for production use: ```python client.create_model( @@ -208,7 +200,7 @@ Let's verify our model: <Tabs> <TabItem value="UI" label="UI"> -You can view your model's details in the DataHub UI. +Check your model's details in the DataHub UI: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-empty.png"/> @@ -216,7 +208,7 @@ You can view your model's details in the DataHub UI. </TabItem> <TabItem value="graphql" label="GraphQL"> -Here's how to query your model's information using GraphQL: +Query your model's information: ```graphql query { @@ -234,7 +226,7 @@ query { } ``` -You should see details about your model: +The response will show your model's details: ```json { @@ -257,11 +249,11 @@ You should see details about your model: ### Create Experiment -An experiment helps you organize multiple training runs for a specific project. +An experiment helps organize multiple training runs for a specific project. <Tabs> <TabItem value="simple" label="Simple Version"> -Create a basic experiment with just an ID: +Create a basic experiment: ```python client.create_experiment( @@ -271,7 +263,7 @@ client.create_experiment( </TabItem> <TabItem value="detailed" label="Detailed Version"> -Add more context to your experiment with metadata: +Add context and metadata: ```python client.create_experiment( @@ -297,7 +289,7 @@ Verify your experiment: <Tabs> <TabItem value="UI" label="UI"> -View your experiment's details in the UI: +See your experiment's details in the UI: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-empty.png"/> @@ -321,7 +313,7 @@ query { } ``` -Check the experiment's details in the response: +Check the response: ```json { @@ -344,11 +336,10 @@ Check the experiment's details in the response: ### Create Training Run -A training run captures everything about a specific model training attempt. +A training run captures all details about a specific model training attempt. <Tabs> <TabItem value="simple" label="Simple Version"> - Create a basic training run: ```python @@ -359,8 +350,7 @@ client.create_training_run( </TabItem> <TabItem value="detailed" label="Detailed Version"> - -For a production run, you'll want to include metrics, parameters, and other metadata: +Include metrics, parameters, and other important metadata: ```python client.create_training_run( @@ -388,12 +378,11 @@ client.create_training_run( </TabItem> </Tabs> -Check your training run: +Verify your training run: <Tabs> <TabItem value="UI" label="UI"> - -View the training run details in DataHub UI: +View the run details in the UI: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-empty.png"/> @@ -401,7 +390,7 @@ View the training run details in DataHub UI: </TabItem> <TabItem value="graphql" label="GraphQL"> -Query your training run information: +Query your training run: ```graphql query { @@ -419,7 +408,7 @@ query { } ``` -See the run details in the response: +Check the response: ```json { @@ -444,45 +433,33 @@ See the run details in the response: ## Define Entity Relationships -Now comes the important part - connecting all these components together. By establishing relationships between your ML assets, you'll be able to: -- Track model lineage (which data and runs produced which models) -- Monitor model evolution over time -- Understand dependencies between different components -- Enable comprehensive searching and filtering +Now let's connect these components to create a comprehensive ML system. These connections enable you to track model lineage, monitor model evolution, understand dependencies, and search effectively across your ML assets. ### Add Model To Model Group -Connect your model to its group to organize related model versions together: +Connect your model to its group: ```python client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) ``` -After connecting your model to a group, you'll be able to: -- View all versions of a model in one place -- Track model evolution over time -- Compare different versions easily -- Manage model lifecycles better - <Tabs> <TabItem value="UI" label="UI"> -In **Model Group** view, you can see the model versions under the **Models** section: +View model versions in the **Model Group** under the **Models** section: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-with-model.png"/> </p> -In **Model** page , you can see the group it belongs to under the **Group** tab: +Find group information in the **Model** page under the **Group** tab: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-model-group.png"/> </p> - </TabItem> <TabItem value="graphql" label="GraphQL"> - -You can query the model's group using `groups` field to see the relationship: +Query the model-group relationship: ```graphql query { @@ -502,7 +479,7 @@ query { } ``` -Expected response: +Check the response: ```json { @@ -529,37 +506,29 @@ Expected response: ### Add Run To Experiment -Organize your training runs by adding them to an experiment: +Connect a training run to its experiment: ```python client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) ``` -This connection enables you to: -- Group related training attempts together -- Compare runs within the same experiment -- Track progress toward your ML goals -- Share results with your team - <Tabs> <TabItem value="UI" label="UI"> -Under **Entities** tab in the **Experiment** page, you can see the runs associated with the experiment: +Find your runs in the **Experiment** page under the **Entities** tab: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-with-run.png"/> </p> -In the **Run** page, you can see the experiment it belongs to. - +See the experiment details in the **Run** page: <p align="center"> <img width="40%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-with-experiment.png"/> </p> </TabItem> <TabItem value="graphql" label="GraphQL"> - -Query the run's experiment through `parentContainers`: +Query the run-experiment relationship: ```graphql query { @@ -579,7 +548,7 @@ query { } ``` -See the relationship details: +View the relationship details: ```json { @@ -595,7 +564,7 @@ See the relationship details: } } ] -} + } } } } @@ -606,28 +575,28 @@ See the relationship details: ### Add Run To Model -Link a training run to the model it produced: +Connect a training run to its resulting model: ```python client.add_run_to_model(model_urn=model_urn, run_urn=run_urn) ``` -This connection helps you: -- Track which training runs produced which models +This relationship enables you to: +- Track which runs produced each model - Understand model provenance -- Debug model issues by examining training history +- Debug model issues - Monitor model evolution <Tabs> <TabItem value="UI" label="UI"> -In the **Model** page, you can see the runs that produced the model as **Source Run** under the **Summary** tab: +Find the source run in the **Model** page under the **Summary** tab: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-source-run.png"/> </p> -In the **Run** page, you can see the related model under the **Lineage** tab: +See related models in the **Run** page under the **Lineage** tab: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model.png"/> @@ -639,8 +608,7 @@ In the **Run** page, you can see the related model under the **Lineage** tab: </TabItem> <TabItem value="graphql" label="GraphQL"> - -You can query the model's training jobs using `trainingJobs` to see the relationship: +Query the model's training jobs: ```graphql query { @@ -657,7 +625,7 @@ query { } ``` -Check the relationship in the response: +View the relationship: ```json { @@ -681,20 +649,22 @@ Check the relationship in the response: ### Add Run To Model Group -Connect a training run directly to a model group: +Create a direct connection between a run and a model group: ```python client.add_run_to_model_group(model_group_urn=model_group_urn, run_urn=run_urn) ``` -After establishing this connection, you'll be able to: -- View model groups in the run's lineage tab +This connection lets you: +- View model groups in the run's lineage - Query training jobs at the group level +- Track training history for model families <Tabs> <TabItem value="UI" label="UI"> -In the **Run** page, you can see the model groups associated with the group under the **Lineage** tab: +See model groups in the **Run** page under the **Lineage** tab: + <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-group.png"/> </p> @@ -704,9 +674,7 @@ In the **Run** page, you can see the model groups associated with the group unde </TabItem> <TabItem value="graphql" label="GraphQL"> - -You can query the model groups's training jobs using `trainingJobs` to see the relationship: - +Query the model group's training jobs: ```graphql query { @@ -723,7 +691,7 @@ query { } ``` -Verify the relationship: +Check the relationship: ```json { @@ -747,7 +715,7 @@ Verify the relationship: ### Add Dataset To Run -Track the data used in your training runs: +Track input and output datasets for your training runs: ```python client.add_input_datasets_to_run( @@ -761,32 +729,30 @@ client.add_output_datasets_to_run( ) ``` -This connection enables you to: -- Track data lineage for your models +These connections help you: +- Track data lineage - Understand data dependencies -- Ensure reproducibility of your training runs -- Monitor data quality impacts on model performance +- Ensure reproducibility +- Monitor data quality impacts -You can verify the relationship in the **Lineage** Tab either in **DataSet** page or **Run** page. +Find dataset relationships in the **Lineage** tab of either the **Dataset** or **Run** page: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-dataset-graph.png"/> </p> ## Full Overview -This is how your ML system looks after connecting all the components. +Here's your complete ML system with all components connected: <p align="center"> <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/lineage-full.png"/> </p> -Now you have a complete lineage view of your ML assets -- from training runs to models to datasets. - -You can check the complete script [here](). +You now have a complete lineage view of your ML assets, from training data through runs to production models! ## What's Next? To see this integration in action and learn about real-world use cases: - Watch our [Townhall demo](https://youtu.be/_WUoVqkF2Zo?feature=shared&t=1932) on MLflow integration with DataHub -- Check out the discussion in our [Slack community](https://slack.datahubproject.io) -- Readh our [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md) for more details \ No newline at end of file +- Join our [Slack community](https://slack.datahubproject.io) for discussions +- Read our [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md) for more details \ No newline at end of file From f34d5df8255535d9effab39874fde96e3b4f0196 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Wed, 29 Jan 2025 19:10:22 +0900 Subject: [PATCH 4/6] fix linting --- metadata-ingestion/examples/ml/mlflow_dh_client.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/metadata-ingestion/examples/ml/mlflow_dh_client.py b/metadata-ingestion/examples/ml/mlflow_dh_client.py index f5581591e2fd84..2ee9dc4f966722 100644 --- a/metadata-ingestion/examples/ml/mlflow_dh_client.py +++ b/metadata-ingestion/examples/ml/mlflow_dh_client.py @@ -56,10 +56,7 @@ def _create_timestamp( time=timestamp or int(time.time() * 1000), actor="urn:li:corpuser:datahub" ) - def _emit_mcps( - self, - mcps: Union[List[MetadataChangeProposalWrapper], MetadataChangeProposalWrapper], - ) -> None: + def _emit_mcps(self, mcps: Union[MetadataChangeProposalWrapper, List[Any]]) -> None: """Helper to emit MCPs with proper connection handling""" if not isinstance(mcps, list): mcps = [mcps] From a555ea580310c75bd09e4c50d80c4ba9ec97a3e4 Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Thu, 30 Jan 2025 11:12:11 +0900 Subject: [PATCH 5/6] update docs/ code feedbacks --- docs-website/sidebars.js | 12 +- docs/api/tutorials/ml.md | 866 ++++++++++-------- docs/api/tutorials/ml_feature_store.md | 505 ++++++++++ docs/api/tutorials/ml_v2.md | 758 --------------- .../dh_ai_client.py} | 10 +- .../dh_ai_client_sample.py} | 10 +- 6 files changed, 1037 insertions(+), 1124 deletions(-) create mode 100644 docs/api/tutorials/ml_feature_store.md delete mode 100644 docs/api/tutorials/ml_v2.md rename metadata-ingestion/examples/{ml/mlflow_dh_client.py => ai/dh_ai_client.py} (98%) rename metadata-ingestion/examples/{ml/mlflow_dh_client_sample.py => ai/dh_ai_client_sample.py} (93%) diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 97f19f071df0dc..e6aedabc37a19f 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -932,8 +932,16 @@ module.exports = { "docs/api/tutorials/domains", "docs/api/tutorials/forms", "docs/api/tutorials/lineage", - "docs/api/tutorials/ml", - "docs/api/tutorials/ml_v2", + { + type: "doc", + id: "docs/api/tutorials/ml", + label: "AI/ML Integration", + }, + { + type: "doc", + id: "docs/api/tutorials/ml_feature_store", + label: "Feature Store", + }, "docs/api/tutorials/owners", "docs/api/tutorials/structured-properties", "docs/api/tutorials/tags", diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md index e88c941c904670..73a86824c9e60e 100644 --- a/docs/api/tutorials/ml.md +++ b/docs/api/tutorials/ml.md @@ -1,146 +1,230 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# ML System +# AI/ML Framework Integration with DataHub -## Why Would You Integrate ML System with DataHub? +## Why Integrate Your AI/ML System with DataHub? -Machine learning systems have become a crucial feature in modern data stacks. -However, the relationships between the different components of a machine learning system, such as features, models, and feature tables, can be complex. -DataHub makes these relationships discoverable and facilitate utilization by other members of the organization. +As a data practitioner, keeping track of your AI experiments, models, and their relationships can be challenging. +DataHub makes this easier by providing a central place to organize and track your AI assets. -For technical details on ML entities, please refer to the following docs: +This guide will show you how to integrate your AI workflows with DataHub. +With integrations for popular ML platforms like MLflow and Amazon SageMaker, DataHub enables you to easily find and share AI models across your organization, track how models evolve over time, and understand how training data connects to each model. +Most importantly, it enables seamless collaboration on AI projects by making everything discoverable and connected. -- [MlFeature](/docs/generated/metamodel/entities/mlFeature.md) -- [MlPrimaryKey](/docs/generated/metamodel/entities/mlPrimaryKey.md) -- [MlFeatureTable](/docs/generated/metamodel/entities/mlFeatureTable.md) -- [MlModel](/docs/generated/metamodel/entities/mlModel.md) -- [MlModelGroup](/docs/generated/metamodel/entities/mlModelGroup.md) +## Goals Of This Guide -### Goal Of This Guide +In this guide, you'll learn how to: +- Create your basic AI components (models, experiments, runs) +- Connect these components to build a complete AI system +- Track relationships between models, data, and experiments -This guide will show you how to +## Core AI Concepts -- Create ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup, MlPrimaryKey -- Read ML entities: MlFeature, MlFeatureTable, MlModel, MlModelGroup, MlPrimaryKey -- Attach MlModel to MlFeature -- Attach MlFeatures to MlFeatureTable -- Attached MlFeatures to upstream Datasets that power them +Here's what you need to know about the key components in DataHub: -## Prerequisites +- **Experiments** are collections of training runs for the same project, like all attempts to build a churn predictor +- **Training Runs** are attempts to train a model within an experiment, capturing parameters and results +- **Model Groups** organize related models together, like all versions of your churn predictor +- **Models** are successful training runs registered for production use -For this tutorial, you need to deploy DataHub Quickstart and ingest sample data. -For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/concept-diagram-dh-term.png"/> +</p> -## Create ML Entities +The hierarchy works like this: +1. Every run belongs to an experiment +2. Successful runs can be registered as models +3. Models belong to a model group +4. Not every run becomes a model -### Create MlFeature +:::note Terminology Mapping +Different AI platforms (MLflow, Amazon SageMaker) have their own terminology. +To keep things consistent, we'll use DataHub's terms throughout this guide. +Here's how DataHub's terminology maps to these platforms: -An ML Feature represents an instance of a feature that can be used across different machine learning models. Features are organized into Feature Tables to be consumed by machine learning models. For example, if we were modeling features for a Users Feature Table, the Features would be `age`, `sign_up_date`, `active_in_past_30_days` and so forth.Using Features in DataHub allows users to see the sources a feature was generated from and how a feature is used to train models. +| DataHub | Description | MLflow | SageMaker | +|---------|-------------------------------------|---------|-----------| +| ML Model Group | Collection of related models | Model | Model Group | +| ML Model | Versioned artifact in a model group | Model Version | Model Version | +| ML Training Run | Single training attempt | Run | Training Job | +| ML Experiment | Collection of training runs | Experiment | Experiment | +::: -<Tabs> -<TabItem value="python" label="Python" default> +For platform-specific details, see our integration guides for [MLflow](/docs/generated/ingestion/sources/mlflow.md) and [Amazon SageMaker](/docs/generated/ingestion/sources/sagemaker.md). + +## Basic Setup + +To follow this tutorial, you'll need DataHub Quickstart deployed locally. +For detailed steps, see the [Datahub Quickstart Guide](/docs/quickstart.md). + +Next, set up the Python client for DataHub using `DataHubAIClient`. + +Create a token in DataHub UI and replace `<your_token>` with your token: ```python -{{ inline /metadata-ingestion/examples/library/create_mlfeature.py show_path_as_comment }} +from dh_ai_client import DataHubAIClient + +client = DataHubAIClient(token="<your_token>", server_url="http://localhost:9002") ``` -Note that when creating a feature, you create upstream lineage to the data warehouse using `sources`. +:::note Verifying via GraphQL +Throughout this guide, we'll show how to verify changes using GraphQL queries. +You can run these queries in the DataHub UI at `https://localhost:9002/api/graphiql`. +::: -</TabItem> -</Tabs> +## Create Simple AI Assets -### Create MlPrimaryKey +Let's create the basic building blocks of your ML system. These components will help you organize your AI work and make it discoverable by your team. -An ML Primary Key represents a specific element of a Feature Table that indicates what group the other features belong to. For example, if a Feature Table contained features for Users, the ML Primary Key would likely be `user_id` or some similar unique identifier for a user. Using ML Primary Keys in DataHub allow users to indicate how ML Feature Tables are structured. +### Create a Model Group + +A model group contains different versions of a similar model. For example, all versions of your "Customer Churn Predictor" would go in one group. <Tabs> -<TabItem value="python" label="Python" default> +<TabItem value="basic" label="Basic"> +Create a basic model group with just an identifier: ```python -{{ inline /metadata-ingestion/examples/library/create_mlprimarykey.py show_path_as_comment }} +client.create_model_group( + group_id="airline_forecast_models_group", +) ``` -Note that when creating a primary key, you create upstream lineage to the data warehouse using `sources`. +</TabItem> +<TabItem value="advanced" label="Advanced"> +Add rich metadata like descriptions, creation timestamps, and team information: + +```python +client.create_model_group( + group_id="airline_forecast_models_group", + properties=models.MLModelGroupPropertiesClass( + name="Airline Forecast Models Group", + description="Group of models for airline passenger forecasting", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), +) +``` </TabItem> </Tabs> -### Create MlFeatureTable - -A feature table represents a group of similar Features that can all be used together to train a model. For example, if there was a Users Feature Table, it would contain documentation around how to use the Users collection of Features and references to each Feature and ML Primary Key contained within it. +Let's verify that our model group was created: <Tabs> -<TabItem value="python" label="Python" default> +<TabItem value="UI" label="UI"> +See your new model group in the DataHub UI: -```python -{{ inline /metadata-ingestion/examples/library/create_mlfeature_table.py show_path_as_comment }} +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/model-group-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query your model group to check its properties: + +```graphql +query { + mlModelGroup( + urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_models_group,PROD)" + ) { + name + description + } +} ``` -Note that when creating a feature table, you connect the table to its features and primary key using `mlFeatures` and `mlPrimaryKeys`. +The response will show your model group's details: + +```json +{ + "data": { + "mlModelGroup": { + "name": "airline_forecast_models_group", + "description": "Group of models for airline passenger forecasting" + } + } +} +``` </TabItem> </Tabs> -### Create MlModel +### Create a Model -An ML Model in Acryl represents an individual version of a trained Machine Learning Model. Another way to think about the ML Model entity is as an istance of a training run. An ML Model entity tracks the exact ML Features used in that instance of training, along with the training results. This entity does not represents all versions of a ML Model. For example, if we train a model for homepage customization on a certain day, that would be a ML Model in DataHub. If you re-train the model the next day off of new data or with different parameters, that would produce a second ML Model entity. +Next, let's create a specific model version that represents a trained model ready for deployment. <Tabs> -<TabItem value="python" label="Python" default> +<TabItem value="basic" label="Basic"> +Create a model with just the required version: ```python -{{ inline /metadata-ingestion/examples/library/create_mlmodel.py show_path_as_comment }} +client.create_model( + model_id="arima_model", + version="1.0", +) ``` -Note that when creating a model, you link it to a list of features using `mlFeatures`. This indicates how the individual instance of the model was trained. -Additionally, you can access the relationship to model groups with `groups`. An ML Model is connected to the warehouse tables it depends on via its dependency on the ML Features it reads from. - </TabItem> -</Tabs> - -### Create MlModelGroup - -An ML Model Group represents the grouping of all training runs of a single Machine Learning model category. It will store documentation about the group of ML Models, along with references to each individual ML Model instance. - -<Tabs> -<TabItem value="python" label="Python" default> +<TabItem value="advanced" label="Advanced"> +Include metrics, parameters, and metadata for production use: ```python -{{ inline /metadata-ingestion/examples/library/create_mlmodel_group.py show_path_as_comment }} +client.create_model( + model_id="arima_model", + properties=models.MLModelPropertiesClass( + name="ARIMA Model", + description="ARIMA model for airline passenger forecasting", + customProperties={"team": "forecasting"}, + trainingMetrics=[ + models.MLMetricClass(name="accuracy", value="0.9"), + models.MLMetricClass(name="precision", value="0.8"), + ], + hyperParams=[ + models.MLHyperParamClass(name="learning_rate", value="0.01"), + models.MLHyperParamClass(name="batch_size", value="32"), + ], + externalUrl="https:localhost:5000", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + tags=["forecasting", "arima"], + ), + version="1.0", + alias="champion", +) ``` </TabItem> </Tabs> -### Expected Outcome of creating entities +Let's verify our model: -You can search the entities in DataHub UI. - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/feature-table-created.png"/> -</p> +<Tabs> +<TabItem value="UI" label="UI"> +Check your model's details in the DataHub UI: <p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/model-group-created.png"/> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/model-empty.png"/> </p> +</TabItem> -## Read ML Entities - -### Read MLFeature - -<Tabs> -<TabItem value="graphql" label="GraphQL" default> +<TabItem value="graphql" label="GraphQL"> +Query your model's information: -```json +```graphql query { - mlFeature(urn: "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)"){ + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { name - featureNamespace description - properties { - description - dataType + versionProperties { version { versionTag } @@ -149,472 +233,534 @@ query { } ``` -Expected response: +The response will show your model's details: ```json { "data": { - "mlFeature": { - "name": "test_BOOL_LIST_feature", - "featureNamespace": "test_feature_table_all_feature_dtypes", - "description": null, - "properties": { - "description": null, - "dataType": "SEQUENCE", - "version": null + "mlModel": { + "name": "arima_model", + "description": "ARIMA model for airline passenger forecasting", + "versionProperties": { + "version": { + "versionTag": "1.0" + } } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="curl" label="Curl" default> +</Tabs> -```json -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer <my-access-token>' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "query": "{ mlFeature(urn: \"urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)\") { name featureNamespace description properties { description dataType version { versionTag } } } }" -}' -``` +### Create an Experiment -Expected response: +An experiment helps organize multiple training runs for a specific project. -```json -{ - "data": { - "mlFeature": { - "name": "test_BOOL_LIST_feature", - "featureNamespace": "test_feature_table_all_feature_dtypes", - "description": null, - "properties": { - "description": null, - "dataType": "SEQUENCE", - "version": null - } - } - }, - "extensions": {} -} +<Tabs> +<TabItem value="basic" label="Basic"> +Create a basic experiment: + +```python +client.create_experiment( + experiment_id="airline_forecast_experiment", +) ``` </TabItem> -<TabItem value="python" label="Python"> +<TabItem value="advanced" label="Advanced"> +Add context and metadata: ```python -{{ inline /metadata-ingestion/examples/library/read_mlfeature.py show_path_as_comment }} +client.create_experiment( + experiment_id="airline_forecast_experiment", + properties=models.ContainerPropertiesClass( + name="Airline Forecast Experiment", + description="Experiment to forecast airline passenger numbers", + customProperties={"team": "forecasting"}, + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), +) ``` </TabItem> </Tabs> -### Read MlPrimaryKey +Verify your experiment: <Tabs> -<TabItem value="graphql" label="GraphQL" default> +<TabItem value="UI" label="UI"> +See your experiment's details in the UI: -```json +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/experiment-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query your experiment's information: + +```graphql query { - mlPrimaryKey(urn: "urn:li:mlPrimaryKey:(user_features,user_id)"){ + container( + urn:"urn:li:container:airline_forecast_experiment" + ) { name - featureNamespace description - dataType properties { - description - dataType - version { - versionTag - } + customProperties } } } ``` -Expected response: +Check the response: ```json { "data": { - "mlPrimaryKey": { - "name": "user_id", - "featureNamespace": "user_features", - "description": "User's internal ID", - "dataType": "ORDINAL", + "container": { + "name": "Airline Forecast Experiment", + "description": "Experiment to forecast airline passenger numbers", "properties": { - "description": "User's internal ID", - "dataType": "ORDINAL", - "version": null + "customProperties": { + "team": "forecasting" + } } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="curl" label="Curl" default> +</Tabs> -```json -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer <my-access-token>' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "query": "query { mlPrimaryKey(urn: \"urn:li:mlPrimaryKey:(user_features,user_id)\"){ name featureNamespace description dataType properties { description dataType version { versionTag } } }}" -}' -``` +### Create a Training Run -Expected response: +A training run captures all details about a specific model training attempt. -```json -{ - "data": { - "mlPrimaryKey": { - "name": "user_id", - "featureNamespace": "user_features", - "description": "User's internal ID", - "dataType": "ORDINAL", - "properties": { - "description": "User's internal ID", - "dataType": "ORDINAL", - "version": null - } - } - }, - "extensions": {} -} +<Tabs> +<TabItem value="basic" label="Basic"> +Create a basic training run: + +```python +client.create_training_run( + run_id="simple_training_run_4", +) ``` </TabItem> -<TabItem value="python" label="Python"> +<TabItem value="advanced" label="Advanced"> +Include metrics, parameters, and other important metadata: ```python -{{ inline /metadata-ingestion/examples/library/read_mlprimarykey.py show_path_as_comment }} +client.create_training_run( + run_id="simple_training_run_4", + properties=models.DataProcessInstancePropertiesClass( + name="Simple Training Run 4", + created=models.AuditStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + customProperties={"team": "forecasting"}, + ), + training_run_properties=models.MLTrainingRunPropertiesClass( + id="simple_training_run_4", + outputUrls=["s3://my-bucket/output"], + trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], + hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], + externalUrl="https:localhost:5000", + ), + run_result=RunResultType.FAILURE, + start_timestamp=1628580000000, + end_timestamp=1628580001000, +) ``` </TabItem> </Tabs> -### Read MLFeatureTable +Verify your training run: <Tabs> -<TabItem value="graphql" label="GraphQL" default> +<TabItem value="UI" label="UI"> +View the run details in the UI: -```json +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-empty.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query your training run: + +```graphql query { - mlFeatureTable(urn: "urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)"){ + dataProcessInstance( + urn:"urn:li:dataProcessInstance:simple_training_run_4" + ) { name - description - platform { - name + created { + time } properties { - description - mlFeatures { - name - } + customProperties } } } ``` -Expected Response: +Check the response: ```json { "data": { - "mlFeatureTable": { - "name": "test_feature_table_all_feature_dtypes", - "description": null, - "platform": { - "name": "feast" + "dataProcessInstance": { + "name": "Simple Training Run 4", + "created": { + "time": 1628580000000 }, "properties": { - "description": null, - "mlFeatures": [ - { - "name": "test_BOOL_LIST_feature" - }, - ...{ - "name": "test_STRING_feature" - } - ] + "customProperties": { + "team": "forecasting" + } } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="curl" label="Curl"> +</Tabs> -```json -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer <my-access-token>' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "query": "{ mlFeatureTable(urn: \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\") { name description platform { name } properties { description mlFeatures { name } } } }" -}' -``` +## Define Entity Relationships -Expected Response: +Now let's connect these components to create a comprehensive ML system. These connections enable you to track model lineage, monitor model evolution, understand dependencies, and search effectively across your ML assets. -```json -{ - "data": { - "mlFeatureTable": { - "name": "test_feature_table_all_feature_dtypes", - "description": null, - "platform": { - "name": "feast" - }, - "properties": { - "description": null, - "mlFeatures": [ - { - "name": "test_BOOL_LIST_feature" - }, - ...{ - "name": "test_STRING_feature" - } - ] - } - } - }, - "extensions": {} -} -``` +### Add Model To Model Group -</TabItem> -<TabItem value="python" label="Python"> +Connect your model to its group: ```python -{{ inline /metadata-ingestion/examples/library/read_mlfeature_table.py show_path_as_comment }} +client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) ``` -</TabItem> -</Tabs> +<Tabs> +<TabItem value="UI" label="UI"> -### Read MLModel +View model versions in the **Model Group** under the **Models** section: -<Tabs> -<TabItem value="graphql" label="GraphQL" default> +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/model-group-with-model.png"/> +</p> -```json +Find group information in the **Model** page under the **Group** tab: +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/model-with-model-group.png"/> +</p> +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query the model-group relationship: + +```graphql query { - mlModel(urn: "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)"){ + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { name - description properties { - description - version - type - mlFeatures groups { urn - name + properties { + name + } } } } } ``` -Expected Response: +Check the response: ```json { "data": { "mlModel": { - "name": "scienceModel", - "description": "A sample model for predicting some outcome.", + "name": "arima_model", "properties": { - "description": "A sample model for predicting some outcome.", - "version": null, - "type": "Naive Bayes classifier", - "mlFeatures": null, - "groups": [] + "groups": [ + { + "urn": "urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)", + "properties": { + "name": "Airline Forecast Model Group" + } + } + ] } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="curl" label="Curl" default> +</Tabs> -```json -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer <my-access-token>' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "query": "{ mlModel(urn: \"urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)\") { name description properties { description version type mlFeatures groups { urn name } } } }" -}' -``` +### Add Run To Experiment -Expected Response: +Connect a training run to its experiment: -```json -{ - "data": { - "mlModel": { - "name": "scienceModel", - "description": "A sample model for predicting some outcome.", - "properties": { - "description": "A sample model for predicting some outcome.", - "version": null, - "type": "Naive Bayes classifier", - "mlFeatures": null, - "groups": [] - } - } - }, - "extensions": {} -} +```python +client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) ``` -</TabItem> -<TabItem value="python" label="Python"> +<Tabs> +<TabItem value="UI" label="UI"> -```python -{{ inline /metadata-ingestion/examples/library/read_mlmodel.py show_path_as_comment }} -``` +Find your runs in the **Experiment** page under the **Entities** tab: -</TabItem> -</Tabs> +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/experiment-with-run.png"/> +</p> -### Read MLModelGroup +See the experiment details in the **Run** page: +<p align="center"> + <img width="40%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-with-experiment.png"/> +</p> +</TabItem> -<Tabs> -<TabItem value="graphql" label="GraphQL" default> +<TabItem value="graphql" label="GraphQL"> +Query the run-experiment relationship: -```json +```graphql query { - mlModelGroup(urn: "urn:li:mlModelGroup:(urn:li:dataPlatform:science,my-model-group,PROD)"){ + dataProcessInstance( + urn:"urn:li:dataProcessInstance:simple_training_run" + ) { name - description - platform { - name - } - properties { - description + parentContainers { + containers { + urn + properties { + name + } + } } } } ``` -Expected Response: (Note that this entity does not exist in the sample ingestion and you might want to create this entity first.) +View the relationship details: ```json { "data": { - "mlModelGroup": { - "name": "my-model-group", - "description": "my model group", - "platform": { - "name": "science" - }, - "properties": { - "description": "my model group" + "dataProcessInstance": { + "name": "Simple Training Run", + "parentContainers": { + "containers": [ + { + "urn": "urn:li:container:airline_forecast_experiment", + "properties": { + "name": "Airline Forecast Experiment" + } + } + ] } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="curl" label="Curl"> +</Tabs> -```json -curl --location --request POST 'http://localhost:8080/api/graphql' \ ---header 'Authorization: Bearer <my-access-token>' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "query": "{ mlModelGroup(urn: \"urn:li:mlModelGroup:(urn:li:dataPlatform:science,my-model-group,PROD)\") { name description platform { name } properties { description } } }" -}' +### Add Run To Model + +Connect a training run to its resulting model: + +```python +client.add_run_to_model(model_urn=model_urn, run_urn=run_urn) +``` + +This relationship enables you to: +- Track which runs produced each model +- Understand model provenance +- Debug model issues +- Monitor model evolution + +<Tabs> +<TabItem value="UI" label="UI"> + +Find the source run in the **Model** page under the **Summary** tab: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/model-with-source-run.png"/> +</p> + +See related models in the **Run** page under the **Lineage** tab: + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-lineage-model.png"/> +</p> +<p align="center"> + <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-lineage-model-graph.png"/> +</p> + +</TabItem> + +<TabItem value="graphql" label="GraphQL"> +Query the model's training jobs: + +```graphql +query { + mlModel( + urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" + ) { + name + properties { + mlModelLineageInfo { + trainingJobs + } + } + } +} ``` -Expected Response: (Note that this entity does not exist in the sample ingestion and you might want to create this entity first.) +View the relationship: ```json { "data": { - "mlModelGroup": { - "name": "my-model-group", - "description": "my model group", - "platform": { - "name": "science" - }, + "mlModel": { + "name": "arima_model", "properties": { - "description": "my model group" + "mlModelLineageInfo": { + "trainingJobs": [ + "urn:li:dataProcessInstance:simple_training_run_test" + ] + } } } - }, - "extensions": {} + } } ``` </TabItem> -<TabItem value="python" label="Python"> +</Tabs> -```python -{{ inline /metadata-ingestion/examples/library/read_mlmodel_group.py show_path_as_comment }} -``` +### Add Run To Model Group -</TabItem> -</Tabs> +Create a direct connection between a run and a model group: -## Add ML Entities +```python +client.add_run_to_model_group(model_group_urn=model_group_urn, run_urn=run_urn) +``` -### Add MlFeature to MlFeatureTable +This connection lets you: +- View model groups in the run's lineage +- Query training jobs at the group level +- Track training history for model families <Tabs> -<TabItem value="python" label="Python"> +<TabItem value="UI" label="UI"> -```python -{{ inline /metadata-ingestion/examples/library/add_mlfeature_to_mlfeature_table.py show_path_as_comment }} -``` +See model groups in the **Run** page under the **Lineage** tab: +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-lineage-model-group.png"/> +</p> +<p align="center"> + <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-lineage-model-group-graph.png"/> +</p> </TabItem> -</Tabs> -### Add MlFeature to MLModel +<TabItem value="graphql" label="GraphQL"> +Query the model group's training jobs: -<Tabs> -<TabItem value="python" label="Python"> +```graphql +query { + mlModelGroup( + urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)" + ) { + name + properties { + mlModelLineageInfo { + trainingJobs + } + } + } +} +``` -```python -{{ inline /metadata-ingestion/examples/library/add_mlfeature_to_mlmodel.py show_path_as_comment }} +Check the relationship: + +```json +{ + "data": { + "mlModelGroup": { + "name": "airline_forecast_model_group", + "properties": { + "mlModelLineageInfo": { + "trainingJobs": [ + "urn:li:dataProcessInstance:simple_training_run_test" + ] + } + } + } + } +} ``` </TabItem> </Tabs> -### Add MLGroup To MLModel +### Add Dataset To Run -<Tabs> -<TabItem value="python" label="Python"> +Track input and output datasets for your training runs: ```python -{{ inline /metadata-ingestion/examples/library/add_mlgroup_to_mlmodel.py show_path_as_comment }} -``` +client.add_input_datasets_to_run( + run_urn=run_urn, + dataset_urns=[str(input_dataset_urn)] +) -</TabItem> -</Tabs> - -### Expected Outcome of Adding ML Entities +client.add_output_datasets_to_run( + run_urn=run_urn, + dataset_urns=[str(output_dataset_urn)] +) +``` -You can access to `Features` or `Group` Tab of each entity to view the added entities. +These connections help you: +- Track data lineage +- Understand data dependencies +- Ensure reproducibility +- Monitor data quality impacts +Find dataset relationships in the **Lineage** tab of either the **Dataset** or **Run** page: <p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/feature-added-to-model.png"/> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/run-lineage-dataset-graph.png"/> </p> +## Full Overview + +Here's your complete ML system with all components connected: + <p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/model-group-added-to-model.png"/> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/ml/lineage-full.png"/> </p> + +You now have a complete lineage view of your ML assets, from training data through runs to production models! + +## What's Next? + +To see these integrations in action: +- Watch our [Townhall demo](https://youtu.be/_WUoVqkF2Zo?feature=shared&t=1932) showcasing the MLflow integration +- Explore our detailed documentation: + - [MLflow Integration Guide](/docs/generated/ingestion/sources/mlflow.md) + - [Amazon SageMaker Integration Guide](/docs/generated/ingestion/sources/sagemaker.md) \ No newline at end of file diff --git a/docs/api/tutorials/ml_feature_store.md b/docs/api/tutorials/ml_feature_store.md new file mode 100644 index 00000000000000..ef84f19b03fc5f --- /dev/null +++ b/docs/api/tutorials/ml_feature_store.md @@ -0,0 +1,505 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Feature Store Integration With DataHub + +## Why Would You Integrate Feature Store with DataHub? + +Feature Store is a data management layer that stores, organizes, and manages features for machine learning models. It is a centralized repository for features that can be used across different AI/ML models. +By integrating Feature Store with DataHub, you can track the lineage of features used in AI/ML models, understand how features are generated, and how they are used to train models. + +For technical details on feature store entities, please refer to the following docs: + +- [MlFeature](/docs/generated/metamodel/entities/mlFeature.md) +- [MlPrimaryKey](/docs/generated/metamodel/entities/mlPrimaryKey.md) +- [MlFeatureTable](/docs/generated/metamodel/entities/mlFeatureTable.md) + +### Goal Of This Guide + +This guide will show you how to + +- Create feature store entities: MlFeature, MlFeatureTable, MlPrimaryKey +- Read feature store entities: MlFeature, MlFeatureTable, MlPrimaryKey +- Attach MlModel to MlFeature +- Attach MlFeatures to MlFeatureTable +- Attached MlFeatures to upstream Datasets that power them + +## Prerequisites + +For this tutorial, you need to deploy DataHub Quickstart and ingest sample data. +For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart.md). + +## Create ML Entities + +:::note +For creating MLModels and MLGroups, please refer to [AI/ML Integration Guide](/docs/api/tutorials/ml.md). +::: + +### Create MlFeature + +An ML Feature represents an instance of a feature that can be used across different machine learning models. Features are organized into Feature Tables to be consumed by machine learning models. For example, if we were modeling features for a Users Feature Table, the Features would be `age`, `sign_up_date`, `active_in_past_30_days` and so forth.Using Features in DataHub allows users to see the sources a feature was generated from and how a feature is used to train models. + +<Tabs> +<TabItem value="python" label="Python" default> + +```python +{{ inline /metadata-ingestion/examples/library/create_mlfeature.py show_path_as_comment }} +``` + +Note that when creating a feature, you create upstream lineage to the data warehouse using `sources`. + +</TabItem> +</Tabs> + +### Create MlPrimaryKey + +An ML Primary Key represents a specific element of a Feature Table that indicates what group the other features belong to. For example, if a Feature Table contained features for Users, the ML Primary Key would likely be `user_id` or some similar unique identifier for a user. Using ML Primary Keys in DataHub allow users to indicate how ML Feature Tables are structured. + +<Tabs> +<TabItem value="python" label="Python" default> + +```python +{{ inline /metadata-ingestion/examples/library/create_mlprimarykey.py show_path_as_comment }} +``` + +Note that when creating a primary key, you create upstream lineage to the data warehouse using `sources`. + +</TabItem> +</Tabs> + +### Create MlFeatureTable + +A feature table represents a group of similar Features that can all be used together to train a model. For example, if there was a Users Feature Table, it would contain documentation around how to use the Users collection of Features and references to each Feature and ML Primary Key contained within it. + +<Tabs> +<TabItem value="python" label="Python" default> + +```python +{{ inline /metadata-ingestion/examples/library/create_mlfeature_table.py show_path_as_comment }} +``` + +Note that when creating a feature table, you connect the table to its features and primary key using `mlFeatures` and `mlPrimaryKeys`. + +</TabItem> +</Tabs> + + +### Expected Outcome of creating entities + +You can search the entities in DataHub UI. + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/feature-table-created.png"/> +</p> + +## Read ML Entities + +### Read MLFeature + +<Tabs> +<TabItem value="graphql" label="GraphQL" default> + +```json +query { + mlFeature(urn: "urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)"){ + name + featureNamespace + description + properties { + description + dataType + version { + versionTag + } + } + } +} +``` + +Expected response: + +```json +{ + "data": { + "mlFeature": { + "name": "test_BOOL_LIST_feature", + "featureNamespace": "test_feature_table_all_feature_dtypes", + "description": null, + "properties": { + "description": null, + "dataType": "SEQUENCE", + "version": null + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="curl" label="Curl" default> + +```json +curl --location --request POST 'http://localhost:8080/api/graphql' \ +--header 'Authorization: Bearer <my-access-token>' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "query": "{ mlFeature(urn: \"urn:li:mlFeature:(test_feature_table_all_feature_dtypes,test_BOOL_LIST_feature)\") { name featureNamespace description properties { description dataType version { versionTag } } } }" +}' +``` + +Expected response: + +```json +{ + "data": { + "mlFeature": { + "name": "test_BOOL_LIST_feature", + "featureNamespace": "test_feature_table_all_feature_dtypes", + "description": null, + "properties": { + "description": null, + "dataType": "SEQUENCE", + "version": null + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/read_mlfeature.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Read MlPrimaryKey + +<Tabs> +<TabItem value="graphql" label="GraphQL" default> + +```json +query { + mlPrimaryKey(urn: "urn:li:mlPrimaryKey:(user_features,user_id)"){ + name + featureNamespace + description + dataType + properties { + description + dataType + version { + versionTag + } + } + } +} +``` + +Expected response: + +```json +{ + "data": { + "mlPrimaryKey": { + "name": "user_id", + "featureNamespace": "user_features", + "description": "User's internal ID", + "dataType": "ORDINAL", + "properties": { + "description": "User's internal ID", + "dataType": "ORDINAL", + "version": null + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="curl" label="Curl" default> + +```json +curl --location --request POST 'http://localhost:8080/api/graphql' \ +--header 'Authorization: Bearer <my-access-token>' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "query": "query { mlPrimaryKey(urn: \"urn:li:mlPrimaryKey:(user_features,user_id)\"){ name featureNamespace description dataType properties { description dataType version { versionTag } } }}" +}' +``` + +Expected response: + +```json +{ + "data": { + "mlPrimaryKey": { + "name": "user_id", + "featureNamespace": "user_features", + "description": "User's internal ID", + "dataType": "ORDINAL", + "properties": { + "description": "User's internal ID", + "dataType": "ORDINAL", + "version": null + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/read_mlprimarykey.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Read MLFeatureTable + +<Tabs> +<TabItem value="graphql" label="GraphQL" default> + +```json +query { + mlFeatureTable(urn: "urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)"){ + name + description + platform { + name + } + properties { + description + mlFeatures { + name + } + } + } +} +``` + +Expected Response: + +```json +{ + "data": { + "mlFeatureTable": { + "name": "test_feature_table_all_feature_dtypes", + "description": null, + "platform": { + "name": "feast" + }, + "properties": { + "description": null, + "mlFeatures": [ + { + "name": "test_BOOL_LIST_feature" + }, + ...{ + "name": "test_STRING_feature" + } + ] + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="curl" label="Curl"> + +```json +curl --location --request POST 'http://localhost:8080/api/graphql' \ +--header 'Authorization: Bearer <my-access-token>' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "query": "{ mlFeatureTable(urn: \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\") { name description platform { name } properties { description mlFeatures { name } } } }" +}' +``` + +Expected Response: + +```json +{ + "data": { + "mlFeatureTable": { + "name": "test_feature_table_all_feature_dtypes", + "description": null, + "platform": { + "name": "feast" + }, + "properties": { + "description": null, + "mlFeatures": [ + { + "name": "test_BOOL_LIST_feature" + }, + ...{ + "name": "test_STRING_feature" + } + ] + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/read_mlfeature_table.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Read MLModel + +<Tabs> +<TabItem value="graphql" label="GraphQL" default> + +```json +query { + mlModel(urn: "urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)"){ + name + description + properties { + description + version + type + mlFeatures + groups { + urn + name + } + } + } +} +``` + +Expected Response: + +```json +{ + "data": { + "mlModel": { + "name": "scienceModel", + "description": "A sample model for predicting some outcome.", + "properties": { + "description": "A sample model for predicting some outcome.", + "version": null, + "type": "Naive Bayes classifier", + "mlFeatures": null, + "groups": [] + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="curl" label="Curl" default> + +```json +curl --location --request POST 'http://localhost:8080/api/graphql' \ +--header 'Authorization: Bearer <my-access-token>' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "query": "{ mlModel(urn: \"urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)\") { name description properties { description version type mlFeatures groups { urn name } } } }" +}' +``` + +Expected Response: + +```json +{ + "data": { + "mlModel": { + "name": "scienceModel", + "description": "A sample model for predicting some outcome.", + "properties": { + "description": "A sample model for predicting some outcome.", + "version": null, + "type": "Naive Bayes classifier", + "mlFeatures": null, + "groups": [] + } + } + }, + "extensions": {} +} +``` + +</TabItem> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/read_mlmodel.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +## Add ML Entities + +### Add MlFeature to MlFeatureTable + +<Tabs> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/add_mlfeature_to_mlfeature_table.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Add MlFeature to MLModel + +<Tabs> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/add_mlfeature_to_mlmodel.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Add MLGroup To MLModel + +<Tabs> +<TabItem value="python" label="Python"> + +```python +{{ inline /metadata-ingestion/examples/library/add_mlgroup_to_mlmodel.py show_path_as_comment }} +``` + +</TabItem> +</Tabs> + +### Expected Outcome of Adding ML Entities + +You can access to `Features` or `Group` Tab of each entity to view the added entities. + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/feature-added-to-model.png"/> +</p> + +<p align="center"> + <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/model-group-added-to-model.png"/> +</p> diff --git a/docs/api/tutorials/ml_v2.md b/docs/api/tutorials/ml_v2.md deleted file mode 100644 index df24a797584b09..00000000000000 --- a/docs/api/tutorials/ml_v2.md +++ /dev/null @@ -1,758 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -# ML System with DataHub - -## Why Integrate Your ML System with DataHub? - -As a data practitioner, keeping track of your ML experiments, models, and their relationships can be challenging. DataHub makes this easier by providing a central place to organize and track your ML assets. - -This guide will show you how to integrate your ML workflows with DataHub. With this integration, you can easily find and share ML models across your organization, track how models evolve over time, and understand how training data connects to each model. Most importantly, it enables seamless collaboration on ML projects by making everything discoverable and connected. - -## Goals Of This Guide - -In this guide, you'll learn how to: -- Create your basic ML components (models, experiments, runs) -- Connect these components to build a complete ML system -- Track relationships between models, data, and experiments - -## Core ML Concepts - -Here's what you need to know about the key components, based on MLflow's terminology: - -- **Experiments** are collections of training runs for the same project, like all attempts to build a churn predictor -- **Training Runs** are attempts to train a model within an experiment, capturing parameters and results -- **Models** organize related model versions together, like all versions of your churn predictor -- **Model Versions** are successful training runs registered for production use - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/concept-diagram.png"/> -</p> - -The hierarchy works like this: -1. Every run belongs to an experiment -2. Successful runs can become model versions -3. Model versions belong to a model group -4. Not every run becomes a model version - -:::note Terminology -Here's how DataHub and MLflow terms map to each other. -For more details, see the [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md): - -| DataHub | MLflow | Description | -|---------|---------|-------------| -| ML Model Group | Model | Collection of related model versions | -| ML Model | Model Version | Specific version of a trained model | -| ML Training Run | Run | Single training attempt | -| ML Experiment | Experiment | Project workspace | -::: - -## Basic Setup - -To follow this tutorial, you'll need DataHub Quickstart deployed locally. -For detailed steps, see the [Datahub Quickstart Guide](/docs/quickstart.md). - -Next, set up the Python client for DataHub. -Create a token in DataHub UI and replace `<your_token>` with your token: - -```python -from mlflow_dh_client import MLflowDatahubClient - -client = MLflowDatahubClient(token="<your_token>") -``` - -:::note Verifying via GraphQL -Throughout this guide, we'll show how to verify changes using GraphQL queries. -You can run these queries in the DataHub UI at `https://localhost:9002/api/graphiql`. -::: - -## Create Simple ML Entities - -Let's create the basic building blocks of your ML system. These components will help you organize your ML work and make it discoverable by your team. - -### Create Model Group - -A model group contains different versions of a similar model. For example, all versions of your "Customer Churn Predictor" would go in one group. - -<Tabs> -<TabItem value="simple" label="Simple Version"> -Create a basic model group with just an identifier: - -```python -client.create_model_group( - group_id="airline_forecast_models_group", -) -``` - -</TabItem> -<TabItem value="detailed" label="Detailed Version"> -Add rich metadata like descriptions, creation timestamps, and team information: - -```python -client.create_model_group( - group_id="airline_forecast_models_group", - properties=models.MLModelGroupPropertiesClass( - name="Airline Forecast Models Group", - description="Group of models for airline passenger forecasting", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - ), -) -``` - -</TabItem> -</Tabs> - -Let's verify that our model group was created: - -<Tabs> -<TabItem value="UI" label="UI"> -See your new model group in the DataHub UI: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-empty.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query your model group to check its properties: - -```graphql -query { - mlModelGroup( - urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_models_group,PROD)" - ) { - name - description - } -} -``` - -The response will show your model group's details: - -```json -{ - "data": { - "mlModelGroup": { - "name": "airline_forecast_models_group", - "description": "Group of models for airline passenger forecasting" - } - } -} -``` - -</TabItem> -</Tabs> - -### Create Model - -Next, let's create a specific model version that represents a trained model ready for deployment. - -<Tabs> -<TabItem value="simple" label="Simple Version"> -Create a model with just the required version: - -```python -client.create_model( - model_id="arima_model", - version="1.0", -) -``` - -</TabItem> -<TabItem value="detailed" label="Detailed Version"> -Include metrics, parameters, and metadata for production use: - -```python -client.create_model( - model_id="arima_model", - properties=models.MLModelPropertiesClass( - name="ARIMA Model", - description="ARIMA model for airline passenger forecasting", - customProperties={"team": "forecasting"}, - trainingMetrics=[ - models.MLMetricClass(name="accuracy", value="0.9"), - models.MLMetricClass(name="precision", value="0.8"), - ], - hyperParams=[ - models.MLHyperParamClass(name="learning_rate", value="0.01"), - models.MLHyperParamClass(name="batch_size", value="32"), - ], - externalUrl="https:localhost:5000", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - lastModified=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - tags=["forecasting", "arima"], - ), - version="1.0", - alias="champion", -) -``` - -</TabItem> -</Tabs> - -Let's verify our model: - -<Tabs> -<TabItem value="UI" label="UI"> -Check your model's details in the DataHub UI: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-empty.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query your model's information: - -```graphql -query { - mlModel( - urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" - ) { - name - description - versionProperties { - version { - versionTag - } - } - } -} -``` - -The response will show your model's details: - -```json -{ - "data": { - "mlModel": { - "name": "arima_model", - "description": "ARIMA model for airline passenger forecasting", - "versionProperties": { - "version": { - "versionTag": "1.0" - } - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Create Experiment - -An experiment helps organize multiple training runs for a specific project. - -<Tabs> -<TabItem value="simple" label="Simple Version"> -Create a basic experiment: - -```python -client.create_experiment( - experiment_id="airline_forecast_experiment", -) -``` - -</TabItem> -<TabItem value="detailed" label="Detailed Version"> -Add context and metadata: - -```python -client.create_experiment( - experiment_id="airline_forecast_experiment", - properties=models.ContainerPropertiesClass( - name="Airline Forecast Experiment", - description="Experiment to forecast airline passenger numbers", - customProperties={"team": "forecasting"}, - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - lastModified=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - ), -) -``` - -</TabItem> -</Tabs> - -Verify your experiment: - -<Tabs> -<TabItem value="UI" label="UI"> -See your experiment's details in the UI: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-empty.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query your experiment's information: - -```graphql -query { - container( - urn:"urn:li:container:airline_forecast_experiment" - ) { - name - description - properties { - customProperties - } - } -} -``` - -Check the response: - -```json -{ - "data": { - "container": { - "name": "Airline Forecast Experiment", - "description": "Experiment to forecast airline passenger numbers", - "properties": { - "customProperties": { - "team": "forecasting" - } - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Create Training Run - -A training run captures all details about a specific model training attempt. - -<Tabs> -<TabItem value="simple" label="Simple Version"> -Create a basic training run: - -```python -client.create_training_run( - run_id="simple_training_run_4", -) -``` - -</TabItem> -<TabItem value="detailed" label="Detailed Version"> -Include metrics, parameters, and other important metadata: - -```python -client.create_training_run( - run_id="simple_training_run_4", - properties=models.DataProcessInstancePropertiesClass( - name="Simple Training Run 4", - created=models.AuditStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - customProperties={"team": "forecasting"}, - ), - training_run_properties=models.MLTrainingRunPropertiesClass( - id="simple_training_run_4", - outputUrls=["s3://my-bucket/output"], - trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], - hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], - externalUrl="https:localhost:5000", - ), - run_result=RunResultType.FAILURE, - start_timestamp=1628580000000, - end_timestamp=1628580001000, -) -``` - -</TabItem> -</Tabs> - -Verify your training run: - -<Tabs> -<TabItem value="UI" label="UI"> -View the run details in the UI: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-empty.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query your training run: - -```graphql -query { - dataProcessInstance( - urn:"urn:li:dataProcessInstance:simple_training_run_4" - ) { - name - created { - time - } - properties { - customProperties - } - } -} -``` - -Check the response: - -```json -{ - "data": { - "dataProcessInstance": { - "name": "Simple Training Run 4", - "created": { - "time": 1628580000000 - }, - "properties": { - "customProperties": { - "team": "forecasting" - } - } - } - } -} -``` - -</TabItem> -</Tabs> - -## Define Entity Relationships - -Now let's connect these components to create a comprehensive ML system. These connections enable you to track model lineage, monitor model evolution, understand dependencies, and search effectively across your ML assets. - -### Add Model To Model Group - -Connect your model to its group: - -```python -client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) -``` - -<Tabs> -<TabItem value="UI" label="UI"> - -View model versions in the **Model Group** under the **Models** section: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-group-with-model.png"/> -</p> - -Find group information in the **Model** page under the **Group** tab: -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-model-group.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query the model-group relationship: - -```graphql -query { - mlModel( - urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" - ) { - name - properties { - groups { - urn - properties { - name - } - } - } - } -} -``` - -Check the response: - -```json -{ - "data": { - "mlModel": { - "name": "arima_model", - "properties": { - "groups": [ - { - "urn": "urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)", - "properties": { - "name": "Airline Forecast Model Group" - } - } - ] - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Add Run To Experiment - -Connect a training run to its experiment: - -```python -client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) -``` - -<Tabs> -<TabItem value="UI" label="UI"> - -Find your runs in the **Experiment** page under the **Entities** tab: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/experiment-with-run.png"/> -</p> - -See the experiment details in the **Run** page: -<p align="center"> - <img width="40%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-with-experiment.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query the run-experiment relationship: - -```graphql -query { - dataProcessInstance( - urn:"urn:li:dataProcessInstance:simple_training_run" - ) { - name - parentContainers { - containers { - urn - properties { - name - } - } - } - } -} -``` - -View the relationship details: - -```json -{ - "data": { - "dataProcessInstance": { - "name": "Simple Training Run", - "parentContainers": { - "containers": [ - { - "urn": "urn:li:container:airline_forecast_experiment", - "properties": { - "name": "Airline Forecast Experiment" - } - } - ] - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Add Run To Model - -Connect a training run to its resulting model: - -```python -client.add_run_to_model(model_urn=model_urn, run_urn=run_urn) -``` - -This relationship enables you to: -- Track which runs produced each model -- Understand model provenance -- Debug model issues -- Monitor model evolution - -<Tabs> -<TabItem value="UI" label="UI"> - -Find the source run in the **Model** page under the **Summary** tab: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/model-with-source-run.png"/> -</p> - -See related models in the **Run** page under the **Lineage** tab: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model.png"/> -</p> -<p align="center"> - <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-graph.png"/> -</p> - -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query the model's training jobs: - -```graphql -query { - mlModel( - urn:"urn:li:mlModel:(urn:li:dataPlatform:mlflow,arima_model,PROD)" - ) { - name - properties { - mlModelLineageInfo { - trainingJobs - } - } - } -} -``` - -View the relationship: - -```json -{ - "data": { - "mlModel": { - "name": "arima_model", - "properties": { - "mlModelLineageInfo": { - "trainingJobs": [ - "urn:li:dataProcessInstance:simple_training_run_test" - ] - } - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Add Run To Model Group - -Create a direct connection between a run and a model group: - -```python -client.add_run_to_model_group(model_group_urn=model_group_urn, run_urn=run_urn) -``` - -This connection lets you: -- View model groups in the run's lineage -- Query training jobs at the group level -- Track training history for model families - -<Tabs> -<TabItem value="UI" label="UI"> - -See model groups in the **Run** page under the **Lineage** tab: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-group.png"/> -</p> -<p align="center"> - <img width="50%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-model-group-graph.png"/> -</p> -</TabItem> - -<TabItem value="graphql" label="GraphQL"> -Query the model group's training jobs: - -```graphql -query { - mlModelGroup( - urn:"urn:li:mlModelGroup:(urn:li:dataPlatform:mlflow,airline_forecast_model_group,PROD)" - ) { - name - properties { - mlModelLineageInfo { - trainingJobs - } - } - } -} -``` - -Check the relationship: - -```json -{ - "data": { - "mlModelGroup": { - "name": "airline_forecast_model_group", - "properties": { - "mlModelLineageInfo": { - "trainingJobs": [ - "urn:li:dataProcessInstance:simple_training_run_test" - ] - } - } - } - } -} -``` - -</TabItem> -</Tabs> - -### Add Dataset To Run - -Track input and output datasets for your training runs: - -```python -client.add_input_datasets_to_run( - run_urn=run_urn, - dataset_urns=[str(input_dataset_urn)] -) - -client.add_output_datasets_to_run( - run_urn=run_urn, - dataset_urns=[str(output_dataset_urn)] -) -``` - -These connections help you: -- Track data lineage -- Understand data dependencies -- Ensure reproducibility -- Monitor data quality impacts - -Find dataset relationships in the **Lineage** tab of either the **Dataset** or **Run** page: -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/run-lineage-dataset-graph.png"/> -</p> - -## Full Overview - -Here's your complete ML system with all components connected: - -<p align="center"> - <img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/add-img-for-ml/imgs/apis/tutorials/ml/lineage-full.png"/> -</p> - -You now have a complete lineage view of your ML assets, from training data through runs to production models! - -## What's Next? - -To see this integration in action and learn about real-world use cases: -- Watch our [Townhall demo](https://youtu.be/_WUoVqkF2Zo?feature=shared&t=1932) on MLflow integration with DataHub -- Join our [Slack community](https://slack.datahubproject.io) for discussions -- Read our [MLflow integration doc](/docs/generated/ingestion/sources/mlflow.md) for more details \ No newline at end of file diff --git a/metadata-ingestion/examples/ml/mlflow_dh_client.py b/metadata-ingestion/examples/ai/dh_ai_client.py similarity index 98% rename from metadata-ingestion/examples/ml/mlflow_dh_client.py rename to metadata-ingestion/examples/ai/dh_ai_client.py index 2ee9dc4f966722..96adb260da4c45 100644 --- a/metadata-ingestion/examples/ml/mlflow_dh_client.py +++ b/metadata-ingestion/examples/ai/dh_ai_client.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -class MLflowDatahubClient: +class DatahubAIClient: """Client for creating and managing MLflow metadata in DataHub.""" def __init__( @@ -36,7 +36,13 @@ def __init__( server_url: str = "http://localhost:8080", platform: str = "mlflow", ) -> None: - """Initialize the MLflow DataHub client.""" + """Initialize the DataHub AI client. + + Args: + token: DataHub access token + server_url: DataHub server URL (defaults to http://localhost:8080) + platform: Platform name (defaults to mlflow) + """ self.token = token self.server_url = server_url self.platform = platform diff --git a/metadata-ingestion/examples/ml/mlflow_dh_client_sample.py b/metadata-ingestion/examples/ai/dh_ai_client_sample.py similarity index 93% rename from metadata-ingestion/examples/ml/mlflow_dh_client_sample.py rename to metadata-ingestion/examples/ai/dh_ai_client_sample.py index 867f118fa88392..291cfb2ff1d619 100644 --- a/metadata-ingestion/examples/ml/mlflow_dh_client_sample.py +++ b/metadata-ingestion/examples/ai/dh_ai_client_sample.py @@ -1,6 +1,6 @@ import argparse -from mlflow_dh_client import MLflowDatahubClient +from dh_ai_client import DatahubAIClient import datahub.metadata.schema_classes as models from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType @@ -9,9 +9,15 @@ # Example usage parser = argparse.ArgumentParser() parser.add_argument("--token", required=True, help="DataHub access token") + parser.add_argument( + "--server_url", + required=False, + default="http://localhost:8080", + help="DataHub server URL (defaults to http://localhost:8080)", + ) args = parser.parse_args() - client = MLflowDatahubClient(token=args.token) + client = DatahubAIClient(token=args.token, server_url=args.server_url) # Create model group model_group_urn = client.create_model_group( From 88599d06b93fb5933b041aafab8969517ab4920f Mon Sep 17 00:00:00 2001 From: yoonhyejin <0327jane@gmail.com> Date: Thu, 30 Jan 2025 12:50:51 +0900 Subject: [PATCH 6/6] fix typo --- docs/api/tutorials/ml.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/api/tutorials/ml.md b/docs/api/tutorials/ml.md index 73a86824c9e60e..1a46f48d5aa4be 100644 --- a/docs/api/tutorials/ml.md +++ b/docs/api/tutorials/ml.md @@ -58,14 +58,14 @@ For platform-specific details, see our integration guides for [MLflow](/docs/gen To follow this tutorial, you'll need DataHub Quickstart deployed locally. For detailed steps, see the [Datahub Quickstart Guide](/docs/quickstart.md). -Next, set up the Python client for DataHub using `DataHubAIClient`. +Next, set up the Python client for DataHub using `DatahubAIClient`defined in [here](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/ai/dh_ai_client.py). Create a token in DataHub UI and replace `<your_token>` with your token: ```python -from dh_ai_client import DataHubAIClient +from dh_ai_client import DatahubAIClient -client = DataHubAIClient(token="<your_token>", server_url="http://localhost:9002") +client = DatahubAIClient(token="<your_token>", server_url="http://localhost:9002") ``` :::note Verifying via GraphQL @@ -757,6 +757,7 @@ Here's your complete ML system with all components connected: You now have a complete lineage view of your ML assets, from training data through runs to production models! +You can check out the full code for this tutorial [here](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/ai/dh_ai_client_sample.py). ## What's Next? To see these integrations in action: