diff --git a/README.rst b/README.rst index c9e26b9e34..0180e0be69 100644 --- a/README.rst +++ b/README.rst @@ -95,15 +95,17 @@ Here's a list of `good first issues `_ for more details. diff --git a/daft/__init__.py b/daft/__init__.py index b67ef86f8f..9c910c0bf4 100644 --- a/daft/__init__.py +++ b/daft/__init__.py @@ -43,7 +43,6 @@ def refresh_logger() -> None: __version__ = get_version() - ### # Initialize analytics ### diff --git a/daft/context.py b/daft/context.py index 33fb502530..78ac508dda 100644 --- a/daft/context.py +++ b/daft/context.py @@ -10,6 +10,7 @@ from daft.daft import set_runner_native as _set_runner_native from daft.daft import set_runner_py as _set_runner_py from daft.daft import set_runner_ray as _set_runner_ray +from daft.scarf_telemetry import scarf_telemetry if TYPE_CHECKING: from daft.runners.runner import Runner @@ -67,12 +68,16 @@ def set_runner_ray( max_task_backlog: int | None = None, force_client_mode: bool = False, ) -> DaftContext: + # Scarf Analytics + scarf_telemetry(runner="ray") + py_ctx = _set_runner_ray( address=address, noop_if_initialized=noop_if_initialized, max_task_backlog=max_task_backlog, force_client_mode=force_client_mode, ) + return DaftContext._from_native(py_ctx) @@ -84,9 +89,13 @@ def set_runner_py(use_thread_pool: bool | None = None) -> DaftContext: Returns: DaftContext: Daft context after setting the Py runner """ + # Scarf Analytics + scarf_telemetry(runner="py") + py_ctx = _set_runner_py( use_thread_pool=use_thread_pool, ) + return DaftContext._from_native(py_ctx) @@ -98,7 +107,11 @@ def set_runner_native() -> DaftContext: Returns: DaftContext: Daft context after setting the native runner """ + # Scarf Analytics + scarf_telemetry(runner="native") + py_ctx = _set_runner_native() + return DaftContext._from_native(py_ctx) diff --git a/daft/scarf_telemetry.py b/daft/scarf_telemetry.py new file mode 100644 index 0000000000..6018c99e46 --- /dev/null +++ b/daft/scarf_telemetry.py @@ -0,0 +1,91 @@ +import os +import platform +import urllib.parse +import urllib.request + +from daft import get_build_type, get_version + + +def scarf_telemetry(runner: str): + """Track analytics for Daft usage via Scarf. + + Args: + user_opted_out (bool): Whether the user has opted out of analytics + runner (str): The runner being used (py, ray, or native) + + Returns: + tuple[str | None, str | None]: Response status and runner type, or (None, None) if analytics disabled/failed + """ + version = get_version() + build_type = get_build_type() + scarf_opt_out = os.getenv("SCARF_NO_ANALYTICS") == "true" or os.getenv("DO_NOT_TRACK") == "true" + + # Skip analytics for dev builds or if user opted out + if build_type == "dev" or scarf_opt_out: + return None, None + + try: + python_version = ".".join(platform.python_version().split(".")[:2]) + + params = { + "version": version, + "platform": platform.system(), + "python": python_version, + "arch": platform.machine(), + "runner": runner, + } + + # Prepare the query string + query_string = urllib.parse.urlencode(params) + + # Make the GET request + url = f"https://daft.gateway.scarf.sh/daft-runner?{query_string}" + with urllib.request.urlopen(url) as response: + return f"Response status: {response.status}", runner + + except Exception as e: + return f"Analytics error: {e!s}", None + + +# def scarf_analytics( +# scarf_opt_out: bool, build_type: str, version: str, runner: str +# ) -> tuple[Union[str, None], Union[str, None]]: +# """Track analytics for Daft usage via Scarf. + +# Args: +# user_opted_out (bool): Whether the user has opted out of analytics +# build_type (str): The build type from get_build_type() +# version (str): The version from get_version() +# runner (str): The runner being used (py, ray, or native) + +# Returns: +# tuple[str | None, str | None]: Response status and runner type, or (None, None) if analytics disabled/failed +# """ +# try: +# # Skip analytics for dev builds or if user opted out +# if build_type == "dev" or scarf_opt_out: +# return None, None + +# if os.getenv("SCARF_NO_ANALYTICS") != "true" and os.getenv("DO_NOT_TRACK") != "true": +# python_version = ".".join(platform.python_version().split(".")[:2]) + +# params = { +# "version": version, +# "platform": platform.system(), +# "python": python_version, +# "arch": platform.machine(), +# "runner": runner, +# } + +# # Prepare the query string +# query_string = urllib.parse.urlencode(params) + +# # Make the GET request +# url = f"https://daft.gateway.scarf.sh/daft-runner?{query_string}" +# with urllib.request.urlopen(url) as response: +# return f"Response status: {response.status}", runner + +# except Exception as e: +# return f"Analytics error: {e!s}", None + +# return None, None diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 2639ded130..4e553a0ae1 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -3,20 +3,17 @@ # Project Information site_name: Daft Documentation +site_author: Eventual site_url: https://www.getdaft.io/projects/docs/en/stable/ -site_description: >- +site_description: | Welcome to Daft Documentation! Daft is a unified data engine for data engineering, analytics, and ML/AI. +copyright: '© Copyright 2025, Eventual ' # Repository repo_name: Daft repo_url: https://github.com/Eventual-Inc/Daft docs_dir: mkdocs -# Scarf pixel for tracking analytics -# image: -# referrerpolicy: "no-referrer-when-downgrade" -# src: "https://static.scarf.sh/a.png?x-pxid=c9065f3a-a090-4243-8f69-145d5de7bfca" - # Sitemap nav: - Daft User Guide: @@ -97,15 +94,36 @@ theme: # Additional Configuration extra: + analytics: + provider: google + property: G-YN4QSRPV0K + feedback: + title: Was this page helpful? + ratings: + - icon: material/emoticon-happy-outline + name: This page was helpful + data: 1 + note: >- + Thanks for your feedback! + - icon: material/emoticon-sad-outline + name: This page could be improved + data: 0 + note: >- + Thanks for your feedback! Help us improve this page by + submitting an issue on our Daft repo. social: - icon: fontawesome/brands/github - link: https://github.com/squidfunk + link: https://github.com/Eventual-Inc/Daft - icon: fontawesome/brands/slack link: https://join.slack.com/t/dist-data/shared_invite/zt-2e77olvxw-uyZcPPV1SRchhi8ah6ZCtg - icon: fontawesome/brands/linkedin - link: https://www.linkedin.com/company/eventualcomputing/ + link: https://www.linkedin.com/showcase/daft-dataframe/ - icon: fontawesome/brands/x-twitter link: https://x.com/daft_dataframe + - icon: fontawesome/brands/youtube + link: https://www.youtube.com/@daftdf + - icon: simple/substack + link: https://blog.getdaft.io/ # This is a macro you should use to refer to paths # When referring to methods, the syntax is {{ api_path }}/path/to/method @@ -148,3 +166,4 @@ plugins: - mkdocs-simple-hooks: hooks: on_pre_build: "docs.hooks:make_api_docs" +- social diff --git a/docs/mkdocs/index.md b/docs/mkdocs/index.md index 7cd27d03cc..6b6c642297 100644 --- a/docs/mkdocs/index.md +++ b/docs/mkdocs/index.md @@ -37,7 +37,7 @@ Daft is a unified data engine for **data engineering, analytics, and ML/AI**. It Daft boasts strong integrations with technologies common across these workloads: * **Cloud Object Storage:** Record-setting I/O performance for integrations with S3 cloud storage, [battle-tested at exabyte-scale at Amazon](https://aws.amazon.com/blogs/opensource/amazons-exabyte-scale-migration-from-apache-spark-to-ray-on-amazon-ec2/) -* **ML/AI Python Ecosystem:** First-class integrations with [PyTorch](https://pytorch.org/>) and [NumPy](https://numpy.org/) for efficient interoperability with your ML/AI stack +* **ML/AI Python Ecosystem:** First-class integrations with [PyTorch](https://pytorch.org/) and [NumPy](https://numpy.org/) for efficient interoperability with your ML/AI stack * **Data Catalogs/Table Formats:** Capabilities to effectively query table formats such as [Apache Iceberg](https://iceberg.apache.org/), [Delta Lake](https://delta.io/) and [Apache Hudi](https://hudi.apache.org/) * **Seamless Data Interchange:** Zero-copy integration with [Apache Arrow](https://arrow.apache.org/docs/index.html) * **Multimodal/ML Data:** Native functionality for data modalities such as tensors, images, URLs, long-form text and embeddings diff --git a/docs/mkdocs/integrations/delta_lake.md b/docs/mkdocs/integrations/delta_lake.md index 4cf08162a2..fa8c8f901d 100644 --- a/docs/mkdocs/integrations/delta_lake.md +++ b/docs/mkdocs/integrations/delta_lake.md @@ -119,7 +119,7 @@ Here are Delta Lake features that are on our roadmap. Please let us know if you 1. Read support for [deletion vectors](https://docs.delta.io/latest/delta-deletion-vectors.html) ([issue](https://github.com/Eventual-Inc/Daft/issues/1954)). -2. Read support for [column mappings](https://docs.delta.io/latest/delta-column-mapping.html>) ([issue](https://github.com/Eventual-Inc/Daft/issues/1955)). +2. Read support for [column mappings](https://docs.delta.io/latest/delta-column-mapping.html) ([issue](https://github.com/Eventual-Inc/Daft/issues/1955)). 3. Writing new Delta Lake tables ([issue](https://github.com/Eventual-Inc/Daft/issues/1967)). diff --git a/docs/mkdocs/resources/telemetry.md b/docs/mkdocs/resources/telemetry.md index 0113203c87..6138ea07e9 100644 --- a/docs/mkdocs/resources/telemetry.md +++ b/docs/mkdocs/resources/telemetry.md @@ -1,12 +1,17 @@ # Telemetry -To help core developers improve Daft, we collect non-identifiable statistics on Daft usage in order to better understand how Daft is used, common bugs and performance bottlenecks. +To help core developers improve Daft, we collect non-identifiable statistics on Daft usage in order to better understand how Daft is used, common bugs and performance bottlenecks. Data is collected from a combination of our own analytics and [Scarf](https://scarf.sh). We take the privacy of our users extremely seriously, and telemetry in Daft is built to be: -1. Easy to opt-out: to disable telemetry, set the following environment variable: `DAFT_ANALYTICS_ENABLED=0` -2. Non-identifiable: events are keyed by a session ID which is generated on import of Daft -3. Metadata-only: we do not collect any of our users' proprietary code or data +1. Easy to opt-out: To disable telemetry, set the following environment variables: + + • `DAFT_ANALYTICS_ENABLED=0` + + • `SCARF_NO_ANALYTICS=true` or `DO_NOT_TRACK=true` + +2. Non-identifiable: Events are keyed by a session ID which is generated on import of Daft +3. Metadata-only: We do not collect any of our users' proprietary code or data We **do not** sell or buy any of the data that is collected in telemetry. @@ -14,7 +19,7 @@ We **do not** sell or buy any of the data that is collected in telemetry. ## What data do we collect? -To audit what data is collected, please see the implementation of `AnalyticsClient` in the `daft.analytics` module. +To audit what data is collected, please see the implementation of `AnalyticsClient` in the `daft.analytics` module as well as `scarf_telemetry.py`. In short, we collect the following: diff --git a/docs/sphinx/source/_templates/sections/header.html b/docs/sphinx/source/_templates/sections/header.html index 6bbcbe5bc1..c71ccaeba2 100644 --- a/docs/sphinx/source/_templates/sections/header.html +++ b/docs/sphinx/source/_templates/sections/header.html @@ -1,4 +1,5 @@
+