Add union-dataset support direct_query_driver system.

This is a major refactor of the many classes involved in translating butler queries to SQL, including some renaming to reflect new roles. - The low-level SqlBuilder and SqlJoiner classes have been renamed to SqlSelectBuilder and SqlJoinsBuilder, and some of SqlJoinsBuilder has been factored out into a base class, SqlColumns. - The QueryPlan objects have been split up into "analysis" objects that are still mostly plan-like, and QueryBuilder objects that have both that planning information and one or more SqlSelectBuilder objects and a Postprocessing inside them. - The new QueryBuilder objects are a hierarchy: there's a QueryBuilder abstrat base class and two derived classes: SingleSelectQueryBuilder is a refactoring of the code path we had before, while UnionQueryBuilder is a UNION ALL over dataset types. - DirectQueryDriver.build_query is still the main entry point, and it's now where the overview docs for the system live. It delegates to methods on the QueryBuilder objects to handle the differences in the single-select vs. union cases, and those delegate back to other DirectQueryDriver methods for logic that's the same between the two cases.
lsst · Oct 22, 2024 · 56cbe4a · 56cbe4a
1 parent 4818ac7
commit 56cbe4a
Show file tree

Hide file tree

Showing 10 changed files with 2,040 additions and 1,006 deletions.
diff --git a/python/lsst/daf/butler/direct_query_driver/__init__.py b/python/lsst/daf/butler/direct_query_driver/__init__.py
@@ -27,4 +27,4 @@
 
 from ._driver import DirectQueryDriver
 from ._postprocessing import Postprocessing
-from ._query_builder import QueryBuilder, QueryJoiner
+from ._sql_builders import SqlJoinsBuilder, SqlSelectBuilder
diff --git a/python/lsst/daf/butler/direct_query_driver/_driver.py b/python/lsst/daf/butler/direct_query_driver/_driver.py
diff --git a/...butler/direct_query_driver/_query_plan.py → ...er/direct_query_driver/_query_analysis.py b/...butler/direct_query_driver/_query_plan.py → ...er/direct_query_driver/_query_analysis.py
@@ -28,34 +28,37 @@
 from __future__ import annotations
 
 __all__ = (
-    "QueryPlan",
-    "QueryJoinsPlan",
-    "QueryProjectionPlan",
-    "QueryFindFirstPlan",
+    "QueryJoinsAnalysis",
+    "QueryFindFirstAnalysis",
     "ResolvedDatasetSearch",
+    "QueryCollectionAnalysis",
 )
 
 import dataclasses
-from collections.abc import Iterator
-from typing import TYPE_CHECKING
+from collections.abc import Iterator, Mapping
+from types import EllipsisType
+from typing import TYPE_CHECKING, Generic, TypeVar
 
 from ..dimensions import DimensionElement, DimensionGroup
 from ..queries import tree as qt
+from ..registry import CollectionSummary
 from ..registry.interfaces import CollectionRecord
 
 if TYPE_CHECKING:
     from ._postprocessing import Postprocessing
-    from ._query_builder import QueryBuilder
+    from ._sql_builders import SqlSelectBuilder
+
+_T = TypeVar("_T")
 
 
 @dataclasses.dataclass
-class ResolvedDatasetSearch:
+class ResolvedDatasetSearch(Generic[_T]):
     """A struct describing a dataset search joined into a query, after
     resolving its collection search path.
     """
 
-    name: str
-    """Name of the dataset type."""
+    name: _T
+    """Name or names of the dataset type(s)."""
 
     dimensions: DimensionGroup
     """Dimensions of the dataset type."""
@@ -76,15 +79,17 @@ class ResolvedDatasetSearch:
     `~CollectionType.CALIBRATION` collection, `False` otherwise.
 
     Since only calibration datasets can be present in
-    `~CollectionType.CALIBRATION` collections, this also
+    `~CollectionType.CALIBRATION` collections, this also indicates that the
+    dataset type is a calibration.
     """
 
 
 @dataclasses.dataclass
-class QueryJoinsPlan:
+class QueryJoinsAnalysis:
     """A struct describing the "joins" section of a butler query.
 
-    See `QueryPlan` and `QueryPlan.joins` for additional information.
+    See `DirectQueryDriver.build_query` for an overview of how queries are
+    transformed into SQL, and the role this object plays in that.
     """
 
     predicate: qt.Predicate
@@ -100,7 +105,7 @@ class QueryJoinsPlan:
     materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict)
     """Materializations to join into the query."""
 
-    datasets: dict[str, ResolvedDatasetSearch] = dataclasses.field(default_factory=dict)
+    datasets: dict[str, ResolvedDatasetSearch[str]] = dataclasses.field(default_factory=dict)
     """Dataset searches to join into the query."""
 
     data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field(
@@ -116,14 +121,20 @@ class QueryJoinsPlan:
     def __post_init__(self) -> None:
         self.predicate.gather_required_columns(self.columns)
 
-    def iter_mandatory(self) -> Iterator[DimensionElement]:
+    def iter_mandatory(self, union_dataset_dimensions: DimensionGroup | None) -> Iterator[DimensionElement]:
         """Return an iterator over the dimension elements that must be joined
         into the query.
 
         These elements either provide "field" (non-key) columns or define
         relationships that result rows must be consistent with.  They do not
         necessarily include all dimension keys in `columns`, since each of
         those can typically be included in a query in multiple different ways.
+
+        Parameters
+        ----------
+        union_dataset_dimensions : `DimensionGroup` or `None`
+            Dimensions of the union dataset types, or `None` if this is not
+            a union dataset query.
         """
         for element_name in self.columns.dimensions.elements:
             element = self.columns.dimensions.universe[element_name]
@@ -148,6 +159,11 @@ def iter_mandatory(self) -> Iterator[DimensionElement]:
                     for dataset_spec in self.datasets.values()
                 ):
                     continue
+                if (
+                    union_dataset_dimensions is not None
+                    and element.minimal_group.names <= union_dataset_dimensions.required
+                ):
+                    continue
                 # Materializations have all key columns for their dimensions.
                 if any(
                     element in materialization_dimensions.names
@@ -158,133 +174,77 @@ def iter_mandatory(self) -> Iterator[DimensionElement]:
 
 
 @dataclasses.dataclass
-class QueryProjectionPlan:
-    """A struct describing the "projection" stage of a butler query.
-
-    This struct evaluates to `True` in boolean contexts if either
-    `needs_dimension_distinct` or `needs_dataset_distinct` are `True`.  In
-    other cases the projection is effectively a no-op, because the
-    "joins"-stage rows are already unique.
-
-    See `QueryPlan` and `QueryPlan.projection` for additional information.
-    """
-
-    columns: qt.ColumnSet
-    """The columns present in the query after the projection is applied.
+class QueryFindFirstAnalysis(Generic[_T]):
+    """A struct describing the "find-first" stage of a butler query.
 
-    This is always a subset of `QueryJoinsPlan.columns`.
+    See `DirectQueryDriver.build_query` for an overview of how queries are
+    transformed into SQL, and the role this object plays in that.
     """
 
-    needs_dimension_distinct: bool = False
-    """If `True`, the projection's dimensions do not include all dimensions in
-    the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be
-    used to make post-projection rows unique.
-    """
+    search: ResolvedDatasetSearch[_T]
+    """Information about the dataset type or types being searched for."""
 
-    needs_dataset_distinct: bool = False
-    """If `True`, the projection columns do not include collection-specific
-    dataset fields that were present in the "joins" stage, and hence a SELECT
-    DISTINCT [ON] or GROUP BY must be added to make post-projection rows
-    unique.
-    """
+    @property
+    def dataset_type(self) -> _T:
+        """Name(s) of the dataset type(s)."""
+        return self.search.name
 
     def __bool__(self) -> bool:
-        return self.needs_dimension_distinct or self.needs_dataset_distinct
+        return len(self.search.collection_records) > 1
 
-    find_first_dataset: str | None = None
-    """If not `None`, this is a find-first query for this dataset.
 
-    This is set even if the find-first search is trivial because there is only
-    one resolved collection.
+@dataclasses.dataclass
+class QueryCollectionAnalysis:
+    """A struct containing information about all of the collections that appear
+    in a butler query.
     """
 
+    collection_records: Mapping[str, CollectionRecord]
+    """All collection records, keyed by collection name.
 
-@dataclasses.dataclass
-class QueryFindFirstPlan:
-    """A struct describing the "find-first" stage of a butler query.
-
-    See `QueryPlan` and `QueryPlan.find_first` for additional information.
+    This includes CHAINED collections.
     """
 
-    search: ResolvedDatasetSearch
-    """Information about the dataset being searched for."""
-
-    @property
-    def dataset_type(self) -> str:
-        """Name of the dataset type."""
-        return self.search.name
+    calibration_dataset_types: set[str | EllipsisType] = dataclasses.field(default_factory=set)
+    """A set of the anmes of all calibration dataset types.
 
-    def __bool__(self) -> bool:
-        return len(self.search.collection_records) > 1
+    If ``...`` appears in the set, the dataset type union includes at least one
+    calibration dataset type.
+    """
 
+    summaries_by_dataset_type: dict[str | EllipsisType, list[tuple[CollectionRecord, CollectionSummary]]] = (
+        dataclasses.field(default_factory=dict)
+    )
+    """Collection records and summaries, in search order, keyed by dataset type
+    name.
 
-@dataclasses.dataclass
-class QueryPlan:
-    """A struct that aggregates information about a complete butler query.
-
-    Notes
-    -----
-    Butler queries are transformed into a combination of SQL and Python-side
-    postprocessing in three stages, with each corresponding to an attributes of
-    this class and a method of `DirectQueryDriver`
-
-    - In the `joins` stage (`~DirectQueryDriver.apply_query_joins`), we define
-      the main SQL FROM and WHERE clauses, by joining all tables needed to
-      bring in any columns, or constrain the keys of its rows.
-
-    - In the `projection` stage (`~DirectQueryDriver.apply_query_projection`),
-      we select only the columns needed for the query's result rows (including
-      columns needed only by postprocessing and ORDER BY, as well those needed
-      by the objects returned to users).  If the result rows are not naturally
-      unique given what went into the query in the "joins" stage, the
-      projection involves a SELECT DISTINCT [ON] or GROUP BY to make them
-      unique, and in a few rare cases uses aggregate functions with GROUP BY.
-
-    - In the `find_first` stage (`~DirectQueryDriver.apply_query_find_first`),
-      we use a window function (PARTITION BY) subquery to find only the first
-      dataset in the collection search path for each data ID.  This stage does
-      nothing if there is no find-first dataset search, or if the search is
-      trivial because there is only one collection.
-
-    In `DirectQueryDriver.build_query`, a `QueryPlan` instance is constructed
-    via `DirectQueryDriver.analyze_query`, which also returns an initial
-    `QueryBuilder`.  After this point the plans are considered frozen, and the
-    nested plan attributes are then passed to each of the corresponding
-    `DirectQueryDriver` methods along with the builder, which is mutated (and
-    occasionally replaced) into the complete SQL/postprocessing form of the
-    query.
+    CHAINED collections are flattened out in the nested lists.  Lists have been
+    filtered to be consistent with the dataset types in the summaries, but not
+    necessarily the governor dimensions in the summaries.
     """
 
-    joins: QueryJoinsPlan
-    """Description of the "joins" stage of query construction."""
-
-    projection: QueryProjectionPlan
-    """Description of the "projection" stage of query construction."""
 
-    find_first: QueryFindFirstPlan | None
-    """Description of the "find_first" stage of query construction.
+@dataclasses.dataclass
+class QueryTreeAnalysis:
+    """A struct aggregating all analysis results derived from the query tree.
 
-    This attribute is `None` if there is no find-first search at all, and
-    `False` in boolean contexts if the search is trivial because there is only
-    one collection after the collections have been resolved.
+    See `DirectQueryDriver.build_query` for an overview of how queries are
+    transformed into SQL, and the role this object plays in that.
     """
 
-    final_columns: qt.ColumnSet
-    """The columns included in the SELECT clause of the complete SQL query
-    that is actually executed.
-
-    This is a subset of `QueryProjectionPlan.columns` that differs only in
-    columns used by the `find_first` stage or an ORDER BY expression.
+    joins: QueryJoinsAnalysis
+    """Analysis of the "joins" stage, including all joins and columns needed by
+    ``tree``.  Additional columns will be added to this plan later.
+    """
 
-    Like all other `.queries.tree.ColumnSet` attributes, it does not include
-    fields added directly to `QueryBuilder.special`, which may also be added
-    to the SELECT clause.
+    union_datasets: list[ResolvedDatasetSearch[list[str]]]
+    """Resolved dataset searches that expand `QueryTree.any_dataset` out
+    into groups of dataset types with the same collection search path.
     """
 
-    builder: QueryBuilder
-    """Under-construction SQL query associated with this plan."""
+    initial_select_builder: SqlSelectBuilder
+    """In-progress SQL query builder, initialized with just spatial and
+    temporal overlaps."""
 
     postprocessing: Postprocessing
-    """Struct representing post-query processing in Python, which may require
-    additional columns in the query results.
-    """
+    """Struct representing post-query processing to be done in Python."""