Should I avoid chaining inner joins? Typical star schema, example code inside #4031
-
Hey all, I have a typical star schema in a sqlite .db: one fact table and several dimension tables.
Dimensions are typical: unique across values and names, and fact is typical for a data hoarder: don't delete anything and take the max created at over the dimension partition. With interactive mode off and ignoring the max over partition bit for now, I'm chaining inner joins like so: fact_sale = conn.table("fact_sale")
dim_date = conn.table("dim_date")
dim_retailer = conn.table("dim_retailer")
dim_store = conn.table("dim_store")
dim_product = conn.table("dim_product")
dim_unit = conn.table("dim_unit")
sdq = (
fact_sale
.inner_join(dim_date, predicates=(fact_sale['date_id'] == dim_date['date_id']))
.inner_join(dim_store, predicates=(fact_sale['store_id'] == dim_store['store_id']))
.inner_join(dim_product, predicates=(fact_sale['product_id'] == dim_product['product_id']))
.inner_join(dim_unit, predicates=(fact_sale['unit_id'] == dim_unit['unit_id']))
) upon ---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:1808, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)
1807 if not evt_handled:
-> 1808 self.dialect.do_execute(
1809 cursor, statement, parameters, context
1810 )
1812 if self._has_events or self.engine._has_events:
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/default.py:732, in DefaultDialect.do_execute(self, cursor, statement, parameters, context)
731 def do_execute(self, cursor, statement, parameters, context=None):
--> 732 cursor.execute(statement, parameters)
OperationalError: no such column: main.fact_sale.store_id
The above exception was the direct cause of the following exception:
OperationalError Traceback (most recent call last)
Input In [17], in <cell line: 1>()
----> 1 sdq.head().execute()
File ~/dev/repos/ibis/ibis/expr/types/core.py:270, in Expr.execute(self, limit, timecontext, params, **kwargs)
245 def execute(
246 self,
247 limit: int | str | None = 'default',
(...)
250 **kwargs: Any,
251 ):
252 """Execute an expression against its backend if one exists.
253
254 Parameters
(...)
268 Mapping of scalar parameter expressions to value
269 """
--> 270 return self._find_backend().execute(
271 self, limit=limit, timecontext=timecontext, params=params, **kwargs
272 )
File ~/dev/repos/ibis/ibis/backends/base/sql/__init__.py:144, in BaseSQLBackend.execute(self, expr, params, limit, **kwargs)
142 sql = query_ast.compile()
143 self._log(sql)
--> 144 cursor = self.raw_sql(sql, **kwargs)
145 schema = self.ast_schema(query_ast, **kwargs)
146 result = self.fetch_from_cursor(cursor, schema)
File ~/dev/repos/ibis/ibis/backends/base/sql/__init__.py:96, in BaseSQLBackend.raw_sql(self, query, results)
76 """Execute a query string.
77
78 Could have unexpected results if the query modifies the behavior of
(...)
89 Backend cursor
90 """
91 # TODO results is unused, it can be removed
92 # (requires updating Impala tests)
93 # TODO `self.con` is assumed to be defined in subclasses, but there
94 # is nothing that enforces it. We should find a way to make sure
95 # `self.con` is always a DBAPI2 connection, or raise an error
---> 96 cursor = self.con.execute(query) # type: ignore
97 if cursor:
98 return cursor
File <string>:2, in execute(self, statement, *multiparams, **params)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/util/deprecations.py:401, in _decorate_with_warning.<locals>.warned(fn, *args, **kwargs)
399 if not skip_warning:
400 _warn_with_version(message, version, wtype, stacklevel=3)
--> 401 return fn(*args, **kwargs)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:3152, in Engine.execute(self, statement, *multiparams, **params)
3134 """Executes the given construct and returns a
3135 :class:`_engine.CursorResult`.
3136
(...)
3149
3150 """
3151 connection = self.connect(close_with_result=True)
-> 3152 return connection.execute(statement, *multiparams, **params)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:1295, in Connection.execute(self, statement, *multiparams, **params)
1291 util.raise_(
1292 exc.ObjectNotExecutableError(statement), replace_context=err
1293 )
1294 else:
-> 1295 return meth(self, multiparams, params, _EMPTY_EXECUTION_OPTS)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/sql/elements.py:325, in ClauseElement._execute_on_connection(self, connection, multiparams, params, execution_options, _force)
321 def _execute_on_connection(
322 self, connection, multiparams, params, execution_options, _force=False
323 ):
324 if _force or self.supports_execution:
--> 325 return connection._execute_clauseelement(
326 self, multiparams, params, execution_options
327 )
328 else:
329 raise exc.ObjectNotExecutableError(self)
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:1487, in Connection._execute_clauseelement(self, elem, multiparams, params, execution_options)
1475 compiled_cache = execution_options.get(
1476 "compiled_cache", self.engine._compiled_cache
1477 )
1479 compiled_sql, extracted_params, cache_hit = elem._compile_w_cache(
1480 dialect=dialect,
1481 compiled_cache=compiled_cache,
(...)
1485 linting=self.dialect.compiler_linting | compiler.WARN_LINTING,
1486 )
-> 1487 ret = self._execute_context(
1488 dialect,
1489 dialect.execution_ctx_cls._init_compiled,
1490 compiled_sql,
1491 distilled_params,
1492 execution_options,
1493 compiled_sql,
1494 distilled_params,
1495 elem,
1496 extracted_params,
1497 cache_hit=cache_hit,
1498 )
1499 if has_events:
1500 self.dispatch.after_execute(
1501 self,
1502 elem,
(...)
1506 ret,
1507 )
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:1851, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)
1848 branched.close()
1850 except BaseException as e:
-> 1851 self._handle_dbapi_exception(
1852 e, statement, parameters, cursor, context
1853 )
1855 return result
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:2032, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context)
2030 util.raise_(newraise, with_traceback=exc_info[2], from_=e)
2031 elif should_wrap:
-> 2032 util.raise_(
2033 sqlalchemy_exception, with_traceback=exc_info[2], from_=e
2034 )
2035 else:
2036 util.raise_(exc_info[1], with_traceback=exc_info[2])
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/util/compat.py:207, in raise_(***failed resolving arguments***)
204 exception.__cause__ = replace_context
206 try:
--> 207 raise exception
208 finally:
209 # credit to
210 # https://cosmicpercolator.com/2016/01/13/exception-leaks-in-python-2-and-3/
211 # as the __traceback__ object creates a cycle
212 del exception, replace_context, from_, with_traceback
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/base.py:1808, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)
1806 break
1807 if not evt_handled:
-> 1808 self.dialect.do_execute(
1809 cursor, statement, parameters, context
1810 )
1812 if self._has_events or self.engine._has_events:
1813 self.dispatch.after_cursor_execute(
1814 self,
1815 cursor,
(...)
1819 context.executemany,
1820 )
File ~/dev/envs/dev/lib/python3.8/site-packages/sqlalchemy/engine/default.py:732, in DefaultDialect.do_execute(self, cursor, statement, parameters, context)
731 def do_execute(self, cursor, statement, parameters, context=None):
--> 732 cursor.execute(statement, parameters)
OperationalError: (sqlite3.OperationalError) no such column: main.fact_sale.store_id
[SQL: SELECT t0.fact_id, t0.date_id_x, t0.store_id_x, t0.product_id_x, t0.unit_id AS unit_id_x, t0.value, t0.created_at, t0.date_id_y, t0.date_value, t0.store_id_y, t0.store_name, t0.retailer_id, t0.product_id_y, t0.product_name, t1.unit_id AS unit_id_y, t1.unit_name
FROM (SELECT t2.fact_id AS fact_id, t2.date_id_x AS date_id_x, t2.store_id_x AS store_id_x, t2.product_id AS product_id_x, t2.unit_id AS unit_id, t2.value AS value, t2.created_at AS created_at, t2.date_id_y AS date_id_y, t2.date_value AS date_value, t2.store_id_y AS store_id_y, t2.store_name AS store_name, t2.retailer_id AS retailer_id, t3.product_id AS product_id_y, t3.product_name AS product_name
FROM (SELECT t4.fact_id AS fact_id, t4.date_id_x AS date_id_x, t4.store_id AS store_id_x, t4.product_id AS product_id, t4.unit_id AS unit_id, t4.value AS value, t4.created_at AS created_at, t4.date_id_y AS date_id_y, t4.date_value AS date_value, t5.store_id AS store_id_y, t5.store_name AS store_name, t5.retailer_id AS retailer_id
FROM (SELECT t6.fact_id AS fact_id, t6.date_id AS date_id_x, t6.store_id AS store_id, t6.product_id AS product_id, t6.unit_id AS unit_id, t6.value AS value, t6.created_at AS created_at, t7.date_id AS date_id_y, t7.date_value AS date_value
FROM main.fact_sale AS t6 JOIN main.dim_date AS t7 ON t6.date_id = t7.date_id) AS t4 JOIN main.dim_store AS t5 ON main.fact_sale.store_id = t5.store_id) AS t2 JOIN main.dim_product AS t3 ON main.fact_sale.product_id = t3.product_id) AS t0 JOIN main.dim_unit AS t1 ON main.fact_sale.unit_id = t1.unit_id
LIMIT ? OFFSET ?]
[parameters: (5, 0)]
(Background on this error at: https://sqlalche.me/e/14/e3q8) Which I think is caused by suffixes being auto-applied on column name intersections and not auto-removed/consolidated between table expressions. Expected behavior: chaining inner joins over one another lumps join key columns into one name and does not add a suffix unless told to (a la pandas default merge behavior--iirc fail on duplicate names, no suffix for columns in difference) For reference, this works: date_join = fact_sale.inner_join(dim_date, predicates=(fact_sale['date_id'] == dim_date['date_id']))
date_join.inner_join(dim_store, predicates=(date_join['store_id'] == dim_store['store_id'])).head().execute() I'd just expect to be able to do this in one statement, i.e. sdq = (
fact_sale
.inner_join(dim_date, predicates=(fact_sale['date_id'] == dim_date['date_id']))
.inner_join(dim_store, predicates=(fact_sale['store_id'] == dim_store['store_id']))
.inner_join(dim_product, predicates=(fact_sale['product_id'] == dim_product['product_id']))
.inner_join(dim_unit, predicates=(fact_sale['unit_id'] == dim_unit['unit_id']))
) Should I just adjust my behavior / is this expected, or is this a bug? edit: just realized it could be I'm referencing |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments
-
From a SQL position, I'd write the query (sans max over partition) like so: SELECT
fs.*
,date_value
,store_name
,retailer_id
,product_name
,unit_name
FROM
fact_sale fs
JOIN
dim_date dd
ON dd.date_id = fs.date_id
JOIN
dim_store ds
ON ds.store_id = fs.store_id
JOIN
dim_product dp
ON dp.product_id = fs.product_id
JOIN
dim_unit du
ON du.unit_id = fs.unit_id Which is why I expect the fact_sale columns to cascade down into the second+ |
Beta Was this translation helpful? Give feedback.
-
As of 9.0: no you should not avoid chaining joins, you should chain as much as you like This can be done now on
The generated SQL is also optimal :)
|
Beta Was this translation helpful? Give feedback.
As of 9.0: no you should not avoid chaining joins, you should chain as much as you like
This can be done now on
main
due to the efforts of everyone inthe-epic-split
: