From d5f21b29b6a0489374af4a2c88a26d3eb77f333a Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 9 Aug 2023 18:24:52 -0700 Subject: [PATCH 01/10] feat: a native SQLAlchemy dialect for Superset --- setup.cfg | 2 +- setup.py | 9 +- superset/config.py | 2 + superset/db_engine_specs/superset.py | 39 ++ superset/extensions/metadb.py | 409 ++++++++++++++++++ superset/security/analytics_db_safety.py | 4 + .../unit_tests/extensions/test_sqlalchemy.py | 206 +++++++++ 7 files changed, 668 insertions(+), 3 deletions(-) create mode 100644 superset/db_engine_specs/superset.py create mode 100644 superset/extensions/metadb.py create mode 100644 tests/unit_tests/extensions/test_sqlalchemy.py diff --git a/setup.cfg b/setup.cfg index bc704c5b6cd8c..4340cf50c47d0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,7 +30,7 @@ combine_as_imports = true include_trailing_comma = true line_length = 88 known_first_party = superset -known_third_party =alembic,apispec,backoff,cachelib,celery,click,colorama,cron_descriptor,croniter,cryptography,dateutil,deprecation,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_jwt_extended,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,freezegun,geohash,geopy,holidays,humanize,isodate,jinja2,jwt,markdown,markupsafe,marshmallow,msgpack,nh3,numpy,pandas,parameterized,parsedatetime,pgsanity,pkg_resources,polyline,prison,progress,pyarrow,sqlalchemy_bigquery,pyhive,pyparsing,pytest,pytest_mock,pytz,redis,requests,selenium,setuptools,simplejson,slack,sqlalchemy,sqlalchemy_utils,sqlparse,typing_extensions,urllib3,werkzeug,wtforms,wtforms_json,yaml +known_third_party =alembic,apispec,backoff,cachelib,celery,click,colorama,cron_descriptor,croniter,cryptography,dateutil,deprecation,flask,flask_appbuilder,flask_babel,flask_caching,flask_compress,flask_jwt_extended,flask_login,flask_migrate,flask_sqlalchemy,flask_talisman,flask_testing,flask_wtf,freezegun,geohash,geopy,holidays,humanize,isodate,jinja2,jwt,markdown,markupsafe,marshmallow,msgpack,nh3,numpy,pandas,parameterized,parsedatetime,pgsanity,pkg_resources,polyline,prison,progress,pyarrow,sqlalchemy_bigquery,pyhive,pyparsing,pytest,pytest_mock,pytz,redis,requests,selenium,setuptools,shillelagh,simplejson,slack,sqlalchemy,sqlalchemy_utils,sqlparse,typing_extensions,urllib3,werkzeug,wtforms,wtforms_json,yaml multi_line_output = 3 order_by_type = false diff --git a/setup.py b/setup.py index 24b1b890d5565..0566bcec72e14 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,10 @@ def get_git_sha() -> str: "sqlalchemy.dialects": [ "postgres.psycopg2 = sqlalchemy.dialects.postgresql:dialect", "postgres = sqlalchemy.dialects.postgresql:dialect", + "superset = superset.extensions.metadb:SupersetAPSWDialect", + ], + "shillelagh.adapter": [ + "superset=superset.extensions.metadb:SupersetShillelaghAdapter" ], }, install_requires=[ @@ -115,6 +119,7 @@ def get_git_sha() -> str: "PyJWT>=2.4.0, <3.0", "redis>=4.5.4, <5.0", "selenium>=3.141.0, <4.10.0", + "shillelagh>=1.2.6,<2.0", "shortid", "sshtunnel>=0.4.0, <0.5", "simplejson>=3.15.0", @@ -158,7 +163,7 @@ def get_git_sha() -> str: "excel": ["xlrd>=1.2.0, <1.3"], "firebird": ["sqlalchemy-firebird>=0.7.0, <0.8"], "firebolt": ["firebolt-sqlalchemy>=0.0.1"], - "gsheets": ["shillelagh[gsheetsapi]>=1.0.14, <2"], + "gsheets": ["shillelagh[gsheetsapi]>=1.2.6, <2"], "hana": ["hdbcli==2.4.162", "sqlalchemy_hana==0.4.0"], "hive": ["pyhive[hive]>=0.6.5", "tableschema", "thrift>=0.14.1, <1.0.0"], "impala": ["impyla>0.16.2, <0.17"], @@ -181,7 +186,7 @@ def get_git_sha() -> str: "redshift": ["sqlalchemy-redshift>=0.8.1, < 0.9"], "rockset": ["rockset-sqlalchemy>=0.0.1, <1.0.0"], "shillelagh": [ - "shillelagh[datasetteapi,gsheetsapi,socrata,weatherapi]>=1.1.1, <2" + "shillelagh[datasetteapi,gsheetsapi,socrata,weatherapi]>=1.2.6,<2" ], "snowflake": ["snowflake-sqlalchemy>=1.2.4, <2"], "spark": ["pyhive[hive]>=0.6.5", "tableschema", "thrift>=0.14.1, <1.0.0"], diff --git a/superset/config.py b/superset/config.py index 75fda6eb37a1c..ecebf8356e06d 100644 --- a/superset/config.py +++ b/superset/config.py @@ -480,6 +480,8 @@ class D3Format(TypedDict, total=False): # or to disallow users from viewing other users profile page # Do not show user info or profile in the menu "MENU_HIDE_USER_INFO": False, + # Allows users to add a ``superset://`` DB that can query across databases. + "ENABLE_SUPERSET_META_DB": False, } # ------------------------------ diff --git a/superset/db_engine_specs/superset.py b/superset/db_engine_specs/superset.py new file mode 100644 index 0000000000000..329a149d40c2e --- /dev/null +++ b/superset/db_engine_specs/superset.py @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +A native Superset database. +""" + +from superset.db_engine_specs.shillelagh import ShillelaghEngineSpec + + +class SupersetEngineSpec(ShillelaghEngineSpec): + """ + Internal engine for Superset + + This DB engine spec is a meta-database. It uses the shillelagh library + to build a DB that can operate across different Superset databases. + """ + + engine = "superset" + engine_name = "Superset native database" + drivers = {"": "Native driver"} + default_driver = "" + sqlalchemy_uri_placeholder = "superset://" + + supports_file_upload = False diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py new file mode 100644 index 0000000000000..18d150b76cb59 --- /dev/null +++ b/superset/extensions/metadb.py @@ -0,0 +1,409 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +A SQLAlchemy dialect for querying across Superset databases. + +The dialect ``superset://`` allows users to query any table in any database that has been +configured in Superset, eg: + + > SELECT * FROM "superset.examples.birth_names"; + +The syntax for tables is: + + superset.database[[.catalog].schema].table + +The dialect is built on top of Shillelagh, a framework for building DB API 2.0 libraries +and SQLAlchemy dialects based on SQLite. SQLite will parse the SQL, and pass the filters +to the adapter. The adapter builds a SQLAlchemy query object reading data from the table +and applying any filters (as well as sorting, limiting, and offsetting). + +Note that no aggregation is done on the database. Aggregations and other operations like +joins and unions are done in memory, using the SQLite engine. +""" + + +import datetime +import operator +import urllib.parse +from collections.abc import Iterator +from functools import wraps +from typing import Any, Callable, cast, Optional, TypeVar + +from shillelagh.adapters.base import Adapter +from shillelagh.backends.apsw.dialects.base import APSWDialect +from shillelagh.exceptions import ProgrammingError +from shillelagh.fields import ( + Blob, + Boolean, + Date, + DateTime, + Field, + Float, + Integer, + Order, + String, + Time, +) +from shillelagh.filters import Equal, Filter, Range +from shillelagh.typing import RequestedOrder, Row +from sqlalchemy import MetaData, Table +from sqlalchemy.engine.url import URL +from sqlalchemy.exc import NoSuchTableError +from sqlalchemy.sql import Select, select + +from superset import db, security_manager, sql_parse + + +# pylint: disable=abstract-method +class SupersetAPSWDialect(APSWDialect): + + """ + A SQLAlchemy dialect for an internal Superset engine. + + This dialect allows query to be executed across different Superset + databases. For example, to read data from the `birth_names` table in the + `examples` databases: + + >>> engine = create_engine('superset://') + >>> conn = engine.connect() + >>> results = conn.execute('SELECT * FROM "superset.examples.birth_names"') + + Queries can also join data across different Superset databases. + + The dialect is built in top of the shillelagh library, leveraging SQLite to + create virtual tables on-the-fly proxying Superset tables. The + `SupersetShillelaghAdapter` adapter is responsible for returning data when a + Superset table is accessed. + """ + + name = "superset" + + def create_connect_args(self, url: URL) -> tuple[tuple[()], dict[str, Any]]: + """ + A custom Shillelagh SQLAlchemy dialect with a single adapter configured. + """ + return ( + (), + { + "path": ":memory:", + "adapters": ["superset"], + "adapter_kwargs": {}, + "safe": True, + "isolation_level": self.isolation_level, + }, + ) + + +# pylint: disable=invalid-name +F = TypeVar("F", bound=Callable[..., Any]) + + +def check_dml(method: F) -> F: + """ + Decorator that prevents DML against databases where it's not allowed. + """ + + @wraps(method) + def wrapper(self: "SupersetShillelaghAdapter", *args: Any, **kwargs: Any) -> Any: + # pylint: disable=protected-access + if not self._allow_dml: + raise ProgrammingError(f'DML not enabled in database "{self.database}"') + return method(self, *args, **kwargs) + + return cast(F, wrapper) + + +def has_rowid(method: F) -> F: + """ + Decorator that prevents updates/deletes on tables without a rowid. + """ + + @wraps(method) + def wrapper(self: "SupersetShillelaghAdapter", *args: Any, **kwargs: Any) -> Any: + # pylint: disable=protected-access + if not self._rowid: + raise ProgrammingError( + "Can only modify data in a table with a single, integer, primary key" + ) + return method(self, *args, **kwargs) + + return cast(F, wrapper) + + +# pylint: disable=too-many-instance-attributes +class SupersetShillelaghAdapter(Adapter): + + """ + A shillelagh adapter for Superset tables. + + Shillelagh adapters are responsible for fetching data from a given resource, + allowing it to be represented as a virtual table in SQLite. This one works + as a proxy to Superset tables. + """ + + # no access to the filesystem + safe = True + + supports_limit = True + supports_offset = True + + type_map: dict[Any, type[Field]] = { + bool: Boolean, + float: Float, + int: Integer, + str: String, + datetime.date: Date, + datetime.datetime: DateTime, + datetime.time: Time, + } + + @staticmethod + def supports(uri: str, fast: bool = True, **kwargs: Any) -> bool: + """ + Return if a table is supported by the adapter. + + An URL for a table has the format superset.database[[.catalog].schema].table, + eg, `superset.examples.birth_names`. + """ + parts = [urllib.parse.unquote(part) for part in uri.split(".")] + return 3 <= len(parts) <= 5 and parts[0] == "superset" + + @staticmethod + def parse_uri(uri: str) -> tuple[str, Optional[str], Optional[str], str]: + """ + Parse the SQLAlchemy URI into arguments for the class. + + This splits the URI into database, catalog, schema, and table name: + + >>> SupersetShillelaghAdapter.parse_uri('superset.examples.birth_names') + ('examples', None, None, 'birth_names') + + """ + parts = [urllib.parse.unquote(part) for part in uri.split(".")] + if len(parts) == 3: + return parts[1], None, None, parts[2] + if len(parts) == 4: + return parts[1], None, parts[2], parts[3] + return tuple(parts[1:]) # type: ignore + + def __init__( + self, + database: str, + catalog: Optional[str], + schema: Optional[str], + table: str, + *args: Any, + **kwargs: Any, + ): + super().__init__(*args, **kwargs) + + self.database = database + self.catalog = catalog + self.schema = schema + self.table = table + + # If the table has a single integer primary key we use that as the row ID in order + # to perform updates and deletes. Otherwise we can only do inserts and selects. + self._rowid: Optional[str] = None + + # Does the database allow DML? + self._allow_dml: bool = False + + # Read column information from the database, and store it for later. + self._set_columns() + + @classmethod + def get_field(cls, python_type: Any) -> Field: + """ + Convert a Python type into a Shillelagh field. + """ + class_ = cls.type_map.get(python_type, Blob) + return class_(filters=[Equal, Range], order=Order.ANY, exact=True) + + def _set_columns(self) -> None: + """ + Inspect the table and get its columns. + + This is done on initialization because it's expensive. + """ + # pylint: disable=import-outside-toplevel + from superset.models.core import Database + + database = ( + db.session.query(Database).filter_by(database_name=self.database).first() + ) + if database is None: + raise ProgrammingError(f"Database not found: {self.database}") + self._allow_dml = database.allow_dml + + # verify permissions + table = sql_parse.Table(self.table, self.schema, self.catalog) + security_manager.raise_for_access(database=database, table=table) + + # fetch column names and types + self.engine_context = database.get_sqla_engine_with_context + metadata = MetaData() + with self.engine_context() as engine: + try: + self._table = Table( + self.table, + metadata, + autoload=True, + autoload_with=engine, + ) + except NoSuchTableError as ex: + raise ProgrammingError(f"Table does not exist: {self.table}") from ex + + # find row ID column; we can only update/delete data into a table with a + # single integer primary key + primary_keys = [ + column for column in list(self._table.primary_key) if column.primary_key + ] + if len(primary_keys) == 1 and primary_keys[0].type.python_type == int: + self._rowid = primary_keys[0].name + + self.columns = { + column.name: self.get_field(column.type.python_type) + for column in self._table.c + } + + def get_columns(self) -> dict[str, Field]: + """ + Return table columns. + """ + return self.columns + + def _build_sql( + self, + bounds: dict[str, Filter], + order: list[tuple[str, RequestedOrder]], + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> Select: + """ + Build SQLAlchemy query object. + """ + query = select([self._table]) + + for column_name, filter_ in bounds.items(): + column = self._table.c[column_name] + if isinstance(filter_, Equal): + query = query.where(column == filter_.value) + elif isinstance(filter_, Range): + if filter_.start is not None: + op = operator.ge if filter_.include_start else operator.gt + query = query.where(op(column, filter_.start)) + if filter_.end is not None: + op = operator.le if filter_.include_end else operator.lt + query = query.where(op(column, filter_.end)) + else: + raise ProgrammingError(f"Invalid filter: {filter_}") + + for column_name, requested_order in order: + column = self._table.c[column_name] + if requested_order == Order.DESCENDING: + column = column.desc() + query = query.order_by(column) + + if limit: + query = query.limit(limit) + if offset: + query = query.offset(offset) + + return query + + def get_data( # pylint: disable=arguments-differ + self, + bounds: dict[str, Filter], + order: list[tuple[str, RequestedOrder]], + limit: Optional[int] = None, + offset: Optional[int] = None, + **kwargs: Any, + ) -> Iterator[Row]: + """ + Return data for a `SELECT` statement. + """ + query = self._build_sql(bounds, order, limit, offset) + + with self.engine_context() as engine: + connection = engine.connect() + rows = connection.execute(query) + for i, row in enumerate(rows): + data = dict(zip(self.columns, row)) + data["rowid"] = data[self._rowid] if self._rowid else i + yield data + + @check_dml + def insert_row(self, row: Row) -> int: + """ + Insert a single row. + """ + row_id: Optional[int] = row.pop("rowid") + if row_id and self._rowid: + if row.get(self._rowid) != row_id: + raise ProgrammingError(f"Invalid rowid specified: {row_id}") + row[self._rowid] = row_id + + query = self._table.insert().values(**row) + + with self.engine_context() as engine: + connection = engine.connect() + result = connection.execute(query) + + # return rowid + if self._rowid: + return result.inserted_primary_key[0] + + query = self._table.count() + return connection.execute(query).scalar() + + @check_dml + @has_rowid + def delete_row(self, row_id: int) -> None: + """ + Delete a single row given its row ID. + """ + query = self._table.delete().where(self._table.c[self._rowid] == row_id) + + with self.engine_context() as engine: + connection = engine.connect() + connection.execute(query) + + @check_dml + @has_rowid + def update_row(self, row_id: int, row: Row) -> None: + """ + Update a single row given its row ID. + + Note that the updated row might have a new row ID. + """ + new_row_id: Optional[int] = row.pop("rowid") + if new_row_id: + if row.get(self._rowid) != new_row_id: + raise ProgrammingError(f"Invalid rowid specified: {new_row_id}") + row[self._rowid] = new_row_id + + query = ( + self._table.update() + .where(self._table.c[self._rowid] == row_id) + .values(**row) + ) + + with self.engine_context() as engine: + connection = engine.connect() + connection.execute(query) diff --git a/superset/security/analytics_db_safety.py b/superset/security/analytics_db_safety.py index 29583b5255201..49882817ec781 100644 --- a/superset/security/analytics_db_safety.py +++ b/superset/security/analytics_db_safety.py @@ -20,6 +20,7 @@ from sqlalchemy.engine.url import URL from sqlalchemy.exc import NoSuchModuleError +from superset import feature_flag_manager from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.exceptions import SupersetSecurityException @@ -34,6 +35,9 @@ def check_sqlalchemy_uri(uri: URL) -> None: + if not not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): + BLOCKLIST.add(re.compile(r"superset$")) + for blocklist_regex in BLOCKLIST: if not re.match(blocklist_regex, uri.drivername): continue diff --git a/tests/unit_tests/extensions/test_sqlalchemy.py b/tests/unit_tests/extensions/test_sqlalchemy.py new file mode 100644 index 0000000000000..13ecce7c7c24e --- /dev/null +++ b/tests/unit_tests/extensions/test_sqlalchemy.py @@ -0,0 +1,206 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=redefined-outer-name, import-outside-toplevel, unused-argument + +import os +from collections.abc import Iterator +from typing import TYPE_CHECKING + +import pytest +from pytest_mock import MockFixture +from sqlalchemy.engine import create_engine +from sqlalchemy.exc import ProgrammingError +from sqlalchemy.orm.session import Session + +from superset.errors import ErrorLevel, SupersetError, SupersetErrorType +from superset.exceptions import SupersetSecurityException + +if TYPE_CHECKING: + from superset.models.core import Database + + +@pytest.fixture +def database1(session: Session) -> Iterator["Database"]: + from superset.models.core import Database + + engine = session.connection().engine + Database.metadata.create_all(engine) # pylint: disable=no-member + + database = Database( + database_name="database1", + sqlalchemy_uri="sqlite:///database1.db", + allow_dml=True, + ) + session.add(database) + session.commit() + + yield database + + session.delete(database) + session.commit() + os.unlink("database1.db") + + +@pytest.fixture +def table1(session: Session, database1: "Database") -> Iterator[None]: + with database1.get_sqla_engine_with_context() as engine: + conn = engine.connect() + conn.execute("CREATE TABLE table1 (a INTEGER NOT NULL PRIMARY KEY, b INTEGER)") + conn.execute("INSERT INTO table1 (a, b) VALUES (1, 10), (2, 20)") + session.commit() + + yield + + conn.execute("DROP TABLE table1") + session.commit() + + +@pytest.fixture +def database2(session: Session) -> Iterator["Database"]: + from superset.models.core import Database + + database = Database( + database_name="database2", + sqlalchemy_uri="sqlite:///database2.db", + allow_dml=False, + ) + session.add(database) + session.commit() + + yield database + + session.delete(database) + session.commit() + os.unlink("database2.db") + + +@pytest.fixture +def table2(session: Session, database2: "Database") -> Iterator[None]: + with database2.get_sqla_engine_with_context() as engine: + conn = engine.connect() + conn.execute("CREATE TABLE table2 (a INTEGER NOT NULL PRIMARY KEY, b TEXT)") + conn.execute("INSERT INTO table2 (a, b) VALUES (1, 'ten'), (2, 'twenty')") + session.commit() + + yield + + conn.execute("DROP TABLE table2") + session.commit() + + +def test_superset(mocker: MockFixture, app_context: None, table1: None) -> None: + """ + Simple test querying a table. + """ + mocker.patch("superset.extensions.metadb.security_manager") + + engine = create_engine("superset://") + conn = engine.connect() + results = conn.execute('SELECT * FROM "superset.database1.table1"') + assert list(results) == [(1, 10), (2, 20)] + + +def test_superset_joins( + mocker: MockFixture, + app_context: None, + table1: None, + table2: None, +) -> None: + """ + A test joining across databases. + """ + mocker.patch("superset.extensions.metadb.security_manager") + + engine = create_engine("superset://") + conn = engine.connect() + results = conn.execute( + """ + SELECT t1.b, t2.b + FROM "superset.database1.table1" AS t1 + JOIN "superset.database2.table2" AS t2 + ON t1.a = t2.a + """ + ) + assert list(results) == [(10, "ten"), (20, "twenty")] + + +def test_dml( + mocker: MockFixture, + app_context: None, + table1: None, + table2: None, +) -> None: + """ + DML tests. + + Test that we can update/delete data, only if DML is enabled. + """ + mocker.patch("superset.extensions.metadb.security_manager") + + engine = create_engine("superset://") + conn = engine.connect() + + conn.execute('INSERT INTO "superset.database1.table1" (a, b) VALUES (3, 30)') + results = conn.execute('SELECT * FROM "superset.database1.table1"') + assert list(results) == [(1, 10), (2, 20), (3, 30)] + conn.execute('UPDATE "superset.database1.table1" SET b=35 WHERE a=3') + results = conn.execute('SELECT * FROM "superset.database1.table1"') + assert list(results) == [(1, 10), (2, 20), (3, 35)] + conn.execute('DELETE FROM "superset.database1.table1" WHERE b>20') + results = conn.execute('SELECT * FROM "superset.database1.table1"') + assert list(results) == [(1, 10), (2, 20)] + + with pytest.raises(ProgrammingError) as excinfo: + conn.execute( + """INSERT INTO "superset.database2.table2" (a, b) VALUES (3, 'thirty')""" + ) + assert str(excinfo.value).strip() == ( + "(shillelagh.exceptions.ProgrammingError) DML not enabled in database " + '"database2"\n[SQL: INSERT INTO "superset.database2.table2" (a, b) ' + "VALUES (3, 'thirty')]\n(Background on this error at: " + "https://sqlalche.me/e/14/f405)" + ) + + +def test_security_manager(mocker: MockFixture, app_context: None, table1: None) -> None: + """ + Test that we use the security manager to check for permissions. + """ + security_manager = mocker.MagicMock() + mocker.patch( + "superset.extensions.metadb.security_manager", + new=security_manager, + ) + security_manager.raise_for_access.side_effect = SupersetSecurityException( + SupersetError( + error_type=SupersetErrorType.TABLE_SECURITY_ACCESS_ERROR, + message=( + "You need access to the following tables: `table1`,\n " + "`all_database_access` or `all_datasource_access` permission" + ), + level=ErrorLevel.ERROR, + ) + ) + + engine = create_engine("superset://") + conn = engine.connect() + with pytest.raises(SupersetSecurityException) as excinfo: + conn.execute('SELECT * FROM "superset.database1.table1"') + assert str(excinfo.value) == ( + "You need access to the following tables: `table1`,\n " + "`all_database_access` or `all_datasource_access` permission" + ) From 7d6e93ca05e504de44dd42fed517ad272b48c98c Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 9 Aug 2023 18:49:19 -0700 Subject: [PATCH 02/10] Run pip-compile-multi --no-upgrade --- requirements/base.txt | 27 ++++++++++++++++++++++++--- requirements/development.txt | 6 +----- requirements/local.txt | 2 +- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 820faca0e491a..3bb6d766f0335 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -13,6 +13,8 @@ amqp==5.1.0 # via kombu apispec[yaml]==6.3.0 # via flask-appbuilder +apsw==3.42.0.1 + # via shillelagh async-timeout==4.0.2 # via redis attrs==23.1.0 @@ -33,10 +35,14 @@ cachelib==0.4.1 # via apache-superset celery==5.2.2 # via apache-superset +certifi==2023.7.22 + # via requests cffi==1.15.1 # via # cryptography # pynacl +charset-normalizer==3.2.0 + # via requests click==8.1.3 # via # apache-superset @@ -123,7 +129,9 @@ geographiclib==1.52 geopy==2.2.0 # via apache-superset greenlet==2.0.2 - # via sqlalchemy + # via + # shillelagh + # sqlalchemy gunicorn==20.1.0 # via apache-superset hashids==1.3.1 @@ -133,11 +141,14 @@ holidays==0.28 humanize==3.11.0 # via apache-superset idna==3.2 - # via email-validator + # via + # email-validator + # requests importlib-metadata==6.6.0 # via # apache-superset # flask + # shillelagh importlib-resources==5.12.0 # via limits isodate==0.6.0 @@ -205,6 +216,7 @@ packaging==23.1 # deprecation # limits # marshmallow + # shillelagh pandas[performance]==2.0.3 # via apache-superset paramiko==2.11.0 @@ -244,6 +256,7 @@ python-dateutil==2.8.2 # flask-appbuilder # holidays # pandas + # shillelagh python-dotenv==0.19.0 # via apache-superset python-editor==1.0.4 @@ -262,10 +275,14 @@ pyyaml==6.0.1 # apispec redis==4.5.4 # via apache-superset +requests==2.31.0 + # via shillelagh rich==13.3.4 # via flask-limiter selenium==3.141.0 # via apache-superset +shillelagh==1.2.6 + # via apache-superset shortid==0.1.2 # via apache-superset simplejson==3.17.3 @@ -287,6 +304,7 @@ sqlalchemy==1.4.36 # flask-appbuilder # flask-sqlalchemy # marshmallow-sqlalchemy + # shillelagh # sqlalchemy-utils sqlalchemy-utils==0.38.3 # via @@ -303,10 +321,13 @@ typing-extensions==4.4.0 # apache-superset # flask-limiter # limits + # shillelagh tzdata==2023.3 # via pandas urllib3==1.26.6 - # via selenium + # via + # requests + # selenium vine==5.0.0 # via # amqp diff --git a/requirements/development.txt b/requirements/development.txt index 7155b2e1a5885..fa5ac1192701a 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -6,7 +6,7 @@ # pip-compile-multi # -r base.txt --e file:. +-e file:///Users/beto/Projects/github/superset # via # -r requirements/base.in # -r requirements/development.in @@ -26,12 +26,8 @@ botocore==1.29.130 # s3transfer cached-property==1.5.2 # via tableschema -certifi==2023.5.7 - # via requests chardet==5.1.0 # via tabulator -charset-normalizer==3.1.0 - # via requests decorator==5.1.1 # via ipython dill==0.3.6 diff --git a/requirements/local.txt b/requirements/local.txt index c4bd3cd599b36..a78dbde0d9363 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -6,7 +6,7 @@ # pip-compile-multi # -r development.txt --e file:. +-e file:///Users/beto/Projects/github/superset # via # -r requirements/base.in # -r requirements/development.in From ed1dddda0e047c16381cff9ad93077555bf28b04 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 9 Aug 2023 18:49:33 -0700 Subject: [PATCH 03/10] Add a limit --- requirements/development.txt | 2 +- requirements/local.txt | 2 +- superset/config.py | 3 +++ superset/extensions/metadb.py | 4 ++++ superset/security/analytics_db_safety.py | 2 +- 5 files changed, 10 insertions(+), 3 deletions(-) diff --git a/requirements/development.txt b/requirements/development.txt index fa5ac1192701a..cab908f5e77db 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -6,7 +6,7 @@ # pip-compile-multi # -r base.txt --e file:///Users/beto/Projects/github/superset +-e file:. # via # -r requirements/base.in # -r requirements/development.in diff --git a/requirements/local.txt b/requirements/local.txt index a78dbde0d9363..c4bd3cd599b36 100644 --- a/requirements/local.txt +++ b/requirements/local.txt @@ -6,7 +6,7 @@ # pip-compile-multi # -r development.txt --e file:///Users/beto/Projects/github/superset +-e file:. # via # -r requirements/base.in # -r requirements/development.in diff --git a/superset/config.py b/superset/config.py index ecebf8356e06d..f0a7c087f3da9 100644 --- a/superset/config.py +++ b/superset/config.py @@ -873,6 +873,9 @@ class D3Format(TypedDict, total=False): # the SQL Lab UI DEFAULT_SQLLAB_LIMIT = 1000 +# The limit for the Superset Meta DB when the feature flag ENABLE_SUPERSET_META_DB is on +SUPERSET_META_DB_LIMIT: int | None = 1000 + # Adds a warning message on sqllab save query and schedule query modals. SQLLAB_SAVE_WARNING_MESSAGE = None SQLLAB_SCHEDULE_WARNING_MESSAGE = None diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 18d150b76cb59..9d53ca13b49e0 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -44,6 +44,7 @@ from functools import wraps from typing import Any, Callable, cast, Optional, TypeVar +from flask import current_app from shillelagh.adapters.base import Adapter from shillelagh.backends.apsw.dialects.base import APSWDialect from shillelagh.exceptions import ProgrammingError @@ -338,6 +339,9 @@ def get_data( # pylint: disable=arguments-differ """ Return data for a `SELECT` statement. """ + app_limit = current_app.config["SUPERSET_META_DB_LIMIT"] + limit = app_limit if limit is None else min(limit, app_limit) + query = self._build_sql(bounds, order, limit, offset) with self.engine_context() as engine: diff --git a/superset/security/analytics_db_safety.py b/superset/security/analytics_db_safety.py index 49882817ec781..73db564c60f5a 100644 --- a/superset/security/analytics_db_safety.py +++ b/superset/security/analytics_db_safety.py @@ -35,7 +35,7 @@ def check_sqlalchemy_uri(uri: URL) -> None: - if not not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): + if not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): BLOCKLIST.add(re.compile(r"superset$")) for blocklist_regex in BLOCKLIST: From 505b46ff86c513899a001394d73d02efd0192165 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 10:24:17 -0700 Subject: [PATCH 04/10] Add a test for limit --- superset/db_engine_specs/__init__.py | 4 +++- superset/db_engine_specs/superset.py | 2 +- tests/unit_tests/extensions/test_sqlalchemy.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/superset/db_engine_specs/__init__.py b/superset/db_engine_specs/__init__.py index 16f44ade7ac8f..ee75265b6b339 100644 --- a/superset/db_engine_specs/__init__.py +++ b/superset/db_engine_specs/__init__.py @@ -41,7 +41,7 @@ from sqlalchemy.engine.default import DefaultDialect from sqlalchemy.engine.url import URL -from superset import app +from superset import app, feature_flag_manager from superset.db_engine_specs.base import BaseEngineSpec logger = logging.getLogger(__name__) @@ -172,6 +172,8 @@ def get_available_engine_specs() -> dict[type[BaseEngineSpec], set[str]]: # do not add denied db engine specs to available list dbs_denylist = app.config["DBS_AVAILABLE_DENYLIST"] + if not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): + dbs_denylist["superset"] = {""} dbs_denylist_engines = dbs_denylist.keys() if ( diff --git a/superset/db_engine_specs/superset.py b/superset/db_engine_specs/superset.py index 329a149d40c2e..318af04d13f23 100644 --- a/superset/db_engine_specs/superset.py +++ b/superset/db_engine_specs/superset.py @@ -31,7 +31,7 @@ class SupersetEngineSpec(ShillelaghEngineSpec): """ engine = "superset" - engine_name = "Superset native database" + engine_name = "Superset meta database" drivers = {"": "Native driver"} default_driver = "" sqlalchemy_uri_placeholder = "superset://" diff --git a/tests/unit_tests/extensions/test_sqlalchemy.py b/tests/unit_tests/extensions/test_sqlalchemy.py index 13ecce7c7c24e..9ac01b49096fa 100644 --- a/tests/unit_tests/extensions/test_sqlalchemy.py +++ b/tests/unit_tests/extensions/test_sqlalchemy.py @@ -114,6 +114,22 @@ def test_superset(mocker: MockFixture, app_context: None, table1: None) -> None: assert list(results) == [(1, 10), (2, 20)] +def test_superset_limit(mocker: MockFixture, app_context: None, table1: None) -> None: + """ + Simple that limit is applied when querying a table. + """ + mocker.patch( + "superset.extensions.metadb.current_app.config", + {"SUPERSET_META_DB_LIMIT": 1}, + ) + mocker.patch("superset.extensions.metadb.security_manager") + + engine = create_engine("superset://") + conn = engine.connect() + results = conn.execute('SELECT * FROM "superset.database1.table1"') + assert list(results) == [(1, 10)] + + def test_superset_joins( mocker: MockFixture, app_context: None, From 7b4277f49d1dd6df2c3950c61bd82210e9f0a0f8 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 11:40:49 -0700 Subject: [PATCH 05/10] Remove superset prefix --- superset/extensions/metadb.py | 92 +++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 9d53ca13b49e0..23bd69e428062 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -14,18 +14,17 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - """ A SQLAlchemy dialect for querying across Superset databases. The dialect ``superset://`` allows users to query any table in any database that has been configured in Superset, eg: - > SELECT * FROM "superset.examples.birth_names"; + > SELECT * FROM "examples.birth_names"; The syntax for tables is: - superset.database[[.catalog].schema].table + database[[.catalog].schema].table The dialect is built on top of Shillelagh, a framework for building DB API 2.0 libraries and SQLAlchemy dialects based on SQLite. SQLite will parse the SQL, and pass the filters @@ -36,13 +35,14 @@ joins and unions are done in memory, using the SQLite engine. """ +from __future__ import annotations import datetime import operator import urllib.parse from collections.abc import Iterator from functools import wraps -from typing import Any, Callable, cast, Optional, TypeVar +from typing import Any, Callable, cast, TypeVar from flask import current_app from shillelagh.adapters.base import Adapter @@ -67,7 +67,7 @@ from sqlalchemy.exc import NoSuchTableError from sqlalchemy.sql import Select, select -from superset import db, security_manager, sql_parse +from superset import db, feature_flag_manager, security_manager, sql_parse # pylint: disable=abstract-method @@ -82,7 +82,7 @@ class SupersetAPSWDialect(APSWDialect): >>> engine = create_engine('superset://') >>> conn = engine.connect() - >>> results = conn.execute('SELECT * FROM "superset.examples.birth_names"') + >>> results = conn.execute('SELECT * FROM "examples.birth_names"') Queries can also join data across different Superset databases. @@ -103,7 +103,7 @@ def create_connect_args(self, url: URL) -> tuple[tuple[()], dict[str, Any]]: { "path": ":memory:", "adapters": ["superset"], - "adapter_kwargs": {}, + "adapter_kwargs": {"superset": {"prefix": None}}, "safe": True, "isolation_level": self.isolation_level, }, @@ -120,7 +120,7 @@ def check_dml(method: F) -> F: """ @wraps(method) - def wrapper(self: "SupersetShillelaghAdapter", *args: Any, **kwargs: Any) -> Any: + def wrapper(self: SupersetShillelaghAdapter, *args: Any, **kwargs: Any) -> Any: # pylint: disable=protected-access if not self._allow_dml: raise ProgrammingError(f'DML not enabled in database "{self.database}"') @@ -135,7 +135,7 @@ def has_rowid(method: F) -> F: """ @wraps(method) - def wrapper(self: "SupersetShillelaghAdapter", *args: Any, **kwargs: Any) -> Any: + def wrapper(self: SupersetShillelaghAdapter, *args: Any, **kwargs: Any) -> Any: # pylint: disable=protected-access if not self._rowid: raise ProgrammingError( @@ -174,53 +174,63 @@ class SupersetShillelaghAdapter(Adapter): } @staticmethod - def supports(uri: str, fast: bool = True, **kwargs: Any) -> bool: + def supports( + uri: str, + fast: bool = True, + prefix: str | None = "superset", + **kwargs: Any, + ) -> bool: """ Return if a table is supported by the adapter. - An URL for a table has the format superset.database[[.catalog].schema].table, + An URL for a table has the format [prefix.]database[[.catalog].schema].table, eg, `superset.examples.birth_names`. - """ - parts = [urllib.parse.unquote(part) for part in uri.split(".")] - return 3 <= len(parts) <= 5 and parts[0] == "superset" - @staticmethod - def parse_uri(uri: str) -> tuple[str, Optional[str], Optional[str], str]: + When using the Superset SQLAlchemy and DB engine spec the prefix is dropped, so + that tables should have the format database[[.catalog].schema].table. """ - Parse the SQLAlchemy URI into arguments for the class. + parts = [urllib.parse.unquote(part) for part in uri.split(".")] - This splits the URI into database, catalog, schema, and table name: + if prefix is not None: + if parts.pop(0) != prefix: + return False - >>> SupersetShillelaghAdapter.parse_uri('superset.examples.birth_names') - ('examples', None, None, 'birth_names') + return 2 <= len(parts) <= 4 + @staticmethod + def parse_uri(uri: str) -> tuple[str]: """ - parts = [urllib.parse.unquote(part) for part in uri.split(".")] - if len(parts) == 3: - return parts[1], None, None, parts[2] - if len(parts) == 4: - return parts[1], None, parts[2], parts[3] - return tuple(parts[1:]) # type: ignore + Pass URI through unmodified. + """ + return (uri,) def __init__( self, - database: str, - catalog: Optional[str], - schema: Optional[str], - table: str, + uri: str, + prefix: str | None = "superset", *args: Any, **kwargs: Any, ): + if not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): + raise ProgrammingError("Superset meta database is disabled") + super().__init__(*args, **kwargs) - self.database = database - self.catalog = catalog - self.schema = schema - self.table = table + parts = [urllib.parse.unquote(part) for part in uri.split(".")] + + if prefix is not None: + if prefix != parts.pop(0): + raise ProgrammingError("Invalid prefix") + self.prefix = prefix + + self.database = parts.pop(0) + self.table = parts.pop(-1) + self.schema = parts.pop(-1) if parts else None + self.catalog = parts.pop(-1) if parts else None # If the table has a single integer primary key we use that as the row ID in order # to perform updates and deletes. Otherwise we can only do inserts and selects. - self._rowid: Optional[str] = None + self._rowid: str | None = None # Does the database allow DML? self._allow_dml: bool = False @@ -293,8 +303,8 @@ def _build_sql( self, bounds: dict[str, Filter], order: list[tuple[str, RequestedOrder]], - limit: Optional[int] = None, - offset: Optional[int] = None, + limit: int | None = None, + offset: int | None = None, ) -> Select: """ Build SQLAlchemy query object. @@ -332,8 +342,8 @@ def get_data( # pylint: disable=arguments-differ self, bounds: dict[str, Filter], order: list[tuple[str, RequestedOrder]], - limit: Optional[int] = None, - offset: Optional[int] = None, + limit: int | None = None, + offset: int | None = None, **kwargs: Any, ) -> Iterator[Row]: """ @@ -357,7 +367,7 @@ def insert_row(self, row: Row) -> int: """ Insert a single row. """ - row_id: Optional[int] = row.pop("rowid") + row_id: int | None = row.pop("rowid") if row_id and self._rowid: if row.get(self._rowid) != row_id: raise ProgrammingError(f"Invalid rowid specified: {row_id}") @@ -396,7 +406,7 @@ def update_row(self, row_id: int, row: Row) -> None: Note that the updated row might have a new row ID. """ - new_row_id: Optional[int] = row.pop("rowid") + new_row_id: int | None = row.pop("rowid") if new_row_id: if row.get(self._rowid) != new_row_id: raise ProgrammingError(f"Invalid rowid specified: {new_row_id}") From 337228c3e79e776ab3abc499a7fcdae5b5c4026e Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 12:05:23 -0700 Subject: [PATCH 06/10] Add docs --- docs/docs/databases/meta-database.mdx | 42 +++++++++++++++++++++++++++ superset/config.py | 6 +++- 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 docs/docs/databases/meta-database.mdx diff --git a/docs/docs/databases/meta-database.mdx b/docs/docs/databases/meta-database.mdx new file mode 100644 index 0000000000000..3040ae05d753c --- /dev/null +++ b/docs/docs/databases/meta-database.mdx @@ -0,0 +1,42 @@ +--- +title: Querying across databases +hide_title: true +sidebar_position: 42 +version: 1 +--- + +## Querying across databases + +Superset offers an experimental feature for querying across different databases. This is done via a special database called "Superset meta database" that uses the "superset://" SQLAlchemy URI. When using the database it's possible to query any table in any of the configured databases using the following syntax: + +```sql +SELECT * FROM "database name.[[catalog.].schema].table name"; +``` + +For example: + +```sql +SELECT * FROM "examples.birth_names"; +``` + +Spaces are allowed, but periods in the names must be replaced by `%2E`. Eg: + +```sql +SELECT * FROM "Superset meta database.examples%2Ebirth_names"; +``` + +The query above returns the same rows as `SELECT * FROM "examples.birth_names"`, and also shows that the meta database can query tables from any table — even itself! + +## Considerations + +Before enabling this feature, there are a few considerations that you should have in mind. First, the meta database enforces permissions on the queried tables, so users should only have access via the database to tables that they originally have access to. Nevertheless, the meta database is a new surface for potential attacks, and bugs could allow users to see data they should not. + +Second, there are performance considerations. The meta database will push any filtering, sorting, and limiting to the underlying databases, but any aggregations and joins will happen in memory in the process running the query. Because of this, it's recommended to run the database in async mode, so queries are executed in Celery workers, instead of the web workers. Additionally, it's possible to specify a hard limit on how many rows are returned from the underlying databases. + +## Enabling the meta database + +To enable the Superset meta database, first you need to set the `ENABLE_SUPERSET_META_DB` feature flag to true. Then, add a new database of type "Superset meta database" with the SQLAlchemy URI "superset://". + +If you enable DML in the meta database users will be able to run DML queries on underlying databases **as long as DML is also enabled in them**. This allows users to run queries that move data across databases. + +Second, you might want to change the value of `SUPERSET_META_DB_LIMIT`. The default value is 1000, and defines how many are read from each database before any aggregations and joins are executed. You can also set this value `None` if you only have small tables. diff --git a/superset/config.py b/superset/config.py index f0a7c087f3da9..d2f0e89653e72 100644 --- a/superset/config.py +++ b/superset/config.py @@ -480,7 +480,11 @@ class D3Format(TypedDict, total=False): # or to disallow users from viewing other users profile page # Do not show user info or profile in the menu "MENU_HIDE_USER_INFO": False, - # Allows users to add a ``superset://`` DB that can query across databases. + # Allows users to add a ``superset://`` DB that can query across databases. This is + # an experimental feature with potential security and performance risks, so use with + # caution. If the feature is enabled you can also set a limit for how much data is + # returned from each database in the ``SUPERSET_META_DB_LIMIT`` configuration value + # in this file. "ENABLE_SUPERSET_META_DB": False, } From 82b37a1dedfe940585dd2a8e038f558a2ecccb36 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 20:42:24 -0700 Subject: [PATCH 07/10] Fix tests and lint --- superset/db_engine_specs/__init__.py | 1 + superset/extensions/metadb.py | 6 ++-- .../unit_tests/extensions/test_sqlalchemy.py | 34 +++++++++++-------- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/superset/db_engine_specs/__init__.py b/superset/db_engine_specs/__init__.py index ee75265b6b339..d4ec199133d38 100644 --- a/superset/db_engine_specs/__init__.py +++ b/superset/db_engine_specs/__init__.py @@ -120,6 +120,7 @@ def get_engine_spec(backend: str, driver: Optional[str] = None) -> type[BaseEngi } +# pylint: disable=too-many-branches def get_available_engine_specs() -> dict[type[BaseEngineSpec], set[str]]: """ Return available engine specs and installed drivers for them. diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 23bd69e428062..c774c82366f57 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -110,7 +110,6 @@ def create_connect_args(self, url: URL) -> tuple[tuple[()], dict[str, Any]]: ) -# pylint: disable=invalid-name F = TypeVar("F", bound=Callable[..., Any]) @@ -208,13 +207,12 @@ def __init__( self, uri: str, prefix: str | None = "superset", - *args: Any, **kwargs: Any, ): if not feature_flag_manager.is_feature_enabled("ENABLE_SUPERSET_META_DB"): raise ProgrammingError("Superset meta database is disabled") - super().__init__(*args, **kwargs) + super().__init__(**kwargs) parts = [urllib.parse.unquote(part) for part in uri.split(".")] @@ -338,7 +336,7 @@ def _build_sql( return query - def get_data( # pylint: disable=arguments-differ + def get_data( self, bounds: dict[str, Filter], order: list[tuple[str, RequestedOrder]], diff --git a/tests/unit_tests/extensions/test_sqlalchemy.py b/tests/unit_tests/extensions/test_sqlalchemy.py index 9ac01b49096fa..5a71529bfe94f 100644 --- a/tests/unit_tests/extensions/test_sqlalchemy.py +++ b/tests/unit_tests/extensions/test_sqlalchemy.py @@ -28,6 +28,7 @@ from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.exceptions import SupersetSecurityException +from tests.unit_tests.conftest import with_feature_flags if TYPE_CHECKING: from superset.models.core import Database @@ -102,6 +103,7 @@ def table2(session: Session, database2: "Database") -> Iterator[None]: session.commit() +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) def test_superset(mocker: MockFixture, app_context: None, table1: None) -> None: """ Simple test querying a table. @@ -110,10 +112,11 @@ def test_superset(mocker: MockFixture, app_context: None, table1: None) -> None: engine = create_engine("superset://") conn = engine.connect() - results = conn.execute('SELECT * FROM "superset.database1.table1"') + results = conn.execute('SELECT * FROM "database1.table1"') assert list(results) == [(1, 10), (2, 20)] +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) def test_superset_limit(mocker: MockFixture, app_context: None, table1: None) -> None: """ Simple that limit is applied when querying a table. @@ -126,10 +129,11 @@ def test_superset_limit(mocker: MockFixture, app_context: None, table1: None) -> engine = create_engine("superset://") conn = engine.connect() - results = conn.execute('SELECT * FROM "superset.database1.table1"') + results = conn.execute('SELECT * FROM "database1.table1"') assert list(results) == [(1, 10)] +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) def test_superset_joins( mocker: MockFixture, app_context: None, @@ -146,14 +150,15 @@ def test_superset_joins( results = conn.execute( """ SELECT t1.b, t2.b - FROM "superset.database1.table1" AS t1 - JOIN "superset.database2.table2" AS t2 + FROM "database1.table1" AS t1 + JOIN "database2.table2" AS t2 ON t1.a = t2.a """ ) assert list(results) == [(10, "ten"), (20, "twenty")] +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) def test_dml( mocker: MockFixture, app_context: None, @@ -170,28 +175,27 @@ def test_dml( engine = create_engine("superset://") conn = engine.connect() - conn.execute('INSERT INTO "superset.database1.table1" (a, b) VALUES (3, 30)') - results = conn.execute('SELECT * FROM "superset.database1.table1"') + conn.execute('INSERT INTO "database1.table1" (a, b) VALUES (3, 30)') + results = conn.execute('SELECT * FROM "database1.table1"') assert list(results) == [(1, 10), (2, 20), (3, 30)] - conn.execute('UPDATE "superset.database1.table1" SET b=35 WHERE a=3') - results = conn.execute('SELECT * FROM "superset.database1.table1"') + conn.execute('UPDATE "database1.table1" SET b=35 WHERE a=3') + results = conn.execute('SELECT * FROM "database1.table1"') assert list(results) == [(1, 10), (2, 20), (3, 35)] - conn.execute('DELETE FROM "superset.database1.table1" WHERE b>20') - results = conn.execute('SELECT * FROM "superset.database1.table1"') + conn.execute('DELETE FROM "database1.table1" WHERE b>20') + results = conn.execute('SELECT * FROM "database1.table1"') assert list(results) == [(1, 10), (2, 20)] with pytest.raises(ProgrammingError) as excinfo: - conn.execute( - """INSERT INTO "superset.database2.table2" (a, b) VALUES (3, 'thirty')""" - ) + conn.execute("""INSERT INTO "database2.table2" (a, b) VALUES (3, 'thirty')""") assert str(excinfo.value).strip() == ( "(shillelagh.exceptions.ProgrammingError) DML not enabled in database " - '"database2"\n[SQL: INSERT INTO "superset.database2.table2" (a, b) ' + '"database2"\n[SQL: INSERT INTO "database2.table2" (a, b) ' "VALUES (3, 'thirty')]\n(Background on this error at: " "https://sqlalche.me/e/14/f405)" ) +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) def test_security_manager(mocker: MockFixture, app_context: None, table1: None) -> None: """ Test that we use the security manager to check for permissions. @@ -215,7 +219,7 @@ def test_security_manager(mocker: MockFixture, app_context: None, table1: None) engine = create_engine("superset://") conn = engine.connect() with pytest.raises(SupersetSecurityException) as excinfo: - conn.execute('SELECT * FROM "superset.database1.table1"') + conn.execute('SELECT * FROM "database1.table1"') assert str(excinfo.value) == ( "You need access to the following tables: `table1`,\n " "`all_database_access` or `all_datasource_access` permission" From a364add1040539db3ce5d7d8f985cceca5dfdcbe Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 21:12:59 -0700 Subject: [PATCH 08/10] Add allowed_dbs --- docs/docs/databases/meta-database.mdx | 6 +++++ superset/extensions/metadb.py | 16 ++++++++++++- .../unit_tests/extensions/test_sqlalchemy.py | 24 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/docs/docs/databases/meta-database.mdx b/docs/docs/databases/meta-database.mdx index 3040ae05d753c..ae5b055470144 100644 --- a/docs/docs/databases/meta-database.mdx +++ b/docs/docs/databases/meta-database.mdx @@ -40,3 +40,9 @@ To enable the Superset meta database, first you need to set the `ENABLE_SUPERSET If you enable DML in the meta database users will be able to run DML queries on underlying databases **as long as DML is also enabled in them**. This allows users to run queries that move data across databases. Second, you might want to change the value of `SUPERSET_META_DB_LIMIT`. The default value is 1000, and defines how many are read from each database before any aggregations and joins are executed. You can also set this value `None` if you only have small tables. + +Additionally, you might want to restrict the databases to with the meta database has access to. This can be done in the database configuration, under "Advanced" -> "Other" -> "ENGINE PARAMETERS" and adding: + +```json +{"allowed_dbs":["Google Sheets","examples"]} +``` diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index c774c82366f57..4430573cada8c 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -94,6 +94,11 @@ class SupersetAPSWDialect(APSWDialect): name = "superset" + def __init__(self, allowed_dbs: list[str] | None = None, **kwargs: Any) -> None: + super().__init__(**kwargs) + + self.allowed_dbs = allowed_dbs + def create_connect_args(self, url: URL) -> tuple[tuple[()], dict[str, Any]]: """ A custom Shillelagh SQLAlchemy dialect with a single adapter configured. @@ -103,7 +108,12 @@ def create_connect_args(self, url: URL) -> tuple[tuple[()], dict[str, Any]]: { "path": ":memory:", "adapters": ["superset"], - "adapter_kwargs": {"superset": {"prefix": None}}, + "adapter_kwargs": { + "superset": { + "prefix": None, + "allowed_dbs": self.allowed_dbs, + } + }, "safe": True, "isolation_level": self.isolation_level, }, @@ -177,6 +187,7 @@ def supports( uri: str, fast: bool = True, prefix: str | None = "superset", + allowed_dbs: list[str] | None = None, **kwargs: Any, ) -> bool: """ @@ -194,6 +205,9 @@ def supports( if parts.pop(0) != prefix: return False + if allowed_dbs is not None and parts[0] not in allowed_dbs: + return False + return 2 <= len(parts) <= 4 @staticmethod diff --git a/tests/unit_tests/extensions/test_sqlalchemy.py b/tests/unit_tests/extensions/test_sqlalchemy.py index 5a71529bfe94f..cc738fd6c6b1e 100644 --- a/tests/unit_tests/extensions/test_sqlalchemy.py +++ b/tests/unit_tests/extensions/test_sqlalchemy.py @@ -224,3 +224,27 @@ def test_security_manager(mocker: MockFixture, app_context: None, table1: None) "You need access to the following tables: `table1`,\n " "`all_database_access` or `all_datasource_access` permission" ) + + +@with_feature_flags(ENABLE_SUPERSET_META_DB=True) +def test_allowed_dbs(mocker: MockFixture, app_context: None, table1: None) -> None: + """ + Test that DBs can be restricted. + """ + mocker.patch("superset.extensions.metadb.security_manager") + + engine = create_engine("superset://", allowed_dbs=["database1"]) + conn = engine.connect() + + results = conn.execute('SELECT * FROM "database1.table1"') + assert list(results) == [(1, 10), (2, 20)] + + with pytest.raises(ProgrammingError) as excinfo: + conn.execute('SELECT * FROM "database2.table2"') + assert str(excinfo.value) == ( + """ +(shillelagh.exceptions.ProgrammingError) Unsupported table: database2.table2 +[SQL: SELECT * FROM "database2.table2"] +(Background on this error at: https://sqlalche.me/e/14/f405) + """.strip() + ) From 0b211b341663e175154209f050e935069a23269d Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 10 Aug 2023 21:48:31 -0700 Subject: [PATCH 09/10] Handle SUPERSET_META_DB_LIMIT=None --- superset/extensions/metadb.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 4430573cada8c..09c48f46759a9 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -343,9 +343,9 @@ def _build_sql( column = column.desc() query = query.order_by(column) - if limit: + if limit is not None: query = query.limit(limit) - if offset: + if offset is not None: query = query.offset(offset) return query @@ -361,8 +361,11 @@ def get_data( """ Return data for a `SELECT` statement. """ - app_limit = current_app.config["SUPERSET_META_DB_LIMIT"] - limit = app_limit if limit is None else min(limit, app_limit) + app_limit: int | None = current_app.config["SUPERSET_META_DB_LIMIT"] + if limit is None: + limit = app_limit + elif app_limit is not None: + limit = min(limit, app_limit) query = self._build_sql(bounds, order, limit, offset) From 8e2b9710889567c7015568105c9ddaedbe6f5804 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Fri, 11 Aug 2023 08:05:37 -0700 Subject: [PATCH 10/10] Schema fixes --- superset/extensions/metadb.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/superset/extensions/metadb.py b/superset/extensions/metadb.py index 09c48f46759a9..79a3c446c44c8 100644 --- a/superset/extensions/metadb.py +++ b/superset/extensions/metadb.py @@ -41,7 +41,7 @@ import operator import urllib.parse from collections.abc import Iterator -from functools import wraps +from functools import partial, wraps from typing import Any, Callable, cast, TypeVar from flask import current_app @@ -240,6 +240,9 @@ def __init__( self.schema = parts.pop(-1) if parts else None self.catalog = parts.pop(-1) if parts else None + if self.catalog: + raise NotImplementedError("Catalogs are not currently supported") + # If the table has a single integer primary key we use that as the row ID in order # to perform updates and deletes. Otherwise we can only do inserts and selects. self._rowid: str | None = None @@ -278,14 +281,20 @@ def _set_columns(self) -> None: table = sql_parse.Table(self.table, self.schema, self.catalog) security_manager.raise_for_access(database=database, table=table) + # store this callable for later whenever we need an engine + self.engine_context = partial( + database.get_sqla_engine_with_context, + self.schema, + ) + # fetch column names and types - self.engine_context = database.get_sqla_engine_with_context metadata = MetaData() with self.engine_context() as engine: try: self._table = Table( self.table, metadata, + schema=self.schema, autoload=True, autoload_with=engine, )