diff --git a/README.md b/README.md index 757c0fb50364e..3588d9941913b 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,7 @@ Here are some of the major database solutions that are supported: yugabyte databend starrocks + doris

**A more comprehensive list of supported databases** along with the configuration instructions can be found [here](https://superset.apache.org/docs/databases/installing-database-drivers). diff --git a/docs/docs/databases/doris.mdx b/docs/docs/databases/doris.mdx new file mode 100644 index 0000000000000..62c16afeb3e1a --- /dev/null +++ b/docs/docs/databases/doris.mdx @@ -0,0 +1,26 @@ +--- +title: Apache Doris +hide_title: true +sidebar_position: 5 +version: 1 +--- + +## Doris + +The [sqlalchemy-doris](https://pypi.org/project/pydoris/) library is the recommended way to connect to Apache Doris through SQLAlchemy. + +You'll need the following setting values to form the connection string: + +- **User**: User Name +- **Password**: Password +- **Host**: Doris FE Host +- **Port**: Doris FE port +- **Catalog**: Catalog Name +- **Database**: Database Name + + +Here's what the connection string looks like: + +``` +doris://:@:/. +``` diff --git a/docs/docs/databases/installing-database-drivers.mdx b/docs/docs/databases/installing-database-drivers.mdx index f698b7ab8ee2a..f11b4ec5eb722 100644 --- a/docs/docs/databases/installing-database-drivers.mdx +++ b/docs/docs/databases/installing-database-drivers.mdx @@ -25,6 +25,7 @@ Some of the recommended packages are shown below. Please refer to [setup.py](htt | Database | PyPI package | Connection String | | --------------------------------------------------------- | ---------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Amazon Athena](/docs/databases/athena) | `pip install pyathena[pandas]` , `pip install PyAthenaJDBC` | `awsathena+rest://{aws_access_key_id}:{aws_secret_access_key}@athena.{region_name}.amazonaws.com/{schema_name}?s3_staging_dir={s3_staging_dir}&... ` | +| [Apache Doris](/docs/databases/doris) | `pip install pydoris` | `doris://:@:/.` | | [Amazon DynamoDB](/docs/databases/dynamodb) | `pip install pydynamodb` | `dynamodb://{access_key_id}:{secret_access_key}@dynamodb.{region_name}.amazonaws.com?connector=superset` | | [Amazon Redshift](/docs/databases/redshift) | `pip install sqlalchemy-redshift` | ` redshift+psycopg2://:@:5439/` | | [Apache Drill](/docs/databases/drill) | `pip install sqlalchemy-drill` | `drill+sadrill:// For JDBC drill+jdbc://` | diff --git a/docs/src/resources/data.js b/docs/src/resources/data.js index a07be552673ef..42cf835a495b4 100644 --- a/docs/src/resources/data.js +++ b/docs/src/resources/data.js @@ -117,4 +117,9 @@ export const Databases = [ href: 'https://www.microsoft.com/en-us/sql-server', imgName: 'msql.png', }, + { + title: 'Apache Doris', + href: 'https://doris.apache.org/', + imgName: 'doris.png', + }, ]; diff --git a/docs/static/img/databases/doris.png b/docs/static/img/databases/doris.png new file mode 100644 index 0000000000000..4d88f2a36cf72 Binary files /dev/null and b/docs/static/img/databases/doris.png differ diff --git a/setup.py b/setup.py index e4d437b4d1077..29df567e04a36 100644 --- a/setup.py +++ b/setup.py @@ -205,6 +205,7 @@ def get_git_sha() -> str: "vertica": ["sqlalchemy-vertica-python>=0.5.9, < 0.6"], "netezza": ["nzalchemy>=11.0.2"], "starrocks": ["starrocks>=1.0.0"], + "doris": ["pydoris>=1.0.0, <2.0.0"], }, python_requires="~=3.9", author="Apache Software Foundation", diff --git a/superset-frontend/src/assets/images/doris.png b/superset-frontend/src/assets/images/doris.png new file mode 100644 index 0000000000000..4d88f2a36cf72 Binary files /dev/null and b/superset-frontend/src/assets/images/doris.png differ diff --git a/superset/db_engine_specs/doris.py b/superset/db_engine_specs/doris.py new file mode 100644 index 0000000000000..e502f5bda2be7 --- /dev/null +++ b/superset/db_engine_specs/doris.py @@ -0,0 +1,278 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import logging +import re +from re import Pattern +from typing import Any, Optional +from urllib import parse + +from flask_babel import gettext as __ +from sqlalchemy import Float, Integer, Numeric, String, TEXT, types +from sqlalchemy.engine.url import URL +from sqlalchemy.sql.type_api import TypeEngine + +from superset.db_engine_specs.mysql import MySQLEngineSpec +from superset.errors import SupersetErrorType +from superset.utils.core import GenericDataType + +# Regular expressions to catch custom errors +CONNECTION_ACCESS_DENIED_REGEX = re.compile( + "Access denied for user '(?P.*?)'" +) +CONNECTION_INVALID_HOSTNAME_REGEX = re.compile( + "Unknown Doris server host '(?P.*?)'" +) +CONNECTION_UNKNOWN_DATABASE_REGEX = re.compile("Unknown database '(?P.*?)'") +CONNECTION_HOST_DOWN_REGEX = re.compile( + "Can't connect to Doris server on '(?P.*?)'" +) +SYNTAX_ERROR_REGEX = re.compile( + "check the manual that corresponds to your MySQL server " + "version for the right syntax to use near '(?P.*)" +) + +logger = logging.getLogger(__name__) + + +class TINYINT(Integer): + __visit_name__ = "TINYINT" + + +class LARGEINT(Integer): + __visit_name__ = "LARGEINT" + + +class DOUBLE(Float): + __visit_name__ = "DOUBLE" + + +class HLL(Numeric): + __visit_name__ = "HLL" + + +class BITMAP(Numeric): + __visit_name__ = "BITMAP" + + +class QuantileState(Numeric): + __visit_name__ = "QUANTILE_STATE" + + +class AggState(Numeric): + __visit_name__ = "AGG_STATE" + + +class ARRAY(TypeEngine): + __visit_name__ = "ARRAY" + + @property + def python_type(self) -> Optional[type[list[Any]]]: + return list + + +class MAP(TypeEngine): + __visit_name__ = "MAP" + + @property + def python_type(self) -> Optional[type[dict[Any, Any]]]: + return dict + + +class STRUCT(TypeEngine): + __visit_name__ = "STRUCT" + + @property + def python_type(self) -> Optional[type[Any]]: + return None + + +class DorisEngineSpec(MySQLEngineSpec): + engine = "pydoris" + engine_aliases = {"doris"} + engine_name = "Apache Doris" + max_column_name_length = 64 + default_driver = "pydoris" + sqlalchemy_uri_placeholder = ( + "doris://user:password@host:port/catalog.db[?key=value&key=value...]" + ) + encryption_parameters = {"ssl": "0"} + supports_dynamic_schema = True + + column_type_mappings = ( # type: ignore + ( + re.compile(r"^tinyint", re.IGNORECASE), + TINYINT(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^largeint", re.IGNORECASE), + LARGEINT(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^decimal.*", re.IGNORECASE), + types.DECIMAL(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^double", re.IGNORECASE), + DOUBLE(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^varchar(\((\d+)\))*$", re.IGNORECASE), + types.VARCHAR(), + GenericDataType.STRING, + ), + ( + re.compile(r"^char(\((\d+)\))*$", re.IGNORECASE), + types.CHAR(), + GenericDataType.STRING, + ), + ( + re.compile(r"^json.*", re.IGNORECASE), + types.JSON(), + GenericDataType.STRING, + ), + ( + re.compile(r"^binary.*", re.IGNORECASE), + types.BINARY(), + GenericDataType.STRING, + ), + ( + re.compile(r"^quantile_state", re.IGNORECASE), + QuantileState(), + GenericDataType.STRING, + ), + ( + re.compile(r"^agg_state.*", re.IGNORECASE), + AggState(), + GenericDataType.STRING, + ), + (re.compile(r"^hll", re.IGNORECASE), HLL(), GenericDataType.STRING), + ( + re.compile(r"^bitmap", re.IGNORECASE), + BITMAP(), + GenericDataType.STRING, + ), + ( + re.compile(r"^array.*", re.IGNORECASE), + ARRAY(), + GenericDataType.STRING, + ), + ( + re.compile(r"^map.*", re.IGNORECASE), + MAP(), + GenericDataType.STRING, + ), + ( + re.compile(r"^struct.*", re.IGNORECASE), + STRUCT(), + GenericDataType.STRING, + ), + ( + re.compile(r"^datetime.*", re.IGNORECASE), + types.DATETIME(), + GenericDataType.STRING, + ), + ( + re.compile(r"^date.*", re.IGNORECASE), + types.DATE(), + GenericDataType.STRING, + ), + ( + re.compile(r"^text.*", re.IGNORECASE), + TEXT(), + GenericDataType.STRING, + ), + ( + re.compile(r"^string.*", re.IGNORECASE), + String(), + GenericDataType.STRING, + ), + ) + + custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = { + CONNECTION_ACCESS_DENIED_REGEX: ( + __('Either the username "%(username)s" or the password is incorrect.'), + SupersetErrorType.CONNECTION_ACCESS_DENIED_ERROR, + {"invalid": ["username", "password"]}, + ), + CONNECTION_INVALID_HOSTNAME_REGEX: ( + __('Unknown Doris server host "%(hostname)s".'), + SupersetErrorType.CONNECTION_INVALID_HOSTNAME_ERROR, + {"invalid": ["host"]}, + ), + CONNECTION_HOST_DOWN_REGEX: ( + __('The host "%(hostname)s" might be down and can\'t be reached.'), + SupersetErrorType.CONNECTION_HOST_DOWN_ERROR, + {"invalid": ["host", "port"]}, + ), + CONNECTION_UNKNOWN_DATABASE_REGEX: ( + __('Unable to connect to database "%(database)s".'), + SupersetErrorType.CONNECTION_UNKNOWN_DATABASE_ERROR, + {"invalid": ["database"]}, + ), + SYNTAX_ERROR_REGEX: ( + __( + 'Please check your query for syntax errors near "%(server_error)s". ' + "Then, try running your query again." + ), + SupersetErrorType.SYNTAX_ERROR, + {}, + ), + } + + @classmethod + def adjust_engine_params( + cls, + uri: URL, + connect_args: dict[str, Any], + catalog: Optional[str] = None, + schema: Optional[str] = None, + ) -> tuple[URL, dict[str, Any]]: + database = uri.database + if schema and database: + schema = parse.quote(schema, safe="") + if "." in database: + database = database.split(".")[0] + "." + schema + else: + database = "internal." + schema + uri = uri.set(database=database) + + return uri, connect_args + + @classmethod + def get_schema_from_engine_params( + cls, + sqlalchemy_uri: URL, + connect_args: dict[str, Any], + ) -> Optional[str]: + """ + Return the configured schema. + + For doris the SQLAlchemy URI looks like this: + + doris://localhost:9030/catalog.database + + """ + database = sqlalchemy_uri.database.strip("/") + + if "." not in database: + return None + + return parse.unquote(database.split(".")[1]) diff --git a/tests/unit_tests/db_engine_specs/test_doris.py b/tests/unit_tests/db_engine_specs/test_doris.py new file mode 100644 index 0000000000000..d7444f8d2d62e --- /dev/null +++ b/tests/unit_tests/db_engine_specs/test_doris.py @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Optional + +import pytest +from sqlalchemy import JSON, types +from sqlalchemy.engine.url import make_url + +from superset.db_engine_specs.doris import ( + AggState, + ARRAY, + BITMAP, + DOUBLE, + HLL, + LARGEINT, + MAP, + QuantileState, + STRUCT, + TINYINT, +) +from superset.utils.core import GenericDataType +from tests.unit_tests.db_engine_specs.utils import assert_column_spec + + +@pytest.mark.parametrize( + "native_type,sqla_type,attrs,generic_type,is_dttm", + [ + # Numeric + ("tinyint", TINYINT, None, GenericDataType.NUMERIC, False), + ("largeint", LARGEINT, None, GenericDataType.NUMERIC, False), + ("decimal(38,18)", types.DECIMAL, None, GenericDataType.NUMERIC, False), + ("decimalv3(38,18)", types.DECIMAL, None, GenericDataType.NUMERIC, False), + ("double", DOUBLE, None, GenericDataType.NUMERIC, False), + # String + ("char(10)", types.CHAR, None, GenericDataType.STRING, False), + ("varchar(65533)", types.VARCHAR, None, GenericDataType.STRING, False), + ("binary", types.BINARY, None, GenericDataType.STRING, False), + ("text", types.TEXT, None, GenericDataType.STRING, False), + ("string", types.String, None, GenericDataType.STRING, False), + # Date + ("datetimev2", types.DateTime, None, GenericDataType.STRING, False), + ("datev2", types.Date, None, GenericDataType.STRING, False), + # Complex type + ("array", ARRAY, None, GenericDataType.STRING, False), + ("map", MAP, None, GenericDataType.STRING, False), + ("struct", STRUCT, None, GenericDataType.STRING, False), + ("json", JSON, None, GenericDataType.STRING, False), + ("jsonb", JSON, None, GenericDataType.STRING, False), + ("bitmap", BITMAP, None, GenericDataType.STRING, False), + ("hll", HLL, None, GenericDataType.STRING, False), + ("quantile_state", QuantileState, None, GenericDataType.STRING, False), + ("agg_state", AggState, None, GenericDataType.STRING, False), + ], +) +def test_get_column_spec( + native_type: str, + sqla_type: type[types.TypeEngine], + attrs: Optional[dict[str, Any]], + generic_type: GenericDataType, + is_dttm: bool, +) -> None: + from superset.db_engine_specs.doris import DorisEngineSpec as spec + + assert_column_spec(spec, native_type, sqla_type, attrs, generic_type, is_dttm) + + +@pytest.mark.parametrize( + "sqlalchemy_uri,connect_args,return_schema,return_connect_args", + [ + ( + "doris://user:password@host/db1", + {"param1": "some_value"}, + "db1", + {"param1": "some_value"}, + ), + ( + "pydoris://user:password@host/db1", + {"param1": "some_value"}, + "db1", + {"param1": "some_value"}, + ), + ( + "doris://user:password@host/catalog1.db1", + {"param1": "some_value"}, + "catalog1.db1", + {"param1": "some_value"}, + ), + ( + "pydoris://user:password@host/catalog1.db1", + {"param1": "some_value"}, + "catalog1.db1", + {"param1": "some_value"}, + ), + ], +) +def test_adjust_engine_params( + sqlalchemy_uri: str, + connect_args: dict[str, Any], + return_schema: str, + return_connect_args: dict[str, Any], +) -> None: + from superset.db_engine_specs.doris import DorisEngineSpec + + url = make_url(sqlalchemy_uri) + returned_url, returned_connect_args = DorisEngineSpec.adjust_engine_params( + url, connect_args + ) + assert returned_url.database == return_schema + assert returned_connect_args == return_connect_args + + +def test_get_schema_from_engine_params() -> None: + """ + Test the ``get_schema_from_engine_params`` method. + """ + from superset.db_engine_specs.doris import DorisEngineSpec + + assert ( + DorisEngineSpec.get_schema_from_engine_params( + make_url("doris://localhost:9030/hive.test"), + {}, + ) + == "test" + ) + + assert ( + DorisEngineSpec.get_schema_from_engine_params( + make_url("doris://localhost:9030/hive"), + {}, + ) + is None + )