Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(BigQuery): explicitly quote columns in select_star #16822

Merged
merged 5 commits into from
Oct 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 104 additions & 16 deletions superset/db_engine_specs/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
from sqlalchemy import literal_column
from sqlalchemy import column
from sqlalchemy.engine.base import Engine
from sqlalchemy.engine.url import make_url
from sqlalchemy.sql.expression import ColumnClause
from sqlalchemy.sql import sqltypes
from typing_extensions import TypedDict

from superset.databases.schemas import encrypted_field_properties, EncryptedField
Expand Down Expand Up @@ -281,20 +282,6 @@ def extra_table_metadata(
"clustering": {"cols": cluster_columns},
}

@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
"""
BigQuery dialect requires us to not use backtick in the fieldname which are
nested.
Using literal_column handles that issue.
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
Also explicility specifying column names so we don't encounter duplicate
column names in the result.
"""
return [
literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
]
Comment on lines -284 to -296
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was implemented in 2018 (#5655) and is no longer needed. I tested loading the preview for a table with nested records and it works fine when the _get_fields method is removed, each part gets quoted separately:

Screenshot 2021-09-27 at 16-40-41  DEV  Superset

Note that the data preview query quotes the parts correctly, even though it fails (for an unrelated reason).


@classmethod
def epoch_to_dttm(cls) -> str:
return "TIMESTAMP_SECONDS({col})"
Expand Down Expand Up @@ -421,3 +408,104 @@ def parameters_json_schema(cls) -> Any:
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
return spec.to_dict()["components"]["schemas"][cls.__name__]

@classmethod
def select_star( # pylint: disable=too-many-arguments
cls,
database: "Database",
table_name: str,
engine: Engine,
schema: Optional[str] = None,
limit: int = 100,
show_cols: bool = False,
indent: bool = True,
latest_partition: bool = True,
cols: Optional[List[Dict[str, Any]]] = None,
) -> str:
"""
Remove array structures from `SELECT *`.

BigQuery supports structures and arrays of structures, eg:

author STRUCT<name STRING, email STRING>
trailer ARRAY<STRUCT<key STRING, value STRING>>

When loading metadata for a table each key in the struct is displayed as a
separate pseudo-column, eg:

- author
- author.name
- author.email
- trailer
- trailer.key
- trailer.value

When generating the `SELECT *` statement we want to remove any keys from
structs inside an array, since selecting them results in an error. The correct
select statement should look like this:

SELECT
`author`,
`author`.`name`,
`author`.`email`,
`trailer`
FROM
table

Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
selecting `author.name`, since they are keys in a structure inside an array.

This method removes any array pseudo-columns.
"""
if cols:
# For arrays of structs, remove the child columns, otherwise the query
# will fail.
array_prefixes = {
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
}
cols = [
col
for col in cols
if "." not in col["name"]
or col["name"].split(".")[0] not in array_prefixes
]

return super().select_star(
database,
table_name,
engine,
schema,
limit,
show_cols,
indent,
latest_partition,
cols,
)

@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
"""
Label columns using their fully qualified name.

BigQuery supports columns of type `struct`, which are basically dictionaries.
When loading metadata for a table with struct columns, each key in the struct
is displayed as a separate pseudo-column, eg:

author STRUCT<name STRING, email STRING>

Will be shown as 3 columns:

- author
- author.name
- author.email

If we select those fields:

SELECT `author`, `author`.`name`, `author`.`email` FROM table

The resulting columns will be called "author", "name", and "email", This may
result in a clash with other columns. To prevent that, we explicitly label
the columns using their fully qualified name, so we end up with "author",
"author__name" and "author__email", respectively.
"""
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]
4 changes: 2 additions & 2 deletions superset/result_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import pandas as pd
import pyarrow as pa

from superset import db_engine_specs
from superset.db_engine_specs import BaseEngineSpec
from superset.typing import DbapiDescription, DbapiResult
from superset.utils import core as utils

Expand Down Expand Up @@ -76,7 +76,7 @@ def __init__( # pylint: disable=too-many-locals
self,
data: DbapiResult,
cursor_description: DbapiDescription,
db_engine_spec: Type[db_engine_specs.BaseEngineSpec],
db_engine_spec: Type[BaseEngineSpec],
):
self.db_engine_spec = db_engine_spec
data = data or []
Expand Down
190 changes: 190 additions & 0 deletions tests/unit_tests/db_engine_specs/test_bigquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=unused-argument, import-outside-toplevel, protected-access

from flask.ctx import AppContext
from pytest_mock import MockFixture
from sqlalchemy import select
from sqlalchemy.dialects.sqlite.base import SQLiteDialect
from sqlalchemy.sql import sqltypes


def test_get_fields(app_context: AppContext) -> None:
"""
Test the custom ``_get_fields`` method.

The method adds custom labels (aliases) to the columns to prevent
collision when referencing record fields. Eg, if we had these two
columns:

name STRING
project STRUCT<name STRING>

One could write this query:

SELECT
`name`,
`project`.`name`
FROM
the_table

But then both columns would get aliased as "name".

The custom method will replace the fields so that the final query
looks like this:

SELECT
`name` AS `name`,
`project`.`name` AS project__name
FROM
the_table

"""
from superset.db_engine_specs.bigquery import BigQueryEngineSpec

columns = [{"name": "limit"}, {"name": "name"}, {"name": "project.name"}]
fields = BigQueryEngineSpec._get_fields(columns)

# generic SQL
query = select(fields)
assert (
str(query)
== 'SELECT "limit" AS "limit", name AS name, "project.name" AS project__name'
)

# BigQuery-specific SQL
try:
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
except ModuleNotFoundError:
return

assert str(query.compile(dialect=BigQueryDialect())) == (
"SELECT `limit` AS `limit`, `name` AS `name`, "
"`project`.`name` AS `project__name`"
)


def test_select_star(mocker: MockFixture, app_context: AppContext) -> None:
"""
Test the ``select_star`` method.

The method removes pseudo-columns from structures inside arrays. While these
pseudo-columns show up as "columns" for metadata reasons, we can't select them
in the query, as opposed to fields from non-array structures.
"""
from superset.db_engine_specs.bigquery import BigQueryEngineSpec

cols = [
{
"name": "trailer",
"type": sqltypes.ARRAY(sqltypes.JSON()),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.key",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.value",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
{
"name": "trailer.email",
"type": sqltypes.String(),
"nullable": True,
"comment": None,
"default": None,
"precision": None,
"scale": None,
"max_length": None,
},
]

# mock the database so we can compile the query
database = mocker.MagicMock()
database.compile_sqla_query = lambda query: str(
query.compile(dialect=SQLiteDialect())
)

# use SQLite dialect so we don't need the BQ dependency
engine = mocker.MagicMock()
engine.dialect = SQLiteDialect()

sql = BigQueryEngineSpec.select_star(
database=database,
table_name="my_table",
engine=engine,
schema=None,
limit=100,
show_cols=True,
indent=True,
latest_partition=False,
cols=cols,
)
assert (
sql
== """SELECT trailer AS trailer
FROM my_table
LIMIT ?
OFFSET ?"""
)

# BigQuery-specific SQL
try:
from pybigquery.sqlalchemy_bigquery import BigQueryDialect
except ModuleNotFoundError:
return

database.compile_sqla_query = lambda query: str(
query.compile(dialect=BigQueryDialect())
)
engine.dialect = BigQueryDialect()

sql = BigQueryEngineSpec.select_star(
database=database,
table_name="my_table",
engine=engine,
schema=None,
limit=100,
show_cols=True,
indent=True,
latest_partition=False,
cols=cols,
)
assert (
sql
== """SELECT `trailer` AS `trailer`
FROM `my_table`
LIMIT :param_1"""
)