Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(gsheets): file upload #24921

Merged
merged 7 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ pylint==2.17.4
# via -r requirements/development.in
python-ldap==3.4.3
# via -r requirements/development.in
requests==2.30.0
requests==2.31.0
# via
# pydruid
# tableschema
Expand Down
2 changes: 1 addition & 1 deletion requirements/testing.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#
-r development.in
-r integration.in
-e file:.[bigquery,hive,presto,prophet,trino]
-e file:.[bigquery,hive,presto,prophet,trino,gsheets]
docker
flask-testing
freezegun
Expand Down
7 changes: 6 additions & 1 deletion requirements/testing.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SHA1:78fe89f88adf34ac75513d363d7d9d0b5cc8cd1c
# SHA1:78d0270a4f583095e0587aa21f57fc2ff7fe8b84
#
# This file is autogenerated by pip-compile-multi
# To update, run:
Expand All @@ -12,6 +12,8 @@
# -r requirements/base.in
# -r requirements/development.in
# -r requirements/testing.in
apsw==3.42.0.1
# via shillelagh
cmdstanpy==1.1.0
# via prophet
contourpy==1.0.7
Expand Down Expand Up @@ -50,6 +52,7 @@ google-auth==2.17.3
# google-cloud-core
# pandas-gbq
# pydata-google-auth
# shillelagh
# sqlalchemy-bigquery
google-auth-oauthlib==1.0.0
# via
Expand Down Expand Up @@ -142,6 +145,8 @@ rfc3339-validator==0.1.4
# via openapi-schema-validator
rsa==4.9
# via google-auth
shillelagh[gsheetsapi]==1.2.6
# via apache-superset
sqlalchemy-bigquery==1.6.1
# via apache-superset
statsd==4.0.1
Expand Down
162 changes: 147 additions & 15 deletions superset/db_engine_specs/gsheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,44 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from __future__ import annotations

import json
import logging
import re
from re import Pattern
from typing import Any, Optional, TYPE_CHECKING
from typing import Any, TYPE_CHECKING

import pandas as pd
from apispec import APISpec
from apispec.ext.marshmallow import MarshmallowPlugin
from flask import g
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
from requests import Session
from sqlalchemy.engine import create_engine
from sqlalchemy.engine.url import URL
from typing_extensions import TypedDict

from superset import security_manager
from superset import db, security_manager
from superset.constants import PASSWORD_MASK
from superset.databases.schemas import encrypted_field_properties, EncryptedString
from superset.db_engine_specs.shillelagh import ShillelaghEngineSpec
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import SupersetException

if TYPE_CHECKING:
from superset.models.core import Database
from superset.sql_parse import Table

_logger = logging.getLogger()

EXAMPLE_GSHEETS_URL = (
"https://docs.google.com/spreadsheets/d/"
"1LcWZMsdCl92g7nA-D6qGRqg1T5TiHyuKJUY1u9XAnsk/edit#gid=0"
)

SYNTAX_ERROR_REGEX = re.compile('SQLError: near "(?P<server_error>.*?)": syntax error')

Expand All @@ -57,7 +71,7 @@ class GSheetsParametersSchema(Schema):

class GSheetsParametersType(TypedDict):
service_account_info: str
catalog: Optional[dict[str, str]]
catalog: dict[str, str] | None

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious why moving from Optional ? Because I see Optional being used in several other TypedDict classes. Want to confirm if this is a pattern we aim to follow moving forward, thanks.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We bumped the minimum Python version we support in Superset (I think to 3.9?), and in it we can use the syntax foo | None instead of Optional[foo]. This way we don't need to import Optional, and the type declaration looks more natural (for example, it's similar to Typescript's foo | null for an optional foo).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh cool! Good to know moving forward then. Thanks @betodealmeida



class GSheetsPropertiesType(TypedDict):
Expand Down Expand Up @@ -88,14 +102,14 @@ class GSheetsEngineSpec(ShillelaghEngineSpec):
),
}

supports_file_upload = False
supports_file_upload = True

@classmethod
def get_url_for_impersonation(
cls,
url: URL,
impersonate_user: bool,
username: Optional[str],
username: str | None,
) -> URL:
if impersonate_user and username is not None:
user = security_manager.find_user(username=username)
Expand All @@ -107,9 +121,9 @@ def get_url_for_impersonation(
@classmethod
def extra_table_metadata(
cls,
database: "Database",
database: Database,
table_name: str,
schema_name: Optional[str],
schema_name: str | None,
) -> dict[str, Any]:
with database.get_raw_connection(schema=schema_name) as conn:
cursor = conn.cursor()
Expand All @@ -126,17 +140,16 @@ def extra_table_metadata(
def build_sqlalchemy_uri(
cls,
_: GSheetsParametersType,
encrypted_extra: Optional[ # pylint: disable=unused-argument
dict[str, Any]
] = None,
encrypted_extra: None # pylint: disable=unused-argument
| (dict[str, Any]) = None,
) -> str:
return "gsheets://"

@classmethod
def get_parameters_from_uri(
cls,
uri: str, # pylint: disable=unused-argument
encrypted_extra: Optional[dict[str, Any]] = None,
encrypted_extra: dict[str, Any] | None = None,
) -> Any:
# Building parameters from encrypted_extra and uri
if encrypted_extra:
Expand All @@ -145,7 +158,7 @@ def get_parameters_from_uri(
raise ValidationError("Invalid service credentials")

@classmethod
def mask_encrypted_extra(cls, encrypted_extra: Optional[str]) -> Optional[str]:
def mask_encrypted_extra(cls, encrypted_extra: str | None) -> str | None:
if encrypted_extra is None:
return encrypted_extra

Expand All @@ -162,9 +175,7 @@ def mask_encrypted_extra(cls, encrypted_extra: Optional[str]) -> Optional[str]:
return json.dumps(config)

@classmethod
def unmask_encrypted_extra(
cls, old: Optional[str], new: Optional[str]
) -> Optional[str]:
def unmask_encrypted_extra(cls, old: str | None, new: str | None) -> str | None:
"""
Reuse ``private_key`` if available and unchanged.
"""
Expand Down Expand Up @@ -299,3 +310,124 @@ def validate_parameters(
)
idx += 1
return errors

@staticmethod
def _do_post(
session: Session,
url: str,
body: dict[str, Any],
**kwargs: Any,
) -> dict[str, Any]:
"""
POST to the Google API.

Helper function that handles logging and error handling.
"""
_logger.info("POST %s", url)
_logger.debug(body)
response = session.post(
url,
json=body,
**kwargs,
)

payload = response.json()
_logger.debug(payload)

if "error" in payload:
raise SupersetException(payload["error"]["message"])

return payload

@classmethod
def df_to_sql( # pylint: disable=too-many-locals
cls,
database: Database,
table: Table,
df: pd.DataFrame,
to_sql_kwargs: dict[str, Any],
) -> None:
"""
Create a new sheet and update the DB catalog.

Since Google Sheets is not a database, uploading a file is slightly different
from other traditional databases. To create a table with a given name we first
create a spreadsheet with the contents of the dataframe, and we later update the
database catalog to add a mapping between the desired table name and the URL of
the new sheet.

If the table already exists and the user wants it replaced we clear all the
cells in the existing sheet before uploading the new data. Appending to an
existing table is not supported because we can't ensure that the schemas match.
"""
# pylint: disable=import-outside-toplevel
from shillelagh.backends.apsw.dialects.base import get_adapter_for_table_name

# grab the existing catalog, if any
extra = database.get_extra()
engine_params = extra.setdefault("engine_params", {})
catalog = engine_params.setdefault("catalog", {})

# sanity checks
spreadsheet_url = catalog.get(table.table)
if spreadsheet_url and "if_exists" in to_sql_kwargs:
if to_sql_kwargs["if_exists"] == "append":
# no way we're going to append a dataframe to a spreadsheet, that's
# never going to work
raise SupersetException("Append operation not currently supported")
if to_sql_kwargs["if_exists"] == "fail":
raise SupersetException("Table already exists")
if to_sql_kwargs["if_exists"] == "replace":
pass

# get the Google session from the Shillelagh adapter
with cls.get_engine(database) as engine:
with engine.connect() as conn:
# any GSheets URL will work to get a working session
adapter = get_adapter_for_table_name(
conn,
spreadsheet_url or EXAMPLE_GSHEETS_URL,
)
session = adapter._get_session() # pylint: disable=protected-access

# clear existing sheet, or create a new one
if spreadsheet_url:
spreadsheet_id = adapter._spreadsheet_id # pylint: disable=protected-access
range_ = adapter._sheet_name # pylint: disable=protected-access
url = (
"https://sheets.googleapis.com/v4/spreadsheets/"
f"{spreadsheet_id}/values/{range_}:clear"
)
cls._do_post(session, url, {})
else:
payload = cls._do_post(
session,
"https://sheets.googleapis.com/v4/spreadsheets",
{"properties": {"title": table.table}},
)
spreadsheet_id = payload["spreadsheetId"]
range_ = payload["sheets"][0]["properties"]["title"]
spreadsheet_url = payload["spreadsheetUrl"]

# insert data
body = {
"range": range_,
"majorDimension": "ROWS",
"values": df.fillna("").values.tolist(),
}
url = (
"https://sheets.googleapis.com/v4/spreadsheets/"
f"{spreadsheet_id}/values/{range_}:append"
)
cls._do_post(
session,
url,
body,
params={"valueInputOption": "USER_ENTERED"},
)

# update catalog
catalog[table.table] = spreadsheet_url
database.extra = json.dumps(extra)
db.session.add(database)
db.session.commit()
2 changes: 1 addition & 1 deletion tests/integration_tests/databases/api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3153,7 +3153,7 @@ def test_available(self, app, get_available_engine_specs):
"preferred": False,
"sqlalchemy_uri_placeholder": "gsheets://",
"engine_information": {
"supports_file_upload": False,
"supports_file_upload": True,
"disable_ssh_tunneling": True,
},
},
Expand Down
4 changes: 2 additions & 2 deletions tests/unit_tests/databases/api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_database_connection(
"driver": "gsheets",
"engine_information": {
"disable_ssh_tunneling": True,
"supports_file_upload": False,
"supports_file_upload": True,
},
"expose_in_sqllab": True,
"extra": '{\n "metadata_params": {},\n "engine_params": {},\n "metadata_cache_timeout": {},\n "schemas_allowed_for_file_upload": []\n}\n',
Expand Down Expand Up @@ -249,7 +249,7 @@ def test_database_connection(
"driver": "gsheets",
"engine_information": {
"disable_ssh_tunneling": True,
"supports_file_upload": False,
"supports_file_upload": True,
},
"expose_in_sqllab": True,
"force_ctas_schema": None,
Expand Down
Loading