Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix] Handle non-JSON errors gracefully #741

Merged
merged 15 commits into from
Aug 30, 2024
147 changes: 17 additions & 130 deletions databricks/sdk/core.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import re
import urllib.parse
from datetime import timedelta
from json import JSONDecodeError
from types import TracebackType
from typing import Any, BinaryIO, Iterator, Type
from urllib.parse import urlencode
Expand All @@ -12,8 +10,8 @@
from .config import *
# To preserve backwards compatibility (as these definitions were previously in this module)
from .credentials_provider import *
from .errors import DatabricksError, error_mapper
from .errors.private_link import _is_private_link_redirect
from .errors import DatabricksError, get_api_error
from .logger import RoundTrip
from .oauth import retrieve_token
from .retries import retried

Expand Down Expand Up @@ -262,134 +260,23 @@ def _perform(self,
auth=auth,
stream=raw,
timeout=self._http_timeout_seconds)
try:
self._record_request_log(response, raw=raw or data is not None or files is not None)
if not response.ok: # internally calls response.raise_for_status()
# TODO: experiment with traceback pruning for better readability
# See https://stackoverflow.com/a/58821552/277035
payload = response.json()
raise self._make_nicer_error(response=response, **payload) from None
# Private link failures happen via a redirect to the login page. From a requests-perspective, the request
# is successful, but the response is not what we expect. We need to handle this case separately.
if _is_private_link_redirect(response):
raise self._make_nicer_error(response=response) from None
return response
except requests.exceptions.JSONDecodeError:
message = self._make_sense_from_html(response.text)
if not message:
message = response.reason
raise self._make_nicer_error(response=response, message=message) from None

@staticmethod
def _make_sense_from_html(txt: str) -> str:
matchers = [r'<pre>(.*)</pre>', r'<title>(.*)</title>']
for attempt in matchers:
expr = re.compile(attempt, re.MULTILINE)
match = expr.search(txt)
if not match:
continue
return match.group(1).strip()
return txt

def _make_nicer_error(self, *, response: requests.Response, **kwargs) -> DatabricksError:
status_code = response.status_code
message = kwargs.get('message', 'request failed')
is_http_unauthorized_or_forbidden = status_code in (401, 403)
is_too_many_requests_or_unavailable = status_code in (429, 503)
if is_http_unauthorized_or_forbidden:
message = self._cfg.wrap_debug_info(message)
if is_too_many_requests_or_unavailable:
kwargs['retry_after_secs'] = self._parse_retry_after(response)
kwargs['message'] = message
return error_mapper(response, kwargs)
Comment on lines -283 to -304
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are moved to databricks/sdk/error/parser.py, albeit in different form.


def _record_request_log(self, response: requests.Response, raw=False):
self._record_request_log(response, raw=raw or data is not None or files is not None)
error = get_api_error(response)
if error is not None:
status_code = response.status_code
is_http_unauthorized_or_forbidden = status_code in (401, 403)
is_too_many_requests_or_unavailable = status_code in (429, 503)
if is_http_unauthorized_or_forbidden:
error.message = self._cfg.wrap_debug_info(error.message)
if is_too_many_requests_or_unavailable:
error.retry_after_secs = self._parse_retry_after(response)
raise error from None
Comment on lines +266 to +273
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These used to be added in _make_nicer_error. Now, _get_api_error is basically identical to GetApiError in the Go SDK. This divergent codepath remains thus in the ApiClient to prevent any breaking changes.

return response

def _record_request_log(self, response: requests.Response, raw: bool = False) -> None:
if not logger.isEnabledFor(logging.DEBUG):
return
request = response.request
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation and remaining functions are moved to databricks/sdk/logger/round_trip_logger.py.

url = urllib.parse.urlparse(request.url)
query = ''
if url.query:
query = f'?{urllib.parse.unquote(url.query)}'
sb = [f'{request.method} {urllib.parse.unquote(url.path)}{query}']
if self._cfg.debug_headers:
if self._cfg.host:
sb.append(f'> * Host: {self._cfg.host}')
for k, v in request.headers.items():
sb.append(f'> * {k}: {self._only_n_bytes(v, self._debug_truncate_bytes)}')
if request.body:
sb.append("> [raw stream]" if raw else self._redacted_dump("> ", request.body))
sb.append(f'< {response.status_code} {response.reason}')
if raw and response.headers.get('Content-Type', None) != 'application/json':
# Raw streams with `Transfer-Encoding: chunked` do not have `Content-Type` header
sb.append("< [raw stream]")
elif response.content:
sb.append(self._redacted_dump("< ", response.content))
logger.debug("\n".join(sb))

@staticmethod
def _mask(m: Dict[str, any]):
for k in m:
if k in {'bytes_value', 'string_value', 'token_value', 'value', 'content'}:
m[k] = "**REDACTED**"

@staticmethod
def _map_keys(m: Dict[str, any]) -> List[str]:
keys = list(m.keys())
keys.sort()
return keys

@staticmethod
def _only_n_bytes(j: str, num_bytes: int = 96) -> str:
diff = len(j.encode('utf-8')) - num_bytes
if diff > 0:
return f"{j[:num_bytes]}... ({diff} more bytes)"
return j

def _recursive_marshal_dict(self, m, budget) -> dict:
out = {}
self._mask(m)
for k in sorted(m.keys()):
raw = self._recursive_marshal(m[k], budget)
out[k] = raw
budget -= len(str(raw))
return out

def _recursive_marshal_list(self, s, budget) -> list:
out = []
for i in range(len(s)):
if i > 0 >= budget:
out.append("... (%d additional elements)" % (len(s) - len(out)))
break
raw = self._recursive_marshal(s[i], budget)
out.append(raw)
budget -= len(str(raw))
return out

def _recursive_marshal(self, v: any, budget: int) -> any:
if isinstance(v, dict):
return self._recursive_marshal_dict(v, budget)
elif isinstance(v, list):
return self._recursive_marshal_list(v, budget)
elif isinstance(v, str):
return self._only_n_bytes(v, self._debug_truncate_bytes)
else:
return v

def _redacted_dump(self, prefix: str, body: str) -> str:
if len(body) == 0:
return ""
try:
# Unmarshal body into primitive types.
tmp = json.loads(body)
max_bytes = 96
if self._debug_truncate_bytes > max_bytes:
max_bytes = self._debug_truncate_bytes
# Re-marshal body taking redaction and character limit into account.
raw = self._recursive_marshal(tmp, max_bytes)
return "\n".join([f'{prefix}{line}' for line in json.dumps(raw, indent=2).split("\n")])
except JSONDecodeError:
return f'{prefix}[non-JSON document of {len(body)} bytes]'
logger.debug(RoundTrip(response, self._cfg.debug_headers, self._debug_truncate_bytes, raw).generate())


class StreamingResponse(BinaryIO):
Expand Down
3 changes: 2 additions & 1 deletion databricks/sdk/errors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .base import DatabricksError, ErrorDetail
from .mapper import error_mapper
from .mapper import _error_mapper
from .parser import get_api_error
from .platform import *
from .private_link import PrivateLinkValidationError
from .sdk import *
34 changes: 32 additions & 2 deletions databricks/sdk/errors/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional

Expand Down Expand Up @@ -41,9 +42,38 @@ def __init__(self,
retry_after_secs: int = None,
details: List[Dict[str, any]] = None,
**kwargs):
"""

:param message:
:param error_code:
:param detail: [Deprecated]
:param status: [Deprecated]
:param scimType: [Deprecated]
:param error: [Deprecated]
:param retry_after_secs:
:param details:
:param kwargs:
"""
# SCIM-specific parameters are deprecated
if detail:
warnings.warn(
"The 'detail' parameter of DatabricksError is deprecated and will be removed in a future version."
)
if scimType:
warnings.warn(
"The 'scimType' parameter of DatabricksError is deprecated and will be removed in a future version."
)
if status:
warnings.warn(
"The 'status' parameter of DatabricksError is deprecated and will be removed in a future version."
)

# API 1.2-specific parameters are deprecated
if error:
# API 1.2 has different response format, let's adapt
message = error
warnings.warn(
"The 'error' parameter of DatabricksError is deprecated and will be removed in a future version."
)

if detail:
# Handle SCIM error message details
# @see https://tools.ietf.org/html/rfc7644#section-3.7.3
Expand Down
6 changes: 1 addition & 5 deletions databricks/sdk/errors/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
from databricks.sdk.errors.base import DatabricksError

from .overrides import _ALL_OVERRIDES
from .private_link import (_get_private_link_validation_error,
_is_private_link_redirect)


def error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
def _error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
for override in _ALL_OVERRIDES:
if override.matches(response, raw):
return override.custom_error(**raw)
Expand All @@ -23,8 +21,6 @@ def error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
# where there's a default exception class per HTTP status code, and we do
# rely on Databricks platform exception mapper to do the right thing.
return platform.STATUS_CODE_MAPPING[status_code](**raw)
if _is_private_link_redirect(response):
return _get_private_link_validation_error(response.url)
Comment on lines -26 to -27
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved to _get_api_error.


# backwards-compatible error creation for cases like using older versions of
# the SDK on way never releases of the platform.
Expand Down
146 changes: 146 additions & 0 deletions databricks/sdk/errors/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import abc
import json
import logging
import re
from typing import Optional

import requests

from ..logger import RoundTrip
from .base import DatabricksError
from .mapper import _error_mapper
from .private_link import (_get_private_link_validation_error,
_is_private_link_redirect)


class _ErrorParser(abc.ABC):
"""A parser for errors from the Databricks REST API."""

@abc.abstractmethod
def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
"""Parses an error from the Databricks REST API. If the error cannot be parsed, returns None."""


class _EmptyParser(_ErrorParser):
"""A parser that handles empty responses."""

def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
if len(response_body) == 0:
return {'message': response.reason}
return None


class _StandardErrorParser(_ErrorParser):
"""
Parses errors from the Databricks REST API using the standard error format.
"""

def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
try:
payload_str = response_body.decode('utf-8')
resp: dict = json.loads(payload_str)
except json.JSONDecodeError as e:
logging.debug('_StandardErrorParser: unable to deserialize response as json', exc_info=e)
return None

error_args = {
'message': resp.get('message', 'request failed'),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is taken from _make_nicer_error().

'error_code': resp.get('error_code'),
'details': resp.get('details'),
}

# Handle API 1.2-style errors
if 'error' in resp:
error_args['message'] = resp['error']

# Handle SCIM Errors
detail = resp.get('detail')
status = resp.get('status')
scim_type = resp.get('scimType')
if detail:
# Handle SCIM error message details
# @see https://tools.ietf.org/html/rfc7644#section-3.7.3
error_args[
'message'] = f"{scim_type} {error_args.get('message', 'SCIM API Internal Error')}".strip(" ")
error_args['error_code'] = f"SCIM_{status}"
Comment on lines +52 to +65
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is moved here from the DatabricksError constructor.

return error_args


class _StringErrorParser(_ErrorParser):
"""
Parses errors from the Databricks REST API in the format "ERROR_CODE: MESSAGE".
"""

__STRING_ERROR_REGEX = re.compile(r'([A-Z_]+): (.*)')

def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
payload_str = response_body.decode('utf-8')
match = self.__STRING_ERROR_REGEX.match(payload_str)
if not match:
logging.debug('_StringErrorParser: unable to parse response as string')
renaudhartert-db marked this conversation as resolved.
Show resolved Hide resolved
return None
error_code, message = match.groups()
return {'error_code': error_code, 'message': message, 'status': response.status_code, }


class _HtmlErrorParser(_ErrorParser):
"""
Parses errors from the Databricks REST API in HTML format.
"""

__HTML_ERROR_REGEXES = [re.compile(r'<pre>(.*)</pre>'), re.compile(r'<title>(.*)</title>'), ]

def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
payload_str = response_body.decode('utf-8')
for regex in self.__HTML_ERROR_REGEXES:
match = regex.search(payload_str)
if match:
message = match.group(1) if match.group(1) else response.reason
return {
'status': response.status_code,
'message': message,
'error_code': response.reason.upper().replace(' ', '_')
}
logging.debug('_HtmlErrorParser: no <pre> tag found in error response')
return None
Comment on lines +91 to +105
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is meant to replicate _make_sense_from_html.



# A list of ErrorParsers that are tried in order to parse an API error from a response body. Most errors should be
# parsable by the _StandardErrorParser, but additional parsers can be added here for specific error formats. The order
# of the parsers is not important, as the set of errors that can be parsed by each parser should be disjoint.
_error_parsers = [_EmptyParser(), _StandardErrorParser(), _StringErrorParser(), _HtmlErrorParser(), ]


def _unknown_error(response: requests.Response) -> str:
"""A standard error message that can be shown when an API response cannot be parsed.

This error message includes a link to the issue tracker for the SDK for users to report the issue to us.
"""
request_log = RoundTrip(response, debug_headers=True, debug_truncate_bytes=10 * 1024).generate()
return (
'This is likely a bug in the Databricks SDK for Python or the underlying '
'API. Please report this issue with the following debugging information to the SDK issue tracker at '
f'https://github.com/databricks/databricks-sdk-go/issues. Request log:```{request_log}```')


def get_api_error(response: requests.Response) -> Optional[DatabricksError]:
"""
Handles responses from the REST API and returns a DatabricksError if the response indicates an error.
:param response: The response from the REST API.
:return: A DatabricksError if the response indicates an error, otherwise None.
"""
if not response.ok:
content = response.content
for parser in _error_parsers:
try:
error_args = parser.parse_error(response, content)
if error_args:
return _error_mapper(response, error_args)
except Exception as e:
logging.debug(f'Error parsing response with {parser}, continuing', exc_info=e)
return _error_mapper(response, {'message': 'unable to parse response. ' + _unknown_error(response)})

# Private link failures happen via a redirect to the login page. From a requests-perspective, the request
# is successful, but the response is not what we expect. We need to handle this case separately.
if _is_private_link_redirect(response):
return _get_private_link_validation_error(response.url)
Loading
Loading