databricks · mgyucht · Aug 30, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024
diff --git a/databricks/sdk/core.py b/databricks/sdk/core.py
@@ -1,7 +1,5 @@
 import re
-import urllib.parse
 from datetime import timedelta
-from json import JSONDecodeError
 from types import TracebackType
 from typing import Any, BinaryIO, Iterator, Type
 from urllib.parse import urlencode
@@ -12,8 +10,8 @@
 from .config import *
 # To preserve backwards compatibility (as these definitions were previously in this module)
 from .credentials_provider import *
-from .errors import DatabricksError, error_mapper
-from .errors.private_link import _is_private_link_redirect
+from .errors import DatabricksError, get_api_error
+from .logger import RoundTrip
 from .oauth import retrieve_token
 from .retries import retried
 
@@ -262,134 +260,23 @@ def _perform(self,
                                          auth=auth,
                                          stream=raw,
                                          timeout=self._http_timeout_seconds)
-        try:
-            self._record_request_log(response, raw=raw or data is not None or files is not None)
-            if not response.ok: # internally calls response.raise_for_status()
-                # TODO: experiment with traceback pruning for better readability
-                # See https://stackoverflow.com/a/58821552/277035
-                payload = response.json()
-                raise self._make_nicer_error(response=response, **payload) from None
-            # Private link failures happen via a redirect to the login page. From a requests-perspective, the request
-            # is successful, but the response is not what we expect. We need to handle this case separately.
-            if _is_private_link_redirect(response):
-                raise self._make_nicer_error(response=response) from None
-            return response
-        except requests.exceptions.JSONDecodeError:
-            message = self._make_sense_from_html(response.text)
-            if not message:
-                message = response.reason
-            raise self._make_nicer_error(response=response, message=message) from None
-
-    @staticmethod
-    def _make_sense_from_html(txt: str) -> str:
-        matchers = [r'<pre>(.*)</pre>', r'<title>(.*)</title>']
-        for attempt in matchers:
-            expr = re.compile(attempt, re.MULTILINE)
-            match = expr.search(txt)
-            if not match:
-                continue
-            return match.group(1).strip()
-        return txt
-
-    def _make_nicer_error(self, *, response: requests.Response, **kwargs) -> DatabricksError:
-        status_code = response.status_code
-        message = kwargs.get('message', 'request failed')
-        is_http_unauthorized_or_forbidden = status_code in (401, 403)
-        is_too_many_requests_or_unavailable = status_code in (429, 503)
-        if is_http_unauthorized_or_forbidden:
-            message = self._cfg.wrap_debug_info(message)
-        if is_too_many_requests_or_unavailable:
-            kwargs['retry_after_secs'] = self._parse_retry_after(response)
-        kwargs['message'] = message
-        return error_mapper(response, kwargs)
-
-    def _record_request_log(self, response: requests.Response, raw=False):
+        self._record_request_log(response, raw=raw or data is not None or files is not None)
+        error = get_api_error(response)
+        if error is not None:
+            status_code = response.status_code
+            is_http_unauthorized_or_forbidden = status_code in (401, 403)
+            is_too_many_requests_or_unavailable = status_code in (429, 503)
+            if is_http_unauthorized_or_forbidden:
+                error.message = self._cfg.wrap_debug_info(error.message)
+            if is_too_many_requests_or_unavailable:
+                error.retry_after_secs = self._parse_retry_after(response)
+            raise error from None
+        return response
+
+    def _record_request_log(self, response: requests.Response, raw: bool = False) -> None:
         if not logger.isEnabledFor(logging.DEBUG):
             return
-        request = response.request
-        url = urllib.parse.urlparse(request.url)
-        query = ''
-        if url.query:
-            query = f'?{urllib.parse.unquote(url.query)}'
-        sb = [f'{request.method} {urllib.parse.unquote(url.path)}{query}']
-        if self._cfg.debug_headers:
-            if self._cfg.host:
-                sb.append(f'> * Host: {self._cfg.host}')
-            for k, v in request.headers.items():
-                sb.append(f'> * {k}: {self._only_n_bytes(v, self._debug_truncate_bytes)}')
-        if request.body:
-            sb.append("> [raw stream]" if raw else self._redacted_dump("> ", request.body))
-        sb.append(f'< {response.status_code} {response.reason}')
-        if raw and response.headers.get('Content-Type', None) != 'application/json':
-            # Raw streams with `Transfer-Encoding: chunked` do not have `Content-Type` header
-            sb.append("< [raw stream]")
-        elif response.content:
-            sb.append(self._redacted_dump("< ", response.content))
-        logger.debug("\n".join(sb))
-
-    @staticmethod
-    def _mask(m: Dict[str, any]):
-        for k in m:
-            if k in {'bytes_value', 'string_value', 'token_value', 'value', 'content'}:
-                m[k] = "**REDACTED**"
-
-    @staticmethod
-    def _map_keys(m: Dict[str, any]) -> List[str]:
-        keys = list(m.keys())
-        keys.sort()
-        return keys
-
-    @staticmethod
-    def _only_n_bytes(j: str, num_bytes: int = 96) -> str:
-        diff = len(j.encode('utf-8')) - num_bytes
-        if diff > 0:
-            return f"{j[:num_bytes]}... ({diff} more bytes)"
-        return j
-
-    def _recursive_marshal_dict(self, m, budget) -> dict:
-        out = {}
-        self._mask(m)
-        for k in sorted(m.keys()):
-            raw = self._recursive_marshal(m[k], budget)
-            out[k] = raw
-            budget -= len(str(raw))
-        return out
-
-    def _recursive_marshal_list(self, s, budget) -> list:
-        out = []
-        for i in range(len(s)):
-            if i > 0 >= budget:
-                out.append("... (%d additional elements)" % (len(s) - len(out)))
-                break
-            raw = self._recursive_marshal(s[i], budget)
-            out.append(raw)
-            budget -= len(str(raw))
-        return out
-
-    def _recursive_marshal(self, v: any, budget: int) -> any:
-        if isinstance(v, dict):
-            return self._recursive_marshal_dict(v, budget)
-        elif isinstance(v, list):
-            return self._recursive_marshal_list(v, budget)
-        elif isinstance(v, str):
-            return self._only_n_bytes(v, self._debug_truncate_bytes)
-        else:
-            return v
-
-    def _redacted_dump(self, prefix: str, body: str) -> str:
-        if len(body) == 0:
-            return ""
-        try:
-            # Unmarshal body into primitive types.
-            tmp = json.loads(body)
-            max_bytes = 96
-            if self._debug_truncate_bytes > max_bytes:
-                max_bytes = self._debug_truncate_bytes
-            # Re-marshal body taking redaction and character limit into account.
-            raw = self._recursive_marshal(tmp, max_bytes)
-            return "\n".join([f'{prefix}{line}' for line in json.dumps(raw, indent=2).split("\n")])
-        except JSONDecodeError:
-            return f'{prefix}[non-JSON document of {len(body)} bytes]'
+        logger.debug(RoundTrip(response, self._cfg.debug_headers, self._debug_truncate_bytes, raw).generate())
 
 
 class StreamingResponse(BinaryIO):

diff --git a/databricks/sdk/errors/__init__.py b/databricks/sdk/errors/__init__.py
@@ -1,5 +1,6 @@
 from .base import DatabricksError, ErrorDetail
-from .mapper import error_mapper
+from .mapper import _error_mapper
+from .parser import get_api_error
 from .platform import *
 from .private_link import PrivateLinkValidationError
 from .sdk import *
diff --git a/databricks/sdk/errors/base.py b/databricks/sdk/errors/base.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 from dataclasses import dataclass
 from typing import Dict, List, Optional
 
@@ -41,9 +42,38 @@ def __init__(self,
                  retry_after_secs: int = None,
                  details: List[Dict[str, any]] = None,
                  **kwargs):
+        """
+
+        :param message:
+        :param error_code:
+        :param detail: [Deprecated]
+        :param status: [Deprecated]
+        :param scimType: [Deprecated]
+        :param error: [Deprecated]
+        :param retry_after_secs:
+        :param details:
+        :param kwargs:
+        """
+        # SCIM-specific parameters are deprecated
+        if detail:
+            warnings.warn(
+                "The 'detail' parameter of DatabricksError is deprecated and will be removed in a future version."
+            )
+        if scimType:
+            warnings.warn(
+                "The 'scimType' parameter of DatabricksError is deprecated and will be removed in a future version."
+            )
+        if status:
+            warnings.warn(
+                "The 'status' parameter of DatabricksError is deprecated and will be removed in a future version."
+            )
+
+        # API 1.2-specific parameters are deprecated
         if error:
-            # API 1.2 has different response format, let's adapt
-            message = error
+            warnings.warn(
+                "The 'error' parameter of DatabricksError is deprecated and will be removed in a future version."
+            )
+
         if detail:
             # Handle SCIM error message details
             # @see https://tools.ietf.org/html/rfc7644#section-3.7.3

diff --git a/databricks/sdk/errors/mapper.py b/databricks/sdk/errors/mapper.py
@@ -4,11 +4,9 @@
 from databricks.sdk.errors.base import DatabricksError
 
 from .overrides import _ALL_OVERRIDES
-from .private_link import (_get_private_link_validation_error,
-                           _is_private_link_redirect)
 
 
-def error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
+def _error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
     for override in _ALL_OVERRIDES:
         if override.matches(response, raw):
             return override.custom_error(**raw)
@@ -23,8 +21,6 @@ def error_mapper(response: requests.Response, raw: dict) -> DatabricksError:
         # where there's a default exception class per HTTP status code, and we do
         # rely on Databricks platform exception mapper to do the right thing.
         return platform.STATUS_CODE_MAPPING[status_code](**raw)
-    if _is_private_link_redirect(response):
-        return _get_private_link_validation_error(response.url)
 
     # backwards-compatible error creation for cases like using older versions of
     # the SDK on way never releases of the platform.

diff --git a/databricks/sdk/errors/parser.py b/databricks/sdk/errors/parser.py
@@ -0,0 +1,146 @@
+import abc
+import json
+import logging
+import re
+from typing import Optional
+
+import requests
+
+from ..logger import RoundTrip
+from .base import DatabricksError
+from .mapper import _error_mapper
+from .private_link import (_get_private_link_validation_error,
+                           _is_private_link_redirect)
+
+
+class _ErrorParser(abc.ABC):
+    """A parser for errors from the Databricks REST API."""
+
+    @abc.abstractmethod
+    def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
+        """Parses an error from the Databricks REST API. If the error cannot be parsed, returns None."""
+
+
+class _EmptyParser(_ErrorParser):
+    """A parser that handles empty responses."""
+
+    def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
+        if len(response_body) == 0:
+            return {'message': response.reason}
+        return None
+
+
+class _StandardErrorParser(_ErrorParser):
+    """
+    Parses errors from the Databricks REST API using the standard error format.
+    """
+
+    def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
+        try:
+            payload_str = response_body.decode('utf-8')
+            resp: dict = json.loads(payload_str)
+        except json.JSONDecodeError as e:
+            logging.debug('_StandardErrorParser: unable to deserialize response as json', exc_info=e)
+            return None
+
+        error_args = {
+            'message': resp.get('message', 'request failed'),
+            'error_code': resp.get('error_code'),
+            'details': resp.get('details'),
+        }
+
+        # Handle API 1.2-style errors
+        if 'error' in resp:
+            error_args['message'] = resp['error']
+
+        # Handle SCIM Errors
+        detail = resp.get('detail')
+        status = resp.get('status')
+        scim_type = resp.get('scimType')
+        if detail:
+            # Handle SCIM error message details
+            # @see https://tools.ietf.org/html/rfc7644#section-3.7.3
+            error_args[
+                'message'] = f"{scim_type} {error_args.get('message', 'SCIM API Internal Error')}".strip(" ")
+            error_args['error_code'] = f"SCIM_{status}"
+        return error_args
+
+
+class _StringErrorParser(_ErrorParser):
+    """
+    Parses errors from the Databricks REST API in the format "ERROR_CODE: MESSAGE".
+    """
+
+    __STRING_ERROR_REGEX = re.compile(r'([A-Z_]+): (.*)')
+
+    def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
+        payload_str = response_body.decode('utf-8')
+        match = self.__STRING_ERROR_REGEX.match(payload_str)
+        if not match:
+            logging.debug('_StringErrorParser: unable to parse response as string')
+            return None
+        error_code, message = match.groups()
+        return {'error_code': error_code, 'message': message, 'status': response.status_code, }
+
+
+class _HtmlErrorParser(_ErrorParser):
+    """
+    Parses errors from the Databricks REST API in HTML format.
+    """
+
+    __HTML_ERROR_REGEXES = [re.compile(r'<pre>(.*)</pre>'), re.compile(r'<title>(.*)</title>'), ]
+
+    def parse_error(self, response: requests.Response, response_body: bytes) -> Optional[dict]:
+        payload_str = response_body.decode('utf-8')
+        for regex in self.__HTML_ERROR_REGEXES:
+            match = regex.search(payload_str)
+            if match:
+                message = match.group(1) if match.group(1) else response.reason
+                return {
+                    'status': response.status_code,
+                    'message': message,
+                    'error_code': response.reason.upper().replace(' ', '_')
+                }
+        logging.debug('_HtmlErrorParser: no <pre> tag found in error response')
+        return None
+
+
+# A list of ErrorParsers that are tried in order to parse an API error from a response body. Most errors should be
+# parsable by the _StandardErrorParser, but additional parsers can be added here for specific error formats. The order
+# of the parsers is not important, as the set of errors that can be parsed by each parser should be disjoint.
+_error_parsers = [_EmptyParser(), _StandardErrorParser(), _StringErrorParser(), _HtmlErrorParser(), ]
+
+
+def _unknown_error(response: requests.Response) -> str:
+    """A standard error message that can be shown when an API response cannot be parsed.
+
+    This error message includes a link to the issue tracker for the SDK for users to report the issue to us.
+    """
+    request_log = RoundTrip(response, debug_headers=True, debug_truncate_bytes=10 * 1024).generate()
+    return (
+        'This is likely a bug in the Databricks SDK for Python or the underlying '
+        'API. Please report this issue with the following debugging information to the SDK issue tracker at '
+        f'https://github.com/databricks/databricks-sdk-go/issues. Request log:```{request_log}```')
+
+
+def get_api_error(response: requests.Response) -> Optional[DatabricksError]:
+    """
+    Handles responses from the REST API and returns a DatabricksError if the response indicates an error.
+    :param response: The response from the REST API.
+    :return: A DatabricksError if the response indicates an error, otherwise None.
+    """
+    if not response.ok:
+        content = response.content
+        for parser in _error_parsers:
+            try:
+                error_args = parser.parse_error(response, content)
+                if error_args:
+                    return _error_mapper(response, error_args)
+            except Exception as e:
+                logging.debug(f'Error parsing response with {parser}, continuing', exc_info=e)
+        return _error_mapper(response, {'message': 'unable to parse response. ' + _unknown_error(response)})
+
+    # Private link failures happen via a redirect to the login page. From a requests-perspective, the request
+    # is successful, but the response is not what we expect. We need to handle this case separately.
+    if _is_private_link_redirect(response):
+        return _get_private_link_validation_error(response.url)