From 514ecf85822c748047646aefd965e619a1bef59f Mon Sep 17 00:00:00 2001 From: Lingling <55448354+linglp@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:10:27 -0500 Subject: [PATCH] [SCHEMATIC-240] stripped google sheet information from open telemetry span status and message (#1573) * catch google http error and sanitized error message * remove unncessary code * remove print * remove unused imports * add comment * add a custom log processor to remove sensitive information in the log * add span processor * use export * patch the _readable_span method in span class and remove processor * add test; move function to util * add module docstring and fix syntax * remove try except in code block * remove unnecessary import * added the space back * revert space changes * remove sensitive info in status description * modify attribute directly * remoove unused imports * strip sensitive info in log in a consistent way * remove unused import * update regex to accomodate different version of google sheet * update comments --- schematic/__init__.py | 74 ++++++++++++++++++- .../utils/remove_sensitive_data_utils.py | 45 +++++++++++ tests/unit/test_filter_sensitive_data.py | 41 ++++++++++ 3 files changed, 157 insertions(+), 3 deletions(-) create mode 100644 schematic/utils/remove_sensitive_data_utils.py create mode 100644 tests/unit/test_filter_sensitive_data.py diff --git a/schematic/__init__.py b/schematic/__init__.py index 6cc29fb5c..7fbbb7e60 100644 --- a/schematic/__init__.py +++ b/schematic/__init__.py @@ -8,7 +8,12 @@ from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.flask import FlaskInstrumentor -from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs import ( + LoggerProvider, + LoggingHandler, + LogRecordProcessor, + LogRecord, +) from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk.resources import ( DEPLOYMENT_ENVIRONMENT, @@ -17,9 +22,14 @@ SERVICE_VERSION, Resource, ) -from opentelemetry.sdk.trace import TracerProvider, SpanProcessor +from opentelemetry.sdk.trace import ( + TracerProvider, + SpanProcessor, + ReadableSpan, + Span as SpanSdk, +) from opentelemetry.trace import Span, SpanContext, get_current_span -from opentelemetry.sdk.trace.export import BatchSpanProcessor, Span +from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.trace.sampling import ALWAYS_OFF from synapseclient import Synapse, USER_AGENT from werkzeug import Request @@ -28,6 +38,10 @@ from schematic.loader import LOADER from schematic.version import __version__ from dotenv import load_dotenv +from schematic.utils.remove_sensitive_data_utils import ( + redact_string, + redacted_sensitive_data_in_exception, +) Synapse.allow_client_caching(False) logger = logging.getLogger(__name__) @@ -95,6 +109,29 @@ def force_flush(self, timeout_millis: int = 30000) -> None: pass +class CustomFilter(LogRecordProcessor): + """A custom log record processor that redacts sensitive data from log messages before they are exported.""" + + def __init__(self, exporter): + self._exporter = exporter + self._shutdown = False + + def emit(self, log_record: LogRecord) -> None: + """Modify log traces before they are exported.""" + # Redact sensitive data in the log message (body) + if log_record.log_record.body and "googleapis" in log_record.log_record.body: + log_record.log_record.body = redact_string(log_record.log_record.body) + + def force_flush(self, timeout_millis=30000) -> bool: + """Flush any pending log records (if needed).""" + return self._exporter.force_flush(timeout_millis) + + def shutdown(self) -> None: + """Clean up resources.""" + self._shutdown = True + self._exporter.shutdown() + + def create_telemetry_session() -> requests.Session: """ Create a requests session with authorization enabled if environment variables are set. @@ -159,11 +196,41 @@ def set_up_tracing(session: requests.Session) -> None: attribute_propagator = AttributePropagatingSpanProcessor(["user.id"]) trace.get_tracer_provider().add_span_processor(attribute_propagator) exporter = OTLPSpanExporter(session=session) + # Overwrite the _readable_span method to redact sensitive data + SpanSdk._readable_span = _readable_span_alternate trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(exporter)) else: trace.set_tracer_provider(TracerProvider(sampler=ALWAYS_OFF)) +original_function_readable_span = SpanSdk._readable_span + + +def _readable_span_alternate(self: SpanSdk) -> ReadableSpan: + """Alternative function to the readable span. This function redacts sensitive data from the span attributes and events. + + Args: + self (SpanSdk): _readable_span method of the SpanSdk class + + Returns: + ReadableSpan: a new readable span that redacts sensitive data + """ + # Remove sensitive information from the span status description + # to prevent exposing sensitive details in the statusMessage on SigNoz + if self._status.status_code == trace.StatusCode.ERROR: + status_description_redacted = redact_string(str(self._status.description)) + self._status._description = status_description_redacted + + # Remove sensitive information in event attributes + # to prevent exposing sensitive details in exception traces and messages + for event in self._events: + attributes = event.attributes + redacted_event_attributes = redacted_sensitive_data_in_exception(attributes) + event._name = redact_string(event.name) + event._attributes = redacted_event_attributes + return original_function_readable_span(self) + + def set_up_logging(session: requests.Session) -> None: """Set up logging to export to OTLP.""" logging_export = os.environ.get("LOGGING_EXPORT_FORMAT", None) @@ -184,6 +251,7 @@ def set_up_logging(session: requests.Session) -> None: set_logger_provider(logger_provider=logger_provider) exporter = OTLPLogExporter(session=session) + logger_provider.add_log_record_processor(CustomFilter(exporter)) logger_provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) handler = LoggingHandler(level=logging.NOTSET, logger_provider=logger_provider) logging.getLogger().addHandler(handler) diff --git a/schematic/utils/remove_sensitive_data_utils.py b/schematic/utils/remove_sensitive_data_utils.py new file mode 100644 index 000000000..5e6632e7b --- /dev/null +++ b/schematic/utils/remove_sensitive_data_utils.py @@ -0,0 +1,45 @@ +"""remove sensitive data from a string utils""" +from typing import Dict +import re + + +def redact_string(value: str) -> str: + """remove sensitive data from a string + + Args: + value (str): a string that may contain sensitive data + + Returns: + str: remove sensitive data from string + """ + sensitive_patterns = { + "google_sheets": r"https://sheets\.googleapis\.com/v\d+/spreadsheets/[\w-]+" + } + _compiled_patterns = { + name: re.compile(pattern) for name, pattern in sensitive_patterns.items() + } + redacted = value + for pattern_name, pattern in _compiled_patterns.items(): + redacted = pattern.sub(f"[REDACTED_{pattern_name.upper()}]", redacted) + return redacted + + +def redacted_sensitive_data_in_exception( + exception_attributes: Dict[str, str] +) -> Dict[str, str]: + """remove sensitive data in exception + + Args: + exception_attributes (dict):a dictionary of exception attributes + + Returns: + dict: a dictionary of exception attributes with sensitive data redacted + """ + redacted_exception_attributes = {} + for key, value in exception_attributes.items(): + # remove sensitive information from exception message and stacktrace + if key in ("exception.message", "exception.stacktrace"): + redacted_exception_attributes[key] = redact_string(value) + else: + redacted_exception_attributes[key] = value + return redacted_exception_attributes diff --git a/tests/unit/test_filter_sensitive_data.py b/tests/unit/test_filter_sensitive_data.py new file mode 100644 index 000000000..e028df37b --- /dev/null +++ b/tests/unit/test_filter_sensitive_data.py @@ -0,0 +1,41 @@ +import pytest +from schematic.utils.remove_sensitive_data_utils import ( + redact_string, + redacted_sensitive_data_in_exception, +) + + +class TestFilterSensitiveData: + @pytest.mark.parametrize( + "test_google_sheet", + [ + "googleapiclient.errors.HttpError: ", + "googleapiclient.errors.HttpError: ", + "googleapiclient.errors.HttpError: ", + ], + ) + def test_redact_string(self, test_google_sheet) -> None: + # given a string with sensitive data, make sure that they are redacted + redacted_data = redact_string(test_google_sheet) + assert ( + redacted_data + == "googleapiclient.errors.HttpError: " + ) + + def test_redacted_sensitive_data_in_exception(self) -> None: + # given a dictionary of exception attributes, make sure that sensitive data is redacted + exception_attributes = { + "exception.message": "googleapiclient.errors.HttpError: ", + "exception.stacktrace": 'Traceback (most recent call last):\n File "", line 1, in \n File "", line 1, in \n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper\n return wrapped(*args, **kwargs)\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/http.py", line 905, in execute\n raise HttpError(resp, content, uri=self.uri)\ngoogleapiclient.errors.HttpError: ', + } + redacted_exception_attributes = redacted_sensitive_data_in_exception( + exception_attributes + ) + assert ( + redacted_exception_attributes["exception.message"] + == "googleapiclient.errors.HttpError: " + ) + assert ( + redacted_exception_attributes["exception.stacktrace"] + == 'Traceback (most recent call last):\n File "", line 1, in \n File "", line 1, in \n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper\n return wrapped(*args, **kwargs)\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/http.py", line 905, in execute\n raise HttpError(resp, content, uri=self.uri)\ngoogleapiclient.errors.HttpError: ' + )