Skip to content

Commit

Permalink
[SCHEMATIC-240] stripped google sheet information from open telemetry…
Browse files Browse the repository at this point in the history
… span status and message (#1573)

* catch google http error and sanitized error message

* remove unncessary code

* remove print

* remove unused imports

* add comment

* add a custom log processor to remove sensitive information in the log

* add span processor

* use export

* patch the _readable_span method in span class and remove processor

* add test; move function to util

* add module docstring and fix syntax

* remove try except in code block

* remove unnecessary import

* added the space back

* revert space changes

* remove sensitive info in status description

* modify attribute directly

* remoove unused imports

* strip sensitive info in log in a consistent way

* remove unused import

* update regex to accomodate different version of google sheet

* update comments
  • Loading branch information
linglp authored Feb 24, 2025
1 parent 200244f commit 514ecf8
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 3 deletions.
74 changes: 71 additions & 3 deletions schematic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs import (
LoggerProvider,
LoggingHandler,
LogRecordProcessor,
LogRecord,
)
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import (
DEPLOYMENT_ENVIRONMENT,
Expand All @@ -17,9 +22,14 @@
SERVICE_VERSION,
Resource,
)
from opentelemetry.sdk.trace import TracerProvider, SpanProcessor
from opentelemetry.sdk.trace import (
TracerProvider,
SpanProcessor,
ReadableSpan,
Span as SpanSdk,
)
from opentelemetry.trace import Span, SpanContext, get_current_span
from opentelemetry.sdk.trace.export import BatchSpanProcessor, Span
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.sampling import ALWAYS_OFF
from synapseclient import Synapse, USER_AGENT
from werkzeug import Request
Expand All @@ -28,6 +38,10 @@
from schematic.loader import LOADER
from schematic.version import __version__
from dotenv import load_dotenv
from schematic.utils.remove_sensitive_data_utils import (
redact_string,
redacted_sensitive_data_in_exception,
)

Synapse.allow_client_caching(False)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -95,6 +109,29 @@ def force_flush(self, timeout_millis: int = 30000) -> None:
pass


class CustomFilter(LogRecordProcessor):
"""A custom log record processor that redacts sensitive data from log messages before they are exported."""

def __init__(self, exporter):
self._exporter = exporter
self._shutdown = False

def emit(self, log_record: LogRecord) -> None:
"""Modify log traces before they are exported."""
# Redact sensitive data in the log message (body)
if log_record.log_record.body and "googleapis" in log_record.log_record.body:
log_record.log_record.body = redact_string(log_record.log_record.body)

def force_flush(self, timeout_millis=30000) -> bool:
"""Flush any pending log records (if needed)."""
return self._exporter.force_flush(timeout_millis)

def shutdown(self) -> None:
"""Clean up resources."""
self._shutdown = True
self._exporter.shutdown()


def create_telemetry_session() -> requests.Session:
"""
Create a requests session with authorization enabled if environment variables are set.
Expand Down Expand Up @@ -159,11 +196,41 @@ def set_up_tracing(session: requests.Session) -> None:
attribute_propagator = AttributePropagatingSpanProcessor(["user.id"])
trace.get_tracer_provider().add_span_processor(attribute_propagator)
exporter = OTLPSpanExporter(session=session)
# Overwrite the _readable_span method to redact sensitive data
SpanSdk._readable_span = _readable_span_alternate
trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(exporter))
else:
trace.set_tracer_provider(TracerProvider(sampler=ALWAYS_OFF))


original_function_readable_span = SpanSdk._readable_span


def _readable_span_alternate(self: SpanSdk) -> ReadableSpan:
"""Alternative function to the readable span. This function redacts sensitive data from the span attributes and events.
Args:
self (SpanSdk): _readable_span method of the SpanSdk class
Returns:
ReadableSpan: a new readable span that redacts sensitive data
"""
# Remove sensitive information from the span status description
# to prevent exposing sensitive details in the statusMessage on SigNoz
if self._status.status_code == trace.StatusCode.ERROR:
status_description_redacted = redact_string(str(self._status.description))
self._status._description = status_description_redacted

# Remove sensitive information in event attributes
# to prevent exposing sensitive details in exception traces and messages
for event in self._events:
attributes = event.attributes
redacted_event_attributes = redacted_sensitive_data_in_exception(attributes)
event._name = redact_string(event.name)
event._attributes = redacted_event_attributes
return original_function_readable_span(self)


def set_up_logging(session: requests.Session) -> None:
"""Set up logging to export to OTLP."""
logging_export = os.environ.get("LOGGING_EXPORT_FORMAT", None)
Expand All @@ -184,6 +251,7 @@ def set_up_logging(session: requests.Session) -> None:
set_logger_provider(logger_provider=logger_provider)

exporter = OTLPLogExporter(session=session)
logger_provider.add_log_record_processor(CustomFilter(exporter))
logger_provider.add_log_record_processor(BatchLogRecordProcessor(exporter))
handler = LoggingHandler(level=logging.NOTSET, logger_provider=logger_provider)
logging.getLogger().addHandler(handler)
Expand Down
45 changes: 45 additions & 0 deletions schematic/utils/remove_sensitive_data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""remove sensitive data from a string utils"""
from typing import Dict
import re


def redact_string(value: str) -> str:
"""remove sensitive data from a string
Args:
value (str): a string that may contain sensitive data
Returns:
str: remove sensitive data from string
"""
sensitive_patterns = {
"google_sheets": r"https://sheets\.googleapis\.com/v\d+/spreadsheets/[\w-]+"
}
_compiled_patterns = {
name: re.compile(pattern) for name, pattern in sensitive_patterns.items()
}
redacted = value
for pattern_name, pattern in _compiled_patterns.items():
redacted = pattern.sub(f"[REDACTED_{pattern_name.upper()}]", redacted)
return redacted


def redacted_sensitive_data_in_exception(
exception_attributes: Dict[str, str]
) -> Dict[str, str]:
"""remove sensitive data in exception
Args:
exception_attributes (dict):a dictionary of exception attributes
Returns:
dict: a dictionary of exception attributes with sensitive data redacted
"""
redacted_exception_attributes = {}
for key, value in exception_attributes.items():
# remove sensitive information from exception message and stacktrace
if key in ("exception.message", "exception.stacktrace"):
redacted_exception_attributes[key] = redact_string(value)
else:
redacted_exception_attributes[key] = value
return redacted_exception_attributes
41 changes: 41 additions & 0 deletions tests/unit/test_filter_sensitive_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pytest
from schematic.utils.remove_sensitive_data_utils import (
redact_string,
redacted_sensitive_data_in_exception,
)


class TestFilterSensitiveData:
@pytest.mark.parametrize(
"test_google_sheet",
[
"googleapiclient.errors.HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/11234budyhf:batchUpdate?fields=%2A&alt=json returned abc>",
"googleapiclient.errors.HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v3/spreadsheets/11234budyhf:batchUpdate?fields=%2A&alt=json returned abc>",
"googleapiclient.errors.HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v10/spreadsheets/11234budyhf:batchUpdate?fields=%2A&alt=json returned abc>",
],
)
def test_redact_string(self, test_google_sheet) -> None:
# given a string with sensitive data, make sure that they are redacted
redacted_data = redact_string(test_google_sheet)
assert (
redacted_data
== "googleapiclient.errors.HttpError: <HttpError 400 when requesting [REDACTED_GOOGLE_SHEETS]:batchUpdate?fields=%2A&alt=json returned abc>"
)

def test_redacted_sensitive_data_in_exception(self) -> None:
# given a dictionary of exception attributes, make sure that sensitive data is redacted
exception_attributes = {
"exception.message": "googleapiclient.errors.HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/11234budyhf:batchUpdate?fields=%2A&alt=json returned>",
"exception.stacktrace": 'Traceback (most recent call last):\n File "<stdin>", line 1, in <module>\n File "<string>", line 1, in <module>\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper\n return wrapped(*args, **kwargs)\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/http.py", line 905, in execute\n raise HttpError(resp, content, uri=self.uri)\ngoogleapiclient.errors.HttpError: <HttpError 400 when requesting https://sheets.googleapis.com/v4/spreadsheets/11234budyhf:batchUpdate?fields=%2A&alt=json returned>',
}
redacted_exception_attributes = redacted_sensitive_data_in_exception(
exception_attributes
)
assert (
redacted_exception_attributes["exception.message"]
== "googleapiclient.errors.HttpError: <HttpError 400 when requesting [REDACTED_GOOGLE_SHEETS]:batchUpdate?fields=%2A&alt=json returned>"
)
assert (
redacted_exception_attributes["exception.stacktrace"]
== 'Traceback (most recent call last):\n File "<stdin>", line 1, in <module>\n File "<string>", line 1, in <module>\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper\n return wrapped(*args, **kwargs)\n File "/usr/local/lib/python3.7/dist-packages/googleapiclient/http.py", line 905, in execute\n raise HttpError(resp, content, uri=self.uri)\ngoogleapiclient.errors.HttpError: <HttpError 400 when requesting [REDACTED_GOOGLE_SHEETS]:batchUpdate?fields=%2A&alt=json returned>'
)

0 comments on commit 514ecf8

Please sign in to comment.