diff --git a/jupyter_telemetry/_categories.py b/jupyter_telemetry/_categories.py new file mode 100644 index 0000000..9353319 --- /dev/null +++ b/jupyter_telemetry/_categories.py @@ -0,0 +1,184 @@ +from collections import deque + +from jsonschema import Draft7Validator, validators +from jsonschema.exceptions import ValidationError + + +class ExtractCategories(ValidationError): + """ + A special `jsonschema.ValidationError` that carries information about the + `categories` keyword, intended to be yielded whenever a `categories` keyword + is encountered during `jsonschema` JSON validation. + + The primary use case for this class is to make use of the JSON validation + mechanism implemented by `jsonschema` to extract all categories associated + with each property in a JSON instance based on a JSON schema. It is not + intended to be used as an actual validation error. + """ + + def __init__(self, property, categories, *args, **kwargs): + super(ValidationError, self).__init__(*args, **kwargs) + self.property = property + self.categories = categories + + +def extend_with_categories(validator_class): + """ + Extend a `jsonschema.IValidator` class so that it yields a `_ExtractCategories` + whenever a `categories` keyword is encountered during JSON validation + + Parameters + ---------- + validator_class : jsonschema.IValidator + an existing validator class + + Returns + ------- + jsonschema.IValidator + a new `jsonschema.IValidator` class extending the one provided + + Examples + -------- + from jsonschema import Draft7Validator + + + CategoryExtractor = extend_with_categories(Draft7Validator) + """ + validate_properties = validator_class.VALIDATORS["properties"] + + def get_categories(validator, properties, instance, schema): + for property, subschema in properties.items(): + if "categories" in subschema: + yield ExtractCategories(property, subschema["categories"], message=None) + + for error in validate_properties( + validator, properties, instance, schema, + ): + yield error + + return validators.extend( + validator_class, {"properties": get_categories}, + ) + + +JSONSchemaValidator = Draft7Validator +CategoryExtractor = extend_with_categories(JSONSchemaValidator) + + +# Ignore categories under any of these jsonschema keywords +IGNORE_CATEGORIES_SCHEMA_KEYWORDS = { + 'if', 'not', 'anyOf', 'oneOf', 'then', 'else' +} + + +def extract_categories_from_errors(errors): + for e in errors: + if ( + isinstance(e, ExtractCategories) and + not any(p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS + for p in e.absolute_schema_path) + ): + yield e + else: + yield from extract_categories_from_errors(e.context) + + +def extract_categories_from_event(event, schema): + """ + Generate a `dict` of `_ExtractCategories` whose keys are pointers to the properties + + Parameters + ---------- + event : dict + A telemetry event + + schema : dict + A JSON schema + + Returns + ------- + dict + A mapping from properties in the event to their categories. + + In each entry, the key is a pointer to a property in the event + (in the form of a tuple) and the value is a `_ExtractCategories` + containing the categories associated with that property. + """ + return { + tuple(c.absolute_path + deque([c.property])): c + for c in extract_categories_from_errors( + CategoryExtractor(schema).iter_errors(event) + ) + } + + +def filter_categories_from_event(event, schema, allowed_categories, allowed_properties): + """ + Filter properties from an event based on their categories. + + Only whitelisted properties and properties whose categories are allowed are kept. + + Parameters + ---------- + event : dict + The input telemetry event + + schema : dict + A JSON schema that makes use of the the `categories` keyword to + specify what categories are associated with a certain property. + + allowed_categories : set + Specify which categories are allowed + + allowed_properties : set + Whitelist certain top level properties. + + These properties are included in the output event even if not all of + their properties are allowed. + + Returns + ------- + dict + The output event after category filtering + + """ + categories = extract_categories_from_event(event, schema) + + # Top-level properties without declared categories are set to null + for property in event.keys(): + path = (property,) + if path not in categories: + event[property] = None + + # Allow only properties whose categories are included in allowed_categories + # and whose top-level parent is included in allowed_properties + not_allowed = ( + c for p, c in categories.items() + if not (set(c.categories).issubset(allowed_categories) or + p[0] in allowed_properties) + ) + + for c in not_allowed: + # In case both a sub property and its parent, e.g. ['user', 'name'] and + # ['user'], do not have all the allowed categories and are to be removed, + # if the parent is removed first then attempting to access + # the descendent would either return None or raise an IndexError or + # KeyError. Just skip it. + try: + item = deep_get(event, c.absolute_path) + except IndexError: + continue + except KeyError: + continue + + if item is not None: + item[c.property] = None + + return event + + +def deep_get(instance, path): + result = instance + while result is not None and path: + result = result[path.popleft()] + return result diff --git a/jupyter_telemetry/categories.py b/jupyter_telemetry/categories.py new file mode 100644 index 0000000..5683356 --- /dev/null +++ b/jupyter_telemetry/categories.py @@ -0,0 +1 @@ +from ._categories import JSONSchemaValidator, filter_categories_from_event # noqa diff --git a/jupyter_telemetry/eventlog.py b/jupyter_telemetry/eventlog.py index 73aa6c8..c4c8956 100644 --- a/jupyter_telemetry/eventlog.py +++ b/jupyter_telemetry/eventlog.py @@ -5,7 +5,6 @@ import logging from datetime import datetime -import jsonschema from pythonjsonlogger import jsonlogger try: from ruamel.yaml import YAML @@ -29,6 +28,8 @@ from .traits import Handlers, SchemaOptions from . import TELEMETRY_METADATA_VERSION +from .categories import JSONSchemaValidator, filter_categories_from_event + yaml = YAML(typ='safe') @@ -131,7 +132,7 @@ def register_schema(self, schema): """ # Check if our schema itself is valid # This throws an exception if it isn't valid - jsonschema.validators.validator_for(schema).check_schema(schema) + JSONSchemaValidator.check_schema(schema) # Check that the properties we require are present required_schema_fields = {'$id', 'version', 'properties'} @@ -225,7 +226,7 @@ def record_event(self, schema_name, version, event, timestamp_override=None): schema = self.schemas[(schema_name, version)] # Validate the event data. - jsonschema.validate(event, schema) + JSONSchemaValidator(schema).validate(event) # Generate the empty event capsule. if timestamp_override is None: @@ -244,21 +245,10 @@ def record_event(self, schema_name, version, event, timestamp_override=None): allowed_categories = self.get_allowed_categories(schema_name) allowed_properties = self.get_allowed_properties(schema_name) - # Iterate through the event properties, and only record the - # properties labelled with allowed_categories - for property_name, data in event.items(): - prop_categories = schema["properties"][property_name]["categories"] - # If the property is explicitly listed in - # the allowed_properties, then include it in the capsule - if property_name in allowed_properties: - capsule[property_name] = data - # All of the property categories must be listed in the the allowed - # categories for this property to be recorded. - elif any([cat in allowed_categories for cat in prop_categories]): - capsule[property_name] = data - # Else return that property with a value of null - else: - capsule[property_name] = None + filtered_event = filter_categories_from_event( + event, schema, allowed_categories, allowed_properties + ) + capsule.update(filtered_event) self.log.info(capsule) return capsule diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_allowed_schemas.py b/tests/test_allowed_schemas.py index 249f74e..6812b1e 100644 --- a/tests/test_allowed_schemas.py +++ b/tests/test_allowed_schemas.py @@ -1,6 +1,3 @@ -import io -import json -import logging from textwrap import dedent as _ from ruamel.yaml import YAML @@ -8,13 +5,16 @@ import pytest +from .utils import get_event_data + SCHEMA_ID = "test.event" VERSION = 1 + @pytest.fixture def schema(): - return { + return { '$id': SCHEMA_ID, 'title': 'Test Event', 'version': VERSION, @@ -103,13 +103,13 @@ def test_missing_categories_label(): assert 'All properties must have a "categories"' in str(err.value) - EVENT_DATA = { 'nothing-exciting': 'hello, world', 'id': 'test id', 'email': 'test@testemail.com', } + @pytest.mark.parametrize( 'allowed_schemas,expected_output', [ @@ -198,28 +198,13 @@ def test_missing_categories_label(): ] ) def test_allowed_schemas(schema, allowed_schemas, expected_output): - sink = io.StringIO() - - # Create a handler that captures+records events with allowed tags. - handler = logging.StreamHandler(sink) - - e = EventLog( - handlers=[handler], - allowed_schemas=allowed_schemas + event_data = get_event_data( + EVENT_DATA, + schema, + SCHEMA_ID, + VERSION, + allowed_schemas ) - e.register_schema(schema) - - event = { - 'nothing-exciting': 'hello, world', - 'id': 'test id', - 'email': 'test@testemail.com', - } - - # Record event and read output - e.record_event(SCHEMA_ID, VERSION, EVENT_DATA) - recorded_event = json.loads(sink.getvalue()) - event_data = {key: value for key, value in recorded_event.items() if not key.startswith('__')} # Verify that *exactly* the right properties are recorded. assert expected_output == event_data - diff --git a/tests/test_category_filtering.py b/tests/test_category_filtering.py new file mode 100644 index 0000000..ac86280 --- /dev/null +++ b/tests/test_category_filtering.py @@ -0,0 +1,508 @@ +from itertools import product + +import pytest + +from .utils import get_event_data + + +SCHEMA_ID = 'test.event' +VERSION = 1 + + +NESTED_CATEGORY_SCHEMA = { + '$id': SCHEMA_ID, + 'title': 'Test Event', + 'version': VERSION, + 'description': 'Test Event.', + 'type': 'object', + 'properties': { + 'nothing-exciting': { + 'description': 'a property with nothing exciting happening', + 'categories': ['unrestricted'], + 'type': 'string' + }, + 'user': { + 'description': 'user', + 'categories': ['user-identifier'], + 'type': 'object', + 'properties': { + 'email': { + 'description': 'email address', + 'categories': ['user-identifiable-information'], + 'type': 'string' + }, + 'id': { + 'description': 'user ID', + 'type': 'string' + } + } + } + } +} + + +NESTED_EVENT_DATA = { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + } +} + + +NESTED_CATEGORY_TEST_CASES = [ + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': []}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': None + } + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None + } + ), + ( + # User configuration for allowed_schemas + { + SCHEMA_ID: { + 'allowed_categories': [ + 'user-identifier', + 'user-identifiable-information' + ] + } + }, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + } + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_properties': ['user']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + } + } + ), +] + + +@pytest.mark.parametrize( + 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES +) +def test_category_filtering(allowed_schemas, expected_output): + event_data = get_event_data( + NESTED_EVENT_DATA, + NESTED_CATEGORY_SCHEMA, + SCHEMA_ID, + VERSION, + allowed_schemas + ) + + # Verify that *exactly* the right properties are recorded. + assert expected_output == event_data + + +NESTED_CATEGORY_ARRAY_SCHEMA = { + '$id': SCHEMA_ID, + 'title': 'Test Event', + 'version': VERSION, + 'description': 'Test Event.', + 'type': 'object', + 'properties': { + 'nothing-exciting': { + 'description': 'a property with nothing exciting happening', + 'categories': ['unrestricted'], + 'type': 'string' + }, + 'users': { + 'description': 'user', + 'categories': ['user-identifier'], + 'type': 'array', + 'items': { + 'properties': { + 'email': { + 'description': 'email address', + 'categories': ['user-identifiable-information'], + 'type': 'string' + }, + 'id': { + 'description': 'user ID', + 'type': 'string' + } + } + } + } + } +} + + +ARRAY_EVENT_DATA = { + 'nothing-exciting': 'hello, world', + 'users': [ + { + 'id': 'test id 0', + 'email': 'test0@testemail.com', + }, + { + 'id': 'test id 1', + 'email': 'test1@testemail.com', + } + ] +} + + +@pytest.mark.parametrize( + 'allowed_schemas,expected_output', + [ + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': []}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': [ + { + 'id': 'test id 0', + 'email': None, + }, + { + 'id': 'test id 1', + 'email': None, + } + ] + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': None + } + ), + ( + # User configuration for allowed_schemas + { + SCHEMA_ID: { + 'allowed_categories': [ + 'user-identifier', + 'user-identifiable-information' + ] + } + }, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': [ + { + 'id': 'test id 0', + 'email': 'test0@testemail.com', + }, + { + 'id': 'test id 1', + 'email': 'test1@testemail.com', + } + ] + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_properties': ['users']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'users': [ + { + 'id': 'test id 0', + 'email': 'test0@testemail.com', + }, + { + 'id': 'test id 1', + 'email': 'test1@testemail.com', + } + ] + } + ), + ] +) +def test_array_category_filtering(allowed_schemas, expected_output): + event_data = get_event_data( + ARRAY_EVENT_DATA, + NESTED_CATEGORY_ARRAY_SCHEMA, + SCHEMA_ID, + VERSION, + allowed_schemas + ) + + # Verify that *exactly* the right properties are recorded. + assert expected_output == event_data + + +ADDITIONAL_PROP_EVENT_DATA = { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + }, + 'extra': 1234 +} + + +@pytest.mark.parametrize( + 'allowed_schemas,expected_output', + [ + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': []}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None, + 'extra': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None, + 'extra': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': None + }, + 'extra': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': None, + 'extra': None + } + ), + ( + # User configuration for allowed_schemas + { + SCHEMA_ID: { + 'allowed_categories': [ + 'user-identifier', + 'user-identifiable-information' + ] + } + }, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + }, + 'extra': None + } + ), + ( + # User configuration for allowed_schemas + {SCHEMA_ID: {'allowed_properties': ['user']}}, + # Expected properties in the recorded event + { + 'nothing-exciting': 'hello, world', + 'user': { + 'id': 'test id', + 'email': 'test@testemail.com', + }, + 'extra': None + } + ), + ] +) +def test_no_additional_properties(allowed_schemas, expected_output): + event_data = get_event_data( + ADDITIONAL_PROP_EVENT_DATA, + NESTED_CATEGORY_SCHEMA, + SCHEMA_ID, + VERSION, + allowed_schemas + ) + + # Verify that *exactly* the right properties are recorded. + assert expected_output == event_data + + +NESTED_CATEGORY_SCHEMA_ALLOF = { + '$id': SCHEMA_ID, + 'title': 'Test Event', + 'version': VERSION, + 'description': 'Test Event.', + 'type': 'object', + 'properties': { + 'nothing-exciting': { + 'description': 'a property with nothing exciting happening', + 'categories': ['unrestricted'], + 'type': 'string' + }, + 'user': { + 'description': 'user', + 'categories': ['user-identifier'], + 'type': 'object', + 'allOf': [ + { + 'properties': { + 'email': { + 'description': 'email address', + 'categories': ['user-identifiable-information'], + 'type': 'string' + } + } + }, + { + 'properties': { + 'id': { + 'description': 'user ID', + 'type': 'string' + } + } + } + ] + } + } +} + + +NESTED_CATEGORY_SCHEMA_REF = { + '$id': SCHEMA_ID, + 'title': 'Test Event', + 'version': VERSION, + 'description': 'Test Event.', + 'type': 'object', + 'properties': { + 'nothing-exciting': { + 'description': 'a property with nothing exciting happening', + 'categories': ['unrestricted'], + 'type': 'string' + }, + 'user': { + 'description': 'user', + 'categories': ['user-identifier'], + 'type': 'object', + '$ref': '#/definitions/properties' + } + }, + 'definitions': { + 'properties': { + 'properties': { + 'email': { + 'description': 'email address', + 'categories': ['user-identifiable-information'], + 'type': 'string' + }, + 'id': { + 'description': 'user ID', + 'type': 'string' + } + } + } + } +} + + +@pytest.mark.parametrize( + 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES +) +def test_category_filtering_ref(allowed_schemas, expected_output): + event_data = get_event_data( + NESTED_EVENT_DATA, + NESTED_CATEGORY_SCHEMA_REF, + SCHEMA_ID, + VERSION, + allowed_schemas + ) + + # Verify that *exactly* the right properties are recorded. + assert expected_output == event_data + + +@pytest.mark.parametrize( + 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES +) +def test_category_filtering_allof(allowed_schemas, expected_output): + event_data = get_event_data( + NESTED_EVENT_DATA, + NESTED_CATEGORY_SCHEMA_ALLOF, + SCHEMA_ID, + VERSION, + allowed_schemas + ) + + # Verify that *exactly* the right properties are recorded. + assert expected_output == event_data diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..2a0e0af --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,25 @@ +from copy import deepcopy +import io +import json +import logging + +from jupyter_telemetry.eventlog import EventLog + + +def get_event_data(event, schema, schema_id, version, allowed_schemas): + sink = io.StringIO() + + # Create a handler that captures+records events with allowed tags. + handler = logging.StreamHandler(sink) + + e = EventLog( + handlers=[handler], + allowed_schemas=allowed_schemas + ) + e.register_schema(schema) + + # Record event and read output + e.record_event(schema_id, version, deepcopy(event)) + + recorded_event = json.loads(sink.getvalue()) + return {key: value for key, value in recorded_event.items() if not key.startswith('__')}