-
Notifications
You must be signed in to change notification settings - Fork 4.3k
/
Copy pathschema_helpers.py
249 lines (199 loc) · 9.07 KB
/
schema_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import json
from copy import deepcopy
from enum import Enum
from functools import total_ordering
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError
JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
@total_ordering
class ComparableType(Enum):
NULL = 0
BOOLEAN = 1
INTEGER = 2
NUMBER = 3
STRING = 4
OBJECT = 5
def __lt__(self, other: Any) -> bool:
if self.__class__ is other.__class__:
return self.value < other.value # type: ignore
else:
return NotImplemented
TYPE_PYTHON_MAPPING: Mapping[str, Tuple[str, Optional[Type[Any]]]] = {
"null": ("null", None),
"array": ("array", list),
"boolean": ("boolean", bool),
"float": ("number", float),
"integer": ("integer", int),
"number": ("number", float),
"object": ("object", dict),
"string": ("string", str),
}
PYTHON_TYPE_MAPPING = {t: k for k, (_, t) in TYPE_PYTHON_MAPPING.items()}
def get_comparable_type(value: Any) -> Optional[ComparableType]:
if value == "null":
return ComparableType.NULL
if value == "boolean":
return ComparableType.BOOLEAN
if value == "integer":
return ComparableType.INTEGER
if value == "number":
return ComparableType.NUMBER
if value == "string":
return ComparableType.STRING
if value == "object":
return ComparableType.OBJECT
else:
return None
def get_inferred_type(value: Any) -> Optional[ComparableType]:
if value is None:
return ComparableType.NULL
if isinstance(value, bool):
return ComparableType.BOOLEAN
if isinstance(value, int):
return ComparableType.INTEGER
if isinstance(value, float):
return ComparableType.NUMBER
if isinstance(value, str):
return ComparableType.STRING
if isinstance(value, dict):
return ComparableType.OBJECT
else:
return None
def merge_schemas(schema1: SchemaType, schema2: SchemaType) -> SchemaType:
"""
Returns a new dictionary that contains schema1 and schema2.
Schemas are merged as follows
- If a key is in one schema but not the other, add it to the base schema with its existing type.
- If a key is in both schemas but with different types, use the wider type.
- If the type is a list in one schema but a different type of element in the other schema, raise an exception.
- If the type is an object in both schemas but the objects are different raise an exception.
- If the type is an object in one schema but not in the other schema, raise an exception.
In other words, we support merging
- any atomic type with any other atomic type (choose the wider of the two)
- list with list (union)
and nothing else.
"""
for k, t in list(schema1.items()) + list(schema2.items()):
if not isinstance(t, dict) or "type" not in t or not _is_valid_type(t["type"]):
raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=k, type=t)
merged_schema: Dict[str, Any] = deepcopy(schema1) # type: ignore # as of 2023-08-08, deepcopy can copy Mapping
for k2, t2 in schema2.items():
t1 = merged_schema.get(k2)
if t1 is None:
merged_schema[k2] = t2
elif t1 == t2:
continue
else:
merged_schema[k2] = _choose_wider_type(k2, t1, t2)
return merged_schema
def _is_valid_type(t: JsonSchemaSupportedType) -> bool:
return t == "array" or get_comparable_type(t) is not None
def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -> Mapping[str, Any]:
t1_type = t1["type"]
t2_type = t2["type"]
if (t1_type == "array" or t2_type == "array") and t1 != t2:
raise SchemaInferenceError(
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
details="Cannot merge schema for unequal array types.",
key=key,
detected_types=f"{t1},{t2}",
)
# Schemas can still be merged if a key contains a null value in either t1 or t2, but it is still an object
elif (t1_type == "object" or t2_type == "object") and t1_type != "null" and t2_type != "null" and t1 != t2:
raise SchemaInferenceError(
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
details="Cannot merge schema for unequal object types.",
key=key,
detected_types=f"{t1},{t2}",
)
else:
comparable_t1 = get_comparable_type(TYPE_PYTHON_MAPPING[t1_type][0]) # accessing the type_mapping value
comparable_t2 = get_comparable_type(TYPE_PYTHON_MAPPING[t2_type][0]) # accessing the type_mapping value
if not comparable_t1 and comparable_t2:
raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}")
return max(
[t1, t2], key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0]))
) # accessing the type_mapping value
def is_equal_or_narrower_type(value: Any, expected_type: str) -> bool:
if isinstance(value, list):
# We do not compare lists directly; the individual items are compared.
# If we hit this condition, it means that the expected type is not
# compatible with the inferred type.
return False
inferred_type = ComparableType(get_inferred_type(value))
if inferred_type is None:
return False
return ComparableType(inferred_type) <= ComparableType(get_comparable_type(expected_type))
def conforms_to_schema(record: Mapping[str, Any], schema: Mapping[str, Any]) -> bool:
"""
Return true iff the record conforms to the supplied schema.
The record conforms to the supplied schema iff:
- All columns in the record are in the schema.
- For every column in the record, that column's type is equal to or narrower than the same column's
type in the schema.
"""
schema_columns = set(schema.get("properties", {}).keys())
record_columns = set(record.keys())
if not record_columns.issubset(schema_columns):
return False
for column, definition in schema.get("properties", {}).items():
expected_type = definition.get("type")
value = record.get(column)
if value is not None:
if isinstance(expected_type, list):
return any(is_equal_or_narrower_type(value, e) for e in expected_type)
elif expected_type == "object":
return isinstance(value, dict)
elif expected_type == "array":
if not isinstance(value, list):
return False
array_type = definition.get("items", {}).get("type")
if not all(is_equal_or_narrower_type(v, array_type) for v in value):
return False
elif not is_equal_or_narrower_type(value, expected_type):
return False
return True
def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[Mapping[str, str]]:
try:
if isinstance(input_schema, str):
schema: Mapping[str, str] = json.loads(input_schema)
else:
schema = input_schema
if not all(isinstance(s, str) for s in schema.values()):
raise ConfigValidationError(
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details="Invalid input schema; nested schemas are not supported."
)
except json.decoder.JSONDecodeError:
return None
return schema
def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, str]]]) -> Optional[Mapping[str, Any]]:
"""
Return the user input schema (type mapping), transformed to JSON Schema format.
Verify that the input schema:
- is a key:value map
- all values in the map correspond to a JsonSchema datatype
"""
if not input_schema:
return None
result_schema = {}
json_mapping = _parse_json_input(input_schema) or {}
for col_name, type_name in json_mapping.items():
col_name, type_name = col_name.strip(), type_name.strip()
if not (col_name and type_name):
raise ConfigValidationError(
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
details=f"Invalid input schema; expected mapping in the format column_name: type, got {input_schema}.",
)
_json_schema_type = TYPE_PYTHON_MAPPING.get(type_name.casefold())
if not _json_schema_type:
raise ConfigValidationError(
FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details=f"Invalid type '{type_name}' for property '{col_name}'."
)
json_schema_type = _json_schema_type[0]
result_schema[col_name] = {"type": json_schema_type}
return {"type": "object", "properties": result_schema}