Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support deserializing to memoryview #624

Merged
merged 4 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ Inspect
.. autoclass:: StrType
.. autoclass:: BytesType
.. autoclass:: ByteArrayType
.. autoclass:: MemoryViewType
.. autoclass:: DateTimeType
.. autoclass:: TimeType
.. autoclass:: DateType
Expand Down
17 changes: 17 additions & 0 deletions docs/source/supported-types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,23 @@ Bytes-like objects map to base64-encoded strings in JSON, YAML, and TOML. The
>>> msgspec.json.decode(msg, type=bytearray)
bytearray(b'\xf0\x9d\x84\x9e')


.. note::

For the ``msgpack`` protocol, `memoryview` objects will be decoded as
direct views into the larger buffer containing the input message being
decoded. This may be useful for implementing efficient zero-copy handling
of large binary messages, but is also a potential footgun. As long as a
decoded ``memoryview`` remains in memory, the input message buffer will
also be persisted, potentially resulting in unnecessarily large memory
usage. The usage of ``memoryview`` types in this manner is considered an
advanced topic, and should only be used when you know their usage will
result in a performance benefit.

For all other protocols `memoryview` objects will still result in a copy,
and will likely be slightly slower than decoding into a `bytes` object


``datetime``
------------

Expand Down
157 changes: 118 additions & 39 deletions msgspec/_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2499,32 +2499,33 @@ static PyTypeObject Field_Type = {
#define MS_TYPE_STR (1ull << 5)
#define MS_TYPE_BYTES (1ull << 6)
#define MS_TYPE_BYTEARRAY (1ull << 7)
#define MS_TYPE_DATETIME (1ull << 8)
#define MS_TYPE_DATE (1ull << 9)
#define MS_TYPE_TIME (1ull << 10)
#define MS_TYPE_TIMEDELTA (1ull << 11)
#define MS_TYPE_UUID (1ull << 12)
#define MS_TYPE_DECIMAL (1ull << 13)
#define MS_TYPE_EXT (1ull << 14)
#define MS_TYPE_STRUCT (1ull << 15)
#define MS_TYPE_STRUCT_ARRAY (1ull << 16)
#define MS_TYPE_STRUCT_UNION (1ull << 17)
#define MS_TYPE_STRUCT_ARRAY_UNION (1ull << 18)
#define MS_TYPE_ENUM (1ull << 19)
#define MS_TYPE_INTENUM (1ull << 20)
#define MS_TYPE_CUSTOM (1ull << 21)
#define MS_TYPE_CUSTOM_GENERIC (1ull << 22)
#define MS_TYPE_DICT ((1ull << 23) | (1ull << 24))
#define MS_TYPE_LIST (1ull << 25)
#define MS_TYPE_SET (1ull << 26)
#define MS_TYPE_FROZENSET (1ull << 27)
#define MS_TYPE_VARTUPLE (1ull << 28)
#define MS_TYPE_FIXTUPLE (1ull << 29)
#define MS_TYPE_INTLITERAL (1ull << 30)
#define MS_TYPE_STRLITERAL (1ull << 31)
#define MS_TYPE_TYPEDDICT (1ull << 32)
#define MS_TYPE_DATACLASS (1ull << 33)
#define MS_TYPE_NAMEDTUPLE (1ull << 34)
#define MS_TYPE_MEMORYVIEW (1ull << 8)
#define MS_TYPE_DATETIME (1ull << 9)
#define MS_TYPE_DATE (1ull << 10)
#define MS_TYPE_TIME (1ull << 11)
#define MS_TYPE_TIMEDELTA (1ull << 12)
#define MS_TYPE_UUID (1ull << 13)
#define MS_TYPE_DECIMAL (1ull << 14)
#define MS_TYPE_EXT (1ull << 15)
#define MS_TYPE_STRUCT (1ull << 16)
#define MS_TYPE_STRUCT_ARRAY (1ull << 17)
#define MS_TYPE_STRUCT_UNION (1ull << 18)
#define MS_TYPE_STRUCT_ARRAY_UNION (1ull << 19)
#define MS_TYPE_ENUM (1ull << 20)
#define MS_TYPE_INTENUM (1ull << 21)
#define MS_TYPE_CUSTOM (1ull << 22)
#define MS_TYPE_CUSTOM_GENERIC (1ull << 23)
#define MS_TYPE_DICT ((1ull << 24) | (1ull << 25))
#define MS_TYPE_LIST (1ull << 26)
#define MS_TYPE_SET (1ull << 27)
#define MS_TYPE_FROZENSET (1ull << 28)
#define MS_TYPE_VARTUPLE (1ull << 29)
#define MS_TYPE_FIXTUPLE (1ull << 30)
#define MS_TYPE_INTLITERAL (1ull << 31)
#define MS_TYPE_STRLITERAL (1ull << 32)
#define MS_TYPE_TYPEDDICT (1ull << 33)
#define MS_TYPE_DATACLASS (1ull << 34)
#define MS_TYPE_NAMEDTUPLE (1ull << 35)
/* Constraints */
#define MS_CONSTR_INT_MIN (1ull << 42)
#define MS_CONSTR_INT_MAX (1ull << 43)
Expand Down Expand Up @@ -3100,7 +3101,7 @@ typenode_simple_repr(TypeNode *self) {
if (self->types & (MS_TYPE_STR | MS_TYPE_ENUM | MS_TYPE_STRLITERAL)) {
if (!strbuilder_extend_literal(&builder, "str")) return NULL;
}
if (self->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY)) {
if (self->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW)) {
if (!strbuilder_extend_literal(&builder, "bytes")) return NULL;
}
if (self->types & MS_TYPE_DATETIME) {
Expand Down Expand Up @@ -3832,7 +3833,7 @@ typenode_collect_check_invariants(TypeNodeCollectState *state) {
if (ms_popcount(
state->types & (
MS_TYPE_STR | MS_TYPE_STRLITERAL | MS_TYPE_ENUM |
MS_TYPE_BYTES | MS_TYPE_BYTEARRAY |
MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW |
MS_TYPE_DATETIME | MS_TYPE_DATE | MS_TYPE_TIME |
MS_TYPE_TIMEDELTA | MS_TYPE_UUID | MS_TYPE_DECIMAL
)
Expand Down Expand Up @@ -4623,6 +4624,10 @@ typenode_collect_type(TypeNodeCollectState *state, PyObject *obj) {
state->types |= MS_TYPE_BYTEARRAY;
kind = CK_BYTES;
}
else if (t == (PyObject *)(&PyMemoryView_Type)) {
state->types |= MS_TYPE_MEMORYVIEW;
kind = CK_BYTES;
}
else if (t == (PyObject *)(PyDateTimeAPI->DateTimeType)) {
state->types |= MS_TYPE_DATETIME;
kind = CK_TIME;
Expand Down Expand Up @@ -14464,6 +14469,16 @@ mpack_decode_bin(
else if (type->types & MS_TYPE_UUID) {
return ms_decode_uuid_from_bytes(s, size, path);
}
else if (type->types & MS_TYPE_MEMORYVIEW) {
PyObject *view = PyMemoryView_GetContiguous(
self->buffer_obj, PyBUF_READ, 'C'
);
if (view == NULL) return NULL;
Py_buffer *buffer = PyMemoryView_GET_BUFFER(view);
buffer->buf = s;
buffer->len = size;
return view;
}

return ms_validation_error("bytes", type, path);
}
Expand Down Expand Up @@ -16494,11 +16509,19 @@ json_decode_binary(
if (out == NULL) return NULL;
bin_buffer = PyBytes_AS_STRING(out);
}
else {
else if (type->types & MS_TYPE_BYTEARRAY) {
out = PyByteArray_FromStringAndSize(NULL, bin_size);
if (out == NULL) return NULL;
bin_buffer = PyByteArray_AS_STRING(out);
}
else {
PyObject *temp = PyBytes_FromStringAndSize(NULL, bin_size);
if (temp == NULL) return NULL;
bin_buffer = PyBytes_AS_STRING(temp);
out = PyMemoryView_FromObject(temp);
Py_DECREF(temp);
if (out == NULL) return NULL;
}

int quad = 0;
uint8_t left_c = 0;
Expand Down Expand Up @@ -16577,7 +16600,11 @@ json_decode_string(JSONDecoderState *self, TypeNode *type, PathNode *path) {
else if (MS_UNLIKELY(type->types & MS_TYPE_DECIMAL)) {
return ms_decode_decimal(view, size, is_ascii, path, NULL);
}
else if (MS_UNLIKELY(type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY))) {
else if (
MS_UNLIKELY(type->types &
(MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW)
)
) {
return json_decode_binary(view, size, type, path);
}
else if (MS_UNLIKELY(type->types & (MS_TYPE_ENUM | MS_TYPE_STRLITERAL))) {
Expand Down Expand Up @@ -16636,7 +16663,7 @@ json_decode_dict_key_fallback(
else if (type->types & MS_TYPE_TIMEDELTA) {
return ms_decode_timedelta(view, size, type, path);
}
else if (type->types & MS_TYPE_BYTES) {
else if (type->types & (MS_TYPE_BYTES | MS_TYPE_MEMORYVIEW)) {
return json_decode_binary(view, size, type, path);
}
else {
Expand Down Expand Up @@ -19662,6 +19689,12 @@ convert_str_uncommon(
) {
return json_decode_binary(view, size, type, path);
}
else if (
(type->types & MS_TYPE_MEMORYVIEW)
&& !(self->builtin_types & MS_BUILTIN_MEMORYVIEW)
) {
return json_decode_binary(view, size, type, path);
}
return ms_validation_error("str", type, path);
}

Expand Down Expand Up @@ -19691,18 +19724,23 @@ static PyObject *
convert_bytes(
ConvertState *self, PyObject *obj, TypeNode *type, PathNode *path
) {
if (type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY)) {
if (type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW)) {
if (!ms_passes_bytes_constraints(PyBytes_GET_SIZE(obj), type, path)) {
return NULL;
}
if (type->types & MS_TYPE_BYTES) {
return PyBytes_FromObject(obj);
}
return PyByteArray_FromObject(obj);
else if (type->types & MS_TYPE_BYTEARRAY) {
return PyByteArray_FromObject(obj);
}
else {
return PyMemoryView_FromObject(obj);
}
}
if (
(type->types & MS_TYPE_UUID) &&
!(self->builtin_types & MS_BUILTIN_UUID)
(type->types & MS_TYPE_UUID) &&
!(self->builtin_types & MS_BUILTIN_UUID)
) {
return ms_decode_uuid_from_bytes(
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), path
Expand All @@ -19715,19 +19753,24 @@ static PyObject *
convert_bytearray(
ConvertState *self, PyObject *obj, TypeNode *type, PathNode *path
) {
if (type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY)) {
if (type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW)) {
if (!ms_passes_bytes_constraints(PyByteArray_GET_SIZE(obj), type, path)) {
return NULL;
}
if (type->types & MS_TYPE_BYTEARRAY) {
Py_INCREF(obj);
return obj;
}
return PyBytes_FromObject(obj);
else if (type->types & MS_TYPE_BYTES) {
return PyBytes_FromObject(obj);
}
else {
return PyMemoryView_FromObject(obj);
}
}
if (
(type->types & MS_TYPE_UUID) &&
!(self->builtin_types & MS_BUILTIN_UUID)
(type->types & MS_TYPE_UUID) &&
!(self->builtin_types & MS_BUILTIN_UUID)
) {
return ms_decode_uuid_from_bytes(
PyByteArray_AS_STRING(obj), PyByteArray_GET_SIZE(obj), path
Expand All @@ -19736,6 +19779,39 @@ convert_bytearray(
return ms_validation_error("bytes", type, path);
}

static PyObject *
convert_memoryview(
ConvertState *self, PyObject *obj, TypeNode *type, PathNode *path
) {
if (type->types & (MS_TYPE_BYTES | MS_TYPE_BYTEARRAY | MS_TYPE_MEMORYVIEW)) {
Py_ssize_t len = PyMemoryView_GET_BUFFER(obj)->len;
if (!ms_passes_bytes_constraints(len, type, path)) return NULL;
if (type->types & MS_TYPE_MEMORYVIEW) {
Py_INCREF(obj);
return obj;
}
else if (type->types & MS_TYPE_BYTES) {
return PyBytes_FromObject(obj);
}
else {
return PyByteArray_FromObject(obj);
}
}
if (
(type->types & MS_TYPE_UUID) &&
!(self->builtin_types & MS_BUILTIN_UUID)
) {
Py_buffer buffer;
if (PyObject_GetBuffer(obj, &buffer, PyBUF_CONTIG_RO) < 0) return NULL;
PyObject *out = ms_decode_uuid_from_bytes(
buffer.buf, buffer.len, path
);
PyBuffer_Release(&buffer);
return out;
}
return ms_validation_error("bytes", type, path);
}

static PyObject *
convert_datetime(
ConvertState *self, PyObject *obj, TypeNode *type, PathNode *path
Expand Down Expand Up @@ -20857,6 +20933,9 @@ convert(
else if (pytype == &PyByteArray_Type) {
return convert_bytearray(self, obj, type, path);
}
else if (pytype == &PyMemoryView_Type) {
return convert_memoryview(self, obj, type, path);
}
else if (pytype == PyDateTimeAPI->DateTimeType) {
return convert_datetime(self, obj, type, path);
}
Expand Down
2 changes: 1 addition & 1 deletion msgspec/_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def to_schema(self, t: mi.Type, check_ref: bool = True) -> dict[str, Any]:
schema["minLength"] = t.min_length
if t.pattern is not None:
schema["pattern"] = t.pattern
elif isinstance(t, (mi.BytesType, mi.ByteArrayType)):
elif isinstance(t, (mi.BytesType, mi.ByteArrayType, mi.MemoryViewType)):
schema["type"] = "string"
schema["contentEncoding"] = "base64"
if t.max_length is not None:
Expand Down
20 changes: 20 additions & 0 deletions msgspec/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"StrType",
"BytesType",
"ByteArrayType",
"MemoryViewType",
"DateTimeType",
"TimeType",
"DateType",
Expand Down Expand Up @@ -222,6 +223,23 @@ class ByteArrayType(Type):
max_length: Union[int, None] = None


class MemoryViewType(Type):
"""A type corresponding to `memoryview`.

Parameters
----------
min_length: int, optional
If set, an instance of this type must have length greater than or equal
to ``min_length``.
max_length: int, optional
If set, an instance of this type must have length less than or equal
to ``max_length``.
"""

min_length: Union[int, None] = None
max_length: Union[int, None] = None


class DateTimeType(Type):
"""A type corresponding to `datetime.datetime`.

Expand Down Expand Up @@ -803,6 +821,8 @@ def _translate_inner(
return BytesType(min_length=min_length, max_length=max_length)
elif t is bytearray:
return ByteArrayType(min_length=min_length, max_length=max_length)
elif t is memoryview:
return MemoryViewType(min_length=min_length, max_length=max_length)
elif t is datetime.datetime:
return DateTimeType(tz=tz)
elif t is datetime.time:
Expand Down
Loading
Loading