From fd6ecd1c926c1dd72080028be2bb124722edbb84 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Nov 2024 22:51:49 +0100 Subject: [PATCH 01/85] add functions for easy read-only data access --- src/zarr/api/asynchronous.py | 140 ++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index cd8c3543ca..729810db89 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -275,8 +275,8 @@ async def open( path : str or None, optional The path within the store to open. storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. **kwargs Additional parameters are passed through to :func:`zarr.creation.open_array` or :func:`zarr.hierarchy.open_group`. @@ -313,6 +313,47 @@ async def open( return await open_group(store=store_path, zarr_format=zarr_format, **kwargs) +async def read( + *, + store: StoreLike | None = None, + zarr_format: ZarrFormat | None = None, + path: str | None = None, + storage_options: dict[str, Any] | None = None, + **kwargs: Any, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup: + """Convenience function to open a group or array for reading. This function + wraps :func:`zarr.api.asynchronous.open` See the documentation of that function for details. + + Parameters + ---------- + store : Store or str, optional + Store or path to directory in file system or name of zip file. + zarr_format : {2, 3, None}, optional + The zarr format to require. The default value of None will first look for Zarr v3 data, + then Zarr v2 data, then fail if neither format is found. + path : str or None, optional + The path within the store to open. + storage_options : dict, optional + If using an fsspec URL to create the store, this will be passed to + the backend implementation. Ignored otherwise. + **kwargs + Additional parameters are passed through to :func:`zarr.creation.open`. + + Returns + ------- + z : array or group + Return type depends on what exists in the given store. + """ + return await open( + store=store, + mode="r", + zarr_format=zarr_format, + path=path, + storage_options=storage_options, + **kwargs, + ) + + async def open_consolidated( *args: Any, use_consolidated: Literal[True] = True, **kwargs: Any ) -> AsyncGroup: @@ -709,6 +750,66 @@ async def open_group( ) +async def read_group( + store: StoreLike, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + storage_options: dict[str, Any] | None = None, + use_consolidated: bool | str | None = None, +) -> AsyncGroup: + """Open a group for reading. This function wraps :func:`zarr.api.asynchronous.open_group` See + the documentation of that function for details. + + Parameters + ---------- + store : Store, str, or mapping, optional + Store or path to directory in file system or name of zip file. + + Strings are interpreted as paths on the local file system + and used as the ``root`` argument to :class:`zarr.store.LocalStore`. + + Dictionaries are used as the ``store_dict`` argument in + :class:`zarr.store.MemoryStore``. + path : str, optional + Group path within store. + zarr_format : {2, 3, None}, optional + The zarr format to require. The default value of None will first look for Zarr v3 data, + then Zarr v2 data, then fail if neither format is found. + storage_options : dict + If the store is backed by an fsspec-based implementation, then this dict will be passed to + the Store constructor for that implementation. Ignored otherwise. + use_consolidated : bool or str, default None + Whether to use consolidated metadata. + + By default, consolidated metadata is used if it's present in the + store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file + for Zarr v2). + + To explicitly require consolidated metadata, set ``use_consolidated=True``, + which will raise an exception if consolidated metadata is not found. + + To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, + which will fall back to using the regular, non consolidated metadata. + + Zarr v2 allowed configuring the key storing the consolidated metadata + (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` + to load consolidated metadata from a non-default key. + + Returns + ------- + g : group + The new group. + """ + return await open_group( + store=store, + mode="r", + path=path, + storage_options=storage_options, + zarr_format=zarr_format, + use_consolidated=use_consolidated, + ) + + async def create( shape: ChunkCoords, *, # Note: this is a change from v2 @@ -893,6 +994,40 @@ async def create( ) +async def read_array( + store: StoreLike, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: + """Create an array for reading. Wraps `:func:zarr.api.asynchronous.create`. + See the documentation of that function for details. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system or name of zip file. + path : str, optional + Path under which the array is stored. + zarr_format : {2, 3, None}, optional + The zarr format to require. The default value of ``None`` will first look for Zarr v3 data, + then Zarr v2 data, then fail if neither format is found. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + z : array + The array. + """ + store_path = await make_store_path(store, path=path, mode="r", storage_options=storage_options) + return await AsyncArray.open( + store=store_path, + zarr_format=zarr_format, + ) + + async def empty( shape: ChunkCoords, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: @@ -1070,6 +1205,7 @@ async def open_array( store=store_path, zarr_format=zarr_format or _default_zarr_version(), overwrite=store_path.store.mode.overwrite, + storage_options=storage_options, **kwargs, ) raise From fa343f5a7eee2c3c22617b496eb67f2d9179e857 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Nov 2024 23:04:58 +0100 Subject: [PATCH 02/85] sync funcs --- src/zarr/api/synchronous.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 9dcd6fe2d5..17793916a5 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -90,6 +90,16 @@ def open( return Group(obj) +def read( + store: StoreLike, + *, + zarr_format: ZarrFormat | None = None, + path: str | None = None, + **kwargs: Any, +) -> Array | Group: + return open(store=store, mode="r", zarr_format=zarr_format, path=path, **kwargs) + + def open_consolidated(*args: Any, use_consolidated: Literal[True] = True, **kwargs: Any) -> Group: return Group( sync(async_api.open_consolidated(*args, use_consolidated=use_consolidated, **kwargs)) @@ -232,11 +242,32 @@ def open_group( ) +def read_group( + store: StoreLike | None = None, + path: str | None = None, + storage_options: dict[str, Any] | None = None, # not used in async api + zarr_format: ZarrFormat | None = None, + use_consolidated: bool | str | None = None, +) -> Group: + return open_group( + store=store, + path=path, + mode="r", + zarr_format=zarr_format, + use_consolidated=use_consolidated, + storage_options=storage_options, + ) + + # TODO: add type annotations for kwargs def create(*args: Any, **kwargs: Any) -> Array: return Array(sync(async_api.create(*args, **kwargs))) +def read_array(*args: Any, **kwargs: Any) -> Array: + return Array(sync(async_api.read_array(*args, **kwargs))) + + # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: return Array(sync(async_api.empty(shape, **kwargs))) @@ -295,6 +326,7 @@ def zeros_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: copy_store.__doc__ = async_api.copy_store.__doc__ load.__doc__ = async_api.load.__doc__ open.__doc__ = async_api.open.__doc__ +read.__doc__ = async_api.read.__doc__ open_consolidated.__doc__ = async_api.open_consolidated.__doc__ save.__doc__ = async_api.save.__doc__ save_array.__doc__ = async_api.save_array.__doc__ @@ -303,7 +335,9 @@ def zeros_like(a: async_api.ArrayLike, **kwargs: Any) -> Array: array.__doc__ = async_api.array.__doc__ group.__doc__ = async_api.group.__doc__ open_group.__doc__ = async_api.open_group.__doc__ +read_group.__doc__ = async_api.read_group.__doc__ create.__doc__ = async_api.create.__doc__ +read_array.__doc__ = async_api.read_array.__doc__ empty.__doc__ = async_api.empty.__doc__ empty_like.__doc__ = async_api.empty_like.__doc__ full.__doc__ = async_api.full.__doc__ From d95eba8b7424347355ab884aa3bb10abc6b3263f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Nov 2024 23:09:23 +0100 Subject: [PATCH 03/85] make read-only funcs top-level exports --- src/zarr/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 51116a929e..2ab461dcc8 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -19,6 +19,9 @@ open_consolidated, open_group, open_like, + read, + read_array, + read_group, save, save_array, save_group, @@ -46,6 +49,7 @@ "copy_all", "copy_store", "create", + "read_array", "empty", "empty_like", "full", @@ -55,9 +59,11 @@ "ones", "ones_like", "open", + "read", "open_array", "open_consolidated", "open_group", + "read_group", "open_like", "save", "save_array", From 5d8445bd824eaf3fcc291e8a12397d741aa582cd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Nov 2024 17:38:59 +0100 Subject: [PATCH 04/85] add create_array, create_group, and tests --- src/zarr/api/asynchronous.py | 171 +++++++++++++++++++++++++++++++++++ src/zarr/api/synchronous.py | 32 ++++++- src/zarr/core/array.py | 7 +- src/zarr/core/group.py | 6 +- tests/test_api.py | 81 ++++++++++++++++- 5 files changed, 288 insertions(+), 9 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 729810db89..cab0b641d0 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -644,6 +644,54 @@ async def group( ) +async def create_group( + *, + store: StoreLike, + path: str | None = None, + overwrite: bool = False, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> AsyncGroup: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + g : group + The new group. + """ + + if zarr_format is None: + zarr_format = _default_zarr_version() + + # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + return await AsyncGroup.from_store( + store=store_path, + zarr_format=zarr_format, + exists_ok=overwrite, + attributes=attributes, + ) + + async def open_group( store: StoreLike | None = None, *, # Note: this is a change from v2 @@ -752,6 +800,7 @@ async def open_group( async def read_group( store: StoreLike, + *, path: str | None = None, zarr_format: ZarrFormat | None = None, storage_options: dict[str, Any] | None = None, @@ -810,6 +859,127 @@ async def read_group( ) +async def create_array( + store: str | StoreLike, + *, + shape: ChunkCoords, + chunks: ChunkCoords | None = None, # TODO: v2 allowed chunks=True + dtype: npt.DTypeLike | None = None, + compressor: dict[str, JSON] | None = None, # TODO: default and type change + fill_value: Any | None = 0, # TODO: need type + order: MemoryOrder | None = None, + overwrite: bool = False, + path: PathLike | None = None, + filters: list[dict[str, JSON]] | None = None, # TODO: type has changed + dimension_separator: Literal[".", "/"] | None = None, + zarr_format: ZarrFormat | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + **kwargs: Any, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + shape : int or tuple of ints + Array shape. + chunks : int or tuple of ints, optional + Chunk shape. If True, will be guessed from `shape` and `dtype`. If + False, will be set to `shape`, i.e., single chunk for the whole array. + If an int, the chunk size in each dimension will be given by the value + of `chunks`. Default is True. + dtype : str or dtype, optional + NumPy dtype. + compressor : Codec, optional + Primary compressor. + fill_value : object + Default value to use for uninitialized portions of the array. + order : {'C', 'F'}, optional + Memory layout to be used within each chunk. + Default is set in Zarr's config (`array.order`). + store : Store or str + Store or path to directory in file system or name of zip file. + overwrite : bool, optional + If True, delete all pre-existing data in `store` at `path` before + creating the array. + path : str, optional + Path under which array is stored. + filters : sequence of Codecs, optional + Sequence of filters to use to encode chunk data prior to compression. + dimension_separator : {'.', '/'}, optional + Separator placed between the dimensions of a chunk. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + z : array + The array. + """ + + if zarr_format is None: + zarr_format = _default_zarr_version() + + if zarr_format == 2 and chunks is None: + chunks = shape + elif zarr_format == 3 and chunk_shape is None: + if chunks is not None: + chunk_shape = chunks + chunks = None + else: + chunk_shape = shape + + if dimension_separator is not None: + if zarr_format == 3: + raise ValueError( + "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" + ) + else: + warnings.warn( + "dimension_separator is not yet implemented", + RuntimeWarning, + stacklevel=2, + ) + + # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + + return await AsyncArray.create( + store_path, + shape=shape, + chunks=chunks, + dtype=dtype, + compressor=compressor, + fill_value=fill_value, + exists_ok=overwrite, + filters=filters, + dimension_separator=dimension_separator, + zarr_format=zarr_format, + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + attributes=attributes, + order=order, + **kwargs, + ) + + async def create( shape: ChunkCoords, *, # Note: this is a change from v2 @@ -996,6 +1166,7 @@ async def create( async def read_array( store: StoreLike, + *, path: str | None = None, zarr_format: ZarrFormat | None = None, storage_options: dict[str, Any] | None = None, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 17793916a5..62f5e41609 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -242,10 +242,34 @@ def open_group( ) +def create_group( + store: StoreLike, + *, + path: str | None = None, + zarr_format: ZarrFormat | None = None, + overwrite: bool = False, + attributes: dict[str, Any] | None = None, + storage_options: dict[str, Any] | None = None, +) -> Group: + return Group( + sync( + async_api.create_group( + store=store, + path=path, + overwrite=overwrite, + storage_options=storage_options, + zarr_format=zarr_format, + attributes=attributes, + ) + ) + ) + + def read_group( - store: StoreLike | None = None, + store: StoreLike, + *, path: str | None = None, - storage_options: dict[str, Any] | None = None, # not used in async api + storage_options: dict[str, Any] | None = None, zarr_format: ZarrFormat | None = None, use_consolidated: bool | str | None = None, ) -> Group: @@ -264,6 +288,10 @@ def create(*args: Any, **kwargs: Any) -> Array: return Array(sync(async_api.create(*args, **kwargs))) +def create_array(*args: Any, **kwargs: Any) -> Array: + return Array(sync(async_api.create_array(*args, **kwargs))) + + def read_array(*args: Any, **kwargs: Any) -> Array: return Array(sync(async_api.read_array(*args, **kwargs))) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c3d6fca543..4daabffdfd 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import warnings from asyncio import gather from dataclasses import dataclass, field from itertools import starmap @@ -144,9 +145,9 @@ async def get_array_metadata( (store_path / ZATTRS_JSON).get(), ) if zarr_json_bytes is not None and zarray_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zarray objects exist") + # wwarn and favor v3 + msg = f"Both zarr.json (zarr v3) and .zarray (zarr v2) metadata objects exist at {store_path}." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) # set zarr_format based on which keys were found diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 46f37700eb..976c347dd9 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -486,9 +486,9 @@ async def open( (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: - # TODO: revisit this exception type - # alternatively, we could warn and favor v3 - raise ValueError("Both zarr.json and .zgroup objects exist") + # we could warn and favor v3 + msg = f"Both zarr.json (zarr v3) and .zgroup (zarr v2) metadata objects exist at {store_path}." + warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( f"could not find zarr.json or .zgroup objects in {store_path}" diff --git a/tests/test_api.py b/tests/test_api.py index 5b62e3a2fa..cb4bce97ff 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -13,10 +13,14 @@ from zarr.abc.store import Store from zarr.api.synchronous import ( create, + create_array, + create_group, group, load, open, open_group, + read_array, + read_group, save, save_array, save_group, @@ -27,7 +31,7 @@ from zarr.storage.memory import MemoryStore -def test_create_array(memory_store: Store) -> None: +def test_create(memory_store: Store) -> None: store = memory_store # create array @@ -48,6 +52,50 @@ def test_create_array(memory_store: Store) -> None: assert z.chunks == (40,) +# TODO: parametrize over everything this function takes +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_create_array(store: Store) -> None: + attrs = {"foo": 100} + shape = (10, 10) + path = "foo" + data_val = 1 + array_w = create_array(store, path=path, shape=shape, attributes=attrs) + array_w[:] = data_val + assert array_w.shape == shape + assert array_w.attrs == attrs + assert np.array_equal(array_w[:], np.zeros(shape, dtype=array_w.dtype) + data_val) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_read_array(store: Store) -> None: + shape = (10, 10) + data_val = 1 + path = "foo" + + zarr_format: ZarrFormat + + for zarr_format in (2, 3): + attrs = {"zarr_format": zarr_format} + node_w = create_array( + store, path=path, shape=shape, attributes=attrs, zarr_format=zarr_format + ) + node_w[:] = data_val + + # check that the correct array can be read when both v2 and v3 arrays are present + for zarr_format in (2, 3): + node_r = read_array(store, path=path, zarr_format=zarr_format) + + assert node_r.shape == shape + assert node_r.attrs == {"zarr_format": zarr_format} + assert np.array_equal(node_r[:], np.zeros(shape, dtype=node_r.dtype) + data_val) + + # check that reading without specifying the zarr_format returns the v3 node + with pytest.warns(UserWarning): + node_r = read_array(store, path=path) + + assert node_r.metadata.zarr_format == 3 + + @pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) @pytest.mark.parametrize("node_type", ["array", "group"]) def test_open_normalized_path( @@ -90,6 +138,16 @@ async def test_open_array(memory_store: MemoryStore) -> None: open(store="doesnotexist", mode="r") +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_group(store: Store, zarr_format: ZarrFormat) -> None: + attrs = {"foo": 100} + path = "node" + node = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) + assert isinstance(node, Group) + assert node.attrs == attrs + assert node.metadata.zarr_format == zarr_format + + async def test_open_group(memory_store: MemoryStore) -> None: store = memory_store @@ -112,6 +170,27 @@ async def test_open_group(memory_store: MemoryStore) -> None: # assert g.read_only +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_read_group(store: Store) -> None: + path = "foo" + + zarr_format: ZarrFormat + for zarr_format in (2, 3): + attrs = {"zarr_format": zarr_format} + _ = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) + + # check that the correct array can be read when both v2 and v3 arrays are present + for zarr_format in (2, 3): + node_r = read_group(store, path=path, zarr_format=zarr_format) + assert node_r.attrs == {"zarr_format": zarr_format} + + # check that reading without specifying the zarr_format returns the v3 node + with pytest.warns(UserWarning): + node_r = read_group(store, path=path) + + assert node_r.metadata.zarr_format == 3 + + @pytest.mark.parametrize("zarr_format", [None, 2, 3]) async def test_open_group_unspecified_version( tmpdir: pathlib.Path, zarr_format: ZarrFormat From 95265712c5af1532f3c73cb7b2b4236d83002711 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Nov 2024 17:40:07 +0100 Subject: [PATCH 05/85] add top-level imports --- src/zarr/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 2ab461dcc8..86b1381c21 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -49,6 +49,8 @@ "copy_all", "copy_store", "create", + "create_array", + "create_group", "read_array", "empty", "empty_like", From de280a7d9f54ba8544eafb3831cfb5909ec31072 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Nov 2024 17:52:00 +0100 Subject: [PATCH 06/85] add test for top-level exports --- src/zarr/__init__.py | 2 ++ tests/test_zarr.py | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100644 tests/test_zarr.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 86b1381c21..c07610b7e3 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -6,6 +6,8 @@ copy_all, copy_store, create, + create_array, + create_group, empty, empty_like, full, diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000000..2aa62e4231 --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,11 @@ +import zarr + + +def test_exports() -> None: + """ + Ensure that everything in __all__ can be imported. + """ + from zarr import __all__ + + for export in __all__: + getattr(zarr, export) From d9878cf8330b13522ea47ee5ac6c8232bdf3a1a6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Nov 2024 17:56:47 +0100 Subject: [PATCH 07/85] add test for read --- tests/test_api.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 74471b9eb2..91e49d1804 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -19,6 +19,7 @@ load, open, open_group, + read, read_array, read_group, save, @@ -52,6 +53,23 @@ def test_create(memory_store: Store) -> None: assert z.chunks == (40,) +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_read(store: Store) -> None: + """ + Test that the polymorphic read function works. + """ + # create an array and a group + _ = create_group(store=store, path="group", attributes={"node_type": "group"}) + _ = create_array(store=store, path="array", shape=(10, 10), attributes={"node_type": "array"}) + + group_r = read(store, path="group") + assert group_r.attrs == {"node_type": "group"} + + array_r = read(store, path="array") + assert array_r.attrs == {"node_type": "array"} + assert array_r.shape == (10, 10) + + # TODO: parametrize over everything this function takes @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_create_array(store: Store) -> None: From e5217ce5de074973451fe6d10a5e9b3b901a4770 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Nov 2024 17:58:52 +0100 Subject: [PATCH 08/85] add asserts --- tests/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 91e49d1804..722716ebd9 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -63,9 +63,11 @@ def test_read(store: Store) -> None: _ = create_array(store=store, path="array", shape=(10, 10), attributes={"node_type": "array"}) group_r = read(store, path="group") + assert isinstance(group_r, Group) assert group_r.attrs == {"node_type": "group"} array_r = read(store, path="array") + assert isinstance(array_r, Array) assert array_r.attrs == {"node_type": "array"} assert array_r.shape == (10, 10) From 40cc7af4feceef1f7d8353ca50b0117322b79b90 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Tue, 5 Nov 2024 18:38:37 +0100 Subject: [PATCH 09/85] Apply suggestions from code review --- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 2 +- tests/test_api.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 944cf8e783..02ca49e4cc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -145,7 +145,7 @@ async def get_array_metadata( (store_path / ZATTRS_JSON).get(), ) if zarr_json_bytes is not None and zarray_bytes is not None: - # wwarn and favor v3 + # warn and favor v3 msg = f"Both zarr.json (zarr v3) and .zarray (zarr v2) metadata objects exist at {store_path}." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index d3c58c9329..a19def337a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -492,7 +492,7 @@ async def open( (store_path / str(consolidated_key)).get(), ) if zarr_json_bytes is not None and zgroup_bytes is not None: - # we could warn and favor v3 + # warn and favor v3 msg = f"Both zarr.json (zarr v3) and .zgroup (zarr v2) metadata objects exist at {store_path}." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: diff --git a/tests/test_api.py b/tests/test_api.py index 722716ebd9..c5d3874d82 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -56,7 +56,7 @@ def test_create(memory_store: Store) -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_read(store: Store) -> None: """ - Test that the polymorphic read function works. + Test that the polymorphic read function can return an Array or a Group, depending on the path argument. """ # create an array and a group _ = create_group(store=store, path="group", attributes={"node_type": "group"}) From 4b45ebf0858fb2e2db86cb3019fd6fff0c04cfb7 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 13:44:58 +0100 Subject: [PATCH 10/85] handle sharding in create_array --- src/zarr/api/asynchronous.py | 157 ++++++++++++++++------------------- src/zarr/core/chunk_grids.py | 39 ++++++++- tests/test_api.py | 35 ++++++++ 3 files changed, 144 insertions(+), 87 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 5553855749..8ffd96d232 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,6 +10,7 @@ from zarr.core.array import Array, AsyncArray, get_array_metadata from zarr.core.buffer import NDArrayLike +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import ( JSON, AccessModeLiteral, @@ -895,67 +896,32 @@ async def read_group( async def create_array( store: str | StoreLike, *, - shape: ChunkCoords, - chunks: ChunkCoords | None = None, # TODO: v2 allowed chunks=True - dtype: npt.DTypeLike | None = None, - compressor: dict[str, JSON] | None = None, # TODO: default and type change - fill_value: Any | None = 0, # TODO: need type - order: MemoryOrder | None = None, - overwrite: bool = False, path: PathLike | None = None, - filters: list[dict[str, JSON]] | None = None, # TODO: type has changed - dimension_separator: Literal[".", "/"] | None = None, - zarr_format: ZarrFormat | None = None, + shape: ChunkCoords, + dtype: npt.DTypeLike, + shard_shape: ChunkCoords | None | Literal["auto"] = "auto", + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + filters: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + ) = ("default", "/"), dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - **kwargs: Any, + overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. Parameters ---------- - shape : int or tuple of ints - Array shape. - chunks : int or tuple of ints, optional - Chunk shape. If True, will be guessed from `shape` and `dtype`. If - False, will be set to `shape`, i.e., single chunk for the whole array. - If an int, the chunk size in each dimension will be given by the value - of `chunks`. Default is True. - dtype : str or dtype, optional - NumPy dtype. - compressor : Codec, optional - Primary compressor. - fill_value : object - Default value to use for uninitialized portions of the array. - order : {'C', 'F'}, optional - Memory layout to be used within each chunk. - Default is set in Zarr's config (`array.order`). - store : Store or str - Store or path to directory in file system or name of zip file. - overwrite : bool, optional - If True, delete all pre-existing data in `store` at `path` before - creating the array. - path : str, optional - Path under which array is stored. - filters : sequence of Codecs, optional - Sequence of filters to use to encode chunk data prior to compression. - dimension_separator : {'.', '/'}, optional - Separator placed between the dimensions of a chunk. - zarr_format : {2, 3, None}, optional - The zarr format to use when saving. - storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. + Returns ------- @@ -966,51 +932,70 @@ async def create_array( if zarr_format is None: zarr_format = _default_zarr_version() - if zarr_format == 2 and chunks is None: - chunks = shape - elif zarr_format == 3 and chunk_shape is None: - if chunks is not None: - chunk_shape = chunks - chunks = None - else: - chunk_shape = shape - - if dimension_separator is not None: - if zarr_format == 3: - raise ValueError( - "dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead" - ) - else: - warnings.warn( - "dimension_separator is not yet implemented", - RuntimeWarning, - stacklevel=2, - ) + # TODO: figure out why putting these imports at top-level causes circular imports + from zarr.codecs.bytes import BytesCodec + from zarr.codecs.sharding import ShardingCodec # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise mode: Literal["a"] = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + sub_codecs = (*filters, BytesCodec(), *compressors) - return await AsyncArray.create( - store_path, - shape=shape, - chunks=chunks, - dtype=dtype, - compressor=compressor, - fill_value=fill_value, - exists_ok=overwrite, - filters=filters, - dimension_separator=dimension_separator, - zarr_format=zarr_format, - chunk_shape=chunk_shape, - chunk_key_encoding=chunk_key_encoding, - codecs=codecs, - dimension_names=dimension_names, - attributes=attributes, - order=order, - **kwargs, - ) + if zarr_format == 2: + if shard_shape is not None or shard_shape != "auto": + msg = ( + 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' + f"Got `shard_shape={shard_shape}` instead." + ) + + raise ValueError(msg) + compressor, *rest = compressors + filters = (*filters, *rest) + if dimension_names is not None: + raise ValueError("Zarr v2 arrays do not support dimension names.") + return await AsyncArray._create_v2( + shape=shape, + dtype=dtype, + chunks=chunk_shape, + dimension_separator="/", + fill_value=fill_value, + order=order, + filters=filters, + compressor=compressor, + attributes=attributes, + overwrite=overwrite, + ) + else: + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + shape, dtype, shard_shape, chunk_shape + ) + if shard_shape_parsed is not None: + sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) + sharding_codec.validate( + shape=chunk_shape_parsed, + dtype=dtype, + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + ) + codecs = (sharding_codec,) + chunks_out = shard_shape_parsed + else: + chunks_out = chunk_shape_parsed + codecs = sub_codecs + + return await AsyncArray._create_v3( + store=store_path, + shape=shape, + dtype=dtype, + fill_value=fill_value, + attributes=attributes, + chunk_shape=chunks_out, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + order=order, + overwrite=overwrite, + ) async def create( diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index afecc6824f..2330188de1 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -7,7 +7,7 @@ from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -141,6 +141,43 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl return tuple(int(c) for c in chunks) +import numpy.typing as npt + + +def _auto_partition( + shape: tuple[int, ...], + dtype: npt.DTypeLike, + shard_shape: tuple[int, ...] | Literal["auto"] | None, + chunk_shape: tuple[int, ...] | Literal["auto"], +) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: + """ + Automatically determine the shard shape and chunk shape for a new array, given the shape and dtype of the array. + If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based + on the dtype and shape of the array. + If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape + of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, + given the dtype and shard shape. Otherwise, the chunks will be returned as-is. + """ + # no sharding + item_size = np.dtype(dtype).itemsize + if shard_shape is None: + _shards_out = None + if chunk_shape == "auto": + _chunks_out = _guess_chunks(shape, item_size) + else: + _chunks_out = chunk_shape + else: + if shard_shape == "auto": + _shards_out = _guess_chunks(shape, item_size) + else: + _shards_out = shard_shape + if chunk_shape == "auto": + _chunks_out = _guess_chunks(_shards_out, item_size) + else: + _chunks_out = chunk_shape + return _shards_out, _chunks_out + + @dataclass(frozen=True) class ChunkGrid(Metadata): @classmethod diff --git a/tests/test_api.py b/tests/test_api.py index 496ff9809a..073ce5180b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -26,6 +26,8 @@ save_array, save_group, ) +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec from zarr.core.common import MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path @@ -1145,3 +1147,36 @@ def test_open_array_with_mode_r_plus(store: Store) -> None: assert isinstance(z2, Array) assert (z2[:] == 1).all() z2[:] = 3 + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v3(store: MemoryStore) -> None: + # TODO: fill in + _ = zarr.create_array( + store=store, + dtype="uint8", + shape=(10,), + shard_shape=(4,), + chunk_shape=(4,), + zarr_format=3, + filters=(TransposeCodec(order=(0,)),), + compressors=(ZstdCodec(level=3),), + ) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v2(store: MemoryStore) -> None: + from numcodecs import Delta, Zstd + + # TODO: fill in + dtype = "uint8" + _ = zarr.create_array( + store=store, + dtype=dtype, + shape=(10,), + shard_shape=(4,), + chunk_shape=(4,), + zarr_format=3, + filters=(Delta(dtype=dtype),), + compressors=(Zstd(level=3),), + ) From 7a5cbe797cd7ffcb6ad8ee4ea49aa38d8bcca848 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 10 Dec 2024 13:58:44 +0100 Subject: [PATCH 11/85] tweak --- src/zarr/api/asynchronous.py | 3 ++- src/zarr/core/chunk_grids.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index af99bbce57..cac64a69d9 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -977,6 +977,7 @@ async def create_array( if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") return await AsyncArray._create_v2( + store_path=store_path, shape=shape, dtype=dtype, chunks=chunk_shape, @@ -1006,7 +1007,7 @@ async def create_array( codecs = sub_codecs return await AsyncArray._create_v3( - store=store_path, + store_path=store_path, shape=shape, dtype=dtype, fill_value=fill_value, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index cfc489e9b9..ebc07cce6a 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -154,7 +154,7 @@ def _auto_partition( chunk_shape: tuple[int, ...] | Literal["auto"], ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: """ - Automatically determine the shard shape and chunk shape for a new array, given the shape and dtype of the array. + Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based on the dtype and shape of the array. If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape From 489e2a2722d27ba3958cf1a807f7251e0a22b2d2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 18 Dec 2024 16:02:30 +0100 Subject: [PATCH 12/85] make logic of _auto_partition better for shard shape --- src/zarr/core/chunk_grids.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ebc07cce6a..54ae781a81 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -25,7 +25,7 @@ if TYPE_CHECKING: from collections.abc import Iterator from typing import Self - + import numpy.typing as npt def _guess_chunks( shape: ShapeLike, @@ -144,8 +144,6 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl return tuple(int(c) for c in chunks) -import numpy.typing as npt - def _auto_partition( shape: tuple[int, ...], @@ -170,14 +168,17 @@ def _auto_partition( else: _chunks_out = chunk_shape else: - if shard_shape == "auto": - _shards_out = _guess_chunks(shape, item_size) - else: - _shards_out = shard_shape if chunk_shape == "auto": - _chunks_out = _guess_chunks(_shards_out, item_size) + # aim for a 1MiB chunk + _chunks_out = _guess_chunks(shape, item_size, max_bytes=1024**2) else: _chunks_out = chunk_shape + if shard_shape == "auto": + # TODO: fix me! this should be capped at some sane shard shape + _shards_out = tuple(c * 8 for c in _chunks_out) + else: + _shards_out = shard_shape + return _shards_out, _chunks_out From 05dd0d84fb1f3c0109e823d091f1334a2ae0ee4b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 18 Dec 2024 21:42:00 +0100 Subject: [PATCH 13/85] add dtype parsing, and tweak auto_partitioning func --- src/zarr/api/asynchronous.py | 7 ++++--- src/zarr/core/chunk_grids.py | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 065badc908..facbe10999 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,6 +18,7 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, ) from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata @@ -972,7 +973,7 @@ async def create_array( store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) sub_codecs = (*filters, BytesCodec(), *compressors) - + _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) if zarr_format == 2: if shard_shape is not None or shard_shape != "auto": msg = ( @@ -988,7 +989,7 @@ async def create_array( return await AsyncArray._create_v2( store_path=store_path, shape=shape, - dtype=dtype, + dtype=_dtype_parsed, chunks=chunk_shape, dimension_separator="/", fill_value=fill_value, @@ -1018,7 +1019,7 @@ async def create_array( return await AsyncArray._create_v3( store_path=store_path, shape=shape, - dtype=dtype, + dtype=_dtype_parsed, fill_value=fill_value, attributes=attributes, chunk_shape=chunks_out, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 54ae781a81..16727fbf8a 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -25,8 +25,10 @@ if TYPE_CHECKING: from collections.abc import Iterator from typing import Self + import numpy.typing as npt + def _guess_chunks( shape: ShapeLike, typesize: int, @@ -144,7 +146,6 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl return tuple(int(c) for c in chunks) - def _auto_partition( shape: tuple[int, ...], dtype: npt.DTypeLike, @@ -170,12 +171,13 @@ def _auto_partition( else: if chunk_shape == "auto": # aim for a 1MiB chunk - _chunks_out = _guess_chunks(shape, item_size, max_bytes=1024**2) + _chunks_out = _guess_chunks(shape, item_size, max_bytes=1024) else: _chunks_out = chunk_shape + if shard_shape == "auto": # TODO: fix me! this should be capped at some sane shard shape - _shards_out = tuple(c * 8 for c in _chunks_out) + _shards_out = tuple(c * 2 for c in _chunks_out) else: _shards_out = shard_shape From 3fbfc218359e3c6ec1971dab710109b6c0dc452b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Dec 2024 13:33:26 +0100 Subject: [PATCH 14/85] sketch of docstring; remove auto chunks / shard shape --- src/zarr/api/asynchronous.py | 38 +++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index facbe10999..0e20d3d455 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -931,8 +931,8 @@ async def create_array( path: PathLike | None = None, shape: ChunkCoords, dtype: npt.DTypeLike, - shard_shape: ChunkCoords | None | Literal["auto"] = "auto", - chunk_shape: ChunkCoords | Literal["auto"] = "auto", + chunk_shape: ChunkCoords, + shard_shape: ChunkCoords | None, filters: Iterable[dict[str, JSON] | Codec] = (), compressors: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, @@ -953,7 +953,39 @@ async def create_array( Parameters ---------- - + store: str or Store + Store or path to directory in file system or name of zip file. + path: str or None, optional + The path within the store to open. + shape: ChunkCoords + Shape of the array. + dtype: npt.DTypeLike + Data type of the array. + chunk_shape: ChunkCoords + Chunk shape of the array. + shard_shape: ChunkCoords | None + Shard shape of the array. + filters: Iterable[Codec], optional + List of filters to apply to the array. + compressors: Iterable[Codec], optional + List of compressors to apply to the array. + fill_value: Any, optional + Fill value for the array. + order: {"C", "F"}, optional + Memory layout of the array. + zarr_format: {2, 3}, optional + The zarr format to use when saving. + attributes: dict, optional + Attributes for the array. + chunk_key_encoding: ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names: Iterable[str], optional + Dimension names for the array. + storage_options: dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite: bool, default False + Whether to overwrite an array with the same name in the store, if one exists. Returns ------- From 5025ad685fc8295451d328724f9155bbf27a19ee Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Dec 2024 13:40:31 +0100 Subject: [PATCH 15/85] tweak docstring --- src/zarr/api/asynchronous.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 064b16a18c..c27688aa06 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -956,7 +956,8 @@ async def create_array( store: str or Store Store or path to directory in file system or name of zip file. path: str or None, optional - The path within the store to open. + The name of the array within the store. If ``path`` is ``None``, the array will be located + at the root of the store. shape: ChunkCoords Shape of the array. dtype: npt.DTypeLike From 68465db57fcfdbcbfcf2467870d941bdbd623128 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 14:28:29 +0100 Subject: [PATCH 16/85] docstrings --- src/zarr/__init__.py | 6 +++--- src/zarr/api/asynchronous.py | 42 +++++++++++++++++++----------------- src/zarr/codecs/sharding.py | 4 +++- src/zarr/core/common.py | 2 +- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index c07610b7e3..3eb9170908 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -53,7 +53,6 @@ "create", "create_array", "create_group", - "read_array", "empty", "empty_like", "full", @@ -63,12 +62,13 @@ "ones", "ones_like", "open", - "read", "open_array", "open_consolidated", "open_group", - "read_group", "open_like", + "read", + "read_array", + "read_group", "save", "save_array", "save_group", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 4846a1b0bf..228adc511e 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -952,46 +952,48 @@ async def create_array( dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. Parameters ---------- - store: str or Store + store : str or Store Store or path to directory in file system or name of zip file. - path: str or None, optional + path : str or None, optional The name of the array within the store. If ``path`` is ``None``, the array will be located at the root of the store. - shape: ChunkCoords + shape : ChunkCoords Shape of the array. - dtype: npt.DTypeLike + dtype : npt.DTypeLike Data type of the array. - chunk_shape: ChunkCoords + chunk_shape : ChunkCoords Chunk shape of the array. - shard_shape: ChunkCoords | None - Shard shape of the array. - filters: Iterable[Codec], optional + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional List of filters to apply to the array. - compressors: Iterable[Codec], optional + compressors : Iterable[Codec], optional List of compressors to apply to the array. - fill_value: Any, optional + fill_value : Any, optional Fill value for the array. - order: {"C", "F"}, optional + order : {"C", "F"}, optional Memory layout of the array. - zarr_format: {2, 3}, optional + zarr_format : {2, 3}, optional The zarr format to use when saving. - attributes: dict, optional + attributes : dict, optional Attributes for the array. - chunk_key_encoding: ChunkKeyEncoding, optional + chunk_key_encoding : ChunkKeyEncoding, optional The chunk key encoding to use. - dimension_names: Iterable[str], optional + dimension_names : Iterable[str], optional Dimension names for the array. - storage_options: dict, optional + storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. - overwrite: bool, default False + overwrite : bool, default False Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- @@ -1026,7 +1028,7 @@ async def create_array( if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") if order is None: - order_parsed = zarr_config.get('array.order') + order_parsed = zarr_config.get("array.order") else: order_parsed = order return await AsyncArray._create_v2( @@ -1071,7 +1073,7 @@ async def create_array( codecs=codecs, dimension_names=dimension_names, overwrite=overwrite, - config=config_parsed + config=config_parsed, ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a01145b3b2..d646423eaf 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -396,7 +396,9 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return replace(self, codecs=evolved_codecs) return self - def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: + def validate( + self, *, shape: ChunkCoords, dtype: np.dtype[np.generic], chunk_grid: ChunkGrid + ) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 3db00b1a06..9f51fcb7e6 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -167,7 +167,7 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[np.generic]: if dtype is str or dtype == "str": if zarr_format == 2: # special case as object From d7bb1215bd6672a39a3e186e077b8a369696ca5b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 14:37:29 +0100 Subject: [PATCH 17/85] ensure tests pass --- src/zarr/api/asynchronous.py | 10 +++++++--- tests/test_api.py | 25 ++++++++++++++++++++----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 228adc511e..ca3aae0a11 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -936,7 +936,7 @@ async def create_array( shape: ChunkCoords, dtype: npt.DTypeLike, chunk_shape: ChunkCoords, - shard_shape: ChunkCoords | None, + shard_shape: ChunkCoords | None = None, filters: Iterable[dict[str, JSON] | Codec] = (), compressors: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, @@ -1016,14 +1016,18 @@ async def create_array( _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) config_parsed = parse_array_config(config) if zarr_format == 2: - if shard_shape is not None or shard_shape != "auto": + if shard_shape is not None: msg = ( 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' f"Got `shard_shape={shard_shape}` instead." ) raise ValueError(msg) - compressor, *rest = compressors + if len(tuple(compressors)) > 1: + compressor, *rest = compressors + else: + compressor = None + rest = () filters = (*filters, *rest) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") diff --git a/tests/test_api.py b/tests/test_api.py index 64f15e63c5..fd8267de6e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -70,7 +70,14 @@ def test_read(store: Store) -> None: """ # create an array and a group _ = create_group(store=store, path="group", attributes={"node_type": "group"}) - _ = create_array(store=store, path="array", shape=(10, 10), attributes={"node_type": "array"}) + _ = create_array( + store=store, + path="array", + shape=(10, 10), + chunk_shape=(1, 1), + dtype="uint8", + attributes={"node_type": "array"}, + ) group_r = read(store, path="group") assert isinstance(group_r, Group) @@ -89,7 +96,9 @@ def test_create_array(store: Store) -> None: shape = (10, 10) path = "foo" data_val = 1 - array_w = create_array(store, path=path, shape=shape, attributes=attrs) + array_w = create_array( + store, path=path, shape=shape, attributes=attrs, chunk_shape=shape, dtype="uint8" + ) array_w[:] = data_val assert array_w.shape == shape assert array_w.attrs == attrs @@ -107,7 +116,13 @@ def test_read_array(store: Store) -> None: for zarr_format in (2, 3): attrs = {"zarr_format": zarr_format} node_w = create_array( - store, path=path, shape=shape, attributes=attrs, zarr_format=zarr_format + store, + path=path, + shape=shape, + attributes=attrs, + zarr_format=zarr_format, + chunk_shape=shape, + dtype="uint8", ) node_w[:] = data_val @@ -1214,9 +1229,9 @@ async def test_create_array_v2(store: MemoryStore) -> None: store=store, dtype=dtype, shape=(10,), - shard_shape=(4,), + shard_shape=None, chunk_shape=(4,), - zarr_format=3, + zarr_format=2, filters=(Delta(dtype=dtype),), compressors=(Zstd(level=3),), ) From 99cc8f57490eac695e967b7ed1b58f3a1cbe8336 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 14:48:52 +0100 Subject: [PATCH 18/85] tuple -> list --- src/zarr/api/asynchronous.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ca3aae0a11..b09d294b50 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1027,7 +1027,7 @@ async def create_array( compressor, *rest = compressors else: compressor = None - rest = () + rest = [] filters = (*filters, *rest) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") From a39457f084391f83f912d8de86ec2077e46058c2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:00:45 +0100 Subject: [PATCH 19/85] allow data in create_array --- src/zarr/api/asynchronous.py | 155 +------------------------------- src/zarr/api/synchronous.py | 3 +- src/zarr/core/array.py | 166 ++++++++++++++++++++++++++++++++++- src/zarr/core/group.py | 147 +++++++++++++------------------ 4 files changed, 229 insertions(+), 242 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index b09d294b50..3f460cf9f2 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,9 +10,8 @@ from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, get_array_metadata -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import ( JSON, AccessModeLiteral, @@ -929,158 +928,6 @@ async def read_group( ) -async def create_array( - store: str | StoreLike, - *, - path: PathLike | None = None, - shape: ChunkCoords, - dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, - shard_shape: ChunkCoords | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), - fill_value: Any | None = 0, - order: MemoryOrder | None = "C", - zarr_format: ZarrFormat | None = 3, - attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = ("default", "/"), - dimension_names: Iterable[str] | None = None, - storage_options: dict[str, Any] | None = None, - overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, -) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an array. - - Parameters - ---------- - store : str or Store - Store or path to directory in file system or name of zip file. - path : str or None, optional - The name of the array within the store. If ``path`` is ``None``, the array will be located - at the root of the store. - shape : ChunkCoords - Shape of the array. - dtype : npt.DTypeLike - Data type of the array. - chunk_shape : ChunkCoords - Chunk shape of the array. - shard_shape : ChunkCoords, optional - Shard shape of the array. The default value of ``None`` results in no sharding at all. - filters : Iterable[Codec], optional - List of filters to apply to the array. - compressors : Iterable[Codec], optional - List of compressors to apply to the array. - fill_value : Any, optional - Fill value for the array. - order : {"C", "F"}, optional - Memory layout of the array. - zarr_format : {2, 3}, optional - The zarr format to use when saving. - attributes : dict, optional - Attributes for the array. - chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding to use. - dimension_names : Iterable[str], optional - Dimension names for the array. - storage_options : dict, optional - If using an fsspec URL to create the store, these will be passed to the backend implementation. - Ignored otherwise. - overwrite : bool, default False - Whether to overwrite an array with the same name in the store, if one exists. - config : ArrayConfig or ArrayConfigParams, optional - Runtime configuration for the array. - - Returns - ------- - z : array - The array. - """ - - if zarr_format is None: - zarr_format = _default_zarr_version() - - # TODO: figure out why putting these imports at top-level causes circular imports - from zarr.codecs.bytes import BytesCodec - from zarr.codecs.sharding import ShardingCodec - - # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise - mode: Literal["a"] = "a" - - store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - sub_codecs = (*filters, BytesCodec(), *compressors) - _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) - config_parsed = parse_array_config(config) - if zarr_format == 2: - if shard_shape is not None: - msg = ( - 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' - f"Got `shard_shape={shard_shape}` instead." - ) - - raise ValueError(msg) - if len(tuple(compressors)) > 1: - compressor, *rest = compressors - else: - compressor = None - rest = [] - filters = (*filters, *rest) - if dimension_names is not None: - raise ValueError("Zarr v2 arrays do not support dimension names.") - if order is None: - order_parsed = zarr_config.get("array.order") - else: - order_parsed = order - return await AsyncArray._create_v2( - store_path=store_path, - shape=shape, - dtype=_dtype_parsed, - chunks=chunk_shape, - dimension_separator="/", - fill_value=fill_value, - order=order_parsed, - filters=filters, - compressor=compressor, - attributes=attributes, - overwrite=overwrite, - config=config_parsed, - ) - else: - shard_shape_parsed, chunk_shape_parsed = _auto_partition( - shape, dtype, shard_shape, chunk_shape - ) - if shard_shape_parsed is not None: - sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) - sharding_codec.validate( - shape=chunk_shape_parsed, - dtype=dtype, - chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), - ) - codecs = (sharding_codec,) - chunks_out = shard_shape_parsed - else: - chunks_out = chunk_shape_parsed - codecs = sub_codecs - - return await AsyncArray._create_v3( - store_path=store_path, - shape=shape, - dtype=_dtype_parsed, - fill_value=fill_value, - attributes=attributes, - chunk_shape=chunks_out, - chunk_key_encoding=chunk_key_encoding, - codecs=codecs, - dimension_names=dimension_names, - overwrite=overwrite, - config=config_parsed, - ) - - async def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 49625eb3e8..1dc61a9bba 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -5,6 +5,7 @@ from typing_extensions import deprecated import zarr.api.asynchronous as async_api +import zarr.core.array from zarr._compat import _deprecate_positional_args from zarr.core.array import Array, AsyncArray from zarr.core.group import Group @@ -727,7 +728,7 @@ def create( def create_array(*args: Any, **kwargs: Any) -> Array: - return Array(sync(async_api.create_array(*args, **kwargs))) + return Array(sync(zarr.core.array.create_array(*args, **kwargs))) def read_array(*args: Any, **kwargs: Any) -> Array: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fa5f3a9100..5432c447d8 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,6 +3,7 @@ import json import warnings from asyncio import gather +from collections.abc import Iterable from dataclasses import dataclass, field from itertools import starmap from logging import getLogger @@ -14,6 +15,7 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete +from zarr.api.asynchronous import PathLike, _default_zarr_version from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config @@ -24,7 +26,7 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, DefaultChunkKeyEncoding, @@ -3450,3 +3452,165 @@ def _get_default_codecs( dtype_key = "numeric" return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + + +async def create_array( + store: str | StoreLike, + *, + path: PathLike | None = None, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords, + shard_shape: ChunkCoords | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = ("default", "/"), + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, + data: np.ndarray | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + path : str or None, optional + The name of the array within the store. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. + data : np.ndarray, optional + Initial data for the array. + + Returns + ------- + z : array + The array. + """ + + if zarr_format is None: + zarr_format = _default_zarr_version() + + # TODO: figure out why putting these imports at top-level causes circular imports + from zarr.codecs.bytes import BytesCodec + from zarr.codecs.sharding import ShardingCodec + + # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise + mode: Literal["a"] = "a" + + store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + sub_codecs = (*filters, BytesCodec(), *compressors) + _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + config_parsed = parse_array_config(config) + result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] + if zarr_format == 2: + if shard_shape is not None: + msg = ( + 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' + f"Got `shard_shape={shard_shape}` instead." + ) + + raise ValueError(msg) + if len(tuple(compressors)) > 1: + compressor, *rest = compressors + else: + compressor = None + rest = [] + filters = (*filters, *rest) + if dimension_names is not None: + raise ValueError("Zarr v2 arrays do not support dimension names.") + if order is None: + order_parsed = zarr_config.get("array.order") + else: + order_parsed = order + result = await AsyncArray._create_v2( + store_path=store_path, + shape=shape, + dtype=_dtype_parsed, + chunks=chunk_shape, + dimension_separator="/", + fill_value=fill_value, + order=order_parsed, + filters=filters, + compressor=compressor, + attributes=attributes, + overwrite=overwrite, + config=config_parsed, + ) + else: + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + shape, dtype, shard_shape, chunk_shape + ) + if shard_shape_parsed is not None: + sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) + sharding_codec.validate( + shape=chunk_shape_parsed, + dtype=dtype, + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + ) + codecs = (sharding_codec,) + chunks_out = shard_shape_parsed + else: + chunks_out = chunk_shape_parsed + codecs = sub_codecs + + result = await AsyncArray._create_v3( + store_path=store_path, + shape=shape, + dtype=_dtype_parsed, + fill_value=fill_value, + attributes=attributes, + chunk_shape=chunks_out, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + overwrite=overwrite, + config=config_parsed, + ) + + if data is not None: + await result.setitem( + selection=slice(None), value=data, prototype=default_buffer_prototype() + ) + return result diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index a9d2e67c16..f934065d96 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -18,7 +18,8 @@ from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo -from zarr.core.array import Array, AsyncArray, _build_parents +from zarr.core.array import Array, AsyncArray, _build_parents, create_array +from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -995,119 +996,93 @@ async def require_groups(self, *names: str) -> tuple[AsyncGroup, ...]: async def create_array( self, - name: str, + path: str, *, - shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords, + shard_shape: ChunkCoords | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + ) = ("default", "/"), dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, - data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Create a Zarr array within this AsyncGroup. - This method lightly wraps AsyncArray.create. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- - name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. + path : str + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- AsyncArray """ - return await AsyncArray.create( - self.store_path / name, + return await create_array( + store=self.store_path, + path=path, shape=shape, dtype=dtype, chunk_shape=chunk_shape, + shard_shape=shard_shape, + filters=filters, + compressors=compressors, fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, dimension_names=dimension_names, - attributes=attributes, - chunks=chunks, - dimension_separator=dimension_separator, - order=order, - filters=filters, - compressor=compressor, + storage_options=storage_options, overwrite=overwrite, - zarr_format=self.metadata.zarr_format, - data=data, + config=config, ) @deprecated("Use AsyncGroup.create_array instead.") From 3f0a3e0aaf01c1b86d96bdbbee824ef398135e8c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:02:42 +0100 Subject: [PATCH 20/85] docstring --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5432c447d8..4a2e9b82bb 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3522,7 +3522,7 @@ async def create_array( Runtime configuration for the array. data : np.ndarray, optional Initial data for the array. - + Returns ------- z : array From 26ced00182f13500df726949901e70b19dafdec4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:14:31 +0100 Subject: [PATCH 21/85] remove auto_partition --- src/zarr/core/chunk_grids.py | 38 ------------------------------------ src/zarr/core/group.py | 2 +- 2 files changed, 1 insertion(+), 39 deletions(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 16727fbf8a..af32a09f51 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -146,44 +146,6 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl return tuple(int(c) for c in chunks) -def _auto_partition( - shape: tuple[int, ...], - dtype: npt.DTypeLike, - shard_shape: tuple[int, ...] | Literal["auto"] | None, - chunk_shape: tuple[int, ...] | Literal["auto"], -) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: - """ - Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. - If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based - on the dtype and shape of the array. - If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape - of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, - given the dtype and shard shape. Otherwise, the chunks will be returned as-is. - """ - # no sharding - item_size = np.dtype(dtype).itemsize - if shard_shape is None: - _shards_out = None - if chunk_shape == "auto": - _chunks_out = _guess_chunks(shape, item_size) - else: - _chunks_out = chunk_shape - else: - if chunk_shape == "auto": - # aim for a 1MiB chunk - _chunks_out = _guess_chunks(shape, item_size, max_bytes=1024) - else: - _chunks_out = chunk_shape - - if shard_shape == "auto": - # TODO: fix me! this should be capped at some sane shard shape - _shards_out = tuple(c * 2 for c in _chunks_out) - else: - _shards_out = shard_shape - - return _shards_out, _chunks_out - - @dataclass(frozen=True) class ChunkGrid(Metadata): @classmethod diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f934065d96..47d7cde100 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -46,7 +46,7 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - + from zarr.core.common import MemoryOrder from zarr.abc.codec import Codec from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_key_encodings import ChunkKeyEncoding From af55ac4baf09db8549c7fa4ac2530e8c0a203a8f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:15:06 +0100 Subject: [PATCH 22/85] make shape shapelike --- src/zarr/core/array.py | 7 ++++--- src/zarr/core/group.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4a2e9b82bb..afb06ec8ec 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3458,7 +3458,7 @@ async def create_array( store: str | StoreLike, *, path: PathLike | None = None, - shape: ChunkCoords, + shape: ShapeLike, dtype: npt.DTypeLike, chunk_shape: ChunkCoords, shard_shape: ChunkCoords | None = None, @@ -3543,6 +3543,7 @@ async def create_array( sub_codecs = (*filters, BytesCodec(), *compressors) _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) config_parsed = parse_array_config(config) + shape_parsed = parse_shapelike(shape) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 2: if shard_shape is not None: @@ -3566,7 +3567,7 @@ async def create_array( order_parsed = order result = await AsyncArray._create_v2( store_path=store_path, - shape=shape, + shape=shape_parsed, dtype=_dtype_parsed, chunks=chunk_shape, dimension_separator="/", @@ -3597,7 +3598,7 @@ async def create_array( result = await AsyncArray._create_v3( store_path=store_path, - shape=shape, + shape=shape_parsed, dtype=_dtype_parsed, fill_value=fill_value, attributes=attributes, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 47d7cde100..6240d241eb 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -46,10 +46,11 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - from zarr.core.common import MemoryOrder + from zarr.abc.codec import Codec from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") From 07f07eab2d1c9a1e4e1f0e3a49e8ef0836b94f71 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:39:12 +0100 Subject: [PATCH 23/85] use create_array everywhere in group class --- src/zarr/core/array.py | 19 +-- src/zarr/core/chunk_grids.py | 4 +- src/zarr/core/group.py | 294 ++++++++++++++--------------------- 3 files changed, 130 insertions(+), 187 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index afb06ec8ec..f668508a6e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -26,7 +26,7 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, DefaultChunkKeyEncoding, @@ -3478,7 +3478,7 @@ async def create_array( storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, - data: np.ndarray | None = None, + data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -3580,20 +3580,17 @@ async def create_array( config=config_parsed, ) else: - shard_shape_parsed, chunk_shape_parsed = _auto_partition( - shape, dtype, shard_shape, chunk_shape - ) - if shard_shape_parsed is not None: - sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) + if shard_shape is not None: + sharding_codec = ShardingCodec(chunk_shape=chunk_shape, codecs=sub_codecs) sharding_codec.validate( - shape=chunk_shape_parsed, + shape=chunk_shape, dtype=dtype, - chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape), ) codecs = (sharding_codec,) - chunks_out = shard_shape_parsed + chunks_out = shard_shape else: - chunks_out = chunk_shape_parsed + chunks_out = chunk_shape codecs = sub_codecs result = await AsyncArray._create_v3( diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index af32a09f51..ea050e39ef 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -7,7 +7,7 @@ from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any import numpy as np @@ -26,8 +26,6 @@ from collections.abc import Iterator from typing import Self - import numpy.typing as npt - def _guess_chunks( shape: ShapeLike, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 6240d241eb..4cd78b14c3 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -19,7 +19,6 @@ from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo from zarr.core.array import Array, AsyncArray, _build_parents, create_array -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -48,6 +47,7 @@ from typing import Any from zarr.abc.codec import Codec + from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_key_encodings import ChunkKeyEncoding from zarr.core.common import MemoryOrder @@ -999,7 +999,7 @@ async def create_array( self, path: str, *, - shape: ChunkCoords, + shape: ShapeLike, dtype: npt.DTypeLike, chunk_shape: ChunkCoords, shard_shape: ChunkCoords | None = None, @@ -1019,6 +1019,7 @@ async def create_array( storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, + data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Create a Zarr array within this AsyncGroup. @@ -1084,6 +1085,7 @@ async def create_array( storage_options=storage_options, overwrite=overwrite, config=config, + data=data, ) @deprecated("Use AsyncGroup.create_array instead.") @@ -2198,122 +2200,95 @@ def create(self, *args: Any, **kwargs: Any) -> Array: @_deprecate_positional_args def create_array( self, - name: str, + path: str, *, shape: ShapeLike, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords, + shard_shape: ChunkCoords | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + ) = ("default", "/"), dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ShapeLike | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. - - This method lightly wraps `AsyncArray.create`. + """ + Create a Zarr array within this AsyncGroup. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- - name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + path : str + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + return Array( self._sync( self._async_group.create_array( - name=name, + path=path, shape=shape, dtype=dtype, + chunk_shape=chunk_shape, + shard_shape=shard_shape, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, + zarr_format=zarr_format, filters=filters, - compressor=compressor, overwrite=overwrite, + storage_options=storage_options, + config=config, data=data, ) ) @@ -2568,122 +2543,95 @@ def move(self, source: str, dest: str) -> None: @_deprecate_positional_args def array( self, - name: str, + path: str, *, - shape: ChunkCoords, - dtype: npt.DTypeLike = "float64", - fill_value: Any | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords, + shard_shape: ChunkCoords | None = None, + filters: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), + fill_value: Any | None = 0, + order: MemoryOrder | None = "C", + zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - # v3 only - chunk_shape: ChunkCoords | None = None, chunk_key_encoding: ( ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None - ) = None, - codecs: Iterable[Codec | dict[str, JSON]] | None = None, + ) = ("default", "/"), dimension_names: Iterable[str] | None = None, - # v2 only - chunks: ChunkCoords | None = None, - dimension_separator: Literal[".", "/"] | None = None, - order: Literal["C", "F"] | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, - # runtime + storage_options: dict[str, Any] | None = None, overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """Create a zarr array within this AsyncGroup. - - This method lightly wraps `AsyncArray.create`. + """ + Create a Zarr array within this AsyncGroup. + This method lightly wraps ``zarr.core.array.create_array``. Parameters ---------- - name : str - The name of the array. - shape : tuple[int, ...] - The shape of the array. - dtype : np.DtypeLike = float64 - The data type of the array. - chunk_shape : tuple[int, ...] | None = None - The shape of the chunks of the array. - V3 only. V2 arrays should use `chunks` instead. - If not specified, default are guessed based on the shape and dtype. - chunk_key_encoding : ChunkKeyEncoding | tuple[Literal["default"], Literal[".", "/"]] | tuple[Literal["v2"], Literal[".", "/"]] | None = None - A specification of how the chunk keys are represented in storage. - V3 only. V2 arrays should use `dimension_separator` instead. - Default is ``("default", "/")``. - codecs : Iterable[Codec | dict[str, JSON]] | None = None - An iterable of Codec or dict serializations of Codecs. The elements of - this collection specify the transformation from array values to stored bytes. - V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. - - If no codecs are provided, default codecs will be used: - - - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. - dimension_names : Iterable[str] | None = None - The names of the dimensions of the array. V3 only. - chunks : ChunkCoords | None = None - The shape of the chunks of the array. - V2 only. V3 arrays should use ``chunk_shape`` instead. - If not specified, default are guessed based on the shape and dtype. - dimension_separator : Literal[".", "/"] | None = None - The delimiter used for the chunk keys. (default: ".") - V2 only. V3 arrays should use ``chunk_key_encoding`` instead. - order : Literal["C", "F"] | None = None - The memory order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). - filters : list[dict[str, JSON]] | None = None - Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) - compressor : dict[str, JSON] | None = None - The compressor used to compress the data (default is None). - V2 only. V3 arrays should use ``codecs`` instead. - - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: - - - For numeric arrays, the default is ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. - - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. - overwrite : bool = False - If True, a pre-existing array or group at the path of this array will - be overwritten. If False, the presence of a pre-existing array or group is - an error. - data : npt.ArrayLike | None = None - Array data to initialize the array with. + path : str + The name of the array relative to the group. If ``path`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunk_shape : ChunkCoords + Chunk shape of the array. + shard_shape : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + List of filters to apply to the array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + Memory layout of the array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + The chunk key encoding to use. + dimension_names : Iterable[str], optional + Dimension names for the array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- - - Array - + AsyncArray """ + return Array( self._sync( self._async_group.create_array( - name=name, + path=path, shape=shape, dtype=dtype, + chunk_shape=chunk_shape, + shard_shape=shard_shape, fill_value=fill_value, attributes=attributes, - chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + compressors=compressors, dimension_names=dimension_names, - chunks=chunks, - dimension_separator=dimension_separator, order=order, + zarr_format=zarr_format, filters=filters, - compressor=compressor, overwrite=overwrite, + storage_options=storage_options, + config=config, data=data, ) ) From bc552ce0305a7837a267f4d98cf889175e1f453c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:44:25 +0100 Subject: [PATCH 24/85] remove readers --- src/zarr/__init__.py | 6 --- src/zarr/api/asynchronous.py | 102 ----------------------------------- src/zarr/api/synchronous.py | 32 ----------- tests/test_api.py | 86 ----------------------------- 4 files changed, 226 deletions(-) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 3eb9170908..bcbdaf7c19 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -21,9 +21,6 @@ open_consolidated, open_group, open_like, - read, - read_array, - read_group, save, save_array, save_group, @@ -66,9 +63,6 @@ "open_consolidated", "open_group", "open_like", - "read", - "read_array", - "read_group", "save", "save_array", "save_group", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 3f460cf9f2..d7df843426 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -339,47 +339,6 @@ async def open( return await open_group(store=store_path, zarr_format=zarr_format, mode=mode, **kwargs) -async def read( - *, - store: StoreLike | None = None, - zarr_format: ZarrFormat | None = None, - path: str | None = None, - storage_options: dict[str, Any] | None = None, - **kwargs: Any, -) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata] | AsyncGroup: - """Convenience function to open a group or array for reading. This function - wraps :func:`zarr.api.asynchronous.open` See the documentation of that function for details. - - Parameters - ---------- - store : Store or str, optional - Store or path to directory in file system or name of zip file. - zarr_format : {2, 3, None}, optional - The zarr format to require. The default value of None will first look for Zarr v3 data, - then Zarr v2 data, then fail if neither format is found. - path : str or None, optional - The path within the store to open. - storage_options : dict, optional - If using an fsspec URL to create the store, this will be passed to - the backend implementation. Ignored otherwise. - **kwargs - Additional parameters are passed through to :func:`zarr.creation.open`. - - Returns - ------- - z : array or group - Return type depends on what exists in the given store. - """ - return await open( - store=store, - mode="r", - zarr_format=zarr_format, - path=path, - storage_options=storage_options, - **kwargs, - ) - - async def open_consolidated( *args: Any, use_consolidated: Literal[True] = True, **kwargs: Any ) -> AsyncGroup: @@ -867,67 +826,6 @@ async def open_group( raise FileNotFoundError(f"Unable to find group: {store_path}") -async def read_group( - store: StoreLike, - *, - path: str | None = None, - zarr_format: ZarrFormat | None = None, - storage_options: dict[str, Any] | None = None, - use_consolidated: bool | str | None = None, -) -> AsyncGroup: - """Open a group for reading. This function wraps :func:`zarr.api.asynchronous.open_group` See - the documentation of that function for details. - - Parameters - ---------- - store : Store, str, or mapping, optional - Store or path to directory in file system or name of zip file. - - Strings are interpreted as paths on the local file system - and used as the ``root`` argument to :class:`zarr.store.LocalStore`. - - Dictionaries are used as the ``store_dict`` argument in - :class:`zarr.store.MemoryStore``. - path : str, optional - Group path within store. - zarr_format : {2, 3, None}, optional - The zarr format to require. The default value of None will first look for Zarr v3 data, - then Zarr v2 data, then fail if neither format is found. - storage_options : dict - If the store is backed by an fsspec-based implementation, then this dict will be passed to - the Store constructor for that implementation. Ignored otherwise. - use_consolidated : bool or str, default None - Whether to use consolidated metadata. - - By default, consolidated metadata is used if it's present in the - store (in the ``zarr.json`` for Zarr v3 and in the ``.zmetadata`` file - for Zarr v2). - - To explicitly require consolidated metadata, set ``use_consolidated=True``, - which will raise an exception if consolidated metadata is not found. - - To explicitly *not* use consolidated metadata, set ``use_consolidated=False``, - which will fall back to using the regular, non consolidated metadata. - - Zarr v2 allowed configuring the key storing the consolidated metadata - (``.zmetadata`` by default). Specify the custom key as ``use_consolidated`` - to load consolidated metadata from a non-default key. - - Returns - ------- - g : group - The new group. - """ - return await open_group( - store=store, - mode="r", - path=path, - storage_options=storage_options, - zarr_format=zarr_format, - use_consolidated=use_consolidated, - ) - - async def create( shape: ChunkCoords | int, *, # Note: this is a change from v2 diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 1dc61a9bba..f15513715a 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -190,16 +190,6 @@ def open( return Group(obj) -def read( - store: StoreLike, - *, - zarr_format: ZarrFormat | None = None, - path: str | None = None, - **kwargs: Any, -) -> Array | Group: - return open(store=store, mode="r", zarr_format=zarr_format, path=path, **kwargs) - - def open_consolidated(*args: Any, use_consolidated: Literal[True] = True, **kwargs: Any) -> Group: """ Alias for :func:`open_group` with ``use_consolidated=True``. @@ -557,24 +547,6 @@ def create_group( ) -def read_group( - store: StoreLike, - *, - path: str | None = None, - storage_options: dict[str, Any] | None = None, - zarr_format: ZarrFormat | None = None, - use_consolidated: bool | str | None = None, -) -> Group: - return open_group( - store=store, - path=path, - mode="r", - zarr_format=zarr_format, - use_consolidated=use_consolidated, - storage_options=storage_options, - ) - - # TODO: add type annotations for kwargs def create( shape: ChunkCoords | int, @@ -731,10 +703,6 @@ def create_array(*args: Any, **kwargs: Any) -> Array: return Array(sync(zarr.core.array.create_array(*args, **kwargs))) -def read_array(*args: Any, **kwargs: Any) -> Array: - return Array(sync(async_api.read_array(*args, **kwargs))) - - # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: """Create an empty array. diff --git a/tests/test_api.py b/tests/test_api.py index fd8267de6e..5b435b2cff 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -19,9 +19,6 @@ load, open, open_group, - read, - read_array, - read_group, save, save_array, save_group, @@ -63,32 +60,6 @@ def test_create(memory_store: Store) -> None: z = create(shape=(400, 100), chunks=(16, 16.5), store=store, overwrite=True) # type: ignore [arg-type] -@pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_read(store: Store) -> None: - """ - Test that the polymorphic read function can return an Array or a Group, depending on the path argument. - """ - # create an array and a group - _ = create_group(store=store, path="group", attributes={"node_type": "group"}) - _ = create_array( - store=store, - path="array", - shape=(10, 10), - chunk_shape=(1, 1), - dtype="uint8", - attributes={"node_type": "array"}, - ) - - group_r = read(store, path="group") - assert isinstance(group_r, Group) - assert group_r.attrs == {"node_type": "group"} - - array_r = read(store, path="array") - assert isinstance(array_r, Array) - assert array_r.attrs == {"node_type": "array"} - assert array_r.shape == (10, 10) - - # TODO: parametrize over everything this function takes @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_create_array(store: Store) -> None: @@ -105,42 +76,6 @@ def test_create_array(store: Store) -> None: assert np.array_equal(array_w[:], np.zeros(shape, dtype=array_w.dtype) + data_val) -@pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_read_array(store: Store) -> None: - shape = (10, 10) - data_val = 1 - path = "foo" - - zarr_format: ZarrFormat - - for zarr_format in (2, 3): - attrs = {"zarr_format": zarr_format} - node_w = create_array( - store, - path=path, - shape=shape, - attributes=attrs, - zarr_format=zarr_format, - chunk_shape=shape, - dtype="uint8", - ) - node_w[:] = data_val - - # check that the correct array can be read when both v2 and v3 arrays are present - for zarr_format in (2, 3): - node_r = read_array(store, path=path, zarr_format=zarr_format) - - assert node_r.shape == shape - assert node_r.attrs == {"zarr_format": zarr_format} - assert np.array_equal(node_r[:], np.zeros(shape, dtype=node_r.dtype) + data_val) - - # check that reading without specifying the zarr_format returns the v3 node - with pytest.warns(UserWarning): - node_r = read_array(store, path=path) - - assert node_r.metadata.zarr_format == 3 - - @pytest.mark.parametrize("write_empty_chunks", [True, False]) def test_write_empty_chunks_warns(write_empty_chunks: bool) -> None: """ @@ -230,27 +165,6 @@ async def test_open_group(memory_store: MemoryStore) -> None: assert g.read_only -@pytest.mark.parametrize("store", ["memory"], indirect=True) -def test_read_group(store: Store) -> None: - path = "foo" - - zarr_format: ZarrFormat - for zarr_format in (2, 3): - attrs = {"zarr_format": zarr_format} - _ = create_group(store, path=path, attributes=attrs, zarr_format=zarr_format) - - # check that the correct array can be read when both v2 and v3 arrays are present - for zarr_format in (2, 3): - node_r = read_group(store, path=path, zarr_format=zarr_format) - assert node_r.attrs == {"zarr_format": zarr_format} - - # check that reading without specifying the zarr_format returns the v3 node - with pytest.warns(UserWarning): - node_r = read_group(store, path=path) - - assert node_r.metadata.zarr_format == 3 - - @pytest.mark.parametrize("zarr_format", [None, 2, 3]) async def test_open_group_unspecified_version( tmpdir: pathlib.Path, zarr_format: ZarrFormat From 74f731a708a95b33fb08672e7bbbd5f3cf6b6b83 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Dec 2024 16:47:03 +0100 Subject: [PATCH 25/85] fix dodgy imports --- src/zarr/api/asynchronous.py | 7 +------ src/zarr/core/array.py | 4 ++-- src/zarr/core/common.py | 6 ++++++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index d7df843426..a55d245552 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -22,7 +22,7 @@ _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.config import config as zarr_config +from zarr.core.common import _default_zarr_version from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_filters_and_compressor @@ -150,11 +150,6 @@ def _handle_zarr_version_or_format( return zarr_format -def _default_zarr_version() -> ZarrFormat: - """Return the default zarr_version""" - return cast(ZarrFormat, int(zarr_config.get("default_zarr_version", 3))) - - async def consolidate_metadata( store: StoreLike, path: str | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f668508a6e..84ac5ea25d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -15,7 +15,7 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete -from zarr.api.asynchronous import PathLike, _default_zarr_version +from zarr.core.common import _default_zarr_version from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config @@ -3457,7 +3457,7 @@ def _get_default_codecs( async def create_array( store: str | StoreLike, *, - path: PathLike | None = None, + path: str | None = None, shape: ShapeLike, dtype: npt.DTypeLike, chunk_shape: ChunkCoords, diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 9f51fcb7e6..874091039a 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -18,6 +18,7 @@ import numpy as np +from zarr.core.config import config as zarr_config from zarr.core.strings import _STRING_DTYPE if TYPE_CHECKING: @@ -197,3 +198,8 @@ def _warn_order_kwarg() -> None: "or change the global 'array.order' configuration variable." ) warnings.warn(msg, RuntimeWarning, stacklevel=2) + + +def _default_zarr_version() -> ZarrFormat: + """Return the default zarr_version""" + return cast(ZarrFormat, int(zarr_config.get("default_zarr_version", 3))) From 43877c058d2a95a63e2e7118fb6cb427b45cdd31 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 21 Dec 2024 23:16:09 +0100 Subject: [PATCH 26/85] compressors -> compression, auto chunking, auto sharding, auto compression, auto filters --- src/zarr/core/array.py | 238 ++++++++++++++++++----- src/zarr/core/chunk_grids.py | 47 ++++- src/zarr/core/chunk_key_encodings.py | 7 +- src/zarr/core/group.py | 78 +++----- src/zarr/core/metadata/v2.py | 2 + src/zarr/testing/strategies.py | 2 +- tests/test_api.py | 6 +- tests/test_array.py | 4 +- tests/test_group.py | 52 ++--- tests/test_metadata/test_consolidated.py | 39 ++-- 10 files changed, 328 insertions(+), 147 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 84ac5ea25d..57f71e1b7a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -7,16 +7,18 @@ from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, cast, overload from warnings import warn +import numcodecs import numpy as np import numpy.typing as npt from zarr._compat import _deprecate_positional_args +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.core.common import _default_zarr_version from zarr.codecs._v2 import V2Codec +from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config from zarr.core.attributes import Attributes @@ -26,9 +28,10 @@ NDBuffer, default_buffer_prototype, ) -from zarr.core.chunk_grids import RegularChunkGrid, normalize_chunks +from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, + ChunkKeyEncodingParams, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, ) @@ -41,6 +44,7 @@ MemoryOrder, ShapeLike, ZarrFormat, + _default_zarr_version, _warn_order_kwarg, concurrent_map, parse_dtype, @@ -87,15 +91,15 @@ from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError -from zarr.registry import get_pipeline_class +from zarr.registry import get_codec_class, get_pipeline_class from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterator, Sequence from typing import Self - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import CodecPipeline from zarr.core.group import AsyncGroup # Array and AsyncArray are defined in the base ``zarr`` namespace @@ -3454,26 +3458,29 @@ def _get_default_codecs( return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] +FiltersParam: TypeAlias = ( + Iterable[dict[str, JSON] | Codec] | Iterable[numcodecs.abc.Codec] | Literal["auto"] +) +CompressionParam: TypeAlias = ( + Iterable[dict[str, JSON] | Codec] | Codec | numcodecs.abc.Codec | Literal["auto"] +) + + async def create_array( store: str | StoreLike, *, - path: str | None = None, + name: str | None = None, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", shard_shape: ChunkCoords | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + filters: FiltersParam = "auto", + compression: CompressionParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = ("default", "/"), + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -3486,8 +3493,8 @@ async def create_array( ---------- store : str or Store Store or path to directory in file system or name of zip file. - path : str or None, optional - The name of the array within the store. If ``path`` is ``None``, the array will be located + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. shape : ChunkCoords Shape of the array. @@ -3499,7 +3506,7 @@ async def create_array( Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional List of filters to apply to the array. - compressors : Iterable[Codec], optional + compression : Iterable[Codec], optional List of compressors to apply to the array. fill_value : Any, optional Fill value for the array. @@ -3533,75 +3540,85 @@ async def create_array( zarr_format = _default_zarr_version() # TODO: figure out why putting these imports at top-level causes circular imports - from zarr.codecs.bytes import BytesCodec from zarr.codecs.sharding import ShardingCodec # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise mode: Literal["a"] = "a" - - store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - sub_codecs = (*filters, BytesCodec(), *compressors) - _dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) + dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) config_parsed = parse_array_config(config) shape_parsed = parse_shapelike(shape) + chunk_key_encoding_parsed = _parse_chunk_key_encoding( + chunk_key_encoding, zarr_format=zarr_format + ) + store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) + shard_shape_parsed, chunk_shape_parsed = _auto_partition( + shape_parsed, shard_shape, chunk_shape, dtype_parsed + ) result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] + if zarr_format == 2: - if shard_shape is not None: + if shard_shape_parsed is not None: msg = ( 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' f"Got `shard_shape={shard_shape}` instead." ) raise ValueError(msg) - if len(tuple(compressors)) > 1: - compressor, *rest = compressors - else: - compressor = None - rest = [] - filters = (*filters, *rest) + if filters != "auto" and not all(isinstance(f, numcodecs.abc.Codec) for f in filters): + raise TypeError( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." + ) + filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters) + filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( + compression=compression, filters=filters, dtype=dtype_parsed + ) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") if order is None: order_parsed = zarr_config.get("array.order") else: order_parsed = order + result = await AsyncArray._create_v2( store_path=store_path, shape=shape_parsed, - dtype=_dtype_parsed, - chunks=chunk_shape, - dimension_separator="/", + dtype=dtype_parsed, + chunks=chunk_shape_parsed, + dimension_separator=chunk_key_encoding_parsed.separator, fill_value=fill_value, order=order_parsed, - filters=filters, - compressor=compressor, + filters=filters_parsed, + compressor=compressor_parsed, attributes=attributes, overwrite=overwrite, config=config_parsed, ) else: - if shard_shape is not None: - sharding_codec = ShardingCodec(chunk_shape=chunk_shape, codecs=sub_codecs) + array_array, array_bytes, bytes_bytes = _get_default_encoding_v3(dtype_parsed) + sub_codecs = (*array_array, array_bytes, *bytes_bytes) + codecs_out: tuple[Codec, ...] + if shard_shape_parsed is not None: + sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) sharding_codec.validate( - shape=chunk_shape, - dtype=dtype, + shape=chunk_shape_parsed, + dtype=dtype_parsed, chunk_grid=RegularChunkGrid(chunk_shape=shard_shape), ) - codecs = (sharding_codec,) + codecs_out = (sharding_codec,) chunks_out = shard_shape else: - chunks_out = chunk_shape - codecs = sub_codecs + chunks_out = chunk_shape_parsed + codecs_out = sub_codecs result = await AsyncArray._create_v3( store_path=store_path, shape=shape_parsed, - dtype=_dtype_parsed, + dtype=dtype_parsed, fill_value=fill_value, attributes=attributes, chunk_shape=chunks_out, - chunk_key_encoding=chunk_key_encoding, - codecs=codecs, + chunk_key_encoding=chunk_key_encoding_parsed, + codecs=codecs_out, dimension_names=dimension_names, overwrite=overwrite, config=config_parsed, @@ -3612,3 +3629,132 @@ async def create_array( selection=slice(None), value=data, prototype=default_buffer_prototype() ) return result + + +def _parse_chunk_key_encoding( + data: ChunkKeyEncoding | ChunkKeyEncodingParams | None, zarr_format: ZarrFormat +) -> ChunkKeyEncoding: + """ + Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. + """ + if data is None: + if zarr_format == 2: + result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "/"}) + else: + result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) + elif isinstance(data, ChunkKeyEncoding): + result = data + else: + result = ChunkKeyEncoding.from_dict(data) + if zarr_format == 2 and result.name != "v2": + msg = ( + "Invalid chunk key encoding. For Zarr v2 arrays, the `name` field of the " + f"chunk key encoding must be 'v2'. Got `name` = {result.name} instead." + ) + raise ValueError(msg) + return result + + +def _get_default_encoding_v3( + np_dtype: np.dtype[Any], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. + """ + default_codecs = zarr_config.get("array.v3_default_codecs") + dtype = DataType.from_numpy(np_dtype) + if dtype == DataType.string: + dtype_key = "string" + elif dtype == DataType.bytes: + dtype_key = "bytes" + else: + dtype_key = "numeric" + + codec_names = default_codecs[dtype_key] + array_bytes_cls, *rest = tuple(get_codec_class(codec_name) for codec_name in codec_names) + array_bytes: ArrayBytesCodec = cast(ArrayBytesCodec, array_bytes_cls()) + # TODO: we should compress bytes and strings by default! + # The current default codecs only lists names, and strings / bytes are not compressed at all, + # so we insert the ZstdCodec at the end of the list as a default + bytes_bytes: tuple[BytesBytesCodec, ...] + array_array: tuple[ArrayArrayCodec, ...] = () + if len(rest) == 0: + bytes_bytes = (ZstdCodec(),) + else: + bytes_bytes = cast(tuple[BytesBytesCodec, ...], tuple(r() for r in rest)) + + return array_array, array_bytes, bytes_bytes + + +def _get_default_chunk_encoding_v2( + dtype: np.dtype[np.generic], +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: + """ + Get the default chunk encoding for zarr v2 arrays, given a dtype + """ + codec_id_dict = zarr_config.get("array.v2_default_compressor") + + if dtype.kind in "biufcmM": + dtype_key = "numeric" + codec_type = "compressor" + elif dtype.kind in "U": + dtype_key = "string" + codec_type = "filter" + elif dtype.kind in "OSV": + dtype_key = "bytes" + codec_type = "filter" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + codec_id = codec_id_dict[dtype_key] + codec_instance = numcodecs.get_codec({"id": codec_id}) + if codec_type == "compressor": + return (), codec_instance + elif codec_type == "filter": + return codec_instance, numcodecs.Zstd() + else: + raise ValueError(f"Unsupported codec type {codec_type}") + + +def _parse_chunk_encoding_v2( + *, + compression: numcodecs.abc.Codec | Literal["auto"], + filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], + dtype: np.dtype[np.generic], +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: + """ + Generate chunk encoding classes for v2 arrays with optional defaults. + """ + default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) + _filters: tuple[numcodecs.abc.Codec, ...] = () + if compression == "auto": + _compressor = default_compressor + else: + _compressor = compression + if filters == "auto": + _filters = default_filters + else: + _filters = filters + return _filters, _compressor + + +def _parse_chunk_encoding_v3( + *, + compression: Iterable[BytesBytesCodec] | Literal["auto"], + filters: Iterable[ArrayArrayCodec] | Literal["auto"], + dtype: np.dtype[np.generic], +) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + """ + Generate chunk encoding classes for v3 arrays with optional defaults. + """ + default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype) + + if compression == "auto": + out_bytes_bytes = default_bytes_bytes + else: + out_bytes_bytes = tuple(compression) + if filters == "auto": + out_array_array = default_array_array + else: + out_array_array = tuple(filters) + + return out_array_array, default_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index ea050e39ef..158c5918ba 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -7,7 +7,7 @@ from abc import abstractmethod from dataclasses import dataclass from functools import reduce -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal import numpy as np @@ -194,3 +194,48 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int: itertools.starmap(ceildiv, zip(array_shape, self.chunk_shape, strict=True)), 1, ) + + +def _auto_partition( + array_shape: tuple[int, ...], + shard_shape: tuple[int, ...] | Literal["auto"] | None, + chunk_shape: tuple[int, ...] | Literal["auto"], + dtype: np.dtype[np.generic], +) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: + """ + Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. + If `shard_shape` is `None` and the chunk_shape is "auto", the chunks will be set heuristically based + on the dtype and shape of the array. + If `shard_shape` is "auto", then the shard shape will be set heuristically from the dtype and shape + of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, + given the dtype and shard shape. Otherwise, the chunks will be returned as-is. + """ + + item_size = dtype.itemsize + if shard_shape is None: + _shards_out: None | tuple[int, ...] = None + if chunk_shape == "auto": + _chunks_out = _guess_chunks(array_shape, item_size) + else: + _chunks_out = chunk_shape + else: + if chunk_shape == "auto": + # aim for a 1MiB chunk + _chunks_out = _guess_chunks(array_shape, item_size, max_bytes=1024) + else: + _chunks_out = chunk_shape + + if shard_shape == "auto": + _shards_out = () + for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): + # TODO: make a better heuristic than this. + # for each axis, if there are more than 16 chunks along that axis, then make put + # 2 chunks in each shard for that axis. + if a_shape // c_shape > 16: + _shards_out += (c_shape * 2,) + else: + _shards_out += (1,) + else: + _shards_out = shard_shape + + return _shards_out, _chunks_out diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index ed12ee3065..33b44b3232 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -2,7 +2,7 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import Literal, cast +from typing import Literal, TypedDict, cast from zarr.abc.metadata import Metadata from zarr.core.common import ( @@ -20,6 +20,11 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) +class ChunkKeyEncodingParams(TypedDict): + name: Literal["v2", "default"] + separator: SeparatorLiteral + + @dataclass(frozen=True) class ChunkKeyEncoding(Metadata): name: str diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 4cd78b14c3..b8cc56c206 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -49,7 +49,7 @@ from zarr.abc.codec import Codec from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -997,24 +997,18 @@ async def require_groups(self, *names: str) -> tuple[AsyncGroup, ...]: async def create_array( self, - path: str, + name: str, *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, - shard_shape: ChunkCoords | None = None, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | Literal["auto"] | None = None, filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + compression: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, order: MemoryOrder | None = "C", - zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = ("default", "/"), + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -1027,14 +1021,14 @@ async def create_array( Parameters ---------- - path : str + name : str The name of the array relative to the group. If ``path`` is ``None``, the array will be located at the root of the store. shape : ChunkCoords Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords + chunk_shape : ChunkCoords | Literal["auto"], default is "auto". Chunk shape of the array. shard_shape : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. @@ -1046,8 +1040,6 @@ async def create_array( Fill value for the array. order : {"C", "F"}, optional Memory layout of the array. - zarr_format : {2, 3}, optional - The zarr format to use when saving. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional @@ -1069,16 +1061,16 @@ async def create_array( """ return await create_array( store=self.store_path, - path=path, + name=name, shape=shape, dtype=dtype, chunk_shape=chunk_shape, shard_shape=shard_shape, filters=filters, - compressors=compressors, + compression=compression, fill_value=fill_value, order=order, - zarr_format=zarr_format, + zarr_format=self.metadata.zarr_format, attributes=attributes, chunk_key_encoding=chunk_key_encoding, dimension_names=dimension_names, @@ -2200,24 +2192,18 @@ def create(self, *args: Any, **kwargs: Any) -> Array: @_deprecate_positional_args def create_array( self, - path: str, + name: str, *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", shard_shape: ChunkCoords | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + filters: Iterable[dict[str, JSON] | Codec] | Literal["auto"] = "auto", + compression: Iterable[dict[str, JSON] | Codec] | Codec | Literal["auto"] = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", - zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = ("default", "/"), + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -2237,7 +2223,7 @@ def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords + chunk_shape : ChunkCoords | Literal["auto"], default is "auto" Chunk shape of the array. shard_shape : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. @@ -2249,8 +2235,6 @@ def create_array( Fill value for the array. order : {"C", "F"}, optional Memory layout of the array. - zarr_format : {2, 3}, optional - The zarr format to use when saving. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional @@ -2273,7 +2257,7 @@ def create_array( return Array( self._sync( self._async_group.create_array( - path=path, + name=name, shape=shape, dtype=dtype, chunk_shape=chunk_shape, @@ -2281,10 +2265,9 @@ def create_array( fill_value=fill_value, attributes=attributes, chunk_key_encoding=chunk_key_encoding, - compressors=compressors, + compression=compression, dimension_names=dimension_names, order=order, - zarr_format=zarr_format, filters=filters, overwrite=overwrite, storage_options=storage_options, @@ -2543,24 +2526,18 @@ def move(self, source: str, dest: str) -> None: @_deprecate_positional_args def array( self, - path: str, + name: str, *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, - shard_shape: ChunkCoords | None = None, + chunk_shape: ChunkCoords | Literal["auto"] = "auto", + shard_shape: ChunkCoords | Literal["auto"] | None = None, filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + compression: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, order: MemoryOrder | None = "C", - zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ( - ChunkKeyEncoding - | tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] - | None - ) = ("default", "/"), + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, @@ -2592,8 +2569,6 @@ def array( Fill value for the array. order : {"C", "F"}, optional Memory layout of the array. - zarr_format : {2, 3}, optional - The zarr format to use when saving. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional @@ -2616,7 +2591,7 @@ def array( return Array( self._sync( self._async_group.create_array( - path=path, + name=name, shape=shape, dtype=dtype, chunk_shape=chunk_shape, @@ -2624,10 +2599,9 @@ def array( fill_value=fill_value, attributes=attributes, chunk_key_encoding=chunk_key_encoding, - compressors=compressors, + compression=compression, dimension_names=dimension_names, order=order, - zarr_format=zarr_format, filters=filters, overwrite=overwrite, storage_options=storage_options, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index bf6b576edd..a92846512d 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -6,6 +6,8 @@ from functools import cached_property from typing import TYPE_CHECKING, Any, TypedDict, cast +import numcodecs.abc + from zarr.abc.metadata import Metadata if TYPE_CHECKING: diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 85a67e3e69..8a352b601c 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -143,7 +143,7 @@ def arrays( a = root.create_array( array_path, shape=nparray.shape, - chunks=chunks, + chunk_shape=chunks, dtype=nparray.dtype, attributes=attributes, # compressor=compressor, # FIXME diff --git a/tests/test_api.py b/tests/test_api.py index 5b435b2cff..bf6395edf7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -68,7 +68,7 @@ def test_create_array(store: Store) -> None: path = "foo" data_val = 1 array_w = create_array( - store, path=path, shape=shape, attributes=attrs, chunk_shape=shape, dtype="uint8" + store, name=path, shape=shape, attributes=attrs, chunk_shape=shape, dtype="uint8" ) array_w[:] = data_val assert array_w.shape == shape @@ -1129,7 +1129,7 @@ async def test_create_array_v3(store: MemoryStore) -> None: chunk_shape=(4,), zarr_format=3, filters=(TransposeCodec(order=(0,)),), - compressors=(ZstdCodec(level=3),), + compression=ZstdCodec(level=3), ) @@ -1147,5 +1147,5 @@ async def test_create_array_v2(store: MemoryStore) -> None: chunk_shape=(4,), zarr_format=2, filters=(Delta(dtype=dtype),), - compressors=(Zstd(level=3),), + compression=Zstd(level=3), ) diff --git a/tests/test_array.py b/tests/test_array.py index 1899e384dc..e41f5cd548 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -135,13 +135,13 @@ def test_array_name_properties_with_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: root = Group.from_store(store=store, zarr_format=zarr_format) - foo = root.create_array("foo", shape=(100,), chunks=(10,), dtype="i4") + foo = root.create_array("foo", shape=(100,), chunk_shape=(10,), dtype="i4") assert foo.path == "foo" assert foo.name == "/foo" assert foo.basename == "foo" bar = root.create_group("bar") - spam = bar.create_array("spam", shape=(100,), chunks=(10,), dtype="i4") + spam = bar.create_array("spam", shape=(100,), chunk_shape=(10,), dtype="i4") assert spam.path == "bar/spam" assert spam.name == "/bar/spam" diff --git a/tests/test_group.py b/tests/test_group.py index e0bc304b9b..44a98f65b4 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -157,7 +157,6 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad members_expected["subarray"] = group.create_array( "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True ) - # add an extra object to the domain of the group. # the list of children should ignore this object. sync( @@ -313,8 +312,10 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") + subsubarray = subgroup.create_array( + name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8" + ) if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -391,7 +392,7 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -472,19 +473,21 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat expected_group_values = [group.create_group(name=name) for name in expected_group_keys] expected_groups = list(zip(expected_group_keys, expected_group_values, strict=False)) + fill_value = 3 + dtype = "uint8" + expected_group_values[0].create_group("subgroup") - expected_group_values[0].create_array("subarray", shape=(1,)) + expected_group_values[0].create_array( + "subarray", shape=(1,), dtype=dtype, fill_value=fill_value + ) expected_array_keys = ["a0", "a1"] + expected_array_values = [ - group.create_array(name=name, shape=(1,)) for name in expected_array_keys + group.create_array(name=name, shape=(1,), dtype=dtype, fill_value=fill_value) + for name in expected_array_keys ] expected_arrays = list(zip(expected_array_keys, expected_array_values, strict=False)) - fill_value: float | None - if zarr_format == 2: - fill_value = None - else: - fill_value = np.float64(0.0) if consolidate: group = zarr.consolidate_metadata(store) @@ -492,12 +495,13 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat metadata = { "subarray": { "attributes": {}, - "dtype": "float64", + "dtype": dtype, "fill_value": fill_value, "shape": (1,), "chunks": (1,), "order": "C", - "filters": (Zstd(level=0),), + "filters": (), + "compressor": Zstd(level=0), "zarr_format": zarr_format, }, "subgroup": { @@ -527,7 +531,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", + "data_type": dtype, "fill_value": fill_value, "node_type": "array", "shape": (1,), @@ -1014,11 +1018,11 @@ async def test_group_members_async(store: Store, consolidated_metadata: bool) -> group = await AsyncGroup.from_store( store=store, ) - a0 = await group.create_array("a0", shape=(1,)) + a0 = await group.create_array("a0", shape=(1,), dtype="uint8") g0 = await group.create_group("g0") - a1 = await g0.create_array("a1", shape=(1,)) + a1 = await g0.create_array("a1", shape=(1,), dtype="uint8") g1 = await g0.create_group("g1") - a2 = await g1.create_array("a2", shape=(1,)) + a2 = await g1.create_array("a2", shape=(1,), dtype="uint8") g2 = await g1.create_group("g2") # immediate children @@ -1179,9 +1183,9 @@ async def test_require_array(store: Store, zarr_format: ZarrFormat) -> None: async def test_members_name(store: Store, consolidate: bool, zarr_format: ZarrFormat): group = Group.from_store(store=store, zarr_format=zarr_format) a = group.create_group(name="a") - a.create_array("array", shape=(1,)) + a.create_array("array", shape=(1,), dtype="uint8") b = a.create_group(name="b") - b.create_array("array", shape=(1,)) + b.create_array("array", shape=(1,), dtype="uint8") if consolidate: group = zarr.api.synchronous.consolidate_metadata(store) @@ -1284,12 +1288,12 @@ async def test_group_delitem_consolidated(self, store: Store) -> None: g0 = await root.create_group("g0") g1 = await g0.create_group("g1") g2 = await g1.create_group("g2") - await g2.create_array("data", shape=(1,)) + await g2.create_array("data", shape=(1,), dtype="uint8") x0 = await root.create_group("x0") x1 = await x0.create_group("x1") x2 = await x1.create_group("x2") - await x2.create_array("data", shape=(1,)) + await x2.create_array("data", shape=(1,), dtype="uint8") await zarr.api.asynchronous.consolidate_metadata(store) @@ -1360,8 +1364,8 @@ def test_info(self): A = zarr.group(store=store, path="A") B = A.create_group(name="B") - B.create_array(name="x", shape=(1,)) - B.create_array(name="y", shape=(2,)) + B.create_array(name="x", shape=(1,), dtype="uint8") + B.create_array(name="y", shape=(2,), dtype="uint8") result = A.info expected = GroupInfo( @@ -1420,7 +1424,7 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None g1 = zarr.group(store=store, zarr_format=zarr_format) g1.create_group("0") g1.create_group("0/0") - arr = g1.create_array("0/0/0", shape=(1,)) + arr = g1.create_array("0/0/0", shape=(1,), dtype="uint8") arr[:] = 1 del g1["0"] with pytest.raises(KeyError): diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 7f0c49338e..ba7fe0cb08 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -31,16 +31,19 @@ @pytest.fixture async def memory_store_with_hierarchy(memory_store: Store) -> None: g = await group(store=memory_store, attributes={"foo": "bar"}) - await g.create_array(name="air", shape=(1, 2, 3)) - await g.create_array(name="lat", shape=(1,)) - await g.create_array(name="lon", shape=(2,)) - await g.create_array(name="time", shape=(3,)) + dtype = "uint8" + await g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + await g.create_array(name="lat", shape=(1,), dtype=dtype) + await g.create_array(name="lon", shape=(2,), dtype=dtype) + await g.create_array(name="time", shape=(3,), dtype=dtype) child = await g.create_group("child", attributes={"key": "child"}) - await child.create_array("array", shape=(4, 4), attributes={"key": "child"}) + await child.create_array("array", shape=(4, 4), attributes={"key": "child"}, dtype=dtype) grandchild = await child.create_group("grandchild", attributes={"key": "grandchild"}) - await grandchild.create_array("array", shape=(4, 4), attributes={"key": "grandchild"}) + await grandchild.create_array( + "array", shape=(4, 4), attributes={"key": "grandchild"}, dtype=dtype + ) await grandchild.create_group("empty_group", attributes={"key": "empty"}) return memory_store @@ -76,8 +79,8 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": "uint8", + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -205,10 +208,11 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: def test_consolidated_sync(self, memory_store): g = zarr.api.synchronous.group(store=memory_store, attributes={"foo": "bar"}) - g.create_array(name="air", shape=(1, 2, 3)) - g.create_array(name="lat", shape=(1,)) - g.create_array(name="lon", shape=(2,)) - g.create_array(name="time", shape=(3,)) + dtype = "uint8" + g.create_array(name="air", shape=(1, 2, 3), dtype=dtype) + g.create_array(name="lat", shape=(1,), dtype=dtype) + g.create_array(name="lon", shape=(2,), dtype=dtype) + g.create_array(name="time", shape=(3,), dtype=dtype) zarr.api.synchronous.consolidate_metadata(memory_store) group2 = zarr.api.synchronous.Group.open(memory_store) @@ -223,8 +227,8 @@ def test_consolidated_sync(self, memory_store): {"configuration": {"endian": "little"}, "name": "bytes"}, {"configuration": {}, "name": "zstd"}, ), - "data_type": "float64", - "fill_value": np.float64(0.0), + "data_type": dtype, + "fill_value": 0, "node_type": "array", # "shape": (1, 2, 3), "zarr_format": 3, @@ -475,7 +479,8 @@ async def test_open_consolidated_raises_async(self, zarr_format: ZarrFormat): async def test_consolidated_metadata_v2(self): store = zarr.storage.MemoryStore() g = await AsyncGroup.from_store(store, attributes={"key": "root"}, zarr_format=2) - await g.create_array(name="a", shape=(1,), attributes={"key": "a"}) + dtype = "uint8" + await g.create_array(name="a", shape=(1,), attributes={"key": "a"}, dtype=dtype) g1 = await g.create_group(name="g1", attributes={"key": "g1"}) await g1.create_group(name="g2", attributes={"key": "g2"}) @@ -489,10 +494,10 @@ async def test_consolidated_metadata_v2(self): metadata={ "a": ArrayV2Metadata( shape=(1,), - dtype="float64", + dtype=dtype, attributes={"key": "a"}, chunks=(1,), - fill_value=None, + fill_value=0, filters=(Zstd(level=0),), order="C", ), From c693fb4b7185c9dbede0bcd2a739ccaa080b52aa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 21 Dec 2024 23:20:59 +0100 Subject: [PATCH 27/85] use sane shard shape when there are too few chunks --- src/zarr/core/chunk_grids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 158c5918ba..1a06c10413 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -234,7 +234,7 @@ def _auto_partition( if a_shape // c_shape > 16: _shards_out += (c_shape * 2,) else: - _shards_out += (1,) + _shards_out += (c_shape,) else: _shards_out = shard_shape From dba25943c2756d8a56e956ecaef8961537c8fa99 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 22 Dec 2024 18:14:00 +0100 Subject: [PATCH 28/85] fix: allow user-specified filters and compression --- src/zarr/core/array.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 57f71e1b7a..4a8fbfbb4a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3594,8 +3594,7 @@ async def create_array( config=config_parsed, ) else: - array_array, array_bytes, bytes_bytes = _get_default_encoding_v3(dtype_parsed) - sub_codecs = (*array_array, array_bytes, *bytes_bytes) + sub_codecs = _parse_chunk_encoding_v3(compression=compression, filters=filters, dtype=dtype) codecs_out: tuple[Codec, ...] if shard_shape_parsed is not None: sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) From 669ad722de3dccbf79119d5fb0612e07c4d0ee56 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 22 Dec 2024 18:14:48 +0100 Subject: [PATCH 29/85] np.dtype[np.generic] -> np.dtype[Any] --- src/zarr/codecs/sharding.py | 4 +--- src/zarr/core/array.py | 6 +++--- src/zarr/core/chunk_grids.py | 2 +- src/zarr/core/common.py | 2 +- src/zarr/core/metadata/v3.py | 6 +++--- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index d646423eaf..a01145b3b2 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -396,9 +396,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return replace(self, codecs=evolved_codecs) return self - def validate( - self, *, shape: ChunkCoords, dtype: np.dtype[np.generic], chunk_grid: ChunkGrid - ) -> None: + def validate(self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid) -> None: if len(self.chunk_shape) != len(shape): raise ValueError( "The shard's `chunk_shape` and array's `shape` need to have the same number of dimensions." diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 4a8fbfbb4a..d5e01c1acc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3686,7 +3686,7 @@ def _get_default_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: np.dtype[np.generic], + dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: """ Get the default chunk encoding for zarr v2 arrays, given a dtype @@ -3718,7 +3718,7 @@ def _parse_chunk_encoding_v2( *, compression: numcodecs.abc.Codec | Literal["auto"], filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], - dtype: np.dtype[np.generic], + dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: """ Generate chunk encoding classes for v2 arrays with optional defaults. @@ -3740,7 +3740,7 @@ def _parse_chunk_encoding_v3( *, compression: Iterable[BytesBytesCodec] | Literal["auto"], filters: Iterable[ArrayArrayCodec] | Literal["auto"], - dtype: np.dtype[np.generic], + dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 1a06c10413..394d6807d3 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -200,7 +200,7 @@ def _auto_partition( array_shape: tuple[int, ...], shard_shape: tuple[int, ...] | Literal["auto"] | None, chunk_shape: tuple[int, ...] | Literal["auto"], - dtype: np.dtype[np.generic], + dtype: np.dtype[Any], ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: """ Automatically determine the shard shape and chunk shape for an array, given the shape and dtype of the array. diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 874091039a..7e7b2e73da 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -168,7 +168,7 @@ def parse_bool(data: Any) -> bool: raise ValueError(f"Expected bool, got {data} instead.") -def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[np.generic]: +def parse_dtype(dtype: Any, zarr_format: ZarrFormat) -> np.dtype[Any]: if dtype is str or dtype == "str": if zarr_format == 2: # special case as object diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 4cf5860ffd..67415f89aa 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -449,7 +449,7 @@ def parse_fill_value( return np.bytes_(fill_value) # the rest are numeric types - np_dtype = cast(np.dtype[np.generic], data_type.to_numpy()) + np_dtype = cast(np.dtype[Any], data_type.to_numpy()) if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): if data_type in (DataType.complex64, DataType.complex128): @@ -513,7 +513,7 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: return b"" else: np_dtype = dtype.to_numpy() - np_dtype = cast(np.dtype[np.generic], np_dtype) + np_dtype = cast(np.dtype[Any], np_dtype) return np_dtype.type(0) @@ -586,7 +586,7 @@ def to_numpy_shortname(self) -> str: } return data_type_to_numpy[self] - def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[np.generic]: + def to_numpy(self) -> np.dtypes.StringDType | np.dtypes.ObjectDType | np.dtype[Any]: # note: it is not possible to round trip DataType <-> np.dtype # due to the fact that DataType.string and DataType.bytes both # generally return np.dtype("O") from this function, even though From ae1832d623499472afb1871bf9f73e2e3cd60502 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 22 Dec 2024 18:34:30 +0100 Subject: [PATCH 30/85] handle singleton compressor / filters input --- src/zarr/core/array.py | 17 +++++++++++++---- src/zarr/core/group.py | 4 ++-- tests/test_store/test_zip.py | 2 +- tests/test_v2.py | 2 +- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d5e01c1acc..fd3886a603 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field from itertools import starmap from logging import getLogger @@ -3594,7 +3594,10 @@ async def create_array( config=config_parsed, ) else: - sub_codecs = _parse_chunk_encoding_v3(compression=compression, filters=filters, dtype=dtype) + array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( + compression=compression, filters=filters, dtype=dtype_parsed + ) + sub_codecs = (*array_array, array_bytes, *bytes_bytes) codecs_out: tuple[Codec, ...] if shard_shape_parsed is not None: sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) @@ -3750,10 +3753,16 @@ def _parse_chunk_encoding_v3( if compression == "auto": out_bytes_bytes = default_bytes_bytes else: - out_bytes_bytes = tuple(compression) + if isinstance(compression, Mapping | Codec): + out_bytes_bytes = (compression,) + else: + out_bytes_bytes = tuple(compression) if filters == "auto": out_array_array = default_array_array else: - out_array_array = tuple(filters) + if isinstance(filters, Mapping | Codec): + out_array_array = (filters,) + else: + out_array_array = tuple(filters) return out_array_array, default_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index b8cc56c206..f3bc3f3eec 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1064,8 +1064,8 @@ async def create_array( name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, - shard_shape=shard_shape, + chunks=chunk_shape, + shards=shard_shape, filters=filters, compression=compression, fill_value=fill_value, diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index df22b76e1e..c207adebe1 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -69,7 +69,7 @@ def test_api_integration(self, store: ZipStore) -> None: data = np.arange(10000, dtype=np.uint16).reshape(100, 100) z = root.create_array( - shape=data.shape, chunks=(10, 10), name="foo", dtype=np.uint16, fill_value=99 + shape=data.shape, chunk_shape=(10, 10), name="foo", dtype=np.uint16, fill_value=99 ) z[:] = data diff --git a/tests/test_v2.py b/tests/test_v2.py index 80897db8e5..e77edf56cc 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -88,7 +88,7 @@ async def test_v2_encode_decode(dtype): g.create_array( name="foo", shape=(3,), - chunks=(3,), + chunk_shape=(3,), dtype=dtype, fill_value=b"X", ) From 5cb6dd8f62ad6ed5391a08535dc05ef9ac50bbad Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 22 Dec 2024 20:31:44 +0100 Subject: [PATCH 31/85] default codec config now uses the full config dict --- src/zarr/api/asynchronous.py | 21 ++++---- src/zarr/core/array.py | 99 ++++++++++++++++++++---------------- src/zarr/core/config.py | 26 +++++++--- src/zarr/core/metadata/v2.py | 26 ++++++++-- tests/test_config.py | 47 ++++++++++++----- tests/test_v2.py | 33 ++++++++---- 6 files changed, 168 insertions(+), 84 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index a55d245552..c8125a9641 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,14 +18,14 @@ ChunkCoords, MemoryOrder, ZarrFormat, + _default_zarr_version, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, ) -from zarr.core.common import _default_zarr_version from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import _default_compressor, _default_filters from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -886,8 +886,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. compressor : Codec, optional @@ -900,7 +900,8 @@ async def create( - For Unicode strings, the default is ``VLenUTF8Codec``. - For bytes or objects, the default is ``VLenBytesCodec``. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + fill_value : object Default value to use for uninitialized portions of the array. order : {'C', 'F'}, optional Deprecated in favor of the ``config`` keyword argument. @@ -921,8 +922,8 @@ async def create( for storage of both chunks and metadata. filters : sequence of Codecs, optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. If neither ``compressor`` nor ``filters`` are provided, a default - compressor will be used. (see ``compressor`` for details). + V2 only. If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. cache_metadata : bool, optional If True, array configuration metadata will be cached for the lifetime of the object. If False, array metadata will be reloaded @@ -975,8 +976,10 @@ async def create( if chunks is None: chunks = shape dtype = parse_dtype(dtype, zarr_format) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr] if chunks is not None: chunk_shape = chunks diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fd3886a603..429fa4f748 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -18,7 +18,6 @@ from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec -from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config from zarr.core.attributes import Attributes @@ -87,7 +86,10 @@ ArrayV3MetadataDict, T_ArrayMetadata, ) -from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.metadata.v2 import ( + _default_compressor, + _default_filters, +) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError @@ -438,8 +440,8 @@ async def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -460,14 +462,14 @@ async def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional The compressor used to compress the data (default is None). V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -677,8 +679,10 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters and not compressor: - filters, compressor = _default_filters_and_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) if np.issubdtype(dtype, np.str_): filters = filters or [] if not any(x["id"] == "vlen-utf8" for x in filters): @@ -1572,8 +1576,8 @@ def create( If no codecs are provided, default codecs will be used: - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. - - For Unicode strings, the default is ``VLenUTF8Codec``. - - For bytes or objects, the default is ``VLenBytesCodec``. + - For Unicode strings, the default is ``VLenUTF8Codec`` and ``ZstdCodec``. + - For bytes or objects, the default is ``VLenBytesCodec`` and ``ZstdCodec``. These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. dimension_names : Iterable[str], optional @@ -1594,14 +1598,14 @@ def create( order for Zarr 3 arrays is via the ``config`` parameter, e.g. ``{'order': 'C'}``. filters : list[dict[str, JSON]], optional Sequence of filters to use to encode chunk data prior to compression. - V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` - nor ``filters`` are provided, a default compressor will be used. (see - ``compressor`` for details) + V2 only. V3 arrays should use ``codecs`` instead. If no ``filters`` + are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressor : dict[str, JSON], optional Primary compressor to compress chunk data. V2 only. V3 arrays should use ``codecs`` instead. - If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: + If no ``compressor`` is provided, a default compressor will be used: - For numeric arrays, the default is ``ZstdCodec``. - For Unicode strings, the default is ``VLenUTF8Codec``. @@ -3455,7 +3459,7 @@ def _get_default_codecs( else: dtype_key = "numeric" - return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] + return default_codecs[dtype_key] FiltersParam: TypeAlias = ( @@ -3672,49 +3676,56 @@ def _get_default_encoding_v3( else: dtype_key = "numeric" - codec_names = default_codecs[dtype_key] - array_bytes_cls, *rest = tuple(get_codec_class(codec_name) for codec_name in codec_names) - array_bytes: ArrayBytesCodec = cast(ArrayBytesCodec, array_bytes_cls()) - # TODO: we should compress bytes and strings by default! - # The current default codecs only lists names, and strings / bytes are not compressed at all, - # so we insert the ZstdCodec at the end of the list as a default - bytes_bytes: tuple[BytesBytesCodec, ...] - array_array: tuple[ArrayArrayCodec, ...] = () - if len(rest) == 0: - bytes_bytes = (ZstdCodec(),) - else: - bytes_bytes = cast(tuple[BytesBytesCodec, ...], tuple(r() for r in rest)) + codec_dicts = default_codecs[dtype_key] + codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts) + array_bytes_maybe = None + array_array: list[ArrayArrayCodec] = [] + bytes_bytes: list[BytesBytesCodec] = [] + + for codec in codecs: + if isinstance(codec, ArrayBytesCodec): + if array_bytes_maybe is not None: + raise ValueError( + f"Got two instances of ArrayBytesCodec: {array_bytes_maybe} and {codec}. " + "Only one array-to-bytes codec is allowed." + ) + array_bytes_maybe = codec + elif isinstance(codec, ArrayArrayCodec): + array_array.append(codec) + elif isinstance(codec, BytesBytesCodec): + bytes_bytes.append(codec) + else: + raise TypeError(f"Unexpected codec type: {type(codec)}") - return array_array, array_bytes, bytes_bytes + if array_bytes_maybe is None: + raise ValueError("Required ArrayBytesCodec was not found.") + + return tuple(array_array), array_bytes_maybe, tuple(bytes_bytes) def _get_default_chunk_encoding_v2( dtype: np.dtype[Any], -) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: +) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]: """ Get the default chunk encoding for zarr v2 arrays, given a dtype """ - codec_id_dict = zarr_config.get("array.v2_default_compressor") - if dtype.kind in "biufcmM": dtype_key = "numeric" - codec_type = "compressor" elif dtype.kind in "U": dtype_key = "string" - codec_type = "filter" elif dtype.kind in "OSV": dtype_key = "bytes" - codec_type = "filter" else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - codec_id = codec_id_dict[dtype_key] - codec_instance = numcodecs.get_codec({"id": codec_id}) - if codec_type == "compressor": - return (), codec_instance - elif codec_type == "filter": - return codec_instance, numcodecs.Zstd() - else: - raise ValueError(f"Unsupported codec type {codec_type}") + + compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None) + filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, []) + + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + return filters, compressor def _parse_chunk_encoding_v2( diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index a14305aef8..739529a3f9 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -67,14 +67,28 @@ def reset(self) -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index b19683981f..ddfc85a617 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -331,9 +331,9 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any: return dtype.type(0) -def _default_filters_and_compressor( +def _default_compressor( dtype: np.dtype[Any], -) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]: +) -> dict[str, JSON] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -348,4 +348,24 @@ def _default_filters_and_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return [{"id": default_compressor[dtype_key]}], None + return default_compressor.get(dtype_key, None) + + +def _default_filters( + dtype: np.dtype[Any], +) -> list[dict[str, JSON]]: + """Get the default filters and compressor for a dtype. + + https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html + """ + default_filters = config.get("array.v2_default_filters") + if dtype.kind in "biufcmM": + dtype_key = "numeric" + elif dtype.kind in "U": + dtype_key = "string" + elif dtype.kind in "OSV": + dtype_key = "bytes" + else: + raise ValueError(f"Unsupported dtype kind {dtype.kind}") + + return default_filters.get(dtype_key, []) diff --git a/tests/test_config.py b/tests/test_config.py index ea8e70a994..d5a364dd15 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -54,14 +54,28 @@ def test_config_defaults_set() -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": 0, "checksum": True}, + "string": {"id": "zstd", "level": 0, "checksum": True}, + "bytes": {"id": "zstd", "level": 0, "checksum": True}, + }, + "v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { - "bytes": ["vlen-bytes"], - "numeric": ["bytes", "zstd"], - "string": ["vlen-utf8"], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + ], }, }, "async": {"concurrency": 10, "timeout": None}, @@ -291,17 +305,26 @@ class NewCodec2(BytesCodec): ("dtype", "expected_codecs"), [ ("int", [BytesCodec(), GzipCodec()]), - ("bytes", [VLenBytesCodec()]), - ("str", [VLenUTF8Codec()]), + ("bytes", [VLenBytesCodec(), GzipCodec()]), + ("str", [VLenUTF8Codec(), GzipCodec()]), ], ) async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: with config.set( { - "array.v3_default_codecs": { - "numeric": ["bytes", "gzip"], # test setting non-standard codecs - "string": ["vlen-utf8"], - "bytes": ["vlen-bytes"], + "array.v3_default_codecs": { # test setting non-standard codecs + "numeric": [ + {"name": "bytes", "configuration": {"endian": "little"}}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "string": [ + {"name": "vlen-utf8"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], + "bytes": [ + {"name": "vlen-bytes"}, + {"name": "gzip", "configuration": {"level": 5}}, + ], } } ): diff --git a/tests/test_v2.py b/tests/test_v2.py index e77edf56cc..3cf4fecc72 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,7 +82,7 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": "vlen-bytes"}): + with config.set({"array.v2_default_compressor.bytes": {"id": "vlen-bytes"}}): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( @@ -120,9 +120,9 @@ def test_v2_encode_decode_with_data(dtype_value): dtype, value = dtype_value with config.set( { - "array.v2_default_compressor": { - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "array.v2_default_filters": { + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): @@ -210,18 +210,31 @@ def test_default_compressor_deprecation_warning(): @pytest.mark.parametrize( "dtype_expected", - [["b", "zstd"], ["i", "zstd"], ["f", "zstd"], ["|S1", "vlen-bytes"], ["|U1", "vlen-utf8"]], + [ + ["b", "zstd", None], + ["i", "zstd", None], + ["f", "zstd", None], + ["|S1", "zstd", "vlen-bytes"], + ["|U1", "zstd", "vlen-utf8"], + ], ) def test_default_filters_and_compressor(dtype_expected: Any) -> None: with config.set( { "array.v2_default_compressor": { - "numeric": "zstd", - "string": "vlen-utf8", - "bytes": "vlen-bytes", + "numeric": {"id": "zstd", "level": "0"}, + "string": {"id": "zstd", "level": "0"}, + "bytes": {"id": "zstd", "level": "0"}, + }, + "array.v2_default_filters": { + "numeric": [], + "string": [{"id": "vlen-utf8"}], + "bytes": [{"id": "vlen-bytes"}], }, } ): - dtype, expected = dtype_expected + dtype, expected_compressor, expected_filter = dtype_expected arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.filters[0].codec_id == expected + assert arr.metadata.compressor.codec_id == expected_compressor + if expected_filter is not None: + assert arr.metadata.filters[0].codec_id == expected_filter From 5dcd80bf765a4426f931078c1843bf5128d54462 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 23 Dec 2024 22:46:21 +0100 Subject: [PATCH 32/85] test for auto sharding --- src/zarr/core/array.py | 19 ++++++++------- src/zarr/core/chunk_grids.py | 8 +++--- tests/test_array.py | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index fd3886a603..1d5c0cc100 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3472,8 +3472,8 @@ async def create_array( name: str | None = None, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords | Literal["auto"] = "auto", - shard_shape: ChunkCoords | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersParam = "auto", compression: CompressionParam = "auto", fill_value: Any | None = 0, @@ -3500,9 +3500,9 @@ async def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords + chunks : ChunkCoords Chunk shape of the array. - shard_shape : ChunkCoords, optional + shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional List of filters to apply to the array. @@ -3552,15 +3552,16 @@ async def create_array( ) store_path = await make_store_path(store, path=name, mode=mode, storage_options=storage_options) shard_shape_parsed, chunk_shape_parsed = _auto_partition( - shape_parsed, shard_shape, chunk_shape, dtype_parsed + array_shape=shape_parsed, shard_shape=shards, chunk_shape=chunks, dtype=dtype_parsed ) + chunks_out: tuple[int, ...] result: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] if zarr_format == 2: if shard_shape_parsed is not None: msg = ( - 'Zarr v2 arrays can only be created with `shard_shape` set to `None` or `"auto"`.' - f"Got `shard_shape={shard_shape}` instead." + "Zarr v2 arrays can only be created with `shard_shape` set to `None`." + f"Got `shard_shape={shards}` instead." ) raise ValueError(msg) @@ -3604,10 +3605,10 @@ async def create_array( sharding_codec.validate( shape=chunk_shape_parsed, dtype=dtype_parsed, - chunk_grid=RegularChunkGrid(chunk_shape=shard_shape), + chunk_grid=RegularChunkGrid(chunk_shape=shard_shape_parsed), ) codecs_out = (sharding_codec,) - chunks_out = shard_shape + chunks_out = shard_shape_parsed else: chunks_out = chunk_shape_parsed codecs_out = sub_codecs diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 394d6807d3..3e5bab632d 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -197,9 +197,10 @@ def get_nchunks(self, array_shape: ChunkCoords) -> int: def _auto_partition( + *, array_shape: tuple[int, ...], - shard_shape: tuple[int, ...] | Literal["auto"] | None, chunk_shape: tuple[int, ...] | Literal["auto"], + shard_shape: tuple[int, ...] | Literal["auto"] | None, dtype: np.dtype[Any], ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: """ @@ -210,7 +211,6 @@ def _auto_partition( of the array; if the `chunk_shape` is also "auto", then the chunks will be set heuristically as well, given the dtype and shard shape. Otherwise, the chunks will be returned as-is. """ - item_size = dtype.itemsize if shard_shape is None: _shards_out: None | tuple[int, ...] = None @@ -229,9 +229,9 @@ def _auto_partition( _shards_out = () for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): # TODO: make a better heuristic than this. - # for each axis, if there are more than 16 chunks along that axis, then make put + # for each axis, if there are more than 8 chunks along that axis, then put # 2 chunks in each shard for that axis. - if a_shape // c_shape > 16: + if a_shape // c_shape > 8: _shards_out += (c_shape * 2,) else: _shards_out += (c_shape,) diff --git a/tests/test_array.py b/tests/test_array.py index e41f5cd548..eec731c45d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -13,10 +13,13 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec +from zarr.codecs.sharding import ShardingCodec from zarr.core._info import ArrayInfo from zarr.core.array import chunks_initialized from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer +from zarr.core.chunk_grids import _auto_partition +from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv @@ -881,3 +884,47 @@ async def test_nbytes( assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize else: assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize + + +def _get_partitioning(data: AsyncArray) -> tuple[tuple[int, ...], tuple[int, ...] | None]: + """ + Get the shard shape and chunk shape of an array. If the array is not sharded, the shard shape + will be None. + """ + + shard_shape: tuple[int, ...] | None + chunk_shape: tuple[int, ...] + codecs = data.codec_pipeline + if isinstance(codecs, BatchedCodecPipeline): + if isinstance(codecs.array_bytes_codec, ShardingCodec): + chunk_shape = codecs.array_bytes_codec.chunk_shape + shard_shape = data.chunks + else: + chunk_shape = data.chunks + shard_shape = None + return chunk_shape, shard_shape + + +@pytest.mark.parametrize( + ("array_shape", "chunk_shape"), + [((256,), (2,))], +) +def test_auto_partition_auto_shards( + array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] +) -> None: + """ + Test that automatically picking a shard size returns a tuple of 2 * the chunk shape for any axis + where there are 8 or more chunks. + """ + dtype = np.dtype("uint8") + expected_shards: tuple[int, ...] = () + for cs, a_len in zip(chunk_shape, array_shape, strict=False): + if a_len // cs >= 8: + expected_shards += (2 * cs,) + else: + expected_shards += (cs,) + + auto_shards, _ = _auto_partition( + array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype + ) + assert auto_shards == expected_shards From eab46a2e1e7a67e355a54829412db6c09a7bb42e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 25 Dec 2024 21:09:32 +0100 Subject: [PATCH 33/85] test --- tests/test_v2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_v2.py b/tests/test_v2.py index 3cf4fecc72..1bbdf858b5 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -82,7 +82,12 @@ def test_codec_pipeline() -> None: @pytest.mark.parametrize("dtype", ["|S", "|V"]) async def test_v2_encode_decode(dtype): - with config.set({"array.v2_default_compressor.bytes": {"id": "vlen-bytes"}}): + with config.set( + { + "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], + "array.v2_default_compressor.bytes": None, + } + ): store = zarr.storage.MemoryStore() g = zarr.group(store=store, zarr_format=2) g.create_array( From bcdc4cc65565193ada6d54a557d98db3f3fee6a5 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 25 Dec 2024 21:14:51 +0100 Subject: [PATCH 34/85] adds a shards property --- src/zarr/core/array.py | 42 ++++++++++++++++++++++++++++++------ src/zarr/core/metadata/v2.py | 6 +++++- src/zarr/core/metadata/v3.py | 34 +++++++++++++++++++++++++++++ tests/test_array.py | 27 +++++++++++++++++++++++ 4 files changed, 101 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2e55da4c58..3588030e90 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -798,6 +798,7 @@ def shape(self) -> ChunkCoords: @property def chunks(self) -> ChunkCoords: """Returns the chunk shape of the Array. + If sharding is used the inner chunk shape is returned. Only defined for arrays using using `RegularChunkGrid`. If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. @@ -807,14 +808,22 @@ def chunks(self) -> ChunkCoords: ChunkCoords: The chunk shape of the Array. """ - if isinstance(self.metadata.chunk_grid, RegularChunkGrid): - return self.metadata.chunk_grid.chunk_shape + return self.metadata.chunks - msg = ( - f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." - f"This array has a {self.metadata.chunk_grid} instead." - ) - raise NotImplementedError(msg) + @property + def shards(self) -> ChunkCoords | None: + """Returns the shard shape of the Array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + ChunkCoords: + The shard shape of the Array. + """ + return self.metadata.shards @property def size(self) -> int: @@ -1728,6 +1737,10 @@ def shape(self, value: ChunkCoords) -> None: @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. + If sharding is used the inner chunk shape is returned. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. Returns ------- @@ -1736,6 +1749,21 @@ def chunks(self) -> ChunkCoords: """ return self._async_array.chunks + @property + def shards(self) -> ChunkCoords | None: + """Returns a tuple of integers describing the length of each dimension of a shard of the array. + Returns None if sharding is not used. + + Only defined for arrays using using `RegularChunkGrid`. + If array doesn't use `RegularChunkGrid`, `NotImplementedError` is raised. + + Returns + ------- + tuple | None + A tuple of integers representing the length of each dimension of a shard or None if sharding is not used. + """ + return self._async_array.shards + @property def size(self) -> int: """Returns the total number of elements in the array. diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ddfc85a617..955e822783 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -44,7 +44,7 @@ class ArrayV2MetadataDict(TypedDict): @dataclass(frozen=True, kw_only=True) class ArrayV2Metadata(Metadata): shape: ChunkCoords - chunks: tuple[int, ...] + chunks: ChunkCoords dtype: np.dtype[Any] fill_value: int | float | str | bytes | None = 0 order: MemoryOrder = "C" @@ -102,6 +102,10 @@ def ndim(self) -> int: def chunk_grid(self) -> RegularChunkGrid: return RegularChunkGrid(chunk_shape=self.chunks) + @property + def shards(self) -> ChunkCoords | None: + return None + def to_buffer_dict(self, prototype: BufferPrototype) -> dict[str, Buffer]: def _json_convert( o: Any, diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 67415f89aa..dfc6c97882 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -296,6 +296,40 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: return len(self.shape) + @property + def chunks(self) -> ChunkCoords: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + sharding_codec = self.codecs[0] + assert isinstance(sharding_codec, ShardingCodec) # for mypy + return sharding_codec.chunk_shape + else: + return self.chunk_grid.chunk_shape + + msg = ( + f"The `chunks` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + + @property + def shards(self) -> ChunkCoords | None: + if isinstance(self.chunk_grid, RegularChunkGrid): + from zarr.codecs.sharding import ShardingCodec + + if len(self.codecs) == 1 and isinstance(self.codecs[0], ShardingCodec): + return self.chunk_grid.chunk_shape + else: + return None + + msg = ( + f"The `shards` attribute is only defined for arrays using `RegularChunkGrid`." + f"This array has a {self.chunk_grid} instead." + ) + raise NotImplementedError(msg) + def get_chunk_spec( self, _chunk_coords: ChunkCoords, array_config: ArrayConfig, prototype: BufferPrototype ) -> ArraySpec: diff --git a/tests/test_array.py b/tests/test_array.py index eec731c45d..647f50d669 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -928,3 +928,30 @@ def test_auto_partition_auto_shards( array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype ) assert auto_shards == expected_shards + + +def test_chunks_and_shards() -> None: + store = StorePath(MemoryStore()) + shape = (100, 100) + chunks = (5, 5) + shards = (10, 10) + + arr_v3 = zarr.create_array(store=store / "v3", shape=shape, chunks=chunks, dtype="i4") + assert arr_v3.chunks == chunks + assert arr_v3.shards is None + + arr_v3_sharding = zarr.create_array( + store=store / "v3_sharding", + shape=shape, + chunks=chunks, + shards=shards, + dtype="i4", + ) + assert arr_v3_sharding.chunks == chunks + assert arr_v3_sharding.shards == shards + + arr_v2 = zarr.create_array( + store=store / "v2", shape=shape, chunks=chunks, zarr_format=2, dtype="i4" + ) + assert arr_v2.chunks == chunks + assert arr_v2.shards is None From 4e978f90a0cc689e1798b51f93155fd0a42c6c26 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:20:53 +0100 Subject: [PATCH 35/85] add (typed) functions for resolving codecs --- src/zarr/registry.py | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 9055bb1447..5316b3d200 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -5,6 +5,8 @@ from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.core.common import JSON from zarr.core.config import BadConfigError, config if TYPE_CHECKING: @@ -151,6 +153,62 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def _resolve_codec(data: dict[str, JSON]) -> Codec: + """ + Get a codec instance from a dict representation of that codec. + """ + # TODO: narrow the type of the input to only those dicts that map on to codec class instances. + return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] + + +def _parse_bytes_bytes_codec(data: dict[str, JSON] | BytesBytesCodec) -> BytesBytesCodec: + """ + Normalize the input to a ``BytesBytesCodec`` instance. + If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. + """ + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, BytesBytesCodec): + msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." + raise ValueError(msg) + else: + result = data + return result + + +def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBytesCodec: + """ + Normalize the input to a ``ArrayBytesCodec`` instance. + If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. + """ + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayBytesCodec): + msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." + raise ValueError(msg) + else: + result = data + return result + + +def _parse_array_array_codec(data: dict[str, JSON] | ArrayArrayCodec) -> ArrayArrayCodec: + """ + Normalize the input to a ``ArrayArrayCodec`` instance. + If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it + is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. + """ + if isinstance(data, dict): + result = _resolve_codec(data) + if not isinstance(result, ArrayArrayCodec): + msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." + raise ValueError(msg) + else: + result = data + return result + + def get_pipeline_class(reload_config: bool = False) -> type[CodecPipeline]: if reload_config: _reload_config() From a9850bff0abacb3e87b010d87b6ee972a22ec9ec Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:21:47 +0100 Subject: [PATCH 36/85] better codec parsing --- src/zarr/core/array.py | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2e55da4c58..292869ad09 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable, Mapping +from collections.abc import Iterable from dataclasses import dataclass, field from itertools import starmap from logging import getLogger @@ -93,7 +93,12 @@ from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync from zarr.errors import MetadataValidationError -from zarr.registry import get_codec_class, get_pipeline_class +from zarr.registry import ( + _parse_array_array_codec, + _parse_bytes_bytes_codec, + _resolve_codec, + get_pipeline_class, +) from zarr.storage import StoreLike, make_store_path from zarr.storage.common import StorePath, ensure_no_existing_node @@ -3546,7 +3551,6 @@ async def create_array( # TODO: figure out why putting these imports at top-level causes circular imports from zarr.codecs.sharding import ShardingCodec - # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise mode: Literal["a"] = "a" dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) config_parsed = parse_array_config(config) @@ -3678,7 +3682,7 @@ def _get_default_encoding_v3( dtype_key = "numeric" codec_dicts = default_codecs[dtype_key] - codecs = tuple(get_codec_class(c["name"]).from_dict(c) for c in codec_dicts) + codecs = tuple(_resolve_codec(c) for c in codec_dicts) array_bytes_maybe = None array_array: list[ArrayArrayCodec] = [] bytes_bytes: list[BytesBytesCodec] = [] @@ -3710,21 +3714,11 @@ def _get_default_chunk_encoding_v2( """ Get the default chunk encoding for zarr v2 arrays, given a dtype """ - if dtype.kind in "biufcmM": - dtype_key = "numeric" - elif dtype.kind in "U": - dtype_key = "string" - elif dtype.kind in "OSV": - dtype_key = "bytes" - else: - raise ValueError(f"Unsupported dtype kind {dtype.kind}") - compressor_dict = zarr_config.get("array.v2_default_compressor").get(dtype_key, None) - filter_dicts = zarr_config.get("array.v2_default_filters").get(dtype_key, []) + compressor_dict = _default_compressor(dtype) + filter_dicts = _default_filters(dtype) - compressor = None - if compressor_dict is not None: - compressor = numcodecs.get_codec(compressor_dict) + compressor = numcodecs.get_codec(compressor_dict) filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) return filters, compressor @@ -3753,28 +3747,34 @@ def _parse_chunk_encoding_v2( def _parse_chunk_encoding_v3( *, - compression: Iterable[BytesBytesCodec] | Literal["auto"], - filters: Iterable[ArrayArrayCodec] | Literal["auto"], + compression: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], + filters: Iterable[ArrayArrayCodec | dict[str, JSON]] | Literal["auto"], dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. """ default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype) + maybe_bytes_bytes: Iterable[BytesBytesCodec | dict[str, JSON]] + maybe_array_array: Iterable[ArrayArrayCodec | dict[str, JSON]] if compression == "auto": out_bytes_bytes = default_bytes_bytes else: - if isinstance(compression, Mapping | Codec): - out_bytes_bytes = (compression,) + if isinstance(compression, dict | Codec): + maybe_bytes_bytes = (compression,) else: - out_bytes_bytes = tuple(compression) + maybe_bytes_bytes = compression + + out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + if filters == "auto": out_array_array = default_array_array else: - if isinstance(filters, Mapping | Codec): - out_array_array = (filters,) + if isinstance(filters, dict | Codec): + maybe_array_array = (filters,) else: - out_array_array = tuple(filters) + maybe_array_array = filters + out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) return out_array_array, default_array_bytes, out_bytes_bytes From 2747d6904a31e2328fe262ccb0bf8af403937d3a Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:22:38 +0100 Subject: [PATCH 37/85] add warning if auto sharding is used --- src/zarr/core/chunk_grids.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 3e5bab632d..96c73ed2ff 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -4,6 +4,7 @@ import math import numbers import operator +import warnings from abc import abstractmethod from dataclasses import dataclass from functools import reduce @@ -226,6 +227,11 @@ def _auto_partition( _chunks_out = chunk_shape if shard_shape == "auto": + warnings.warn( + "Automatic shard shape inference is experimental and may change without notice.", + UserWarning, + stacklevel=2, + ) _shards_out = () for a_shape, c_shape in zip(array_shape, _chunks_out, strict=True): # TODO: make a better heuristic than this. From 023c16b97784bf40f6e14ce1648c61d8d10420bb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:24:30 +0100 Subject: [PATCH 38/85] remove read_array --- src/zarr/api/asynchronous.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c8125a9641..4c97862979 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1061,41 +1061,6 @@ async def create( ) -async def read_array( - store: StoreLike, - *, - path: str | None = None, - zarr_format: ZarrFormat | None = None, - storage_options: dict[str, Any] | None = None, -) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: - """Create an array for reading. Wraps `:func:zarr.api.asynchronous.create`. - See the documentation of that function for details. - - Parameters - ---------- - store : Store or str - Store or path to directory in file system or name of zip file. - path : str, optional - Path under which the array is stored. - zarr_format : {2, 3, None}, optional - The zarr format to require. The default value of ``None`` will first look for Zarr v3 data, - then Zarr v2 data, then fail if neither format is found. - storage_options : dict - If using an fsspec URL to create the store, these will be passed to - the backend implementation. Ignored otherwise. - - Returns - ------- - z : array - The array. - """ - store_path = await make_store_path(store, path=path, mode="r", storage_options=storage_options) - return await AsyncArray.open( - store=store_path, - zarr_format=zarr_format, - ) - - async def empty( shape: ChunkCoords, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: From de2c36e72e29f26f02d2ffcff7606184711b6e12 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:32:17 +0100 Subject: [PATCH 39/85] rename compression to compressors, and make the docstring for create_array more clear on what filters and compressors mean --- src/zarr/core/array.py | 22 ++++++++++++++++------ src/zarr/core/group.py | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 292869ad09..c815de7e60 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3484,7 +3484,7 @@ async def create_array( chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersParam = "auto", - compression: CompressionParam = "auto", + compressors: CompressionParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", zarr_format: ZarrFormat | None = 3, @@ -3514,9 +3514,19 @@ async def create_array( shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional - List of filters to apply to the array. - compression : Iterable[Codec], optional - List of compressors to apply to the array. + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + For Zarr v3, a "filter" is a transformation that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + For Zarr v3, a "compressor" is a transformation that takes a string of bytes and + returns another string of bytes. + For Zarr v2, a "compressor" can be any numcodecs codec. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -3579,7 +3589,7 @@ async def create_array( ) filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters) filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compression=compression, filters=filters, dtype=dtype_parsed + compression=compressors, filters=filters, dtype=dtype_parsed ) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") @@ -3604,7 +3614,7 @@ async def create_array( ) else: array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( - compression=compression, filters=filters, dtype=dtype_parsed + compression=compressors, filters=filters, dtype=dtype_parsed ) sub_codecs = (*array_array, array_bytes, *bytes_bytes) codecs_out: tuple[Codec, ...] diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index f3bc3f3eec..72ac94ff89 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1067,7 +1067,7 @@ async def create_array( chunks=chunk_shape, shards=shard_shape, filters=filters, - compression=compression, + compressors=compression, fill_value=fill_value, order=order, zarr_format=self.metadata.zarr_format, From 74d31efc717c6e7d035bf83f01a26503c140b4fe Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 26 Dec 2024 23:38:22 +0100 Subject: [PATCH 40/85] compression -> compressors, shard_shape -> shards, chunk_shape -> chunks --- src/zarr/core/group.py | 76 +++++++++++++++++++++--------------------- tests/test_api.py | 12 +++---- tests/test_array.py | 2 +- tests/test_group.py | 10 +++--- 4 files changed, 49 insertions(+), 51 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 72ac94ff89..e328abb82b 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1001,10 +1001,10 @@ async def create_array( *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords | Literal["auto"] = "auto", - shard_shape: ChunkCoords | Literal["auto"] | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, filters: Iterable[dict[str, JSON] | Codec] = (), - compression: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -1028,9 +1028,9 @@ async def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords | Literal["auto"], default is "auto". + chunks : ChunkCoords | Literal["auto"], default is "auto". Chunk shape of the array. - shard_shape : ChunkCoords, optional + shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional List of filters to apply to the array. @@ -1064,10 +1064,10 @@ async def create_array( name=name, shape=shape, dtype=dtype, - chunks=chunk_shape, - shards=shard_shape, + chunks=chunks, + shards=shards, filters=filters, - compressors=compression, + compressors=compressors, fill_value=fill_value, order=order, zarr_format=self.metadata.zarr_format, @@ -1725,8 +1725,8 @@ def __getitem__(self, path: str) -> Array | Group: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) - >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) + >>> group.create_group(name="subgroup").create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group["subarray"] >>> group["subgroup"] @@ -1760,7 +1760,7 @@ def get(self, path: str, default: DefaultT | None = None) -> Array | Group | Def -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> group.create_group(name="subgroup") >>> group.get("subarray") @@ -1786,7 +1786,7 @@ def __delitem__(self, key: str) -> None: -------- >>> import zarr >>> group = Group.from_store(zarr.storage.MemoryStore() - >>> group.create_array(name="subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array(name="subarray", shape=(10,), chunks=(10,)) >>> del group["subarray"] >>> "subarray" in group False @@ -1801,8 +1801,8 @@ def __iter__(self) -> Iterator[str]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1: ... print(name) baz @@ -1993,8 +1993,8 @@ def keys(self) -> Generator[str, None]: >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') >>> g3 = g1.create_group('bar') - >>> d1 = g1.create_array('baz', shape=(10,), chunk_shape=(10,)) - >>> d2 = g1.create_array('quux', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('baz', shape=(10,), chunks=(10,)) + >>> d2 = g1.create_array('quux', shape=(10,), chunks=(10,)) >>> for name in g1.keys(): ... print(name) baz @@ -2012,7 +2012,7 @@ def __contains__(self, member: str) -> bool: >>> import zarr >>> g1 = zarr.group() >>> g2 = g1.create_group('foo') - >>> d1 = g1.create_array('bar', shape=(10,), chunk_shape=(10,)) + >>> d1 = g1.create_array('bar', shape=(10,), chunks=(10,)) >>> 'foo' in g1 True >>> 'bar' in g1 @@ -2075,7 +2075,7 @@ def arrays(self) -> Generator[tuple[str, Array], None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name, subarray in group.arrays(): ... print(name, subarray) subarray @@ -2090,7 +2090,7 @@ def array_keys(self) -> Generator[str, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for name in group.array_keys(): ... print(name) subarray @@ -2106,7 +2106,7 @@ def array_values(self) -> Generator[Array, None]: -------- >>> import zarr >>> group = zarr.group() - >>> group.create_array("subarray", shape=(10,), chunk_shape=(10,)) + >>> group.create_array("subarray", shape=(10,), chunks=(10,)) >>> for subarray in group.array_values(): ... print(subarray) @@ -2196,10 +2196,10 @@ def create_array( *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords | Literal["auto"] = "auto", - shard_shape: ChunkCoords | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | None = None, filters: Iterable[dict[str, JSON] | Codec] | Literal["auto"] = "auto", - compression: Iterable[dict[str, JSON] | Codec] | Codec | Literal["auto"] = "auto", + compressors: Iterable[dict[str, JSON] | Codec] | Codec | Literal["auto"] = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -2216,16 +2216,16 @@ def create_array( Parameters ---------- - path : str + name : str The name of the array relative to the group. If ``path`` is ``None``, the array will be located at the root of the store. shape : ChunkCoords Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords | Literal["auto"], default is "auto" + chunks : ChunkCoords | Literal["auto"], default is "auto" Chunk shape of the array. - shard_shape : ChunkCoords, optional + shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional List of filters to apply to the array. @@ -2260,12 +2260,12 @@ def create_array( name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, - shard_shape=shard_shape, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, chunk_key_encoding=chunk_key_encoding, - compression=compression, + compressors=compressors, dimension_names=dimension_names, order=order, filters=filters, @@ -2530,10 +2530,10 @@ def array( *, shape: ShapeLike, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords | Literal["auto"] = "auto", - shard_shape: ChunkCoords | Literal["auto"] | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, filters: Iterable[dict[str, JSON] | Codec] = (), - compression: Iterable[dict[str, JSON] | Codec] = (), + compressors: Iterable[dict[str, JSON] | Codec] = (), fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -2550,16 +2550,16 @@ def array( Parameters ---------- - path : str + name : str The name of the array relative to the group. If ``path`` is ``None``, the array will be located at the root of the store. shape : ChunkCoords Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunk_shape : ChunkCoords + chunks : ChunkCoords Chunk shape of the array. - shard_shape : ChunkCoords, optional + shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional List of filters to apply to the array. @@ -2594,12 +2594,12 @@ def array( name=name, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, - shard_shape=shard_shape, + chunks=chunks, + shards=shards, fill_value=fill_value, attributes=attributes, chunk_key_encoding=chunk_key_encoding, - compression=compression, + compressors=compressors, dimension_names=dimension_names, order=order, filters=filters, diff --git a/tests/test_api.py b/tests/test_api.py index bf6395edf7..d593fbe66e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1125,11 +1125,11 @@ async def test_create_array_v3(store: MemoryStore) -> None: store=store, dtype="uint8", shape=(10,), - shard_shape=(4,), - chunk_shape=(4,), + shards=(4,), + chunks=(4,), zarr_format=3, filters=(TransposeCodec(order=(0,)),), - compression=ZstdCodec(level=3), + compressors=ZstdCodec(level=3), ) @@ -1143,9 +1143,9 @@ async def test_create_array_v2(store: MemoryStore) -> None: store=store, dtype=dtype, shape=(10,), - shard_shape=None, - chunk_shape=(4,), + shards=None, + chunks=(4,), zarr_format=2, filters=(Delta(dtype=dtype),), - compression=Zstd(level=3), + compressors=Zstd(level=3), ) diff --git a/tests/test_array.py b/tests/test_array.py index eec731c45d..117e32ba35 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -925,6 +925,6 @@ def test_auto_partition_auto_shards( expected_shards += (cs,) auto_shards, _ = _auto_partition( - array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype + array_shape=array_shape, chunks=chunk_shape, shards="auto", dtype=dtype ) assert auto_shards == expected_shards diff --git a/tests/test_group.py b/tests/test_group.py index 44a98f65b4..aa4070c25c 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -874,9 +874,7 @@ async def test_asyncgroup_getitem(store: Store, zarr_format: ZarrFormat) -> None agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) array_name = "sub_array" - sub_array = await agroup.create_array( - name=array_name, shape=(10,), dtype="uint8", chunk_shape=(2,) - ) + sub_array = await agroup.create_array(name=array_name, shape=(10,), dtype="uint8", chunks=(2,)) assert await agroup.getitem(array_name) == sub_array sub_group_path = "sub_group" @@ -898,7 +896,7 @@ async def test_asyncgroup_delitem(store: Store, zarr_format: ZarrFormat) -> None name=array_name, shape=(10,), dtype="uint8", - chunk_shape=(2,), + chunks=(2,), attributes={"foo": 100}, ) await agroup.delitem(array_name) @@ -964,7 +962,7 @@ async def test_asyncgroup_create_array( name=sub_node_path, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + chunks=chunk_shape, attributes=attributes, ) assert isinstance(subnode, AsyncArray) @@ -1105,7 +1103,7 @@ async def test_require_group(store: LocalStore | MemoryStore, zarr_format: ZarrF assert foo_group.attrs == {} _ = await foo_group.create_array( - "bar", shape=(10,), dtype="uint8", chunk_shape=(2,), attributes={"foo": 100} + "bar", shape=(10,), dtype="uint8", chunks=(2,), attributes={"foo": 100} ) # test that overwriting a group w/ children fails From 470b60f1cee1a0f476994de6769017e1716e5232 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 27 Dec 2024 12:32:13 +0100 Subject: [PATCH 41/85] use typerror instead of valuerror; docstring --- src/zarr/core/group.py | 2 +- src/zarr/registry.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index e328abb82b..11826d63f6 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1028,7 +1028,7 @@ async def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunks : ChunkCoords | Literal["auto"], default is "auto". + chunks : ChunkCoords | Literal["auto"], default is "auto" Chunk shape of the array. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 5316b3d200..af6cd3fa62 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Generic, TypeVar from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec -from zarr.core.common import JSON from zarr.core.config import BadConfigError, config if TYPE_CHECKING: @@ -14,6 +13,7 @@ from zarr.abc.codec import Codec, CodecPipeline from zarr.core.buffer import Buffer, NDBuffer + from zarr.core.common import JSON __all__ = [ "Registry", @@ -171,7 +171,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | BytesBytesCodec) -> BytesBy result = _resolve_codec(data) if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." - raise ValueError(msg) + raise TypeError(msg) else: result = data return result @@ -187,7 +187,7 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBy result = _resolve_codec(data) if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." - raise ValueError(msg) + raise TypeError(msg) else: result = data return result @@ -203,7 +203,7 @@ def _parse_array_array_codec(data: dict[str, JSON] | ArrayArrayCodec) -> ArrayAr result = _resolve_codec(data) if not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." - raise ValueError(msg) + raise TypeError(msg) else: result = data return result From e8b1ad189a7058a8ef6a92e1f9382d5a353b82fb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 27 Dec 2024 12:35:13 +0100 Subject: [PATCH 42/85] default order is None --- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c815de7e60..1d6a807091 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3486,7 +3486,7 @@ async def create_array( filters: FiltersParam = "auto", compressors: CompressionParam = "auto", fill_value: Any | None = 0, - order: MemoryOrder | None = "C", + order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 11826d63f6..a08a2c0a30 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1003,10 +1003,10 @@ async def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + filters: Iterable[dict[str, JSON] | Codec] | Literal["auto"] = "auto", + compressors: Iterable[dict[str, JSON] | Codec] | Literal['auto'] = "auto", fill_value: Any | None = 0, - order: MemoryOrder | None = "C", + order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, dimension_names: Iterable[str] | None = None, From 6fcd976a5700590305c71c3a93da7b1dd422dab6 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 27 Dec 2024 13:09:19 +0100 Subject: [PATCH 43/85] fix circular dep --- src/zarr/core/buffer/core.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 7ddedfe064..571d7af881 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -16,10 +16,6 @@ import numpy as np import numpy.typing as npt -from zarr.registry import ( - get_buffer_class, - get_ndbuffer_class, -) if TYPE_CHECKING: from collections.abc import Iterable, Sequence @@ -507,4 +503,9 @@ class BufferPrototype(NamedTuple): # The default buffer prototype used throughout the Zarr codebase. def default_buffer_prototype() -> BufferPrototype: + from zarr.registry import ( + get_buffer_class, + get_ndbuffer_class, + ) + return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class()) From d9c30a3c52b23c410c6d4d3f9aa60352008cc874 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 27 Dec 2024 13:11:30 +0100 Subject: [PATCH 44/85] format --- src/zarr/core/buffer/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/core/buffer/core.py b/src/zarr/core/buffer/core.py index 571d7af881..85a7351fc7 100644 --- a/src/zarr/core/buffer/core.py +++ b/src/zarr/core/buffer/core.py @@ -16,7 +16,6 @@ import numpy as np import numpy.typing as npt - if TYPE_CHECKING: from collections.abc import Iterable, Sequence from typing import Self From 0bf4dd0bc7a559db1203e313d3abf0cea4e5af50 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 27 Dec 2024 14:02:08 +0100 Subject: [PATCH 45/85] fix some tests --- src/zarr/core/array.py | 14 ++++++++++---- src/zarr/core/config.py | 14 +++++++------- src/zarr/core/metadata/v2.py | 4 ++-- src/zarr/testing/strategies.py | 2 +- tests/test_api.py | 2 +- tests/test_array.py | 11 +++++------ tests/test_config.py | 14 +++++++------- tests/test_group.py | 16 ++++++---------- tests/test_metadata/test_consolidated.py | 6 +++--- tests/test_store/test_zip.py | 2 +- tests/test_v2.py | 2 +- 11 files changed, 44 insertions(+), 43 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9c887aef78..2f735c00bc 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3748,7 +3748,7 @@ def _get_default_encoding_v3( def _get_default_chunk_encoding_v2( dtype: np.dtype[Any], -) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec | None]: +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Get the default chunk encoding for zarr v2 arrays, given a dtype """ @@ -3756,8 +3756,14 @@ def _get_default_chunk_encoding_v2( compressor_dict = _default_compressor(dtype) filter_dicts = _default_filters(dtype) - compressor = numcodecs.get_codec(compressor_dict) - filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + compressor = None + if compressor_dict is not None: + compressor = numcodecs.get_codec(compressor_dict) + + filters = None + if filter_dicts is not None: + filters = tuple(numcodecs.get_codec(f) for f in filter_dicts) + return filters, compressor @@ -3766,7 +3772,7 @@ def _parse_chunk_encoding_v2( compression: numcodecs.abc.Codec | Literal["auto"], filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], dtype: np.dtype[Any], -) -> tuple[tuple[numcodecs.abc.Codec, ...], numcodecs.abc.Codec]: +) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for v2 arrays with optional defaults. """ diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 739529a3f9..0f261f10b7 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -67,27 +67,27 @@ def reset(self) -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": {"id": "zstd", "level": 0, "checksum": True}, - "string": {"id": "zstd", "level": 0, "checksum": True}, - "bytes": {"id": "zstd", "level": 0, "checksum": True}, + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, }, "v2_default_filters": { - "numeric": [], + "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { "numeric": [ {"name": "bytes", "configuration": {"endian": "little"}}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ {"name": "vlen-utf8"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "bytes": [ {"name": "vlen-bytes"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, }, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 955e822783..5d49e933fd 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -357,7 +357,7 @@ def _default_compressor( def _default_filters( dtype: np.dtype[Any], -) -> list[dict[str, JSON]]: +) -> list[dict[str, JSON]] | None: """Get the default filters and compressor for a dtype. https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html @@ -372,4 +372,4 @@ def _default_filters( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return default_filters.get(dtype_key, []) + return default_filters.get(dtype_key, None) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 8a352b601c..85a67e3e69 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -143,7 +143,7 @@ def arrays( a = root.create_array( array_path, shape=nparray.shape, - chunk_shape=chunks, + chunks=chunks, dtype=nparray.dtype, attributes=attributes, # compressor=compressor, # FIXME diff --git a/tests/test_api.py b/tests/test_api.py index d593fbe66e..78616b5caf 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -68,7 +68,7 @@ def test_create_array(store: Store) -> None: path = "foo" data_val = 1 array_w = create_array( - store, name=path, shape=shape, attributes=attrs, chunk_shape=shape, dtype="uint8" + store, name=path, shape=shape, attributes=attrs, chunks=shape, dtype="uint8" ) array_w[:] = data_val assert array_w.shape == shape diff --git a/tests/test_array.py b/tests/test_array.py index 1140065774..5e4ac8e7c1 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -8,7 +8,6 @@ import numcodecs import numpy as np import pytest -from numcodecs import Zstd import zarr.api.asynchronous from zarr import Array, AsyncArray, Group @@ -138,13 +137,13 @@ def test_array_name_properties_with_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: root = Group.from_store(store=store, zarr_format=zarr_format) - foo = root.create_array("foo", shape=(100,), chunk_shape=(10,), dtype="i4") + foo = root.create_array("foo", shape=(100,), chunks=(10,), dtype="i4") assert foo.path == "foo" assert foo.name == "/foo" assert foo.basename == "foo" bar = root.create_group("bar") - spam = bar.create_array("spam", shape=(100,), chunk_shape=(10,), dtype="i4") + spam = bar.create_array("spam", shape=(100,), chunks=(10,), dtype="i4") assert spam.path == "bar/spam" assert spam.name == "/bar/spam" @@ -463,7 +462,7 @@ def test_info_v2(self) -> None: _read_only=False, _store_type="MemoryStore", _count_bytes=128, - _filters=(numcodecs.Zstd(),), + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -519,8 +518,8 @@ async def test_info_v2_async(self) -> None: _order="C", _read_only=False, _store_type="MemoryStore", - _filters=(Zstd(level=0),), _count_bytes=128, + _compressor=numcodecs.Zstd(), ) assert result == expected @@ -925,7 +924,7 @@ def test_auto_partition_auto_shards( expected_shards += (cs,) auto_shards, _ = _auto_partition( - array_shape=array_shape, chunks=chunk_shape, shards="auto", dtype=dtype + array_shape=array_shape, chunk_shape=chunk_shape, shard_shape="auto", dtype=dtype ) assert auto_shards == expected_shards diff --git a/tests/test_config.py b/tests/test_config.py index d5a364dd15..7ac416b393 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -54,27 +54,27 @@ def test_config_defaults_set() -> None: "order": "C", "write_empty_chunks": False, "v2_default_compressor": { - "numeric": {"id": "zstd", "level": 0, "checksum": True}, - "string": {"id": "zstd", "level": 0, "checksum": True}, - "bytes": {"id": "zstd", "level": 0, "checksum": True}, + "numeric": {"id": "zstd", "level": 0, "checksum": False}, + "string": {"id": "zstd", "level": 0, "checksum": False}, + "bytes": {"id": "zstd", "level": 0, "checksum": False}, }, "v2_default_filters": { - "numeric": [], + "numeric": None, "string": [{"id": "vlen-utf8"}], "bytes": [{"id": "vlen-bytes"}], }, "v3_default_codecs": { "bytes": [ {"name": "vlen-bytes"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "numeric": [ {"name": "bytes", "configuration": {"endian": "little"}}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], "string": [ {"name": "vlen-utf8"}, - {"name": "zstd", "configuration": {"level": 0, "checksum": True}}, + {"name": "zstd", "configuration": {"level": 0, "checksum": False}}, ], }, }, diff --git a/tests/test_group.py b/tests/test_group.py index aa4070c25c..99943df9e7 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -155,7 +155,7 @@ def test_group_members(store: Store, zarr_format: ZarrFormat, consolidated_metad subsubsubgroup = subsubgroup.create_group("subsubsubgroup") members_expected["subarray"] = group.create_array( - "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), overwrite=True + "subarray", shape=(100,), dtype="uint8", chunks=(10,), overwrite=True ) # add an extra object to the domain of the group. # the list of children should ignore this object. @@ -226,9 +226,7 @@ def test_group(store: Store, zarr_format: ZarrFormat) -> None: # create an array from the "bar" group data = np.arange(0, 4 * 4, dtype="uint16").reshape((4, 4)) - arr = bar.create_array( - "baz", shape=data.shape, dtype=data.dtype, chunk_shape=(2, 2), overwrite=True - ) + arr = bar.create_array("baz", shape=data.shape, dtype=data.dtype, chunks=(2, 2), overwrite=True) arr[:] = data # check the array @@ -312,10 +310,8 @@ def test_group_getitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") - subsubarray = subgroup.create_array( - name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8" - ) + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") + subsubarray = subgroup.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -392,7 +388,7 @@ def test_group_delitem(store: Store, zarr_format: ZarrFormat, consolidated: bool group = Group.from_store(store, zarr_format=zarr_format) subgroup = group.create_group(name="subgroup") - subarray = group.create_array(name="subarray", shape=(10,), chunk_shape=(10,), dtype="uint8") + subarray = group.create_array(name="subarray", shape=(10,), chunks=(10,), dtype="uint8") if consolidated: group = zarr.api.synchronous.consolidate_metadata(store=store, zarr_format=zarr_format) @@ -500,7 +496,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "shape": (1,), "chunks": (1,), "order": "C", - "filters": (), + "filters": None, "compressor": Zstd(level=0), "zarr_format": zarr_format, }, diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index ba7fe0cb08..a40b8a5c0a 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -77,7 +77,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), "data_type": "uint8", "fill_value": 0, @@ -225,7 +225,7 @@ def test_consolidated_sync(self, memory_store): }, "codecs": ( {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + {"configuration": {"level": 0, "checksum": False}, "name": "zstd"}, ), "data_type": dtype, "fill_value": 0, @@ -498,7 +498,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=0, - filters=(Zstd(level=0),), + compressor=Zstd(level=0), order="C", ), "g1": GroupMetadata( diff --git a/tests/test_store/test_zip.py b/tests/test_store/test_zip.py index c207adebe1..df22b76e1e 100644 --- a/tests/test_store/test_zip.py +++ b/tests/test_store/test_zip.py @@ -69,7 +69,7 @@ def test_api_integration(self, store: ZipStore) -> None: data = np.arange(10000, dtype=np.uint16).reshape(100, 100) z = root.create_array( - shape=data.shape, chunk_shape=(10, 10), name="foo", dtype=np.uint16, fill_value=99 + shape=data.shape, chunks=(10, 10), name="foo", dtype=np.uint16, fill_value=99 ) z[:] = data diff --git a/tests/test_v2.py b/tests/test_v2.py index 1bbdf858b5..74b8a654fb 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -93,7 +93,7 @@ async def test_v2_encode_decode(dtype): g.create_array( name="foo", shape=(3,), - chunk_shape=(3,), + chunks=(3,), dtype=dtype, fill_value=b"X", ) From ea3ed0e499d9ed6826c2f61d97e86c60084d285d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 27 Dec 2024 14:11:18 +0100 Subject: [PATCH 46/85] use filters=auto and compressors=auto in Group.create_array --- src/zarr/core/group.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 11826d63f6..113eee58a1 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -18,7 +18,14 @@ from zarr.abc.metadata import Metadata from zarr.abc.store import Store, set_or_delete from zarr.core._info import GroupInfo -from zarr.core.array import Array, AsyncArray, _build_parents, create_array +from zarr.core.array import ( + Array, + AsyncArray, + CompressionParam, + FiltersParam, + _build_parents, + create_array, +) from zarr.core.attributes import Attributes from zarr.core.buffer import default_buffer_prototype from zarr.core.common import ( @@ -1003,8 +1010,8 @@ async def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + filters: FiltersParam = "auto", + compressors: CompressionParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, From 54fd9203440d4ca5363c56c166251c6c8c778a97 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 27 Dec 2024 14:14:35 +0100 Subject: [PATCH 47/85] compression -> compressors --- src/zarr/core/array.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 2f735c00bc..e3112f00ab 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3617,7 +3617,7 @@ async def create_array( ) filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters) filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compression=compressors, filters=filters, dtype=dtype_parsed + compressor=compressors, filters=filters, dtype=dtype_parsed ) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") @@ -3642,7 +3642,7 @@ async def create_array( ) else: array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( - compression=compressors, filters=filters, dtype=dtype_parsed + compressors=compressors, filters=filters, dtype=dtype_parsed ) sub_codecs = (*array_array, array_bytes, *bytes_bytes) codecs_out: tuple[Codec, ...] @@ -3769,7 +3769,7 @@ def _get_default_chunk_encoding_v2( def _parse_chunk_encoding_v2( *, - compression: numcodecs.abc.Codec | Literal["auto"], + compressor: numcodecs.abc.Codec | Literal["auto"], filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: @@ -3778,10 +3778,10 @@ def _parse_chunk_encoding_v2( """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) _filters: tuple[numcodecs.abc.Codec, ...] = () - if compression == "auto": + if compressor == "auto": _compressor = default_compressor else: - _compressor = compression + _compressor = compressor if filters == "auto": _filters = default_filters else: @@ -3791,7 +3791,7 @@ def _parse_chunk_encoding_v2( def _parse_chunk_encoding_v3( *, - compression: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], + compressors: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], filters: Iterable[ArrayArrayCodec | dict[str, JSON]] | Literal["auto"], dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: @@ -3802,13 +3802,13 @@ def _parse_chunk_encoding_v3( maybe_bytes_bytes: Iterable[BytesBytesCodec | dict[str, JSON]] maybe_array_array: Iterable[ArrayArrayCodec | dict[str, JSON]] - if compression == "auto": + if compressors == "auto": out_bytes_bytes = default_bytes_bytes else: - if isinstance(compression, dict | Codec): - maybe_bytes_bytes = (compression,) + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = compression + maybe_bytes_bytes = compressors out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) From a4ba7db013aeb9b8a26bae7246974555d389df14 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 28 Dec 2024 12:44:57 +0100 Subject: [PATCH 48/85] Update src/zarr/core/group.py Co-authored-by: Norman Rzepka --- src/zarr/core/group.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 113eee58a1..44f41f91ad 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -2539,8 +2539,8 @@ def array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: Iterable[dict[str, JSON] | Codec] = (), - compressors: Iterable[dict[str, JSON] | Codec] = (), + filters: Iterable[dict[str, JSON] | Codec] = "auto", + compressors: Iterable[dict[str, JSON] | Codec] = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, From fb286a7283dce77c88e9f7c048882987f91177a1 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 28 Dec 2024 13:34:31 +0100 Subject: [PATCH 49/85] fix mypy --- src/zarr/core/array.py | 61 ++++++++++++++++------------ src/zarr/core/chunk_key_encodings.py | 8 +++- src/zarr/core/group.py | 10 ++--- src/zarr/core/metadata/v2.py | 8 ++-- src/zarr/core/metadata/v3.py | 2 +- src/zarr/registry.py | 8 +++- tests/test_array.py | 7 +++- 7 files changed, 62 insertions(+), 42 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index e3112f00ab..a85a376e54 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -89,6 +89,8 @@ from zarr.core.metadata.v2 import ( _default_compressor, _default_filters, + parse_compressor, + parse_filters, ) from zarr.core.metadata.v3 import DataType, parse_node_type_array from zarr.core.sync import sync @@ -164,7 +166,7 @@ async def get_array_metadata( ) if zarr_json_bytes is not None and zarray_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (zarr v3) and .zarray (zarr v2) metadata objects exist at {store_path}." + msg = f"Both zarr.json (Zarr v3) and .zarray (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zarray_bytes is None: raise FileNotFoundError(store_path) @@ -667,8 +669,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: float | None = None, - filters: list[dict[str, JSON]] | None = None, - compressor: dict[str, JSON] | None = None, + filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + compressor: dict[str, JSON] | numcodecs.abc.Codec | None = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -3492,13 +3494,13 @@ def _get_default_codecs( else: dtype_key = "numeric" - return default_codecs[dtype_key] + return cast(list[dict[str, JSON]], default_codecs[dtype_key]) FiltersParam: TypeAlias = ( Iterable[dict[str, JSON] | Codec] | Iterable[numcodecs.abc.Codec] | Literal["auto"] ) -CompressionParam: TypeAlias = ( +CompressorsParam: TypeAlias = ( Iterable[dict[str, JSON] | Codec] | Codec | numcodecs.abc.Codec | Literal["auto"] ) @@ -3512,7 +3514,7 @@ async def create_array( chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersParam = "auto", - compressors: CompressionParam = "auto", + compressors: CompressorsParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", zarr_format: ZarrFormat | None = 3, @@ -3544,7 +3546,7 @@ async def create_array( filters : Iterable[Codec], optional Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. - For Zarr v3, a "filter" is a transformation that takes an array and returns an array, + For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the @@ -3552,8 +3554,8 @@ async def create_array( compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). - For Zarr v3, a "compressor" is a transformation that takes a string of bytes and - returns another string of bytes. + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. For Zarr v2, a "compressor" can be any numcodecs codec. fill_value : Any, optional Fill value for the array. @@ -3611,11 +3613,6 @@ async def create_array( ) raise ValueError(msg) - if filters != "auto" and not all(isinstance(f, numcodecs.abc.Codec) for f in filters): - raise TypeError( - "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." - ) - filters = cast(Iterable[numcodecs.abc.Codec] | Literal["auto"], filters) filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=dtype_parsed ) @@ -3644,7 +3641,7 @@ async def create_array( array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( compressors=compressors, filters=filters, dtype=dtype_parsed ) - sub_codecs = (*array_array, array_bytes, *bytes_bytes) + sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] if shard_shape_parsed is not None: sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) @@ -3688,7 +3685,7 @@ def _parse_chunk_key_encoding( """ if data is None: if zarr_format == 2: - result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "/"}) + result = ChunkKeyEncoding.from_dict({"name": "v2", "separator": "."}) else: result = ChunkKeyEncoding.from_dict({"name": "default", "separator": "/"}) elif isinstance(data, ChunkKeyEncoding): @@ -3769,38 +3766,48 @@ def _get_default_chunk_encoding_v2( def _parse_chunk_encoding_v2( *, - compressor: numcodecs.abc.Codec | Literal["auto"], - filters: tuple[numcodecs.abc.Codec, ...] | Literal["auto"], + compressor: CompressorsParam, + filters: FiltersParam, dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Generate chunk encoding classes for v2 arrays with optional defaults. """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) - _filters: tuple[numcodecs.abc.Codec, ...] = () + + _filters: tuple[numcodecs.abc.Codec, ...] | None = None + _compressor: numcodecs.abc.Codec | None = None + if compressor == "auto": _compressor = default_compressor else: - _compressor = compressor + if isinstance(compressor, Iterable): + raise TypeError("For Zarr v2 arrays, the `compressor` must be a single codec.") + _compressor = parse_compressor(compressor) if filters == "auto": _filters = default_filters else: - _filters = filters + if not all(isinstance(f, numcodecs.abc.Codec) for f in filters): + raise TypeError( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." + ) + _filters = parse_filters(filters) + return _filters, _compressor def _parse_chunk_encoding_v3( *, - compressors: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], - filters: Iterable[ArrayArrayCodec | dict[str, JSON]] | Literal["auto"], + compressors: CompressorsParam, + filters: FiltersParam, dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ Generate chunk encoding classes for v3 arrays with optional defaults. """ default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype) - maybe_bytes_bytes: Iterable[BytesBytesCodec | dict[str, JSON]] - maybe_array_array: Iterable[ArrayArrayCodec | dict[str, JSON]] + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] + maybe_array_array: Iterable[Codec | dict[str, JSON]] if compressors == "auto": out_bytes_bytes = default_bytes_bytes @@ -3808,7 +3815,7 @@ def _parse_chunk_encoding_v3( if isinstance(compressors, dict | Codec): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = compressors + maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) @@ -3818,7 +3825,7 @@ def _parse_chunk_encoding_v3( if isinstance(filters, dict | Codec): maybe_array_array = (filters,) else: - maybe_array_array = filters + maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) return out_array_array, default_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 33b44b3232..06d387afea 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -36,10 +36,16 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: object.__setattr__(self, "separator", separator_parsed) @classmethod - def from_dict(cls, data: dict[str, JSON] | ChunkKeyEncoding) -> ChunkKeyEncoding: + def from_dict( + cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingParams + ) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): return data + # handle ChunkKeyEncodingParams + if "name" in data and "separator" in data: + data = {"name": data["name"], "configuration": {"separator": data["separator"]}} + # configuration is optional for chunk key encodings name_parsed, config_parsed = parse_named_configuration(data, require_configuration=False) if name_parsed == "default": diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 44f41f91ad..272560611a 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -21,7 +21,7 @@ from zarr.core.array import ( Array, AsyncArray, - CompressionParam, + CompressorsParam, FiltersParam, _build_parents, create_array, @@ -511,7 +511,7 @@ async def open( ) if zarr_json_bytes is not None and zgroup_bytes is not None: # warn and favor v3 - msg = f"Both zarr.json (zarr v3) and .zgroup (zarr v2) metadata objects exist at {store_path}." + msg = f"Both zarr.json (Zarr v3) and .zgroup (Zarr v2) metadata objects exist at {store_path}. Zarr v3 will be used." warnings.warn(msg, stacklevel=1) if zarr_json_bytes is None and zgroup_bytes is None: raise FileNotFoundError( @@ -1011,7 +1011,7 @@ async def create_array( chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersParam = "auto", - compressors: CompressionParam = "auto", + compressors: CompressorsParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -2539,8 +2539,8 @@ def array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: Iterable[dict[str, JSON] | Codec] = "auto", - compressors: Iterable[dict[str, JSON] | Codec] = "auto", + filters: FiltersParam = "auto", + compressors: CompressorsParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 5d49e933fd..0292d9551b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -16,7 +16,7 @@ import numpy.typing as npt from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.common import JSON, ChunkCoords + from zarr.core.common import ChunkCoords import json from dataclasses import dataclass, field, fields, replace @@ -27,7 +27,7 @@ from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import parse_separator -from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike +from zarr.core.common import JSON, ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import parse_attributes @@ -352,7 +352,7 @@ def _default_compressor( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return default_compressor.get(dtype_key, None) + return cast(dict[str, JSON] | None, default_compressor.get(dtype_key, None)) def _default_filters( @@ -372,4 +372,4 @@ def _default_filters( else: raise ValueError(f"Unsupported dtype kind {dtype.kind}") - return default_filters.get(dtype_key, None) + return cast(list[dict[str, JSON]] | None, default_filters.get(dtype_key, None)) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index dfc6c97882..0821dd9bc9 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -548,7 +548,7 @@ def default_fill_value(dtype: DataType) -> str | bytes | np.generic: else: np_dtype = dtype.to_numpy() np_dtype = cast(np.dtype[Any], np_dtype) - return np_dtype.type(0) + return np_dtype.type(0) # type: ignore[misc] # For type checking diff --git a/src/zarr/registry.py b/src/zarr/registry.py index af6cd3fa62..28595ff534 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -161,7 +161,7 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | BytesBytesCodec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it @@ -173,6 +173,8 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | BytesBytesCodec) -> BytesBy msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) else: + if not isinstance(data, BytesBytesCodec): + raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") result = data return result @@ -193,7 +195,7 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBy return result -def _parse_array_array_codec(data: dict[str, JSON] | ArrayArrayCodec) -> ArrayArrayCodec: +def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it @@ -205,6 +207,8 @@ def _parse_array_array_codec(data: dict[str, JSON] | ArrayArrayCodec) -> ArrayAr msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) else: + if not isinstance(data, ArrayArrayCodec): + raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") result = data return result diff --git a/tests/test_array.py b/tests/test_array.py index 5e4ac8e7c1..2979623093 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -22,7 +22,8 @@ from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv -from zarr.core.metadata.v3 import DataType +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore @@ -885,7 +886,9 @@ async def test_nbytes( assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize -def _get_partitioning(data: AsyncArray) -> tuple[tuple[int, ...], tuple[int, ...] | None]: +def _get_partitioning( + data: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], +) -> tuple[tuple[int, ...], tuple[int, ...] | None]: """ Get the shard shape and chunk shape of an array. If the array is not sharded, the shard shape will be None. From df35d13f0a68316d1e09ad72a271da0f308a4770 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 13:52:42 +0100 Subject: [PATCH 50/85] narrow type of filters param and compression param --- src/zarr/core/array.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1d6a807091..45d8aeefa4 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3468,10 +3468,16 @@ def _get_default_codecs( FiltersParam: TypeAlias = ( - Iterable[dict[str, JSON] | Codec] | Iterable[numcodecs.abc.Codec] | Literal["auto"] + Iterable[dict[str, JSON] | ArrayArrayCodec] + | ArrayArrayCodec + | Iterable[numcodecs.abc.Codec] + | Literal["auto"] ) CompressionParam: TypeAlias = ( - Iterable[dict[str, JSON] | Codec] | Codec | numcodecs.abc.Codec | Literal["auto"] + Iterable[dict[str, JSON] | BytesBytesCodec] + | BytesBytesCodec + | numcodecs.abc.Codec + | Literal["auto"] ) @@ -3614,7 +3620,7 @@ async def create_array( ) else: array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( - compression=compressors, filters=filters, dtype=dtype_parsed + compressors=compressors, filters=filters, dtype=dtype_parsed ) sub_codecs = (*array_array, array_bytes, *bytes_bytes) codecs_out: tuple[Codec, ...] @@ -3757,7 +3763,7 @@ def _parse_chunk_encoding_v2( def _parse_chunk_encoding_v3( *, - compression: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], + compressors: Iterable[BytesBytesCodec | dict[str, JSON]] | Literal["auto"], filters: Iterable[ArrayArrayCodec | dict[str, JSON]] | Literal["auto"], dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: @@ -3768,13 +3774,13 @@ def _parse_chunk_encoding_v3( maybe_bytes_bytes: Iterable[BytesBytesCodec | dict[str, JSON]] maybe_array_array: Iterable[ArrayArrayCodec | dict[str, JSON]] - if compression == "auto": + if compressors == "auto": out_bytes_bytes = default_bytes_bytes else: - if isinstance(compression, dict | Codec): - maybe_bytes_bytes = (compression,) + if isinstance(compressors, dict | Codec): + maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = compression + maybe_bytes_bytes = compressors out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) From 77f40a5de0422ff2f14ae55d54e8b94637f870a5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 14:05:30 +0100 Subject: [PATCH 51/85] remove data kwarg to create_array --- src/zarr/core/array.py | 7 ------- src/zarr/core/group.py | 5 ----- 2 files changed, 12 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6f090113f1..ce110079bf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3530,7 +3530,6 @@ async def create_array( storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, - data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -3582,8 +3581,6 @@ async def create_array( Whether to overwrite an array with the same name in the store, if one exists. config : ArrayConfig or ArrayConfigParams, optional Runtime configuration for the array. - data : np.ndarray, optional - Initial data for the array. Returns ------- @@ -3676,10 +3673,6 @@ async def create_array( config=config_parsed, ) - if data is not None: - await result.setitem( - selection=slice(None), value=data, prototype=default_buffer_prototype() - ) return result diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 8e46562e41..60f8b6a1e4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1020,7 +1020,6 @@ async def create_array( storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, - data: npt.ArrayLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Create a Zarr array within this AsyncGroup. @@ -1084,7 +1083,6 @@ async def create_array( storage_options=storage_options, overwrite=overwrite, config=config, - data=data, ) @deprecated("Use AsyncGroup.create_array instead.") @@ -2215,7 +2213,6 @@ def create_array( storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, - data: npt.ArrayLike | None = None, ) -> Array: """ Create a Zarr array within this AsyncGroup. @@ -2279,7 +2276,6 @@ def create_array( overwrite=overwrite, storage_options=storage_options, config=config, - data=data, ) ) ) @@ -2613,7 +2609,6 @@ def array( overwrite=overwrite, storage_options=storage_options, config=config, - data=data, ) ) ) From 235e246a8498c6b77c52080b95c45c805df5280f Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 28 Dec 2024 14:18:03 +0100 Subject: [PATCH 52/85] mypy fixes --- src/zarr/core/array.py | 4 +++- src/zarr/testing/strategies.py | 3 ++- tests/test_group.py | 12 ++++++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ce110079bf..d05720afc6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3786,7 +3786,9 @@ def _parse_chunk_encoding_v2( if filters == "auto": _filters = default_filters else: - if not all(isinstance(f, numcodecs.abc.Codec) for f in filters): + if isinstance(filters, Iterable) and not all( + isinstance(f, numcodecs.abc.Codec) for f in filters + ): raise TypeError( "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." ) diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 85a67e3e69..c447596f06 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -10,6 +10,7 @@ from zarr.core.array import Array from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike +from zarr.storage.common import _dereference_path # Copied from Xarray _attr_keys = st.text(st.characters(), min_size=1) @@ -137,7 +138,7 @@ def arrays( expected_attrs = {} if attributes is None else attributes - array_path = path + ("/" if not path.endswith("/") else "") + name + array_path = _dereference_path(path, name) root = zarr.open_group(store, mode="w", zarr_format=zarr_format) a = root.create_array( diff --git a/tests/test_group.py b/tests/test_group.py index 99943df9e7..67232fd948 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -614,20 +614,24 @@ def test_group_create_array( data = np.arange(np.prod(shape)).reshape(shape).astype(dtype) if method == "create_array": - array = group.create_array(name="array", shape=shape, dtype=dtype, data=data) + array = group.create_array(name="array", shape=shape, dtype=dtype) + array[:] = data elif method == "array": with pytest.warns(DeprecationWarning): - array = group.array(name="array", shape=shape, dtype=dtype, data=data) + array = group.array(name="array", shape=shape, dtype=dtype) + array[:] = data else: raise AssertionError if not overwrite: if method == "create_array": with pytest.raises(ContainsArrayError): - group.create_array(name="array", shape=shape, dtype=dtype, data=data) + a = group.create_array(name="array", shape=shape, dtype=dtype) + a[:] = data elif method == "array": with pytest.raises(ContainsArrayError), pytest.warns(DeprecationWarning): - group.array(name="array", shape=shape, dtype=dtype, data=data) + a = group.array(name="array", shape=shape, dtype=dtype) + a[:] = data assert array.shape == shape assert array.dtype == np.dtype(dtype) assert np.array_equal(array[:], data) From 95348d6021ae4e9b2e63d43d1debc41828f06a8d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 14:20:15 +0100 Subject: [PATCH 53/85] ensure that we accept dict form of compressor in _parse_chunk_encoding_v2 --- src/zarr/core/array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ce110079bf..1d75aa1fcf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3780,8 +3780,9 @@ def _parse_chunk_encoding_v2( if compressor == "auto": _compressor = default_compressor else: - if isinstance(compressor, Iterable): - raise TypeError("For Zarr v2 arrays, the `compressor` must be a single codec.") + if isinstance(compressor, Iterable) and not isinstance(compressor, dict): + msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." + raise TypeError(msg) _compressor = parse_compressor(compressor) if filters == "auto": _filters = default_filters From 665037e05621964d3c7d746bd951a2c86b97f8cc Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 28 Dec 2024 14:22:13 +0100 Subject: [PATCH 54/85] fix properties test --- src/zarr/core/array.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d05720afc6..5bf1ea5893 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -690,10 +690,14 @@ async def _create_v2( filters = _default_filters(dtype) if not compressor: compressor = _default_compressor(dtype) + + # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): filters = filters or [] - if not any(x["id"] == "vlen-utf8" for x in filters): - filters = list(filters) + [{"id": "vlen-utf8"}] + from numcodecs.vlen import VLenUTF8 + + if not any(isinstance(x, VLenUTF8) or x["id"] == "vlen-utf8" for x in filters): + filters = list(filters) + [VLenUTF8()] metadata = ArrayV2Metadata( shape=shape, From 0a983e69e18dcc3b48bb69a6cd5a8675a48b284d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 15:30:31 +0100 Subject: [PATCH 55/85] add tests for compressors and filters kwargs to create_array --- tests/test_api.py | 35 -------------- tests/test_array.py | 112 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 36 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 78616b5caf..5f8c84c4a6 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -23,8 +23,6 @@ save_array, save_group, ) -from zarr.codecs.transpose import TransposeCodec -from zarr.codecs.zstd import ZstdCodec from zarr.core.common import MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path @@ -1116,36 +1114,3 @@ def test_open_array_with_mode_r_plus(store: Store) -> None: assert isinstance(z2, Array) assert (z2[:] == 1).all() z2[:] = 3 - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_v3(store: MemoryStore) -> None: - # TODO: fill in - _ = zarr.create_array( - store=store, - dtype="uint8", - shape=(10,), - shards=(4,), - chunks=(4,), - zarr_format=3, - filters=(TransposeCodec(order=(0,)),), - compressors=ZstdCodec(level=3), - ) - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_v2(store: MemoryStore) -> None: - from numcodecs import Delta, Zstd - - # TODO: fill in - dtype = "uint8" - _ = zarr.create_array( - store=store, - dtype=dtype, - shape=(10,), - shards=None, - chunks=(4,), - zarr_format=2, - filters=(Delta(dtype=dtype),), - compressors=Zstd(level=3), - ) diff --git a/tests/test_array.py b/tests/test_array.py index 2979623093..bd7b39ff4e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -12,9 +12,16 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec +from zarr.codecs.gzip import GzipCodec from zarr.codecs.sharding import ShardingCodec +from zarr.codecs.transpose import TransposeCodec from zarr.core._info import ArrayInfo -from zarr.core.array import chunks_initialized +from zarr.core.array import ( + CompressorsParam, + FiltersParam, + _parse_chunk_encoding_v3, + chunks_initialized, +) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition @@ -957,3 +964,106 @@ def test_chunks_and_shards() -> None: ) assert arr_v2.chunks == chunks assert arr_v2.shards is None + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + (ZstdCodec(level=3),), + (ZstdCodec(level=3), GzipCodec(level=0)), + ZstdCodec(level=3), + {"name": "zstd", "configuration": {"level": 3}}, + ({"name": "zstd", "configuration": {"level": 3}},), + ], +) +async def test_create_array_v3_compressors( + store: MemoryStore, compressors: CompressorsParam +) -> None: + """ + Test various possibilities for the compressors parameter to create_array + """ + dtype = "uint8" + arr = zarr.create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + compressors=compressors, + ) + _, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters=(), compressors=compressors, dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the compressors from the array. + assert tuple(arr._async_array.metadata.codecs[-len(bb_codecs_expected) :]) == bb_codecs_expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize( + "filters", + [ + "auto", + ( + TransposeCodec( + order=[ + 0, + ] + ), + ), + ( + TransposeCodec( + order=[ + 0, + ] + ), + TransposeCodec( + order=[ + 0, + ] + ), + ), + TransposeCodec( + order=[ + 0, + ] + ), + {"name": "transpose", "configuration": {"order": [0]}}, + ({"name": "transpose", "configuration": {"order": [0]}},), + ], +) +async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam) -> None: + """ + Test various possibilities for the filters parameter to create_array + """ + dtype = "uint8" + arr = zarr.create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + filters=filters, + ) + aa_codecs_expected, _, _ = _parse_chunk_encoding_v3( + filters=filters, compressors=(), dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters from the array. + assert tuple(arr._async_array.metadata.codecs[: len(aa_codecs_expected)]) == aa_codecs_expected # type: ignore[union-attr] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_create_array_v2(store: MemoryStore) -> None: + from numcodecs import Delta, Zstd + + # TODO: fill in + dtype = "uint8" + _ = zarr.create_array( + store=store, + dtype=dtype, + shape=(10,), + shards=None, + chunks=(4,), + zarr_format=2, + filters=(Delta(dtype=dtype),), + compressors=Zstd(level=3), + ) From 2182793f025750f2ae3505f04ffd2ec62b61e510 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 16:30:49 +0100 Subject: [PATCH 56/85] add tests for codec inference --- src/zarr/core/array.py | 12 ++++++---- tests/test_array.py | 52 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index dbb3fae7c7..8dcf42100e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3704,7 +3704,7 @@ def _parse_chunk_key_encoding( return result -def _get_default_encoding_v3( +def _get_default_chunk_encoding_v3( np_dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ @@ -3747,14 +3747,14 @@ def _get_default_encoding_v3( def _get_default_chunk_encoding_v2( - dtype: np.dtype[Any], + np_dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ Get the default chunk encoding for zarr v2 arrays, given a dtype """ - compressor_dict = _default_compressor(dtype) - filter_dicts = _default_filters(dtype) + compressor_dict = _default_compressor(np_dtype) + filter_dicts = _default_filters(np_dtype) compressor = None if compressor_dict is not None: @@ -3811,7 +3811,9 @@ def _parse_chunk_encoding_v3( """ Generate chunk encoding classes for v3 arrays with optional defaults. """ - default_array_array, default_array_bytes, default_bytes_bytes = _get_default_encoding_v3(dtype) + default_array_array, default_array_bytes, default_bytes_bytes = _get_default_chunk_encoding_v3( + dtype + ) maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] maybe_array_array: Iterable[Codec | dict[str, JSON]] diff --git a/tests/test_array.py b/tests/test_array.py index bd7b39ff4e..7b77aaabd9 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -19,8 +19,11 @@ from zarr.core.array import ( CompressorsParam, FiltersParam, + _get_default_chunk_encoding_v2, + _get_default_chunk_encoding_v3, _parse_chunk_encoding_v3, chunks_initialized, + create_array, ) from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer @@ -985,7 +988,7 @@ async def test_create_array_v3_compressors( Test various possibilities for the compressors parameter to create_array """ dtype = "uint8" - arr = zarr.create_array( + arr = await create_array( store=store, dtype=dtype, shape=(10,), @@ -996,7 +999,7 @@ async def test_create_array_v3_compressors( filters=(), compressors=compressors, dtype=np.dtype(dtype) ) # TODO: find a better way to get the compressors from the array. - assert tuple(arr._async_array.metadata.codecs[-len(bb_codecs_expected) :]) == bb_codecs_expected # type: ignore[union-attr] + assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[union-attr, attr-defined] @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1037,7 +1040,7 @@ async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam Test various possibilities for the filters parameter to create_array """ dtype = "uint8" - arr = zarr.create_array( + arr = await create_array( store=store, dtype=dtype, shape=(10,), @@ -1048,7 +1051,48 @@ async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam filters=filters, compressors=(), dtype=np.dtype(dtype) ) # TODO: find a better way to get the filters from the array. - assert tuple(arr._async_array.metadata.codecs[: len(aa_codecs_expected)]) == aa_codecs_expected # type: ignore[union-attr] + assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[union-attr, attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v3_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 3 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=3, + ) + expected_aa, expected_ab, expected_bb = _get_default_chunk_encoding_v3(np_dtype=np.dtype(dtype)) + # TODO: define the codec pipeline class such that these fields are required, which will obviate the + # type ignore statements + assert arr.codec_pipeline.array_array_codecs == expected_aa # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == expected_bb # type: ignore[attr-defined] + assert arr.codec_pipeline.array_bytes_codec == expected_ab # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_v2_default_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are used when ``create_array`` is invoked with + ``zarr_format`` = 2 and ``filters`` and ``compressors`` are not specified. + """ + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + ) + expected_filters, expected_compressors = _get_default_chunk_encoding_v2( + np_dtype=np.dtype(dtype) + ) + assert arr.metadata.filters == expected_filters # type: ignore[union-attr] + assert arr.metadata.compressor == expected_compressors # type: ignore[union-attr] @pytest.mark.parametrize("store", ["memory"], indirect=True) From c04d7cf03da2cdb6ed38175f57da046e41c6a6a5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 16:39:52 +0100 Subject: [PATCH 57/85] add test for illegal shards kwarg for v2 arrays --- src/zarr/core/array.py | 2 +- tests/test_array.py | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8dcf42100e..22fac94a74 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3615,7 +3615,7 @@ async def create_array( if zarr_format == 2: if shard_shape_parsed is not None: msg = ( - "Zarr v2 arrays can only be created with `shard_shape` set to `None`." + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. " f"Got `shard_shape={shards}` instead." ) diff --git a/tests/test_array.py b/tests/test_array.py index 7b77aaabd9..93ed9defcb 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2,6 +2,7 @@ import json import math import pickle +import re from itertools import accumulate from typing import Any, Literal @@ -1096,18 +1097,18 @@ async def test_create_array_v2_default_filters_compressors(store: MemoryStore, d @pytest.mark.parametrize("store", ["memory"], indirect=True) -async def test_create_array_v2(store: MemoryStore) -> None: - from numcodecs import Delta, Zstd - - # TODO: fill in - dtype = "uint8" - _ = zarr.create_array( - store=store, - dtype=dtype, - shape=(10,), - shards=None, - chunks=(4,), - zarr_format=2, - filters=(Delta(dtype=dtype),), - compressors=Zstd(level=3), +async def test_create_array_v2_no_shards(store: MemoryStore) -> None: + """ + Test that creating a Zarr v2 array with ``shard_shape`` set to a non-None value raises an error. + """ + msg = re.escape( + "Zarr v2 arrays can only be created with `shard_shape` set to `None`. Got `shard_shape=(5,)` instead." ) + with pytest.raises(ValueError, match=msg): + _ = await create_array( + store=store, + dtype="uint8", + shape=(10,), + shards=(5,), + zarr_format=2, + ) From 144b2b7704afd1f2ed91b75a60c4c073962be88b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 28 Dec 2024 16:41:38 +0100 Subject: [PATCH 58/85] remove redundant test function --- tests/test_array.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 93ed9defcb..f21e69cc27 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -14,7 +14,6 @@ from zarr import Array, AsyncArray, Group from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec from zarr.codecs.gzip import GzipCodec -from zarr.codecs.sharding import ShardingCodec from zarr.codecs.transpose import TransposeCodec from zarr.core._info import ArrayInfo from zarr.core.array import ( @@ -29,12 +28,10 @@ from zarr.core.buffer import default_buffer_prototype from zarr.core.buffer.cpu import NDBuffer from zarr.core.chunk_grids import _auto_partition -from zarr.core.codec_pipeline import BatchedCodecPipeline from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.core.group import AsyncGroup from zarr.core.indexing import ceildiv -from zarr.core.metadata.v2 import ArrayV2Metadata -from zarr.core.metadata.v3 import ArrayV3Metadata, DataType +from zarr.core.metadata.v3 import DataType from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.storage import LocalStore, MemoryStore @@ -897,27 +894,6 @@ async def test_nbytes( assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize -def _get_partitioning( - data: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], -) -> tuple[tuple[int, ...], tuple[int, ...] | None]: - """ - Get the shard shape and chunk shape of an array. If the array is not sharded, the shard shape - will be None. - """ - - shard_shape: tuple[int, ...] | None - chunk_shape: tuple[int, ...] - codecs = data.codec_pipeline - if isinstance(codecs, BatchedCodecPipeline): - if isinstance(codecs.array_bytes_codec, ShardingCodec): - chunk_shape = codecs.array_bytes_codec.chunk_shape - shard_shape = data.chunks - else: - chunk_shape = data.chunks - shard_shape = None - return chunk_shape, shard_shape - - @pytest.mark.parametrize( ("array_shape", "chunk_shape"), [((256,), (2,))], From d407e5d25f582a448da162919282bc6c9b776350 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 29 Dec 2024 20:38:44 +0100 Subject: [PATCH 59/85] tests and types --- src/zarr/core/array.py | 4 +++- tests/test_array.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 22fac94a74..76e865eb7c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3621,8 +3621,9 @@ async def create_array( raise ValueError(msg) filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( - compressor=compressors, filters=filters, dtype=dtype_parsed + compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) + print(dtype_parsed) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") if order is None: @@ -3788,6 +3789,7 @@ def _parse_chunk_encoding_v2( msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." raise TypeError(msg) _compressor = parse_compressor(compressor) + if filters == "auto": _filters = default_filters else: diff --git a/tests/test_array.py b/tests/test_array.py index f21e69cc27..e7b870bf89 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -976,7 +976,7 @@ async def test_create_array_v3_compressors( filters=(), compressors=compressors, dtype=np.dtype(dtype) ) # TODO: find a better way to get the compressors from the array. - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[union-attr, attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] @pytest.mark.parametrize("store", ["memory"], indirect=True) @@ -1028,7 +1028,7 @@ async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam filters=filters, compressors=(), dtype=np.dtype(dtype) ) # TODO: find a better way to get the filters from the array. - assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[union-attr, attr-defined] + assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] @pytest.mark.parametrize("store", ["memory"], indirect=True) From 1301c5f71c31e5952f6c1505f824319206b96908 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 29 Dec 2024 20:44:04 +0100 Subject: [PATCH 60/85] rm print --- src/zarr/core/array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 76e865eb7c..a1bd65cf07 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3623,7 +3623,6 @@ async def create_array( filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) - print(dtype_parsed) if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") if order is None: From 31b3ad46fdd963e171b955884aa5f0f8de71ec74 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 29 Dec 2024 20:55:31 +0100 Subject: [PATCH 61/85] types --- src/zarr/core/group.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 60f8b6a1e4..c7a3d333ba 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -53,7 +53,6 @@ from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - from zarr.abc.codec import Codec from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams @@ -2203,8 +2202,8 @@ def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | None = None, - filters: Iterable[dict[str, JSON] | Codec] | Literal["auto"] = "auto", - compressors: Iterable[dict[str, JSON] | Codec] | Codec | Literal["auto"] = "auto", + filters: FiltersParam = "auto", + compressors: CompressorsParam = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, From 43b6774c59506080639845caac9440b1776ce60d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 29 Dec 2024 21:06:41 +0100 Subject: [PATCH 62/85] resolve cyclic import --- src/zarr/registry.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 28595ff534..4775799807 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -5,13 +5,18 @@ from importlib.metadata import entry_points as get_entry_points from typing import TYPE_CHECKING, Any, Generic, TypeVar -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec from zarr.core.config import BadConfigError, config if TYPE_CHECKING: from importlib.metadata import EntryPoint - from zarr.abc.codec import Codec, CodecPipeline + from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BytesBytesCodec, + Codec, + CodecPipeline, + ) from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON @@ -167,6 +172,8 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + from zarr.abc.codec import BytesBytesCodec + if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, BytesBytesCodec): @@ -185,6 +192,8 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBy If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ + from zarr.abc.codec import ArrayBytesCodec + if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, ArrayBytesCodec): @@ -201,6 +210,8 @@ def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ + from zarr.abc.codec import ArrayArrayCodec + if isinstance(data, dict): result = _resolve_codec(data) if not isinstance(result, ArrayArrayCodec): From e55023afe67f8539aa04e73e20457b2ea5c48110 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 30 Dec 2024 13:49:13 +0100 Subject: [PATCH 63/85] add create_array to async and sync API --- src/zarr/api/asynchronous.py | 3 ++- src/zarr/api/synchronous.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index aaaef235c0..769c66d4cb 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,7 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, get_array_metadata +from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike from zarr.core.common import ( @@ -49,6 +49,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f15513715a..461d64765f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -31,6 +31,7 @@ "copy_all", "copy_store", "create", + "create_array", "empty", "empty_like", "full", From e24bdeb18039bc912b487c2081195165d14b36fc Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 30 Dec 2024 14:03:00 +0100 Subject: [PATCH 64/85] docs for create_array --- src/zarr/core/array.py | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a1bd65cf07..8490a1bce6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -487,6 +487,8 @@ async def create( Whether to raise an error if the store already exists (default is False). data : npt.ArrayLike, optional The data to be inserted into the array (default is None). + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. Returns ------- @@ -3548,36 +3550,60 @@ async def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunks : ChunkCoords + chunks : ChunkCoords, optional Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. + For Zarr v3, a "filter" is a codec that takes an array and returns an array, and these values must be instances of ``ArrayArrayCodec``, or dict representations of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and - returns another bytestream. - For Zarr v2, a "compressor" can be any numcodecs codec. + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional - Memory layout of the array. + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. zarr_format : {2, 3}, optional The zarr format to use when saving. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding to use. + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional - Dimension names for the array. + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. From b564ae68c7e9258d77718181b663e8c492e2c5cc Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 31 Dec 2024 13:13:32 +0100 Subject: [PATCH 65/85] rename (Async)Array.create to _create --- src/zarr/api/asynchronous.py | 4 +- src/zarr/core/array.py | 99 +++++++++++-------- src/zarr/core/chunk_grids.py | 6 +- tests/test_array.py | 70 +++++++------- tests/test_buffer.py | 39 +++----- tests/test_codecs/test_blosc.py | 17 ++-- tests/test_codecs/test_codecs.py | 70 ++++++-------- tests/test_codecs/test_endian.py | 4 +- tests/test_codecs/test_gzip.py | 10 +- tests/test_codecs/test_sharding.py | 145 ++++++++++------------------ tests/test_codecs/test_transpose.py | 43 +++------ tests/test_codecs/test_vlen.py | 23 ++--- tests/test_codecs/test_zstd.py | 10 +- tests/test_config.py | 15 +-- tests/test_group.py | 2 +- tests/test_indexing.py | 8 +- tests/test_v2.py | 8 +- 17 files changed, 258 insertions(+), 315 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 769c66d4cb..53cf1db11d 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -425,7 +425,7 @@ async def save_array( shape = arr.shape chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute overwrite = kwargs.pop("overwrite", None) or _infer_overwrite(mode) - new = await AsyncArray.create( + new = await AsyncArray._create( store_path, zarr_format=zarr_format, shape=shape, @@ -1041,7 +1041,7 @@ async def create( config_parsed = ArrayConfig.from_dict(config_dict) - return await AsyncArray.create( + return await AsyncArray._create( store_path, shape=shape, chunks=chunks, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8490a1bce6..721c5c2dba 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -7,7 +7,7 @@ from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, cast, overload +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypedDict, cast, overload from warnings import warn import numcodecs @@ -109,8 +109,10 @@ from typing import Self from zarr.abc.codec import CodecPipeline + from zarr.codecs.sharding import ShardingCodecIndexLocation from zarr.core.group import AsyncGroup + # Array and AsyncArray are defined in the base ``zarr`` namespace __all__ = ["create_codec_pipeline", "parse_array_metadata"] @@ -271,7 +273,7 @@ def __init__( # this overload defines the function signature when zarr_format is 2 @overload @classmethod - async def create( + async def _create( cls, store: StoreLike, *, @@ -295,7 +297,7 @@ async def create( # this overload defines the function signature when zarr_format is 3 @overload @classmethod - async def create( + async def _create( cls, store: StoreLike, *, @@ -323,7 +325,7 @@ async def create( @overload @classmethod - async def create( + async def _create( cls, store: StoreLike, *, @@ -350,7 +352,7 @@ async def create( ) -> AsyncArray[ArrayV3Metadata]: ... @overload @classmethod - async def create( + async def _create( cls, store: StoreLike, *, @@ -383,7 +385,7 @@ async def create( ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod - async def create( + async def _create( cls, store: StoreLike, *, @@ -494,19 +496,6 @@ async def create( ------- AsyncArray The created asynchronous array instance. - - Examples - -------- - >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='i4', - >>> fill_value=0) - - """ store_path = await make_store_path(store) @@ -1148,7 +1137,7 @@ async def getitem( -------- >>> import zarr >>> store = zarr.storage.MemoryStore(mode='w') - >>> async_arr = await zarr.core.array.AsyncArray.create( + >>> async_arr = await zarr.api.asynchronous.create_array( ... store=store, ... shape=(100,100), ... chunks=(10,10), @@ -1542,7 +1531,7 @@ class Array: @classmethod @_deprecate_positional_args - def create( + def _create( cls, store: StoreLike, *, @@ -1643,7 +1632,7 @@ def create( Array created from the store. """ async_array = sync( - AsyncArray.create( + AsyncArray._create( store=store, shape=shape, dtype=dtype, @@ -2025,10 +2014,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10,), + >>> chunks=(10,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2059,10 +2048,10 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: Setup a 2-dimensional array:: >>> data = np.arange(100, dtype="uint16").reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(10, 10), + >>> chunks=(10, 10), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2290,10 +2279,10 @@ def get_basic_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100, dtype="uint16") - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3,), + >>> chunks=(3,), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2319,10 +2308,10 @@ def get_basic_selection( Setup a 3-dimensional array:: >>> data = np.arange(1000).reshape(10, 10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(5, 5, 5), + >>> chunks=(5, 5, 5), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2514,10 +2503,10 @@ def get_orthogonal_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2748,10 +2737,10 @@ def get_mask_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(100).reshape(10, 10) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=data.shape, + >>> chunks=data.shape, >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -2908,10 +2897,10 @@ def get_coordinate_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3096,10 +3085,10 @@ def get_block_selection( >>> import zarr >>> import numpy as np >>> data = np.arange(0, 100, dtype="uint16").reshape((10, 10)) - >>> z = Array.create( + >>> z = zarr.create_array( >>> StorePath(MemoryStore(mode="w")), >>> shape=data.shape, - >>> chunk_shape=(3, 3), + >>> chunks=(3, 3), >>> dtype=data.dtype, >>> ) >>> z[:] = data @@ -3517,6 +3506,14 @@ def _get_default_codecs( ) +class ShardsConfigParam(TypedDict): + shape: ChunkCoords + index_location: ShardingCodecIndexLocation | None + + +ShardsParam: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] + + async def create_array( store: str | StoreLike, *, @@ -3524,7 +3521,7 @@ async def create_array( shape: ShapeLike, dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", - shards: ChunkCoords | Literal["auto"] | None = None, + shards: ShardsParam | None = None, filters: FiltersParam = "auto", compressors: CompressorsParam = "auto", fill_value: Any | None = 0, @@ -3616,13 +3613,24 @@ async def create_array( ------- z : array The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> async_arr = await zarr.api.asynchronous.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + """ if zarr_format is None: zarr_format = _default_zarr_version() - # TODO: figure out why putting these imports at top-level causes circular imports - from zarr.codecs.sharding import ShardingCodec + from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation mode: Literal["a"] = "a" dtype_parsed = parse_dtype(dtype, zarr_format=zarr_format) @@ -3677,7 +3685,14 @@ async def create_array( sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] if shard_shape_parsed is not None: - sharding_codec = ShardingCodec(chunk_shape=chunk_shape_parsed, codecs=sub_codecs) + index_location = None + if isinstance(shards, dict): + index_location = ShardingCodecIndexLocation(shards.get("index_location", None)) + if index_location is None: + index_location = ShardingCodecIndexLocation.end + sharding_codec = ShardingCodec( + chunk_shape=chunk_shape_parsed, codecs=sub_codecs, index_location=index_location + ) sharding_codec.validate( shape=chunk_shape_parsed, dtype=dtype_parsed, diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 96c73ed2ff..f9fbf8d4c8 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -27,6 +27,8 @@ from collections.abc import Iterator from typing import Self + from zarr.core.array import ShardsParam + def _guess_chunks( shape: ShapeLike, @@ -201,7 +203,7 @@ def _auto_partition( *, array_shape: tuple[int, ...], chunk_shape: tuple[int, ...] | Literal["auto"], - shard_shape: tuple[int, ...] | Literal["auto"] | None, + shard_shape: ShardsParam | None, dtype: np.dtype[Any], ) -> tuple[tuple[int, ...] | None, tuple[int, ...]]: """ @@ -241,6 +243,8 @@ def _auto_partition( _shards_out += (c_shape * 2,) else: _shards_out += (c_shape,) + elif isinstance(shard_shape, dict): + _shards_out = tuple(shard_shape["shape"]) else: _shards_out = shard_shape diff --git a/tests/test_array.py b/tests/test_array.py index e6a9b7adf0..6c44ead91c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -69,7 +69,7 @@ def test_array_creation_existing_node( if overwrite: if not store.supports_deletes: pytest.skip("store does not support deletes") - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -80,7 +80,7 @@ def test_array_creation_existing_node( assert arr_new.dtype == new_dtype else: with pytest.raises(expected_exception): - arr_new = Array.create( + arr_new = zarr.create_array( spath / "extant", shape=new_shape, dtype=new_dtype, @@ -134,7 +134,9 @@ async def test_create_creates_parents( def test_array_name_properties_no_group( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - arr = Array.create(store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4") + arr = zarr.create_array( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) assert arr.path == "" assert arr.name == "/" assert arr.basename == "" @@ -172,17 +174,17 @@ def test_array_v3_fill_value_default( shape = (10,) default_fill_value = 0 if specifiy_fill_value: - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=None, ) else: - arr = Array.create( - store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunk_shape=shape + arr = zarr.create_array( + store=store, shape=shape, dtype=dtype_str, zarr_format=3, chunks=shape ) assert arr.fill_value == np.dtype(dtype_str).type(default_fill_value) @@ -196,12 +198,12 @@ def test_array_v3_fill_value_default( ) def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=dtype_str, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=fill_value, ) @@ -212,12 +214,12 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str def test_create_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="Pass"): - Array.create(store, (2, 2), dtype="f8") + zarr.create_array(store, (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: store = MemoryStore() - arr = Array.create(store, shape=(2, 2), dtype="f8") + arr = zarr.create_array(store, shape=(2, 2), dtype="f8") with pytest.warns(FutureWarning, match="Pass out"): arr.get_basic_selection(..., NDBuffer(array=np.empty((2, 2)))) @@ -253,12 +255,12 @@ def test_selection_positional_args_deprecated() -> None: @pytest.mark.parametrize("store", ["memory"], indirect=True) async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: shape = (10,) - arr = Array.create( + arr = zarr.create_array( store=store, shape=shape, dtype=np.float64, zarr_format=3, - chunk_shape=shape, + chunks=shape, fill_value=np.nan, ) arr[:] = np.nan @@ -274,7 +276,7 @@ async def test_array_v3_nan_fill_value(store: MemoryStore) -> None: async def test_serializable_async_array( store: LocalStore | MemoryStore, zarr_format: ZarrFormat ) -> None: - expected = await AsyncArray.create( + expected = await zarr.api.asynchronous.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) # await expected.setitems(list(range(100))) @@ -290,7 +292,7 @@ async def test_serializable_async_array( @pytest.mark.parametrize("store", ["local"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) def test_serializable_sync_array(store: LocalStore, zarr_format: ZarrFormat) -> None: - expected = Array.create( + expected = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" ) expected[:] = list(range(100)) @@ -331,7 +333,7 @@ def test_nchunks(test_cls: type[Array] | type[AsyncArray[Any]], nchunks: int) -> """ store = MemoryStore() shape = 100 - arr = Array.create(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") + arr = zarr.create_array(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") expected = nchunks if test_cls == Array: observed = arr.nchunks @@ -346,7 +348,7 @@ async def test_nchunks_initialized(test_cls: type[Array] | type[AsyncArray[Any]] Test that nchunks_initialized accurately returns the number of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") # write chunks one at a time for idx, region in enumerate(arr._iter_chunk_regions()): @@ -374,7 +376,7 @@ async def test_chunks_initialized() -> None: Test that chunks_initialized accurately returns the keys of stored chunks. """ store = MemoryStore() - arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + arr = zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4") chunks_accumulated = tuple( accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) @@ -413,34 +415,34 @@ async def test_nbytes_stored_async() -> None: def test_default_fill_values() -> None: - a = Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array.create(MemoryStore(), shape=5, chunk_shape=5, dtype=" None: def test_update_attrs(zarr_format: int) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 store = MemoryStore() - arr = Array.create(store=store, shape=5, chunk_shape=5, dtype="f8", zarr_format=zarr_format) + arr = zarr.create_array(store=store, shape=5, chunks=5, dtype="f8", zarr_format=zarr_format) arr.attrs["foo"] = "bar" assert arr.attrs["foo"] == "bar" @@ -768,7 +770,7 @@ def test_array_create_metadata_order_v2( keyword argument to ``Array.create``. When ``order`` is ``None``, the value of the ``array.order`` config is used. """ - arr = Array.create(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") + arr = zarr.create_array(store=store, shape=(2, 2), order=order, zarr_format=2, dtype="i4") expected = order or zarr.config.get("array.order") assert arr.metadata.order == expected # type: ignore[union-attr] @@ -792,7 +794,7 @@ def test_array_create_order( config = {"order": order_config} expected = order_config - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2, 2), zarr_format=zarr_format, dtype="i4", config=config ) @@ -812,7 +814,7 @@ def test_write_empty_chunks_config(write_empty_chunks: bool) -> None: explicitly """ with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): - arr = Array.create({}, shape=(2, 2), dtype="i4") + arr = zarr.create_array({}, shape=(2, 2), dtype="i4") assert arr._async_array._config.write_empty_chunks == write_empty_chunks @@ -832,13 +834,13 @@ def test_write_empty_chunks_behavior( already present. """ - arr = Array.create( + arr = zarr.create_array( store=store, shape=(2,), zarr_format=zarr_format, dtype="i4", fill_value=fill_value, - chunk_shape=(1,), + chunks=(1,), config={"write_empty_chunks": write_empty_chunks}, ) @@ -869,7 +871,7 @@ def test_write_empty_chunks_behavior( ) async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: list[Any]) -> None: store = MemoryStore() - Array.create(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) + zarr.create_array(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) content = await store.get("zarr.json", prototype=default_buffer_prototype()) assert content is not None actual = json.loads(content.to_bytes()) @@ -887,7 +889,7 @@ async def test_nbytes( the chunks of that array. """ store = MemoryStore() - arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) + arr = zarr.create_array(store=store, shape=shape, dtype=dtype, fill_value=0) if array_type == "async": assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize else: diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 7a275516c6..e3cab0f214 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -5,9 +5,8 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.codecs.blosc import BloscCodec -from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.transpose import TransposeCodec @@ -47,10 +46,10 @@ async def test_async_array_prototype() -> None: """Test the use of a custom buffer prototype""" expect = np.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_async_array_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -76,10 +75,10 @@ async def test_async_array_gpu_prototype() -> None: """Test the use of the GPU buffer prototype""" expect = cp.zeros((9, 9), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_async_array_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, ) @@ -98,20 +97,14 @@ async def test_async_array_gpu_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_prototype() -> None: expect = np.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(StoreExpectingTestBuffer()) / "test_codecs_use_of_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = np.arange(100).reshape(10, 10) @@ -133,20 +126,14 @@ async def test_codecs_use_of_prototype() -> None: @pytest.mark.asyncio async def test_codecs_use_of_gpu_prototype() -> None: expect = cp.zeros((10, 10), dtype="uint16", order="F") - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( StorePath(MemoryStore()) / "test_codecs_use_of_gpu_prototype", shape=expect.shape, - chunk_shape=(5, 5), + chunks=(5, 5), dtype=expect.dtype, fill_value=0, - codecs=[ - TransposeCodec(order=(1, 0)), - BytesCodec(), - BloscCodec(), - Crc32cCodec(), - GzipCodec(), - ZstdCodec(), - ], + compressors=[BloscCodec(), Crc32cCodec(), GzipCodec(), ZstdCodec()], + filters=[TransposeCodec(order=(1, 0))], ) expect[:] = cp.arange(100).reshape(10, 10) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 416a2f784e..34044d7d62 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -3,9 +3,9 @@ import numpy as np import pytest -from zarr import AsyncArray +import zarr from zarr.abc.store import Store -from zarr.codecs import BloscCodec, BytesCodec, ShardingCodec +from zarr.codecs import BloscCodec from zarr.core.buffer import default_buffer_prototype from zarr.storage.common import StorePath @@ -16,13 +16,13 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: typesize = np.dtype(dtype).itemsize path = "blosc_evolve" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype=dtype, fill_value=0, - codecs=[BytesCodec(), BloscCodec()], + compressors=BloscCodec(), ) buf = await store.get(f"{path}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None @@ -36,13 +36,14 @@ async def test_blosc_evolve(store: Store, dtype: str) -> None: path2 = "blosc_evolve_sharding" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), + shards=(16, 16), dtype=dtype, fill_value=0, - codecs=[ShardingCodec(chunk_shape=(16, 16), codecs=[BytesCodec(), BloscCodec()])], + compressors=BloscCodec(), ) buf = await store.get(f"{path2}/zarr.json", prototype=default_buffer_prototype()) assert buf is not None diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 2025e72937..fe771579ff 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -7,6 +7,9 @@ import numpy as np import pytest +import zarr +import zarr.api +import zarr.api.asynchronous from zarr import Array, AsyncArray, config from zarr.codecs import ( BytesCodec, @@ -19,7 +22,6 @@ from zarr.storage import StorePath if TYPE_CHECKING: - from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.core.buffer.core import NDArrayLike from zarr.core.common import MemoryOrder @@ -75,26 +77,17 @@ async def test_order( data = np.arange(0, 256, dtype="uint16").reshape((32, 8), order=input_order) path = "order" spath = StorePath(store, path=path) - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(16, 8), - codecs=[TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=order_from_dim(store_order, data.ndim)), BytesCodec()] - ) with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(32, 8), + chunks=(16, 8) if with_sharding else (32, 8), + shards=(32, 8) if with_sharding else None, dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order_from_dim(store_order, data.ndim))], ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -131,16 +124,15 @@ def test_order_implicit( data = np.arange(0, 256, dtype="uint16").reshape((16, 16), order=input_order) path = "order_implicit" spath = StorePath(store, path) - codecs_: list[Codec] | None = [ShardingCodec(chunk_shape=(8, 8))] if with_sharding else None with config.set({"array.order": runtime_write_order}): - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(8, 8) if with_sharding else (16, 16), + shards=(16, 16) if with_sharding else None, dtype=data.dtype, fill_value=0, - codecs=codecs_, ) a[:, :] = data @@ -161,10 +153,10 @@ def test_order_implicit( @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_open(store: Store) -> None: spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(16, 16), dtype="int32", fill_value=0, ) @@ -228,10 +220,10 @@ def test_morton2(shape) -> None: def test_write_partial_chunks(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=(20, 20), + chunks=(20, 20), dtype=data.dtype, fill_value=1, ) @@ -244,10 +236,10 @@ async def test_delete_empty_chunks(store: Store) -> None: data = np.ones((16, 16)) path = "delete_empty_chunks" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(32, 32), + chunks=(32, 32), dtype=data.dtype, fill_value=1, ) @@ -262,25 +254,25 @@ async def test_dimension_names(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "dimension_names" spath = StorePath(store, path) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, dimension_names=("x", "y"), ) - assert (await AsyncArray.open(spath)).metadata.dimension_names == ( + assert (await zarr.api.asynchronous.open_array(store=spath)).metadata.dimension_names == ( "x", "y", ) path2 = "dimension_names2" spath2 = StorePath(store, path2) - await AsyncArray.create( + await zarr.api.asynchronous.create_array( spath2, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, ) @@ -295,7 +287,7 @@ async def test_dimension_names(store: Store) -> None: def test_invalid_metadata(store: Store) -> None: spath2 = StorePath(store, "invalid_endian") with pytest.raises(TypeError): - Array.create( + Array._create( spath2, shape=(16, 16), chunk_shape=(16, 16), @@ -308,7 +300,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath3 = StorePath(store, "invalid_order") with pytest.raises(TypeError): - Array.create( + Array._create( spath3, shape=(16, 16), chunk_shape=(16, 16), @@ -321,7 +313,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath4 = StorePath(store, "invalid_missing_bytes_codec") with pytest.raises(ValueError): - Array.create( + Array._create( spath4, shape=(16, 16), chunk_shape=(16, 16), @@ -333,7 +325,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath5 = StorePath(store, "invalid_inner_chunk_shape") with pytest.raises(ValueError): - Array.create( + Array._create( spath5, shape=(16, 16), chunk_shape=(16, 16), @@ -345,7 +337,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath6 = StorePath(store, "invalid_inner_chunk_shape") with pytest.raises(ValueError): - Array.create( + Array._create( spath6, shape=(16, 16), chunk_shape=(16, 16), @@ -357,7 +349,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath7 = StorePath(store, "warning_inefficient_codecs") with pytest.warns(UserWarning): - Array.create( + Array._create( spath7, shape=(16, 16), chunk_shape=(16, 16), @@ -375,12 +367,12 @@ async def test_resize(store: Store) -> None: data = np.zeros((16, 18), dtype="uint16") path = "resize" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(10, 10), + chunks=(10, 10), dtype=data.dtype, - chunk_key_encoding=("v2", "."), + chunk_key_encoding={"name": "v2", "separator": "."}, fill_value=1, ) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index db4e77451c..97b20cc50b 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -17,7 +17,7 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await AsyncArray._create( spath, shape=data.shape, chunk_shape=(16, 16), @@ -43,7 +43,7 @@ async def test_endian_write( data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await AsyncArray._create( spath, shape=data.shape, chunk_shape=(16, 16), diff --git a/tests/test_codecs/test_gzip.py b/tests/test_codecs/test_gzip.py index 7b4d231813..f47f9710b1 100644 --- a/tests/test_codecs/test_gzip.py +++ b/tests/test_codecs/test_gzip.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, GzipCodec +from zarr.codecs import GzipCodec from zarr.storage.common import StorePath @@ -11,13 +11,13 @@ def test_gzip(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), GzipCodec()], + compressors=GzipCodec(), ) a[:, :] = data diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 51c82067f3..0edf625433 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -5,11 +5,13 @@ import numpy.typing as npt import pytest -from zarr import Array, AsyncArray +import zarr +import zarr.api +import zarr.api.asynchronous +from zarr import Array from zarr.abc.store import Store from zarr.codecs import ( BloscCodec, - BytesCodec, ShardingCodec, ShardingCodecIndexLocation, TransposeCodec, @@ -45,23 +47,16 @@ def test_sharding( """ data = array_fixture spath = StorePath(store) - arr = Array.create( + + arr = zarr.create_array( spath, shape=tuple(s + offset for s in data.shape), - chunk_shape=(64,) * data.ndim, + chunks=(32,) * data.ndim, + shards={"shape": (64,) * data.ndim, "index_location": index_location}, dtype=data.dtype, fill_value=6, - codecs=[ - ShardingCodec( - chunk_shape=(32,) * data.ndim, - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], + compressors=BloscCodec(cname="lz4"), ) write_region = tuple(slice(offset, None) for dim in range(data.ndim)) arr[write_region] = data @@ -89,23 +84,15 @@ def test_sharding_partial( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[10:, 10:, 10:] = data @@ -132,19 +119,15 @@ def test_sharding_partial_readwrite( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=(1, data.shape[1], data.shape[2]), + shards={"shape": data.shape, "index_location": index_location}, dtype=data.dtype, fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(1, data.shape[1], data.shape[2]), - codecs=[BytesCodec()], - index_location=index_location, - ) - ], + filters=None, + compressors=None, ) a[:] = data @@ -168,23 +151,15 @@ def test_sharding_partial_read( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) read_data = a[0:10, 0:10, 0:10] @@ -205,23 +180,15 @@ def test_sharding_partial_overwrite( ) -> None: data = array_fixture[:10, :10, :10] spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=tuple(a + 10 for a in data.shape), - chunk_shape=(64, 64, 64), + chunks=(32, 32, 32), + shards={"shape": (64, 64, 64), "index_location": index_location}, + compressors=BloscCodec(cname="lz4"), + filters=[TransposeCodec(order=order_from_dim("F", data.ndim))], dtype=data.dtype, fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(32, 32, 32), - codecs=[ - TransposeCodec(order=order_from_dim("F", data.ndim)), - BytesCodec(), - BloscCodec(cname="lz4"), - ], - index_location=index_location, - ) - ], ) a[:10, :10, :10] = data @@ -259,7 +226,7 @@ def test_nested_sharding( ) -> None: data = array_fixture spath = StorePath(store) - a = Array.create( + a = Array._create( spath, shape=data.shape, chunk_shape=(64, 64, 64), @@ -287,22 +254,15 @@ def test_nested_sharding( def test_open_sharding(store: Store) -> None: path = "open_sharding" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=(16, 16), - chunk_shape=(16, 16), + chunks=(8, 8), + shards=(16, 16), + filters=[TransposeCodec(order=order_from_dim("F", 2))], + compressors=BloscCodec(), dtype="int32", fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(8, 8), - codecs=[ - TransposeCodec(order=order_from_dim("F", 2)), - BytesCodec(), - BloscCodec(), - ], - ) - ], ) b = Array.open(spath) assert a.metadata == b.metadata @@ -312,21 +272,14 @@ def test_open_sharding(store: Store) -> None: def test_write_partial_sharded_chunks(store: Store) -> None: data = np.arange(0, 16 * 16, dtype="uint16").reshape((16, 16)) spath = StorePath(store) - a = Array.create( + a = zarr.create_array( spath, shape=(40, 40), - chunk_shape=(20, 20), + chunks=(10, 10), + shards=(20, 20), dtype=data.dtype, + compressors=BloscCodec(), fill_value=1, - codecs=[ - ShardingCodec( - chunk_shape=(10, 10), - codecs=[ - BytesCodec(), - BloscCodec(), - ], - ) - ], ) a[0:16, 0:16] = data assert np.array_equal(a[0:16, 0:16], data) @@ -338,13 +291,13 @@ async def test_delete_empty_shards(store: Store) -> None: pytest.skip("store does not support deletes") path = "delete_empty_shards" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 16), + chunks=(8, 8), + shards=(16, 8), dtype="uint16", fill_value=1, - codecs=[ShardingCodec(chunk_shape=(8, 8))], ) await _AsyncArrayProxy(a)[:, :].set(np.zeros((16, 16))) await _AsyncArrayProxy(a)[8:, :].set(np.ones((8, 16))) @@ -380,13 +333,13 @@ async def test_sharding_with_empty_inner_chunk( path = f"sharding_with_empty_inner_chunk_{index_location}" spath = StorePath(store, path) - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=(16, 16), - chunk_shape=(8, 8), + chunks=(4, 4), + shards={"shape": (8, 8), "index_location": index_location}, dtype="uint32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=(4, 4), index_location=index_location)], ) data[:4, :4] = fill_value await a.setitem(..., data) @@ -411,13 +364,13 @@ async def test_sharding_with_chunks_per_shard( path = f"test_sharding_with_chunks_per_shard_{index_location}" spath = StorePath(store, path) - a = Array.create( + a = zarr.create_array( spath, shape=shape, - chunk_shape=shape, + chunks=chunk_shape, + shards={"shape": shape, "index_location": index_location}, dtype="int32", fill_value=fill_value, - codecs=[ShardingCodec(chunk_shape=chunk_shape, index_location=index_location)], ) a[...] = data data_read = a[...] diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 2b3914150e..06ed07f6c1 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -1,19 +1,15 @@ -from typing import TYPE_CHECKING - import numpy as np import pytest -from zarr import Array, AsyncArray, config +import zarr +from zarr import AsyncArray, config from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ShardingCodec, TransposeCodec +from zarr.codecs import TransposeCodec from zarr.core.common import MemoryOrder from zarr.storage.common import StorePath from .test_codecs import _AsyncArrayProxy -if TYPE_CHECKING: - from zarr.abc.codec import Codec - @pytest.mark.parametrize("input_order", ["F", "C"]) @pytest.mark.parametrize("runtime_write_order", ["F", "C"]) @@ -29,25 +25,16 @@ async def test_transpose( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8), order=input_order) spath = StorePath(store, path="transpose") - codecs_: list[Codec] = ( - [ - ShardingCodec( - chunk_shape=(1, 16, 8), - codecs=[TransposeCodec(order=(2, 1, 0)), BytesCodec()], - ) - ] - if with_sharding - else [TransposeCodec(order=(2, 1, 0)), BytesCodec()] - ) with config.set({"array.order": runtime_write_order}): - a = await AsyncArray.create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 16, 8) if with_sharding else (1, 32, 8), + shards=(1, 32, 8) if with_sharding else None, dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=codecs_, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=(2, 1, 0))], ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -75,13 +62,13 @@ def test_transpose_non_self_inverse(store: Store, order: list[int]) -> None: shape = [i + 3 for i in range(len(order))] data = np.arange(0, np.prod(shape), dtype="uint16").reshape(shape) spath = StorePath(store, "transpose_non_self_inverse") - a = Array.create( + a = zarr.create_array( spath, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=0, - codecs=[TransposeCodec(order=order), BytesCodec()], + filters=[TransposeCodec(order=order)], ) a[:, :] = data read_data = a[:, :] @@ -96,12 +83,12 @@ def test_transpose_invalid( spath = StorePath(store, "transpose_invalid") for order in [(1, 0), (3, 2, 1), (3, 3, 1)]: with pytest.raises(ValueError): - Array.create( + zarr.create_array( spath, shape=data.shape, - chunk_shape=(1, 32, 8), + chunks=(1, 32, 8), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[TransposeCodec(order=order), BytesCodec()], + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order)], ) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index 05b2e25267..f4ee135601 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -3,10 +3,11 @@ import numpy as np import pytest +import zarr from zarr import Array from zarr.abc.codec import Codec from zarr.abc.store import Store -from zarr.codecs import VLenBytesCodec, VLenUTF8Codec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.core.metadata.v3 import ArrayV3Metadata, DataType from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING from zarr.storage.common import StorePath @@ -23,21 +24,21 @@ @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenUTF8Codec()], [VLenUTF8Codec(), ZstdCodec()]]) +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) def test_vlen_string( - store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, codecs: list[Codec] | None + store: Store, dtype: np.dtype[Any] | None, as_object_array: bool, compressor: Codec | None ) -> None: strings = ["hello", "world", "this", "is", "a", "test"] data = np.array(strings, dtype=dtype).reshape((2, 3)) sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value="", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy @@ -61,20 +62,20 @@ def test_vlen_string( @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("codecs", [None, [VLenBytesCodec()], [VLenBytesCodec(), ZstdCodec()]]) -def test_vlen_bytes(store: Store, as_object_array: bool, codecs: list[Codec] | None) -> None: +@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) +def test_vlen_bytes(store: Store, as_object_array: bool, compressor: Codec | None) -> None: bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] data = np.array(bstrings).reshape((2, 3)) assert data.dtype == "|S5" sp = StorePath(store, path="string") - a = Array.create( + a = zarr.create_array( sp, shape=data.shape, - chunk_shape=data.shape, + chunks=data.shape, dtype=data.dtype, fill_value=b"", - codecs=codecs, + compressors=compressor, ) assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 29efc29466..a57476fb61 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -1,9 +1,9 @@ import numpy as np import pytest -from zarr import Array +import zarr from zarr.abc.store import Store -from zarr.codecs import BytesCodec, ZstdCodec +from zarr.codecs import ZstdCodec from zarr.storage.common import StorePath @@ -12,13 +12,13 @@ def test_zstd(store: Store, checksum: bool) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( StorePath(store, path="zstd"), shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - codecs=[BytesCodec(), ZstdCodec(level=0, checksum=checksum)], + compressors=ZstdCodec(level=0, checksum=checksum), ) a[:, :] = data diff --git a/tests/test_config.py b/tests/test_config.py index 7ac416b393..4befb2dae2 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,7 +8,8 @@ import pytest import zarr -from zarr import Array, AsyncArray, zeros +import zarr.api +from zarr import zeros from zarr.abc.codec import Codec, CodecInput, CodecOutput, CodecPipeline from zarr.abc.store import ByteSetter, Store from zarr.codecs import ( @@ -153,7 +154,7 @@ async def write( assert get_pipeline_class() == MockCodecPipeline # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -198,13 +199,13 @@ async def _encode_single( assert get_codec_class("blosc") == MockBloscCodec # test if codec is used - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), zarr_format=3, dtype="i4", - codecs=[BytesCodec(), {"name": "blosc", "configuration": {}}], + compressors=[{"name": "blosc", "configuration": {}}], ) arr[:] = range(100) _mock.call.assert_called() @@ -227,7 +228,7 @@ def test_config_ndbuffer_implementation(store: Store) -> None: register_ndbuffer(NDBufferUsingTestNDArrayLike) with config.set({"ndbuffer": fully_qualified_name(NDBufferUsingTestNDArrayLike)}): assert get_ndbuffer_class() == NDBufferUsingTestNDArrayLike - arr = Array.create( + arr = zarr.create_array( store=store, shape=(100,), chunks=(10,), @@ -328,9 +329,9 @@ async def test_default_codecs(dtype: str, expected_codecs: list[Codec]) -> None: } } ): - arr = await AsyncArray.create( + arr = await zarr.api.asynchronous.create_array( shape=(100,), - chunk_shape=(100,), + chunks=(100,), dtype=np.dtype(dtype), zarr_format=3, store=MemoryStore(), diff --git a/tests/test_group.py b/tests/test_group.py index 67232fd948..e9fea3ebed 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -784,7 +784,7 @@ async def test_asyncgroup_create( ) # create an array at our target path collision_name = "foo" - _ = await AsyncArray.create( + _ = await zarr.api.asynchronous.create_array( spath / collision_name, shape=(10,), dtype="uint8", zarr_format=zarr_format ) with pytest.raises(ContainsArrayError): diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 04eb53e364..fc83af695b 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -47,12 +47,12 @@ def zarr_array_from_numpy_array( a: npt.NDArray[Any], chunk_shape: ChunkCoords | None = None, ) -> zarr.Array: - z = zarr.Array.create( + z = zarr.create_array( store=store / str(uuid4()), shape=a.shape, dtype=a.dtype, - chunk_shape=chunk_shape or a.shape, - chunk_key_encoding=("v2", "."), + chunks=chunk_shape or a.shape, + chunk_key_encoding={"name": "v2", "separator": "."}, ) z[()] = a return z @@ -1933,7 +1933,7 @@ def test_indexing_with_zarr_array(store: StorePath) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("shape", [(0, 2, 3), (0), (3, 0)]) def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: - z = Array.create(store=store, shape=shape, chunk_shape=shape, zarr_format=3, dtype="f8") + z = zarr.create_array(store=store, shape=shape, chunks=shape, zarr_format=3, dtype="f8") z[...] = 42 assert_array_equal(z[...], np.zeros(shape, dtype="f8")) diff --git a/tests/test_v2.py b/tests/test_v2.py index 74b8a654fb..72127f4ede 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,7 +11,7 @@ import zarr import zarr.core.buffer import zarr.storage -from zarr import Array, config +from zarr import config from zarr.storage import MemoryStore, StorePath @@ -23,7 +23,7 @@ async def store() -> Iterator[StorePath]: def test_simple(store: StorePath) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - a = Array.create( + a = zarr.create_array( store / "simple_v2", zarr_format=2, shape=data.shape, @@ -167,7 +167,7 @@ def test_v2_filters_codecs(filters: Any, order: Literal["C", "F"]) -> None: @pytest.mark.parametrize("array_order", ["C", "F"]) @pytest.mark.parametrize("data_order", ["C", "F"]) def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal["C", "F"]) -> None: - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), @@ -187,7 +187,7 @@ def test_v2_non_contiguous(array_order: Literal["C", "F"], data_order: Literal[" arr[slice(6, 9, None), slice(3, 6, None)], a[slice(6, 9, None), slice(3, 6, None)] ) - arr = zarr.Array.create( + arr = zarr.create_array( MemoryStore({}), shape=(10, 8), chunks=(3, 3), From 75b2197b2b7939a5966dcf3c3f403f949b7c76b2 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 31 Dec 2024 14:20:41 +0100 Subject: [PATCH 66/85] adds array_bytes_codec kwarg --- src/zarr/core/array.py | 48 +++++++++++++++++++++++------- src/zarr/core/metadata/v2.py | 7 +++++ src/zarr/registry.py | 2 +- tests/test_array.py | 48 ++++++++++++++++++++++++++---- tests/test_codecs/test_endian.py | 17 ++++++----- tests/test_codecs/test_sharding.py | 2 +- 6 files changed, 99 insertions(+), 25 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 721c5c2dba..f9bd6fdbcf 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -97,6 +97,7 @@ from zarr.errors import MetadataValidationError from zarr.registry import ( _parse_array_array_codec, + _parse_array_bytes_codec, _parse_bytes_bytes_codec, _resolve_codec, get_pipeline_class, @@ -385,6 +386,7 @@ async def _create( ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod + # @deprecated("Use `zarr.api.asynchronous.create_array` instead.") async def _create( cls, store: StoreLike, @@ -417,6 +419,7 @@ async def _create( config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ + Deprecated in favor of `zarr.api.asynchronous.create_array`. Method to create a new asynchronous array instance. Parameters @@ -677,10 +680,10 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters: - filters = _default_filters(dtype) - if not compressor: - compressor = _default_compressor(dtype) + # if not filters: + # filters = _default_filters(dtype) + # if not compressor: + # compressor = _default_compressor(dtype) # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): @@ -1530,6 +1533,7 @@ class Array: _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @classmethod + # @deprecated("Use `zarr.create_array` instead.") @_deprecate_positional_args def _create( cls, @@ -1561,7 +1565,8 @@ def _create( overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, ) -> Array: - """Creates a new Array instance from an initialized store. + """Deprecated in favor of `zarr.create_array`. + Creates a new Array instance from an initialized store. Parameters ---------- @@ -3504,6 +3509,7 @@ def _get_default_codecs( | numcodecs.abc.Codec | Literal["auto"] ) +ArrayBytesCodecParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] class ShardsConfigParam(TypedDict): @@ -3524,6 +3530,7 @@ async def create_array( shards: ShardsParam | None = None, filters: FiltersParam = "auto", compressors: CompressorsParam = "auto", + array_bytes_codec: ArrayBytesCodecParam | None = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, @@ -3580,6 +3587,10 @@ async def create_array( For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. If no ``compressors`` are provided, a default compressor will be used. These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``array_bytes_codec`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -3680,7 +3691,10 @@ async def create_array( ) else: array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( - compressors=compressors, filters=filters, dtype=dtype_parsed + compressors=compressors, + filters=filters, + array_bytes_codec=array_bytes_codec, + dtype=dtype_parsed, ) sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) codecs_out: tuple[Codec, ...] @@ -3825,7 +3839,11 @@ def _parse_chunk_encoding_v2( if compressor == "auto": _compressor = default_compressor else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): + if ( + isinstance(compressor, Iterable) + and not isinstance(compressor, dict) + and len(compressor) > 1 + ): msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." raise TypeError(msg) _compressor = parse_compressor(compressor) @@ -3846,8 +3864,9 @@ def _parse_chunk_encoding_v2( def _parse_chunk_encoding_v3( *, - compressors: CompressorsParam, - filters: FiltersParam, + compressors: CompressorsParam | None, + filters: FiltersParam | None, + array_bytes_codec: ArrayBytesCodecParam | None, dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ @@ -3864,6 +3883,8 @@ def _parse_chunk_encoding_v3( else: if isinstance(compressors, dict | Codec): maybe_bytes_bytes = (compressors,) + elif compressors is None: + maybe_bytes_bytes = () else: maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) @@ -3874,8 +3895,15 @@ def _parse_chunk_encoding_v3( else: if isinstance(filters, dict | Codec): maybe_array_array = (filters,) + elif filters is None: + maybe_array_array = () else: maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) - return out_array_array, default_array_bytes, out_bytes_bytes + if array_bytes_codec == "auto": + out_array_bytes = default_array_bytes + else: + out_array_bytes = _parse_array_bytes_codec(array_bytes_codec) + + return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 0292d9551b..386222a8e7 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -232,6 +232,8 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: if data is None: return data if isinstance(data, Iterable): + if len(data) == 0: + return None for idx, val in enumerate(data): if isinstance(val, numcodecs.abc.Codec): out.append(val) @@ -249,6 +251,11 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: """ Parse a potential compressor. """ + if isinstance(data, Iterable) and not isinstance(data, dict): + if len(data) == 0: + data = None + else: + data = data[0] if data is None or isinstance(data, numcodecs.abc.Codec): return data if isinstance(data, dict): diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 4775799807..16baa31c4c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -186,7 +186,7 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | ArrayBytesCodec) -> ArrayBytesCodec: +def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it diff --git a/tests/test_array.py b/tests/test_array.py index 6c44ead91c..d0f061e379 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -12,9 +12,14 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec -from zarr.codecs.gzip import GzipCodec -from zarr.codecs.transpose import TransposeCodec +from zarr.codecs import ( + BytesCodec, + GzipCodec, + TransposeCodec, + VLenBytesCodec, + VLenUTF8Codec, + ZstdCodec, +) from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsParam, @@ -975,12 +980,45 @@ async def test_create_array_v3_compressors( compressors=compressors, ) _, _, bb_codecs_expected = _parse_chunk_encoding_v3( - filters=(), compressors=compressors, dtype=np.dtype(dtype) + filters=(), compressors=compressors, array_bytes_codec="auto", dtype=np.dtype(dtype) ) # TODO: find a better way to get the compressors from the array. assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: str) -> None: + """ + Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. + """ + + # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=(), + filters=(), + ) + assert arr.metadata.filters == None # type: ignore[union-attr] + assert arr.metadata.compressor == None # type: ignore[union-attr] + + # v3 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + compressors=(), + filters=(), + ) + if dtype == "str": + assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + else: + assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + + @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize( "filters", @@ -1027,7 +1065,7 @@ async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam filters=filters, ) aa_codecs_expected, _, _ = _parse_chunk_encoding_v3( - filters=filters, compressors=(), dtype=np.dtype(dtype) + filters=filters, compressors=(), array_bytes_codec="auto", dtype=np.dtype(dtype) ) # TODO: find a better way to get the filters from the array. assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index 97b20cc50b..8b3645095a 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -4,6 +4,7 @@ import pytest from zarr import AsyncArray +import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec from zarr.storage.common import StorePath @@ -17,14 +18,14 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray._create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype=data.dtype, fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + array_bytes_codec=BytesCodec(endian=endian), ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -43,14 +44,14 @@ async def test_endian_write( data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16)) path = "endian" spath = StorePath(store, path) - a = await AsyncArray._create( + a = await zarr.api.asynchronous.create_array( spath, shape=data.shape, - chunk_shape=(16, 16), + chunks=(16, 16), dtype="uint16", fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[BytesCodec(endian=dtype_store_endian)], + chunk_key_encoding={"name": "v2", "separator": "."}, + array_bytes_codec=BytesCodec(endian=dtype_store_endian), ) await _AsyncArrayProxy(a)[:, :].set(data) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 0edf625433..c7691d6921 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -295,7 +295,7 @@ async def test_delete_empty_shards(store: Store) -> None: spath, shape=(16, 16), chunks=(8, 8), - shards=(16, 8), + shards=(8, 16), dtype="uint16", fill_value=1, ) From 2f6f8a0569ff45f678cf722f2a4c390256573152 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 31 Dec 2024 14:37:34 +0100 Subject: [PATCH 67/85] tests --- src/zarr/core/array.py | 27 +++++++++++++-------------- src/zarr/core/metadata/v2.py | 7 ------- src/zarr/registry.py | 2 ++ tests/test_array.py | 17 ++++++++++++++--- tests/test_codecs/test_endian.py | 1 - tests/test_codecs/test_sharding.py | 2 ++ 6 files changed, 31 insertions(+), 25 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f9bd6fdbcf..25f128601f 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -680,10 +680,10 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - # if not filters: - # filters = _default_filters(dtype) - # if not compressor: - # compressor = _default_compressor(dtype) + if not filters: + filters = _default_filters(dtype) + if not compressor: + compressor = _default_compressor(dtype) # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): @@ -3528,10 +3528,10 @@ async def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ShardsParam | None = None, - filters: FiltersParam = "auto", + filters: FiltersParam | None = "auto", compressors: CompressorsParam = "auto", - array_bytes_codec: ArrayBytesCodecParam | None = "auto", - fill_value: Any | None = 0, + array_bytes_codec: ArrayBytesCodecParam = "auto", + fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, @@ -3665,6 +3665,9 @@ async def create_array( ) raise ValueError(msg) + if array_bytes_codec != "auto": + raise ValueError("Zarr v2 arrays do not support `array_bytes_codec`.") + filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) @@ -3825,7 +3828,7 @@ def _get_default_chunk_encoding_v2( def _parse_chunk_encoding_v2( *, compressor: CompressorsParam, - filters: FiltersParam, + filters: FiltersParam | None, dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ @@ -3839,11 +3842,7 @@ def _parse_chunk_encoding_v2( if compressor == "auto": _compressor = default_compressor else: - if ( - isinstance(compressor, Iterable) - and not isinstance(compressor, dict) - and len(compressor) > 1 - ): + if isinstance(compressor, Iterable) and not isinstance(compressor, dict): msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." raise TypeError(msg) _compressor = parse_compressor(compressor) @@ -3866,7 +3865,7 @@ def _parse_chunk_encoding_v3( *, compressors: CompressorsParam | None, filters: FiltersParam | None, - array_bytes_codec: ArrayBytesCodecParam | None, + array_bytes_codec: ArrayBytesCodecParam, dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 386222a8e7..0292d9551b 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -232,8 +232,6 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: if data is None: return data if isinstance(data, Iterable): - if len(data) == 0: - return None for idx, val in enumerate(data): if isinstance(val, numcodecs.abc.Codec): out.append(val) @@ -251,11 +249,6 @@ def parse_compressor(data: object) -> numcodecs.abc.Codec | None: """ Parse a potential compressor. """ - if isinstance(data, Iterable) and not isinstance(data, dict): - if len(data) == 0: - data = None - else: - data = data[0] if data is None or isinstance(data, numcodecs.abc.Codec): return data if isinstance(data, dict): diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 16baa31c4c..704db3f704 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -200,6 +200,8 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) else: + if not isinstance(data, ArrayBytesCodec): + raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") result = data return result diff --git a/tests/test_array.py b/tests/test_array.py index d0f061e379..5546bb8ae6 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -219,7 +219,7 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str def test_create_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="Pass"): - zarr.create_array(store, (2, 2), dtype="f8") + zarr.Array._create(store, (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: @@ -994,6 +994,17 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st """ # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=None, + filters=None, + ) + assert arr.metadata.filters is None # type: ignore[union-attr] + assert arr.metadata.compressor is None # type: ignore[union-attr] + arr = await create_array( store=store, dtype=dtype, @@ -1002,8 +1013,8 @@ async def test_create_array_no_filters_compressors(store: MemoryStore, dtype: st compressors=(), filters=(), ) - assert arr.metadata.filters == None # type: ignore[union-attr] - assert arr.metadata.compressor == None # type: ignore[union-attr] + assert arr.metadata.filters == () # type: ignore[union-attr] + assert arr.metadata.compressor is None # type: ignore[union-attr] # v3 arr = await create_array( diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index 8b3645095a..f4d85683b2 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -3,7 +3,6 @@ import numpy as np import pytest -from zarr import AsyncArray import zarr from zarr.abc.store import Store from zarr.codecs import BytesCodec diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index c7691d6921..344d6a4b65 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -297,8 +297,10 @@ async def test_delete_empty_shards(store: Store) -> None: chunks=(8, 8), shards=(8, 16), dtype="uint16", + compressors=None, fill_value=1, ) + print(a.metadata.to_dict()) await _AsyncArrayProxy(a)[:, :].set(np.zeros((16, 16))) await _AsyncArrayProxy(a)[8:, :].set(np.ones((8, 16))) await _AsyncArrayProxy(a)[:, 8:].set(np.ones((16, 8))) From c4330ef3778270e7384d0e14e5402af21426f203 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 31 Dec 2024 14:47:18 +0100 Subject: [PATCH 68/85] tests for no filters+compressors --- src/zarr/core/array.py | 2 +- tests/test_array.py | 61 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8490a1bce6..1522aa87e8 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3527,7 +3527,7 @@ async def create_array( shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersParam = "auto", compressors: CompressorsParam = "auto", - fill_value: Any | None = 0, + fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, diff --git a/tests/test_array.py b/tests/test_array.py index e6a9b7adf0..19ef9362b7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -12,9 +12,14 @@ import zarr.api.asynchronous from zarr import Array, AsyncArray, Group -from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec -from zarr.codecs.gzip import GzipCodec -from zarr.codecs.transpose import TransposeCodec +from zarr.codecs import ( + BytesCodec, + GzipCodec, + TransposeCodec, + VLenBytesCodec, + VLenUTF8Codec, + ZstdCodec, +) from zarr.core._info import ArrayInfo from zarr.core.array import ( CompressorsParam, @@ -946,6 +951,56 @@ def test_chunks_and_shards() -> None: assert arr_v2.shards is None +def test_create_array_default_fill_values() -> None: + a = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype=" None: + """ + Test that the default ``filters`` and ``compressors`` are removed when ``create_array`` is invoked. + """ + + # v2 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=empty_value, + filters=empty_value, + ) + assert arr.metadata.filters == empty_value # type: ignore[union-attr] + assert arr.metadata.compressor is None # type: ignore[union-attr] + + # v3 + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + compressors=empty_value, + filters=empty_value, + ) + if dtype == "str": + assert arr.metadata.codecs == [VLenUTF8Codec()] # type: ignore[union-attr] + else: + assert arr.metadata.codecs == [BytesCodec()] # type: ignore[union-attr] + + @pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize( "compressors", From 95ffadd6e7420e7403b2de1f4c44343f50901ee5 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 1 Jan 2025 14:24:16 +0100 Subject: [PATCH 69/85] widen type of FiltersParam to include single numcodecs codec instances --- src/zarr/core/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1522aa87e8..a7af56c83a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3507,6 +3507,7 @@ def _get_default_codecs( Iterable[dict[str, JSON] | ArrayArrayCodec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] + | numcodecs.abc.Codec | Literal["auto"] ) CompressorsParam: TypeAlias = ( From bbe3a94c4295e4c77f81e0152708dbd4d9592c09 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 1 Jan 2025 20:39:07 +0100 Subject: [PATCH 70/85] don't alias None to default codecs in _create_v2 --- src/zarr/core/array.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a7af56c83a..ba08bc9ee0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -688,10 +688,6 @@ async def _create_v2( dimension_separator = "." dtype = parse_dtype(dtype, zarr_format=2) - if not filters: - filters = _default_filters(dtype) - if not compressor: - compressor = _default_compressor(dtype) # inject VLenUTF8 for str dtype if not already present if np.issubdtype(dtype, np.str_): From 856b40f921b67af93903e7d0aaeea00652d36fbc Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 1 Jan 2025 20:41:58 +0100 Subject: [PATCH 71/85] allow single codec instances for filters, and None for filters / compressor, and condense some tests --- src/zarr/core/array.py | 41 +++++++++++++------- src/zarr/core/metadata/v2.py | 3 ++ tests/test_array.py | 75 ++++++++++++++++++++++-------------- 3 files changed, 78 insertions(+), 41 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ba08bc9ee0..6e29a9d142 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3505,12 +3505,14 @@ def _get_default_codecs( | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec | Literal["auto"] + | None ) CompressorsParam: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] + | None ) @@ -3646,6 +3648,7 @@ async def create_array( filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) ) + if dimension_names is not None: raise ValueError("Zarr v2 arrays do not support dimension names.") if order is None: @@ -3801,10 +3804,12 @@ def _parse_chunk_encoding_v2( """ default_filters, default_compressor = _get_default_chunk_encoding_v2(dtype) - _filters: tuple[numcodecs.abc.Codec, ...] | None = None - _compressor: numcodecs.abc.Codec | None = None + _filters: tuple[numcodecs.abc.Codec, ...] | None + _compressor: numcodecs.abc.Codec | None - if compressor == "auto": + if compressor is None: + _compressor = None + elif compressor == "auto": _compressor = default_compressor else: if isinstance(compressor, Iterable) and not isinstance(compressor, dict): @@ -3812,15 +3817,19 @@ def _parse_chunk_encoding_v2( raise TypeError(msg) _compressor = parse_compressor(compressor) - if filters == "auto": + if filters is None: + _filters = None + elif filters == "auto": _filters = default_filters else: - if isinstance(filters, Iterable) and not all( - isinstance(f, numcodecs.abc.Codec) for f in filters - ): - raise TypeError( - "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs." - ) + if isinstance(filters, Iterable): + for idx, f in enumerate(filters): + if not isinstance(f, numcodecs.abc.Codec): + msg = ( + "For Zarr v2 arrays, all elements of `filters` must be numcodecs codecs. " + f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." + ) + raise TypeError(msg) _filters = parse_filters(filters) return _filters, _compressor @@ -3840,9 +3849,13 @@ def _parse_chunk_encoding_v3( ) maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] maybe_array_array: Iterable[Codec | dict[str, JSON]] + out_bytes_bytes: tuple[BytesBytesCodec, ...] + if compressors is None: + out_bytes_bytes = () - if compressors == "auto": + elif compressors == "auto": out_bytes_bytes = default_bytes_bytes + else: if isinstance(compressors, dict | Codec): maybe_bytes_bytes = (compressors,) @@ -3850,8 +3863,10 @@ def _parse_chunk_encoding_v3( maybe_bytes_bytes = cast(Iterable[Codec | dict[str, JSON]], compressors) out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) - - if filters == "auto": + out_array_array: tuple[ArrayArrayCodec, ...] + if filters is None: + out_array_array = () + elif filters == "auto": out_array_array = default_array_array else: if isinstance(filters, dict | Codec): diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 0292d9551b..bc7fd32cbf 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -241,6 +241,9 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) return tuple(out) + # take a single codec instance and wrap it in a tuple + if isinstance(data, numcodecs.abc.Codec): + return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) diff --git a/tests/test_array.py b/tests/test_array.py index 19ef9362b7..b8987603aa 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -26,6 +26,7 @@ FiltersParam, _get_default_chunk_encoding_v2, _get_default_chunk_encoding_v3, + _parse_chunk_encoding_v2, _parse_chunk_encoding_v3, chunks_initialized, create_array, @@ -1002,10 +1003,13 @@ async def test_create_array_no_filters_compressors( @pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) @pytest.mark.parametrize( "compressors", [ "auto", + None, + (), (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), @@ -1013,32 +1017,12 @@ async def test_create_array_no_filters_compressors( ({"name": "zstd", "configuration": {"level": 3}},), ], ) -async def test_create_array_v3_compressors( - store: MemoryStore, compressors: CompressorsParam -) -> None: - """ - Test various possibilities for the compressors parameter to create_array - """ - dtype = "uint8" - arr = await create_array( - store=store, - dtype=dtype, - shape=(10,), - zarr_format=3, - compressors=compressors, - ) - _, _, bb_codecs_expected = _parse_chunk_encoding_v3( - filters=(), compressors=compressors, dtype=np.dtype(dtype) - ) - # TODO: find a better way to get the compressors from the array. - assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] - - -@pytest.mark.parametrize("store", ["memory"], indirect=True) @pytest.mark.parametrize( "filters", [ "auto", + None, + (), ( TransposeCodec( order=[ @@ -1067,23 +1051,58 @@ async def test_create_array_v3_compressors( ({"name": "transpose", "configuration": {"order": [0]}},), ], ) -async def test_create_array_v3_filters(store: MemoryStore, filters: FiltersParam) -> None: +async def test_create_array_v3_chunk_encoding( + store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str +) -> None: """ - Test various possibilities for the filters parameter to create_array + Test various possibilities for the compressors and filters parameter to create_array """ - dtype = "uint8" arr = await create_array( store=store, dtype=dtype, shape=(10,), zarr_format=3, filters=filters, + compressors=compressors, ) - aa_codecs_expected, _, _ = _parse_chunk_encoding_v3( - filters=filters, compressors=(), dtype=np.dtype(dtype) + aa_codecs_expected, _, bb_codecs_expected = _parse_chunk_encoding_v3( + filters=filters, compressors=compressors, dtype=np.dtype(dtype) ) - # TODO: find a better way to get the filters from the array. + # TODO: find a better way to get the filters / compressors from the array. assert arr.codec_pipeline.array_array_codecs == aa_codecs_expected # type: ignore[attr-defined] + assert arr.codec_pipeline.bytes_bytes_codecs == bb_codecs_expected # type: ignore[attr-defined] + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("dtype", ["uint8", "float32", "str"]) +@pytest.mark.parametrize( + "compressors", + [ + "auto", + None, + numcodecs.Zstd(level=3), + ], +) +@pytest.mark.parametrize( + "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] +) +async def test_create_array_v2_chunk_encoding( + store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str +) -> None: + arr = await create_array( + store=store, + dtype=dtype, + shape=(10,), + zarr_format=2, + compressors=compressors, + filters=filters, + ) + filters_expected, compressor_expected = _parse_chunk_encoding_v2( + filters=filters, compressor=compressors, dtype=np.dtype(dtype) + ) + # TODO: find a better way to get the filters/compressor from the array. + assert arr.metadata.compressor == compressor_expected # type: ignore[union-attr] + assert arr.metadata.filters == filters_expected # type: ignore[union-attr] @pytest.mark.parametrize("store", ["memory"], indirect=True) From 2aa3acc7417658af64c9fd2b8b0c02926e61b0da Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 10:18:09 +0100 Subject: [PATCH 72/85] add docstring for None --- src/zarr/core/array.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 6e29a9d142..7bd1927d77 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3563,12 +3563,16 @@ async def create_array( of ``ArrayArrayCodec``. If ``filters`` and ``compressors`` are not specified, then the default codecs for Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, a default set of filters will be used. - These defaults can be changed by modifying the value of ``array.v2_default_filters`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. compressors : Iterable[Codec], optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified). @@ -3577,11 +3581,16 @@ async def create_array( returns another bytestream. Multiple compressors my be provided for Zarr v3. If ``filters`` and ``compressors`` are not specified, then the default codecs for Zarr v3 will be used. - These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. - For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr v2. + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. If no ``compressors`` are provided, a default compressor will be used. - These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional From 9fb8a33211d3cb88b84892c5add6ad2ec08ca310 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 10:55:48 +0100 Subject: [PATCH 73/85] single-item tuple for compressors in v2 --- src/zarr/core/array.py | 4 +++- tests/test_array.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 7bd1927d77..b5f880b77e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3816,10 +3816,12 @@ def _parse_chunk_encoding_v2( _filters: tuple[numcodecs.abc.Codec, ...] | None _compressor: numcodecs.abc.Codec | None - if compressor is None: + if compressor is None or compressor == (): _compressor = None elif compressor == "auto": _compressor = default_compressor + elif isinstance(compressor, tuple | list) and len(compressor) == 1: + _compressor = parse_compressor(compressor[0]) else: if isinstance(compressor, Iterable) and not isinstance(compressor, dict): msg = f"For Zarr v2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." diff --git a/tests/test_array.py b/tests/test_array.py index b8987603aa..c7b4e9a0ff 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -985,7 +985,9 @@ async def test_create_array_no_filters_compressors( compressors=empty_value, filters=empty_value, ) + # The v2 metadata stores None and () separately assert arr.metadata.filters == empty_value # type: ignore[union-attr] + # The v2 metadata does not allow tuple for compressor, therefore it is turned into None assert arr.metadata.compressor is None # type: ignore[union-attr] # v3 @@ -1081,6 +1083,8 @@ async def test_create_array_v3_chunk_encoding( "auto", None, numcodecs.Zstd(level=3), + (), + (numcodecs.Zstd(level=3),), ], ) @pytest.mark.parametrize( From 99faa8e83ed646e0e364015ec478a28f3851f1c5 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 10:56:14 +0100 Subject: [PATCH 74/85] Update src/zarr/core/array.py --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b5f880b77e..96ffd5c363 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3500,7 +3500,7 @@ def _get_default_codecs( FiltersParam: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec] + Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] | numcodecs.abc.Codec From 947f20ef07ba395cbfd266539651f965e433187a Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 12:39:23 +0100 Subject: [PATCH 75/85] tweaks --- src/zarr/api/asynchronous.py | 2 +- src/zarr/core/array.py | 163 ++++++++++++++++++++++++++--- tests/test_array.py | 16 ++- tests/test_codecs/test_codecs.py | 67 ++++++++++-- tests/test_codecs/test_sharding.py | 49 ++++++++- 5 files changed, 268 insertions(+), 29 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 53cf1db11d..0908344a30 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -1220,7 +1220,7 @@ async def open_array( If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. **kwargs - Any keyword arguments to pass to ``create``. + Any keyword arguments to pass to :func:`create`. Returns ------- diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9a89902f2a..c0e71a6fd6 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -7,12 +7,22 @@ from dataclasses import dataclass, field from itertools import starmap from logging import getLogger -from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypedDict, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + TypeAlias, + TypedDict, + cast, + overload, +) from warnings import warn import numcodecs import numpy as np import numpy.typing as npt +from typing_extensions import deprecated from zarr._compat import _deprecate_positional_args from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec @@ -274,7 +284,7 @@ def __init__( # this overload defines the function signature when zarr_format is 2 @overload @classmethod - async def _create( + async def create( cls, store: StoreLike, *, @@ -298,7 +308,7 @@ async def _create( # this overload defines the function signature when zarr_format is 3 @overload @classmethod - async def _create( + async def create( cls, store: StoreLike, *, @@ -326,7 +336,7 @@ async def _create( @overload @classmethod - async def _create( + async def create( cls, store: StoreLike, *, @@ -351,9 +361,10 @@ async def _create( data: npt.ArrayLike | None = None, config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... + @overload @classmethod - async def _create( + async def create( cls, store: StoreLike, *, @@ -386,8 +397,9 @@ async def _create( ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod - # @deprecated("Use `zarr.api.asynchronous.create_array` instead.") - async def _create( + @deprecated("Use zarr.api.asynchronous.create_array instead.") + @_deprecate_positional_args + async def create( cls, store: StoreLike, *, @@ -418,9 +430,7 @@ async def _create( data: npt.ArrayLike | None = None, config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Deprecated in favor of `zarr.api.asynchronous.create_array`. - Method to create a new asynchronous array instance. + """Method to create a new asynchronous array instance. Parameters ---------- @@ -499,6 +509,70 @@ async def _create( ------- AsyncArray The created asynchronous array instance. + + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. + """ + return await cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + fill_value=fill_value, + attributes=attributes, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + data=data, + config=config, + ) + + @classmethod + async def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ShapeLike, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ShapeLike | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ShapeLike | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + data: npt.ArrayLike | None = None, + config: ArrayConfig | ArrayConfigParams | None = None, + ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Method to create a new asynchronous array instance. + See :func:`AsyncArray.create` for more details. + Deprecated in favor of :func:`zarr.api.asynchronous.create_array`. """ store_path = await make_store_path(store) @@ -1529,9 +1603,9 @@ class Array: _async_array: AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata] @classmethod - # @deprecated("Use `zarr.create_array` instead.") + @deprecated("Use zarr.create_array instead.") @_deprecate_positional_args - def _create( + def create( cls, store: StoreLike, *, @@ -1561,8 +1635,7 @@ def _create( overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, ) -> Array: - """Deprecated in favor of `zarr.create_array`. - Creates a new Array instance from an initialized store. + """Creates a new Array instance from an initialized store. Parameters ---------- @@ -1631,6 +1704,68 @@ def _create( ------- Array Array created from the store. + + .. deprecated:: 3.0.0 + Deprecated in favor of :func:`zarr.create_array`. + """ + return cls._create( + store, + # v2 and v3 + shape=shape, + dtype=dtype, + zarr_format=zarr_format, + attributes=attributes, + fill_value=fill_value, + # v3 only + chunk_shape=chunk_shape, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + # v2 only + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, + # runtime + overwrite=overwrite, + config=config, + ) + + @classmethod + def _create( + cls, + store: StoreLike, + *, + # v2 and v3 + shape: ChunkCoords, + dtype: npt.DTypeLike, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: MemoryOrder | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, + ) -> Array: + """Creates a new Array instance from an initialized store. + See :func:`Array.create` for more details. + Deprecated in favor of :func:`zarr.create_array`. """ async_array = sync( AsyncArray._create( diff --git a/tests/test_array.py b/tests/test_array.py index ab48e108bc..9c159f2acf 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -220,7 +220,7 @@ def test_array_v3_fill_value(store: MemoryStore, fill_value: int, dtype_str: str def test_create_positional_args_deprecated() -> None: store = MemoryStore() with pytest.warns(FutureWarning, match="Pass"): - zarr.Array._create(store, (2, 2), dtype="f8") + zarr.Array.create(store, (2, 2), dtype="f8") def test_selection_positional_args_deprecated() -> None: @@ -436,16 +436,16 @@ def test_default_fill_values() -> None: def test_vlen_errors() -> None: with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): - Array._create(MemoryStore(), shape=5, chunks=5, dtype=" None: codecs=[BytesCodec(), VLenBytesCodec()], ) + with pytest.raises( + ValueError, + match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.", + ): + zarr.create_array( + MemoryStore(), shape=5, chunks=5, dtype=" None: diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index fe771579ff..4e99d4b8ea 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -285,35 +285,35 @@ async def test_dimension_names(store: Store) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_invalid_metadata(store: Store) -> None: - spath2 = StorePath(store, "invalid_endian") + spath2 = StorePath(store, "invalid_codec_order") with pytest.raises(TypeError): - Array._create( + Array.create( spath2, shape=(16, 16), chunk_shape=(16, 16), dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(endian="big"), + BytesCodec(), TransposeCodec(order=order_from_dim("F", 2)), ], ) spath3 = StorePath(store, "invalid_order") with pytest.raises(TypeError): - Array._create( + Array.create( spath3, shape=(16, 16), chunk_shape=(16, 16), dtype=np.dtype("uint8"), fill_value=0, codecs=[ - BytesCodec(), TransposeCodec(order="F"), # type: ignore[arg-type] + BytesCodec(), ], ) spath4 = StorePath(store, "invalid_missing_bytes_codec") with pytest.raises(ValueError): - Array._create( + Array.create( spath4, shape=(16, 16), chunk_shape=(16, 16), @@ -325,7 +325,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath5 = StorePath(store, "invalid_inner_chunk_shape") with pytest.raises(ValueError): - Array._create( + Array.create( spath5, shape=(16, 16), chunk_shape=(16, 16), @@ -337,7 +337,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath6 = StorePath(store, "invalid_inner_chunk_shape") with pytest.raises(ValueError): - Array._create( + Array.create( spath6, shape=(16, 16), chunk_shape=(16, 16), @@ -349,7 +349,7 @@ def test_invalid_metadata(store: Store) -> None: ) spath7 = StorePath(store, "warning_inefficient_codecs") with pytest.warns(UserWarning): - Array._create( + Array.create( spath7, shape=(16, 16), chunk_shape=(16, 16), @@ -362,6 +362,55 @@ def test_invalid_metadata(store: Store) -> None: ) +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata_create_array(store: Store) -> None: + spath3 = StorePath(store, "invalid_order") + with pytest.raises(TypeError): + zarr.create_array( + spath3, + shape=(16, 16), + chunks=(16, 16), + dtype=np.dtype("uint8"), + fill_value=0, + filters=[ + TransposeCodec(order="F"), # type: ignore[arg-type] + ], + ) + spath5 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath5, + shape=(16, 16), + shards=(16, 16), + chunks=(8,), + dtype=np.dtype("uint8"), + fill_value=0, + ) + spath6 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath6, + shape=(16, 16), + shards=(16, 16), + chunks=(8, 7), + dtype=np.dtype("uint8"), + fill_value=0, + ) + spath7 = StorePath(store, "warning_inefficient_codecs") + with pytest.warns(UserWarning): + zarr.create_array( + spath7, + shape=(16, 16), + chunks=(16, 16), + dtype=np.dtype("uint8"), + fill_value=0, + array_bytes_codec=ShardingCodec(chunk_shape=(8, 8)), + compressors=[ + GzipCodec(), + ], + ) + + @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) async def test_resize(store: Store) -> None: data = np.zeros((16, 18), dtype="uint16") diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index 344d6a4b65..a9abf8ede1 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -226,7 +226,7 @@ def test_nested_sharding( ) -> None: data = array_fixture spath = StorePath(store) - a = Array._create( + a = Array.create( spath, shape=data.shape, chunk_shape=(64, 64, 64), @@ -250,6 +250,53 @@ def test_nested_sharding( assert np.array_equal(data, read_data) +@pytest.mark.parametrize( + "array_fixture", + [ + ArrayRequest(shape=(128,) * 3, dtype="uint16", order="F"), + ], + indirect=["array_fixture"], +) +@pytest.mark.parametrize( + "outer_index_location", + ["start", "end"], +) +@pytest.mark.parametrize( + "inner_index_location", + ["start", "end"], +) +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +def test_nested_sharding_create_array( + store: Store, + array_fixture: npt.NDArray[Any], + outer_index_location: ShardingCodecIndexLocation, + inner_index_location: ShardingCodecIndexLocation, +) -> None: + data = array_fixture + spath = StorePath(store) + a = zarr.create_array( + spath, + shape=data.shape, + chunks=(32, 32, 32), + dtype=data.dtype, + fill_value=0, + array_bytes_codec=ShardingCodec( + chunk_shape=(32, 32, 32), + codecs=[ShardingCodec(chunk_shape=(16, 16, 16), index_location=inner_index_location)], + index_location=outer_index_location, + ), + filters=None, + compressors=None, + ) + print(a.metadata.to_dict()) + + a[:, :, :] = data + + read_data = a[0 : data.shape[0], 0 : data.shape[1], 0 : data.shape[2]] + assert data.shape == read_data.shape + assert np.array_equal(data, read_data) + + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) def test_open_sharding(store: Store) -> None: path = "open_sharding" From 14c45cd534ac805bda1e44f3145a4e7f9f81617e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 13:42:28 +0100 Subject: [PATCH 76/85] pr feedback 1 --- src/zarr/api/asynchronous.py | 22 +-- src/zarr/api/synchronous.py | 194 ++++++++++++++++++++++- src/zarr/core/array.py | 6 +- src/zarr/core/common.py | 4 +- src/zarr/core/config.py | 2 +- tests/test_config.py | 2 +- tests/test_metadata/test_consolidated.py | 2 +- 7 files changed, 210 insertions(+), 22 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 0908344a30..6bc7323ecb 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,7 +18,7 @@ ChunkCoords, MemoryOrder, ZarrFormat, - _default_zarr_version, + _default_zarr_format, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, @@ -413,7 +413,7 @@ async def save_array( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if not isinstance(arr, NDArrayLike): raise TypeError("arr argument must be numpy or other NDArrayLike array") @@ -473,7 +473,7 @@ async def save_group( zarr_version=zarr_version, zarr_format=zarr_format, ) - or _default_zarr_version() + or _default_zarr_format() ) for arg in args: @@ -653,7 +653,7 @@ async def group( try: return await AsyncGroup.open(store=store_path, zarr_format=zarr_format) except (KeyError, FileNotFoundError): - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store=store_path, zarr_format=_zarr_format, @@ -684,20 +684,22 @@ async def create_group( creating the group. zarr_format : {2, 3, None}, optional The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. storage_options : dict If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. Returns ------- - g : group + AsyncGroup The new group. """ if zarr_format is None: - zarr_format = _default_zarr_version() + zarr_format = _default_zarr_format() - # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise mode: Literal["a"] = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) @@ -812,7 +814,7 @@ async def open_group( pass if mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await AsyncGroup.from_store( store_path, zarr_format=_zarr_format, @@ -970,7 +972,7 @@ async def create( """ zarr_format = ( _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format) - or _default_zarr_version() + or _default_zarr_format() ) if zarr_format == 2: @@ -1243,7 +1245,7 @@ async def open_array( except FileNotFoundError: if not store_path.read_only and mode in _CREATE_MODES: overwrite = _infer_overwrite(mode) - _zarr_format = zarr_format or _default_zarr_version() + _zarr_format = zarr_format or _default_zarr_format() return await create( store=store_path, zarr_format=_zarr_format, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 461d64765f..427eff004f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,10 +18,23 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.core.array import ( + ArrayBytesCodecParam, + CompressorsParam, + FiltersParam, + ShardsParam, + ) from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike - from zarr.core.chunk_key_encodings import ChunkKeyEncoding - from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams + from zarr.core.common import ( + JSON, + AccessModeLiteral, + ChunkCoords, + MemoryOrder, + ShapeLike, + ZarrFormat, + ) from zarr.storage import StoreLike __all__ = [ @@ -534,6 +547,31 @@ def create_group( attributes: dict[str, Any] | None = None, storage_options: dict[str, Any] | None = None, ) -> Group: + """Create a group. + + Parameters + ---------- + store : Store or str + Store or path to directory in file system. + path : str, optional + Group path within store. + overwrite : bool, optional + If True, pre-existing data at ``path`` will be deleted before + creating the group. + zarr_format : {2, 3, None}, optional + The zarr format to use when saving. + If no ``zarr_format`` is provided, the default format will be used. + This default can be changed by modifying the value of ``default_zarr_format`` + in :mod:`zarr.core.config`. + storage_options : dict + If using an fsspec URL to create the store, these will be passed to + the backend implementation. Ignored otherwise. + + Returns + ------- + Group + The new group. + """ return Group( sync( async_api.create_group( @@ -700,8 +738,156 @@ def create( ) -def create_array(*args: Any, **kwargs: Any) -> Array: - return Array(sync(zarr.core.array.create_array(*args, **kwargs))) +def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsParam | None = None, + filters: FiltersParam | None = "auto", + compressors: CompressorsParam = "auto", + array_bytes_codec: ArrayBytesCodecParam = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, +) -> Array: + """Create an array. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``array_bytes_codec`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. + + Returns + ------- + Array + The array. + + Examples + -------- + >>> import zarr + >>> store = zarr.storage.MemoryStore(mode='w') + >>> arr = await zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='i4', + >>> fill_value=0) + + """ + return Array( + sync( + zarr.core.array.create_array( + store=store, + name=name, + shape=shape, + dtype=dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + array_bytes_codec=array_bytes_codec, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + ) + ) # TODO: add type annotations for kwargs diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index c0e71a6fd6..ee14f102fa 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -53,7 +53,7 @@ MemoryOrder, ShapeLike, ZarrFormat, - _default_zarr_version, + _default_zarr_format, _warn_order_kwarg, concurrent_map, parse_dtype, @@ -3765,7 +3765,7 @@ async def create_array( Returns ------- - z : array + AsyncArray The array. Examples @@ -3782,7 +3782,7 @@ async def create_array( """ if zarr_format is None: - zarr_format = _default_zarr_version() + zarr_format = _default_zarr_format() from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 7e7b2e73da..d53f3847a5 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -200,6 +200,6 @@ def _warn_order_kwarg() -> None: warnings.warn(msg, RuntimeWarning, stacklevel=2) -def _default_zarr_version() -> ZarrFormat: +def _default_zarr_format() -> ZarrFormat: """Return the default zarr_version""" - return cast(ZarrFormat, int(zarr_config.get("default_zarr_version", 3))) + return cast(ZarrFormat, int(zarr_config.get("default_zarr_format", 3))) diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py index 0f261f10b7..421a100f1b 100644 --- a/src/zarr/core/config.py +++ b/src/zarr/core/config.py @@ -62,7 +62,7 @@ def reset(self) -> None: "zarr", defaults=[ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, diff --git a/tests/test_config.py b/tests/test_config.py index 4befb2dae2..20e3c6044f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -50,7 +50,7 @@ def test_config_defaults_set() -> None: # regression test for available defaults assert config.defaults == [ { - "default_zarr_version": 3, + "default_zarr_format": 3, "array": { "order": "C", "write_empty_chunks": False, diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index a40b8a5c0a..aaace6f5cd 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -523,7 +523,7 @@ async def test_consolidated_metadata_v2(self): async def test_use_consolidated_false( self, memory_store: zarr.storage.MemoryStore, zarr_format: ZarrFormat ) -> None: - with zarr.config.set(default_zarr_version=zarr_format): + with zarr.config.set(default_zarr_format=zarr_format): g = await group(store=memory_store, attributes={"foo": "bar"}) await g.create_group(name="a") From 2afe940b8abca27d1b277e5d1d03235040a26bd9 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 13:47:34 +0100 Subject: [PATCH 77/85] tests --- tests/test_codecs/test_codecs.py | 58 +++++++---------------------- tests/test_codecs/test_sharding.py | 24 ++++++++++++ tests/test_codecs/test_transpose.py | 4 +- 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 4e99d4b8ea..a9bb78e3fd 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -78,17 +78,17 @@ async def test_order( path = "order" spath = StorePath(store, path=path) - with config.set({"array.order": runtime_write_order}): - a = await zarr.api.asynchronous.create_array( - spath, - shape=data.shape, - chunks=(16, 8) if with_sharding else (32, 8), - shards=(32, 8) if with_sharding else None, - dtype=data.dtype, - fill_value=0, - chunk_key_encoding={"name": "v2", "separator": "."}, - filters=[TransposeCodec(order=order_from_dim(store_order, data.ndim))], - ) + a = await zarr.api.asynchronous.create_array( + spath, + shape=data.shape, + chunks=(16, 8) if with_sharding else (32, 8), + shards=(32, 8) if with_sharding else None, + dtype=data.dtype, + fill_value=0, + chunk_key_encoding={"name": "v2", "separator": "."}, + filters=[TransposeCodec(order=order_from_dim(store_order, data.ndim))], + config={"order": runtime_write_order}, + ) await _AsyncArrayProxy(a)[:, :].set(data) read_data = await _AsyncArrayProxy(a)[:, :].get() @@ -364,42 +364,10 @@ def test_invalid_metadata(store: Store) -> None: @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) def test_invalid_metadata_create_array(store: Store) -> None: - spath3 = StorePath(store, "invalid_order") - with pytest.raises(TypeError): - zarr.create_array( - spath3, - shape=(16, 16), - chunks=(16, 16), - dtype=np.dtype("uint8"), - fill_value=0, - filters=[ - TransposeCodec(order="F"), # type: ignore[arg-type] - ], - ) - spath5 = StorePath(store, "invalid_inner_chunk_shape") - with pytest.raises(ValueError): - zarr.create_array( - spath5, - shape=(16, 16), - shards=(16, 16), - chunks=(8,), - dtype=np.dtype("uint8"), - fill_value=0, - ) - spath6 = StorePath(store, "invalid_inner_chunk_shape") - with pytest.raises(ValueError): - zarr.create_array( - spath6, - shape=(16, 16), - shards=(16, 16), - chunks=(8, 7), - dtype=np.dtype("uint8"), - fill_value=0, - ) - spath7 = StorePath(store, "warning_inefficient_codecs") + spath = StorePath(store, "warning_inefficient_codecs") with pytest.warns(UserWarning): zarr.create_array( - spath7, + spath, shape=(16, 16), chunks=(16, 16), dtype=np.dtype("uint8"), diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index a9abf8ede1..f77b871a1c 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -424,3 +424,27 @@ async def test_sharding_with_chunks_per_shard( a[...] = data data_read = a[...] assert np.array_equal(data_read, data) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_invalid_metadata(store: Store) -> None: + spath1 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath1, + shape=(16, 16), + shards=(16, 16), + chunks=(8,), + dtype=np.dtype("uint8"), + fill_value=0, + ) + spath2 = StorePath(store, "invalid_inner_chunk_shape") + with pytest.raises(ValueError): + zarr.create_array( + spath2, + shape=(16, 16), + shards=(16, 16), + chunks=(8, 7), + dtype=np.dtype("uint8"), + fill_value=0, + ) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 06ed07f6c1..65159f174b 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -81,8 +81,8 @@ def test_transpose_invalid( ) -> None: data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8)) spath = StorePath(store, "transpose_invalid") - for order in [(1, 0), (3, 2, 1), (3, 3, 1)]: - with pytest.raises(ValueError): + for order in [(1, 0), (3, 2, 1), (3, 3, 1), "F", "C"]: + with pytest.raises((ValueError, TypeError)): zarr.create_array( spath, shape=data.shape, From e3f1f3311abe6d014fca0c7a012f79b0bf9cf178 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 13:56:15 +0100 Subject: [PATCH 78/85] mypy --- tests/test_api.py | 4 ++-- tests/test_array.py | 32 ++++++++++++++++++------------ tests/test_codecs/test_sharding.py | 2 +- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index 5f8c84c4a6..80e8555e11 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -23,7 +23,7 @@ save_array, save_group, ) -from zarr.core.common import MemoryOrder, ZarrFormat +from zarr.core.common import JSON, MemoryOrder, ZarrFormat from zarr.errors import MetadataValidationError from zarr.storage._utils import normalize_path from zarr.storage.memory import MemoryStore @@ -61,7 +61,7 @@ def test_create(memory_store: Store) -> None: # TODO: parametrize over everything this function takes @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_create_array(store: Store) -> None: - attrs = {"foo": 100} + attrs: dict[str, JSON] = {"foo": 100} # explicit type annotation to avoid mypy error shape = (10, 10) path = "foo" data_val = 1 diff --git a/tests/test_array.py b/tests/test_array.py index 9c159f2acf..87525a126d 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -4,7 +4,7 @@ import pickle import re from itertools import accumulate -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import numcodecs import numpy as np @@ -43,6 +43,9 @@ from zarr.storage import LocalStore, MemoryStore from zarr.storage.common import StorePath +if TYPE_CHECKING: + from zarr.core.array_spec import ArrayConfigParams + @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [2, 3]) @@ -421,16 +424,16 @@ async def test_nbytes_stored_async() -> None: def test_default_fill_values() -> None: - a = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype=" None: match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.", ): zarr.create_array( - MemoryStore(), shape=5, chunks=5, dtype=" None: +def test_update_attrs(zarr_format: Literal[2, 3]) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 store = MemoryStore() - arr = zarr.create_array(store=store, shape=5, chunks=5, dtype="f8", zarr_format=zarr_format) + arr = zarr.create_array( + store=store, shape=(5,), chunks=(5,), dtype="f8", zarr_format=zarr_format + ) arr.attrs["foo"] = "bar" assert arr.attrs["foo"] == "bar" @@ -794,13 +799,14 @@ def test_array_create_metadata_order_v2( @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_array_create_order( order_config: MemoryOrder | None, - zarr_format: int, + zarr_format: Literal[2, 3], store: MemoryStore, ) -> None: """ Test that the arrays generated by array indexing have a memory order defined by the config order value """ + config: ArrayConfigParams = {} if order_config is None: config = {} expected = zarr.config.get("array.order") @@ -963,16 +969,16 @@ def test_chunks_and_shards() -> None: def test_create_array_default_fill_values() -> None: - a = zarr.create_array(MemoryStore(), shape=5, chunks=5, dtype=" None: chunk_shape = (2, 1) - shape = [x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)] + shape = tuple(x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False)) data = np.ones(np.prod(shape), dtype="int32").reshape(shape) fill_value = 42 From 16439831f11f0439ee0d18fc6dcd410c2eece1db Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 15:00:22 +0100 Subject: [PATCH 79/85] rename array_bytes_codec to serializer --- src/zarr/api/synchronous.py | 10 +++++----- src/zarr/core/array.py | 20 ++++++++++---------- tests/test_array.py | 4 ++-- tests/test_codecs/test_codecs.py | 2 +- tests/test_codecs/test_endian.py | 4 ++-- tests/test_codecs/test_sharding.py | 2 +- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 427eff004f..576d760ca5 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -19,9 +19,9 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike from zarr.core.array import ( - ArrayBytesCodecParam, CompressorsParam, FiltersParam, + SerializerParam, ShardsParam, ) from zarr.core.array_spec import ArrayConfig, ArrayConfigParams @@ -748,7 +748,7 @@ def create_array( shards: ShardsParam | None = None, filters: FiltersParam | None = "auto", compressors: CompressorsParam = "auto", - array_bytes_codec: ArrayBytesCodecParam = "auto", + serializer: SerializerParam = "auto", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, @@ -814,10 +814,10 @@ def create_array( These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. - array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional + serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``array_bytes_codec`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -875,7 +875,7 @@ def create_array( shards=shards, filters=filters, compressors=compressors, - array_bytes_codec=array_bytes_codec, + serializer=serializer, fill_value=fill_value, order=order, zarr_format=zarr_format, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ee14f102fa..0f6847ee16 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3643,7 +3643,7 @@ def _get_default_codecs( | Literal["auto"] | None ) -ArrayBytesCodecParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] +SerializerParam: TypeAlias = dict[str, JSON] | ArrayBytesCodec | Literal["auto"] class ShardsConfigParam(TypedDict): @@ -3664,7 +3664,7 @@ async def create_array( shards: ShardsParam | None = None, filters: FiltersParam | None = "auto", compressors: CompressorsParam = "auto", - array_bytes_codec: ArrayBytesCodecParam = "auto", + serializer: SerializerParam = "auto", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, @@ -3730,10 +3730,10 @@ async def create_array( These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. Use ``None`` to omit the default compressor. - array_bytes_codec : dict[str, JSON] | ArrayBytesCodec, optional + serializer : dict[str, JSON] | ArrayBytesCodec, optional Array-to-bytes codec to use for encoding the array data. Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. - If no ``array_bytes_codec`` is provided, the `zarr.codecs.BytesCodec` codec will be used. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional @@ -3808,8 +3808,8 @@ async def create_array( ) raise ValueError(msg) - if array_bytes_codec != "auto": - raise ValueError("Zarr v2 arrays do not support `array_bytes_codec`.") + if serializer != "auto": + raise ValueError("Zarr v2 arrays do not support `serializer`.") filters_parsed, compressor_parsed = _parse_chunk_encoding_v2( compressor=compressors, filters=filters, dtype=np.dtype(dtype) @@ -3840,7 +3840,7 @@ async def create_array( array_array, array_bytes, bytes_bytes = _parse_chunk_encoding_v3( compressors=compressors, filters=filters, - array_bytes_codec=array_bytes_codec, + serializer=serializer, dtype=dtype_parsed, ) sub_codecs = cast(tuple[Codec, ...], (*array_array, array_bytes, *bytes_bytes)) @@ -4017,7 +4017,7 @@ def _parse_chunk_encoding_v3( *, compressors: CompressorsParam | None, filters: FiltersParam | None, - array_bytes_codec: ArrayBytesCodecParam, + serializer: SerializerParam, dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ @@ -4058,9 +4058,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = cast(Iterable[Codec | dict[str, JSON]], filters) out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) - if array_bytes_codec == "auto": + if serializer == "auto": out_array_bytes = default_array_bytes else: - out_array_bytes = _parse_array_bytes_codec(array_bytes_codec) + out_array_bytes = _parse_array_bytes_codec(serializer) return out_array_array, out_array_bytes, out_bytes_bytes diff --git a/tests/test_array.py b/tests/test_array.py index 87525a126d..bb462ab014 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -461,7 +461,7 @@ def test_vlen_errors() -> None: match="For string dtype, ArrayBytesCodec must be `VLenUTF8Codec`, got `BytesCodec`.", ): zarr.create_array( - MemoryStore(), shape=(5,), chunks=(5,), dtype=" None: chunks=(16, 16), dtype=np.dtype("uint8"), fill_value=0, - array_bytes_codec=ShardingCodec(chunk_shape=(8, 8)), + serializer=ShardingCodec(chunk_shape=(8, 8)), compressors=[ GzipCodec(), ], diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index f4d85683b2..ae9d1f6f1f 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -24,7 +24,7 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: dtype=data.dtype, fill_value=0, chunk_key_encoding={"name": "v2", "separator": "."}, - array_bytes_codec=BytesCodec(endian=endian), + serializer=BytesCodec(endian=endian), ) await _AsyncArrayProxy(a)[:, :].set(data) @@ -50,7 +50,7 @@ async def test_endian_write( dtype="uint16", fill_value=0, chunk_key_encoding={"name": "v2", "separator": "."}, - array_bytes_codec=BytesCodec(endian=dtype_store_endian), + serializer=BytesCodec(endian=dtype_store_endian), ) await _AsyncArrayProxy(a)[:, :].set(data) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index b7ad60e189..3f14007351 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -280,7 +280,7 @@ def test_nested_sharding_create_array( chunks=(32, 32, 32), dtype=data.dtype, fill_value=0, - array_bytes_codec=ShardingCodec( + serializer=ShardingCodec( chunk_shape=(32, 32, 32), codecs=[ShardingCodec(chunk_shape=(16, 16, 16), index_location=inner_index_location)], index_location=outer_index_location, From aad8e9d84672f35bf64cab2742f58b9fdf056aec Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Thu, 2 Jan 2025 17:20:45 +0100 Subject: [PATCH 80/85] Update src/zarr/api/asynchronous.py Co-authored-by: Joe Hamman --- src/zarr/api/asynchronous.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 769c66d4cb..835f242c9c 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -697,7 +697,6 @@ async def create_group( if zarr_format is None: zarr_format = _default_zarr_version() - # TODO: fix this when modes make sense. It should be `w` for overwriting, `w-` otherwise mode: Literal["a"] = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) From f29b2d974c8712f70f883c3f2bd6abbf3cef2769 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 2 Jan 2025 17:53:33 +0100 Subject: [PATCH 81/85] docstrings --- src/zarr/api/synchronous.py | 146 +++++++++++++++++++++++++++++++++++- src/zarr/core/array.py | 4 +- 2 files changed, 144 insertions(+), 6 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 461d64765f..58ff418928 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,10 +18,18 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike + from zarr.core.array import CompressorsParam, FiltersParam from zarr.core.array_spec import ArrayConfig, ArrayConfigParams from zarr.core.buffer import NDArrayLike - from zarr.core.chunk_key_encodings import ChunkKeyEncoding - from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams + from zarr.core.common import ( + JSON, + AccessModeLiteral, + ChunkCoords, + MemoryOrder, + ShapeLike, + ZarrFormat, + ) from zarr.storage import StoreLike __all__ = [ @@ -700,8 +708,138 @@ def create( ) -def create_array(*args: Any, **kwargs: Any) -> Array: - return Array(sync(zarr.core.array.create_array(*args, **kwargs))) +def create_array( + store: str | StoreLike, + *, + name: str | None = None, + shape: ShapeLike, + dtype: npt.DTypeLike, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ChunkCoords | Literal["auto"] | None = None, + filters: FiltersParam = "auto", + compressors: CompressorsParam = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigParams | None = None, +) -> Array: + """Create an ``Array``. This function wraps :mod:`zarr.core.array.create_array`. + + Parameters + ---------- + store : str or Store + Store or path to directory in file system or name of zip file. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + shape : ChunkCoords + Shape of the array. + dtype : npt.DTypeLike + Data type of the array. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + fill_value : Any, optional + Fill value for the array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + attributes : dict, optional + Attributes for the array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigParams, optional + Runtime configuration for the array. + + Returns + ------- + z : Array + The array. + """ + return Array( + sync( + zarr.core.array.create_array( + store, + name=name, + shape=shape, + dtype=dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + ) + ) # TODO: add type annotations for kwargs diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 96ffd5c363..5292d02eb1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3536,7 +3536,7 @@ async def create_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigParams | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an array. + """Create an ``AsyncArray``. Parameters ---------- @@ -3622,7 +3622,7 @@ async def create_array( Returns ------- - z : array + z : AsyncArray The array. """ From 4654cbdc14d35c6d6b602cb77e58b13881def8e1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 2 Jan 2025 17:54:41 +0100 Subject: [PATCH 82/85] *params -> *like --- src/zarr/api/asynchronous.py | 6 ++--- src/zarr/api/synchronous.py | 16 ++++++------ src/zarr/core/array.py | 38 ++++++++++++++-------------- src/zarr/core/array_spec.py | 8 +++--- src/zarr/core/chunk_key_encodings.py | 4 +-- src/zarr/core/group.py | 32 +++++++++++------------ 6 files changed, 52 insertions(+), 52 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 835f242c9c..0c1c69dbbc 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -10,7 +10,7 @@ from typing_extensions import deprecated from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike from zarr.core.common import ( JSON, @@ -856,7 +856,7 @@ async def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array. @@ -1017,7 +1017,7 @@ async def create( mode = "a" store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) - config_dict: ArrayConfigParams = {} + config_dict: ArrayConfigLike = {} if write_empty_chunks is not None: if config is not None: diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 58ff418928..3852b5b81f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,10 +18,10 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.core.array import CompressorsParam, FiltersParam - from zarr.core.array_spec import ArrayConfig, ArrayConfigParams + from zarr.core.array import CompressorsLike, FiltersLike + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike - from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import ( JSON, AccessModeLiteral, @@ -592,7 +592,7 @@ def create( codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, **kwargs: Any, ) -> Array: """Create an array. @@ -716,17 +716,17 @@ def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: FiltersParam = "auto", - compressors: CompressorsParam = "auto", + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: """Create an ``Array``. This function wraps :mod:`zarr.core.array.create_array`. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 5292d02eb1..c379feaaf1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -19,7 +19,7 @@ from zarr.abc.store import Store, set_or_delete from zarr.codecs._v2 import V2Codec from zarr.core._info import ArrayInfo -from zarr.core.array_spec import ArrayConfig, ArrayConfigParams, parse_array_config +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -30,7 +30,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition, normalize_chunks from zarr.core.chunk_key_encodings import ( ChunkKeyEncoding, - ChunkKeyEncodingParams, + ChunkKeyEncodingLike, DefaultChunkKeyEncoding, V2ChunkKeyEncoding, ) @@ -289,7 +289,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata]: ... # this overload defines the function signature when zarr_format is 3 @@ -318,7 +318,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @@ -346,7 +346,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata]: ... @overload @classmethod @@ -379,7 +379,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV3Metadata] | AsyncArray[ArrayV2Metadata]: ... @classmethod @@ -412,7 +412,7 @@ async def create( # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Method to create a new asynchronous array instance. @@ -1566,7 +1566,7 @@ def create( compressor: dict[str, JSON] | None = None, # runtime overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: """Creates a new Array instance from an initialized store. @@ -3499,7 +3499,7 @@ def _get_default_codecs( return cast(list[dict[str, JSON]], default_codecs[dtype_key]) -FiltersParam: TypeAlias = ( +FiltersLike: TypeAlias = ( Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] | ArrayArrayCodec | Iterable[numcodecs.abc.Codec] @@ -3507,7 +3507,7 @@ def _get_default_codecs( | Literal["auto"] | None ) -CompressorsParam: TypeAlias = ( +CompressorsLike: TypeAlias = ( Iterable[dict[str, JSON] | BytesBytesCodec] | BytesBytesCodec | numcodecs.abc.Codec @@ -3524,17 +3524,17 @@ async def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: FiltersParam = "auto", - compressors: CompressorsParam = "auto", + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an ``AsyncArray``. @@ -3716,7 +3716,7 @@ async def create_array( def _parse_chunk_key_encoding( - data: ChunkKeyEncoding | ChunkKeyEncodingParams | None, zarr_format: ZarrFormat + data: ChunkKeyEncoding | ChunkKeyEncodingLike | None, zarr_format: ZarrFormat ) -> ChunkKeyEncoding: """ Take an implicit specification of a chunk key encoding and parse it into a ChunkKeyEncoding object. @@ -3804,8 +3804,8 @@ def _get_default_chunk_encoding_v2( def _parse_chunk_encoding_v2( *, - compressor: CompressorsParam, - filters: FiltersParam, + compressor: CompressorsLike, + filters: FiltersLike, dtype: np.dtype[Any], ) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: """ @@ -3848,8 +3848,8 @@ def _parse_chunk_encoding_v2( def _parse_chunk_encoding_v3( *, - compressors: CompressorsParam, - filters: FiltersParam, + compressors: CompressorsLike, + filters: FiltersLike, dtype: np.dtype[Any], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: """ diff --git a/src/zarr/core/array_spec.py b/src/zarr/core/array_spec.py index ed5adf5526..b1a6a3cad0 100644 --- a/src/zarr/core/array_spec.py +++ b/src/zarr/core/array_spec.py @@ -21,7 +21,7 @@ from zarr.core.common import ChunkCoords -class ArrayConfigParams(TypedDict): +class ArrayConfigLike(TypedDict): """ A TypedDict model of the attributes of an ArrayConfig class, but with no required fields. This allows for partial construction of an ArrayConfig, with the assumption that the unset @@ -56,13 +56,13 @@ def __init__(self, order: MemoryOrder, write_empty_chunks: bool) -> None: object.__setattr__(self, "write_empty_chunks", write_empty_chunks_parsed) @classmethod - def from_dict(cls, data: ArrayConfigParams) -> Self: + def from_dict(cls, data: ArrayConfigLike) -> Self: """ Create an ArrayConfig from a dict. The keys of that dict are a subset of the attributes of the ArrayConfig class. Any keys missing from that dict will be set to the the values in the ``array`` namespace of ``zarr.config``. """ - kwargs_out: ArrayConfigParams = {} + kwargs_out: ArrayConfigLike = {} for f in fields(ArrayConfig): field_name = cast(Literal["order", "write_empty_chunks"], f.name) if field_name not in data: @@ -72,7 +72,7 @@ def from_dict(cls, data: ArrayConfigParams) -> Self: return cls(**kwargs_out) -def parse_array_config(data: ArrayConfig | ArrayConfigParams | None) -> ArrayConfig: +def parse_array_config(data: ArrayConfig | ArrayConfigLike | None) -> ArrayConfig: """ Convert various types of data to an ArrayConfig. """ diff --git a/src/zarr/core/chunk_key_encodings.py b/src/zarr/core/chunk_key_encodings.py index 06d387afea..95ce9108f3 100644 --- a/src/zarr/core/chunk_key_encodings.py +++ b/src/zarr/core/chunk_key_encodings.py @@ -20,7 +20,7 @@ def parse_separator(data: JSON) -> SeparatorLiteral: return cast(SeparatorLiteral, data) -class ChunkKeyEncodingParams(TypedDict): +class ChunkKeyEncodingLike(TypedDict): name: Literal["v2", "default"] separator: SeparatorLiteral @@ -37,7 +37,7 @@ def __init__(self, *, separator: SeparatorLiteral) -> None: @classmethod def from_dict( - cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingParams + cls, data: dict[str, JSON] | ChunkKeyEncoding | ChunkKeyEncodingLike ) -> ChunkKeyEncoding: if isinstance(data, ChunkKeyEncoding): return data diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index c7a3d333ba..982bd19fcb 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -21,8 +21,8 @@ from zarr.core.array import ( Array, AsyncArray, - CompressorsParam, - FiltersParam, + CompressorsLike, + FiltersLike, _build_parents, create_array, ) @@ -53,9 +53,9 @@ from collections.abc import AsyncGenerator, Generator, Iterable, Iterator from typing import Any - from zarr.core.array_spec import ArrayConfig, ArrayConfigParams + from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import Buffer, BufferPrototype - from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingParams + from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike from zarr.core.common import MemoryOrder logger = logging.getLogger("zarr.group") @@ -1009,16 +1009,16 @@ async def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: FiltersParam = "auto", - compressors: CompressorsParam = "auto", + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """ Create a Zarr array within this AsyncGroup. @@ -2202,16 +2202,16 @@ def create_array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | None = None, - filters: FiltersParam = "auto", - compressors: CompressorsParam = "auto", + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: """ Create a Zarr array within this AsyncGroup. @@ -2534,16 +2534,16 @@ def array( dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", shards: ChunkCoords | Literal["auto"] | None = None, - filters: FiltersParam = "auto", - compressors: CompressorsParam = "auto", + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, - chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingParams | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, dimension_names: Iterable[str] | None = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, - config: ArrayConfig | ArrayConfigParams | None = None, + config: ArrayConfig | ArrayConfigLike | None = None, data: npt.ArrayLike | None = None, ) -> Array: """ From 5cdb515620ebc313b6d57ee2fc541ff33ac1837f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 2 Jan 2025 17:56:28 +0100 Subject: [PATCH 83/85] *params -> *like, in tests --- tests/test_array.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index c7b4e9a0ff..a7357d89b3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -22,8 +22,8 @@ ) from zarr.core._info import ArrayInfo from zarr.core.array import ( - CompressorsParam, - FiltersParam, + CompressorsLike, + FiltersLike, _get_default_chunk_encoding_v2, _get_default_chunk_encoding_v3, _parse_chunk_encoding_v2, @@ -1054,7 +1054,7 @@ async def test_create_array_no_filters_compressors( ], ) async def test_create_array_v3_chunk_encoding( - store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str ) -> None: """ Test various possibilities for the compressors and filters parameter to create_array @@ -1091,7 +1091,7 @@ async def test_create_array_v3_chunk_encoding( "filters", ["auto", None, numcodecs.GZip(level=1), (numcodecs.GZip(level=1),)] ) async def test_create_array_v2_chunk_encoding( - store: MemoryStore, compressors: CompressorsParam, filters: FiltersParam, dtype: str + store: MemoryStore, compressors: CompressorsLike, filters: FiltersLike, dtype: str ) -> None: arr = await create_array( store=store, From be60d73ce00f0ed9a23cf8bd4e3e80fd96f58056 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 19:10:08 +0100 Subject: [PATCH 84/85] adds deprecated compressor arg to Group.create_array --- src/zarr/api/synchronous.py | 7 +- src/zarr/core/_info.py | 5 +- src/zarr/core/array.py | 19 ++- src/zarr/core/group.py | 212 +++++++++++++++++++++++++++++---- src/zarr/testing/strategies.py | 7 +- tests/test_array.py | 4 +- tests/test_group.py | 10 ++ 7 files changed, 230 insertions(+), 34 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 82fc6e4d27..bb030cf97f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -18,7 +18,12 @@ from zarr.abc.codec import Codec from zarr.api.asynchronous import ArrayLike, PathLike - from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike, ShardsLike + from zarr.core.array import ( + CompressorsLike, + FiltersLike, + SerializerLike, + ShardsLike, + ) from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike from zarr.core.chunk_key_encodings import ChunkKeyEncoding, ChunkKeyEncodingLike diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py index 4708967390..12bcc02e96 100644 --- a/src/zarr/core/_info.py +++ b/src/zarr/core/_info.py @@ -6,6 +6,7 @@ import numpy as np from zarr.abc.codec import Codec +from zarr.core.common import ZarrFormat from zarr.core.metadata.v3 import DataType @@ -20,7 +21,7 @@ class GroupInfo: _name: str _type: Literal["Group"] = "Group" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _read_only: bool _store_type: str _count_members: int | None = None @@ -76,7 +77,7 @@ class ArrayInfo: """ _type: Literal["Array"] = "Array" - _zarr_format: Literal[2, 3] + _zarr_format: ZarrFormat _data_type: np.dtype[Any] | DataType _shape: tuple[int, ...] _chunk_shape: tuple[int, ...] | None = None diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 9d148283c0..99e9d97c67 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3636,8 +3636,10 @@ def _get_default_codecs( | Literal["auto"] | None ) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | None CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec] + Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + | dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] @@ -4064,3 +4066,18 @@ def _parse_chunk_encoding_v3( out_array_bytes = _parse_array_bytes_codec(serializer) return out_array_array, out_array_bytes, out_bytes_bytes + + +def _parse_deprecated_compressor( + compressor: CompressorLike | None, compressors: CompressorsLike +) -> CompressorsLike | None: + if compressor: + if compressors != "auto": + raise ValueError("Cannot specify both `compressor` and `compressors`.") + warn( + "The `compressor` argument is deprecated. Use `compressors` instead.", + category=UserWarning, + stacklevel=2, + ) + compressors = (compressor,) + return compressors diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 1890c007f8..ff21cd3fae 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -21,9 +21,13 @@ from zarr.core.array import ( Array, AsyncArray, + CompressorLike, CompressorsLike, FiltersLike, + SerializerLike, + ShardsLike, _build_parents, + _parse_deprecated_compressor, create_array, ) from zarr.core.attributes import Attributes @@ -66,7 +70,7 @@ def parse_zarr_format(data: Any) -> ZarrFormat: """Parse the zarr_format field from metadata.""" if data in (2, 3): - return cast(Literal[2, 3], data) + return cast(ZarrFormat, data) msg = f"Invalid zarr_format. Expected one of 2 or 3. Got {data}." raise ValueError(msg) @@ -442,7 +446,7 @@ async def from_store( async def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, use_consolidated: bool | str | None = None, ) -> AsyncGroup: """Open a new AsyncGroup @@ -1008,9 +1012,11 @@ async def create_array( shape: ShapeLike, dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", - shards: ChunkCoords | Literal["auto"] | None = None, + shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, @@ -1033,24 +1039,73 @@ async def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunks : ChunkCoords | Literal["auto"], default is "auto" + chunks : ChunkCoords, optional Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional - List of filters to apply to the array. + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. compressors : Iterable[Codec], optional - List of compressors to apply to the array. + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional - Memory layout of the array. + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding to use. + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional - Dimension names for the array. + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -1064,6 +1119,8 @@ async def create_array( AsyncArray """ + + compressors = _parse_deprecated_compressor(compressor, compressors) return await create_array( store=self.store_path, name=name, @@ -1073,6 +1130,7 @@ async def create_array( shards=shards, filters=filters, compressors=compressors, + serializer=serializer, fill_value=fill_value, order=order, zarr_format=self.metadata.zarr_format, @@ -1693,7 +1751,7 @@ def from_store( def open( cls, store: StoreLike, - zarr_format: Literal[2, 3] | None = 3, + zarr_format: ZarrFormat | None = 3, ) -> Group: """Open a group from an initialized store. @@ -2201,9 +2259,11 @@ def create_array( shape: ShapeLike, dtype: npt.DTypeLike, chunks: ChunkCoords | Literal["auto"] = "auto", - shards: ChunkCoords | None = None, + shards: ShardsLike | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -2226,24 +2286,73 @@ def create_array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunks : ChunkCoords | Literal["auto"], default is "auto" + chunks : ChunkCoords, optional Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional - List of filters to apply to the array. + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. compressors : Iterable[Codec], optional - List of compressors to apply to the array. + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional - Memory layout of the array. + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding to use. + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional - Dimension names for the array. + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2256,7 +2365,7 @@ def create_array( ------- AsyncArray """ - + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( @@ -2269,6 +2378,7 @@ def create_array( attributes=attributes, chunk_key_encoding=chunk_key_encoding, compressors=compressors, + serializer=serializer, dimension_names=dimension_names, order=order, filters=filters, @@ -2536,6 +2646,8 @@ def array( shards: ChunkCoords | Literal["auto"] | None = None, filters: FiltersLike = "auto", compressors: CompressorsLike = "auto", + compressor: CompressorLike = None, + serializer: SerializerLike = "auto", fill_value: Any | None = 0, order: MemoryOrder | None = "C", attributes: dict[str, JSON] | None = None, @@ -2559,24 +2671,73 @@ def array( Shape of the array. dtype : npt.DTypeLike Data type of the array. - chunks : ChunkCoords + chunks : ChunkCoords, optional Chunk shape of the array. + If not specified, default are guessed based on the shape and dtype. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec], optional - List of filters to apply to the array. + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr v3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. + + For Zarr v2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + If no ``filters`` are provided, a default set of filters will be used. + These defaults can be changed by modifying the value of ``array.v2_default_filters`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default filters. compressors : Iterable[Codec], optional - List of compressors to apply to the array. + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified). + + For Zarr v3, a "compressor" is a codec that takes a bytestrea, and + returns another bytestream. Multiple compressors my be provided for Zarr v3. + If ``filters`` and ``compressors`` are not specified, then the default codecs for + Zarr v3 will be used. + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` + in :mod:`zarr.core.config`. + Use ``None`` to omit default compressors. + + For Zarr v2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr v2. + If no ``compressors`` are provided, a default compressor will be used. + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` + in :mod:`zarr.core.config`. + Use ``None`` to omit the default compressor. + compressor : Codec, optional + Deprecated in favor of ``compressors``. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr v3 only. Zarr v2 arrays use implicit array-to-bytes conversion. + If no ``serializer`` is provided, the `zarr.codecs.BytesCodec` codec will be used. fill_value : Any, optional Fill value for the array. order : {"C", "F"}, optional - Memory layout of the array. + The memory of the array (default is "C"). + For Zarr v2, this parameter sets the memory order of the array. + For Zarr v3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr v3 arrays. The recommended way to specify the memory + order for Zarr v3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If no ``order`` is provided, a default order will be used. + This default can be changed by modifying the value of ``array.order`` in :mod:`zarr.core.config`. attributes : dict, optional Attributes for the array. chunk_key_encoding : ChunkKeyEncoding, optional - The chunk key encoding to use. + A specification of how the chunk keys are represented in storage. + For Zarr v3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr v2, the default is ``{"name": "v2", "separator": "."}}``. dimension_names : Iterable[str], optional - Dimension names for the array. + The names of the dimensions (default is None). + Zarr v3 only. Zarr v2 arrays should not use this parameter. storage_options : dict, optional If using an fsspec URL to create the store, these will be passed to the backend implementation. Ignored otherwise. @@ -2589,7 +2750,7 @@ def array( ------- AsyncArray """ - + compressors = _parse_deprecated_compressor(compressor, compressors) return Array( self._sync( self._async_group.create_array( @@ -2602,6 +2763,7 @@ def array( attributes=attributes, chunk_key_encoding=chunk_key_encoding, compressors=compressors, + serializer=serializer, dimension_names=dimension_names, order=order, filters=filters, diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index c447596f06..ae0487e447 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -1,4 +1,4 @@ -from typing import Any, Literal +from typing import Any import hypothesis.extra.numpy as npst import hypothesis.strategies as st @@ -8,6 +8,7 @@ import zarr from zarr.core.array import Array +from zarr.core.common import ZarrFormat from zarr.core.sync import sync from zarr.storage import MemoryStore, StoreLike from zarr.storage.common import _dereference_path @@ -69,7 +70,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: # So we map a clear to reset the store. stores = st.builds(MemoryStore, st.just({})).map(lambda x: sync(x.clear())) compressors = st.sampled_from([None, "default"]) -zarr_formats: st.SearchStrategy[Literal[2, 3]] = st.sampled_from([2, 3]) +zarr_formats: st.SearchStrategy[ZarrFormat] = st.sampled_from([2, 3]) array_shapes = npst.array_shapes(max_dims=4, min_side=0) @@ -78,7 +79,7 @@ def numpy_arrays( draw: st.DrawFn, *, shapes: st.SearchStrategy[tuple[int, ...]] = array_shapes, - zarr_formats: st.SearchStrategy[Literal[2, 3]] = zarr_formats, + zarr_formats: st.SearchStrategy[ZarrFormat] = zarr_formats, ) -> Any: """ Generate numpy arrays that can be saved in the provided Zarr format. diff --git a/tests/test_array.py b/tests/test_array.py index c49d089bf7..72ff68d954 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -466,7 +466,7 @@ def test_vlen_errors() -> None: @pytest.mark.parametrize("zarr_format", [2, 3]) -def test_update_attrs(zarr_format: Literal[2, 3]) -> None: +def test_update_attrs(zarr_format: ZarrFormat) -> None: # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 store = MemoryStore() arr = zarr.create_array( @@ -799,7 +799,7 @@ def test_array_create_metadata_order_v2( @pytest.mark.parametrize("store", ["memory"], indirect=True) def test_array_create_order( order_config: MemoryOrder | None, - zarr_format: Literal[2, 3], + zarr_format: ZarrFormat, store: MemoryStore, ) -> None: """ diff --git a/tests/test_group.py b/tests/test_group.py index e9fea3ebed..6b3c40412e 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -1427,3 +1427,13 @@ def test_delitem_removes_children(store: Store, zarr_format: ZarrFormat) -> None del g1["0"] with pytest.raises(KeyError): g1["0/0"] + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +def test_deprecated_compressor(store: Store) -> None: + g = zarr.group(store=store, zarr_format=2) + with pytest.warns(UserWarning, match="The `compressor` argument is deprecated.*"): + a = g.create_array( + "foo", shape=(100,), chunks=(10,), dtype="i4", compressor={"id": "blosc"} + ) + assert a.metadata.compressor.codec_id == "blosc" From 0a8b91cd438c82791519f7e3f3e1fd6a66b50e6d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 2 Jan 2025 19:22:49 +0100 Subject: [PATCH 85/85] docs --- src/zarr/api/synchronous.py | 4 +++- src/zarr/core/array.py | 2 +- src/zarr/core/group.py | 18 +++++++++--------- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index bb030cf97f..52815748ad 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -759,7 +759,9 @@ def create_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: - """Create an ``Array``. This function wraps :mod:`zarr.core.array.create_array`. + """Create an array. + + This function wraps :func:`zarr.core.array.create_array`. Parameters ---------- diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 99e9d97c67..0a5b5f085a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3677,7 +3677,7 @@ async def create_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an ``AsyncArray``. + """Create an array. Parameters ---------- diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index ff21cd3fae..29b25689c4 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -1026,9 +1026,9 @@ async def create_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """ - Create a Zarr array within this AsyncGroup. - This method lightly wraps ``zarr.core.array.create_array``. + """Create an array within this group. + + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- @@ -2273,9 +2273,9 @@ def create_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: - """ - Create a Zarr array within this AsyncGroup. - This method lightly wraps ``zarr.core.array.create_array``. + """Create an array within this group. + + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ---------- @@ -2658,9 +2658,9 @@ def array( config: ArrayConfig | ArrayConfigLike | None = None, data: npt.ArrayLike | None = None, ) -> Array: - """ - Create a Zarr array within this AsyncGroup. - This method lightly wraps ``zarr.core.array.create_array``. + """Create an array within this group. + + This method lightly wraps :func:`zarr.core.array.create_array`. Parameters ----------