Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: store learns to delete prefixes when overwriting/creating hierarchy nodes #2430

Merged
merged 19 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/zarr/abc/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,36 @@ def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
"""
...

async def delete_dir(self, prefix: str, recursive: bool = True) -> None:
"""
Remove all keys and prefixes in the store that begin with a given prefix.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would make life much easier for Store implementers if we declare that prefix will always be the path of a group or a array. Otherwise, all implementations need to deal with unnecessary edge cases like delete_dir("path/to/array/c/0") or worse.

Icechunk is an example of a store that could very cleanly and efficiently implement the case for a group/array prefix, but it would struggle to handle other prefixes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the use case for recursive = False ? If zarr itself doesn't have one, I'd remove the argument.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can see a use case for delete_dir("path/to/array/c/") (remove all chunks from an array) so I think we should allow that -- even if it makes life a little harder in icechunk.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's totally fine. Let's document the three possible prefix cases, and say that stores are allowed to do undefined behavior for other prefixes

"""
if not self.supports_deletes:
raise NotImplementedError
if not self.supports_listing:
raise NotImplementedError
self._check_writable()
if recursive:
if not prefix.endswith("/"):
prefix += "/"
async for key in self.list_prefix(prefix):
await self.delete(f"{key}")
jhamman marked this conversation as resolved.
Show resolved Hide resolved
else:
async for key in self.list_dir(prefix):
await self.delete(f"{prefix}/{key}")

async def delete_prefix(self, prefix: str) -> None:
jhamman marked this conversation as resolved.
Show resolved Hide resolved
"""
Remove all keys in the store that begin with a given prefix.
"""
if not self.supports_deletes:
raise NotImplementedError
if not self.supports_listing:
raise NotImplementedError
self._check_writable()
async for key in self.list_prefix(prefix):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we document what's the expected return value of list_prefix? Absolute vs. relative, final "/" or not, etc.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is done in the list_prefix docstring

await self.delete(f"{key}")

def close(self) -> None:
"""Close the store."""
self._is_open = False
Expand Down
10 changes: 3 additions & 7 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,7 @@ async def create(
dtype: npt.DTypeLike | None = None,
compressor: dict[str, JSON] | None = None, # TODO: default and type change
fill_value: Any | None = 0, # TODO: need type
order: MemoryOrder | None = None, # TODO: default change
order: MemoryOrder | None = None,
d-v-b marked this conversation as resolved.
Show resolved Hide resolved
store: str | StoreLike | None = None,
synchronizer: Any | None = None,
overwrite: bool = False,
Expand Down Expand Up @@ -761,6 +761,7 @@ async def create(
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Memory layout to be used within each chunk.
Default is set in Zarr's config (`array.order`).
store : Store or str
Store or path to directory in file system or name of zip file.
synchronizer : object, optional
Expand Down Expand Up @@ -834,12 +835,6 @@ async def create(
else:
chunk_shape = shape

if order is not None:
warnings.warn(
"order is deprecated, use config `array.order` instead",
DeprecationWarning,
stacklevel=2,
)
if synchronizer is not None:
warnings.warn("synchronizer is not yet implemented", RuntimeWarning, stacklevel=2)
if chunk_store is not None:
Expand Down Expand Up @@ -889,6 +884,7 @@ async def create(
codecs=codecs,
dimension_names=dimension_names,
attributes=attributes,
order=order,
**kwargs,
)

Expand Down
49 changes: 29 additions & 20 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
ZARRAY_JSON,
ZATTRS_JSON,
ChunkCoords,
MemoryOrder,
ShapeLike,
ZarrFormat,
concurrent_map,
Expand Down Expand Up @@ -203,29 +204,29 @@ class AsyncArray(Generic[T_ArrayMetadata]):
metadata: T_ArrayMetadata
store_path: StorePath
codec_pipeline: CodecPipeline = field(init=False)
order: Literal["C", "F"]
order: MemoryOrder

@overload
def __init__(
self: AsyncArray[ArrayV2Metadata],
metadata: ArrayV2Metadata | ArrayV2MetadataDict,
store_path: StorePath,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
) -> None: ...

@overload
def __init__(
self: AsyncArray[ArrayV3Metadata],
metadata: ArrayV3Metadata | ArrayV3MetadataDict,
store_path: StorePath,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
) -> None: ...

def __init__(
self,
metadata: ArrayMetadata | ArrayMetadataDict,
store_path: StorePath,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
) -> None:
if isinstance(metadata, dict):
zarr_format = metadata["zarr_format"]
Expand Down Expand Up @@ -261,7 +262,7 @@ async def create(
attributes: dict[str, JSON] | None = None,
chunks: ShapeLike | None = None,
dimension_separator: Literal[".", "/"] | None = None,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: dict[str, JSON] | None = None,
# runtime
Expand Down Expand Up @@ -350,7 +351,7 @@ async def create(
# v2 only
chunks: ShapeLike | None = None,
dimension_separator: Literal[".", "/"] | None = None,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: dict[str, JSON] | None = None,
# runtime
Expand Down Expand Up @@ -382,7 +383,7 @@ async def create(
# v2 only
chunks: ShapeLike | None = None,
dimension_separator: Literal[".", "/"] | None = None,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: dict[str, JSON] | None = None,
# runtime
Expand Down Expand Up @@ -422,7 +423,6 @@ async def create(
V2 only. V3 arrays cannot have a dimension separator.
order : Literal["C", "F"], optional
The order of the array (default is None).
V2 only. V3 arrays should not have 'order' parameter.
filters : list[dict[str, JSON]], optional
The filters used to compress the data (default is None).
V2 only. V3 arrays should not have 'filters' parameter.
Expand Down Expand Up @@ -471,10 +471,6 @@ async def create(
raise ValueError(
"dimension_separator cannot be used for arrays with version 3. Use chunk_key_encoding instead."
)
if order is not None:
raise ValueError(
"order cannot be used for arrays with version 3. Use a transpose codec instead."
)
if filters is not None:
raise ValueError(
"filters cannot be used for arrays with version 3. Use array-to-array codecs instead."
Expand All @@ -494,6 +490,7 @@ async def create(
dimension_names=dimension_names,
attributes=attributes,
exists_ok=exists_ok,
order=order,
)
elif zarr_format == 2:
if dtype is str or dtype == "str":
Expand Down Expand Up @@ -545,6 +542,7 @@ async def _create_v3(
dtype: npt.DTypeLike,
chunk_shape: ChunkCoords,
fill_value: Any | None = None,
order: MemoryOrder | None = None,
chunk_key_encoding: (
ChunkKeyEncoding
| tuple[Literal["default"], Literal[".", "/"]]
Expand All @@ -556,7 +554,12 @@ async def _create_v3(
attributes: dict[str, JSON] | None = None,
exists_ok: bool = False,
) -> AsyncArray[ArrayV3Metadata]:
if not exists_ok:
if exists_ok:
if store_path.store.supports_deletes:
await store_path.delete_dir(recursive=True)
else:
await ensure_no_existing_node(store_path, zarr_format=3)
else:
jhamman marked this conversation as resolved.
Show resolved Hide resolved
await ensure_no_existing_node(store_path, zarr_format=3)

shape = parse_shapelike(shape)
Expand Down Expand Up @@ -588,7 +591,7 @@ async def _create_v3(
attributes=attributes or {},
)

array = cls(metadata=metadata, store_path=store_path)
array = cls(metadata=metadata, store_path=store_path, order=order)
await array._save_metadata(metadata, ensure_parents=True)
return array

Expand All @@ -602,16 +605,22 @@ async def _create_v2(
chunks: ChunkCoords,
dimension_separator: Literal[".", "/"] | None = None,
fill_value: None | float = None,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: dict[str, JSON] | None = None,
attributes: dict[str, JSON] | None = None,
exists_ok: bool = False,
) -> AsyncArray[ArrayV2Metadata]:
if not exists_ok:
if exists_ok:
if store_path.store.supports_deletes:
await store_path.delete_dir(recursive=True)
else:
await ensure_no_existing_node(store_path, zarr_format=2)
else:
await ensure_no_existing_node(store_path, zarr_format=2)

if order is None:
order = "C"
order = parse_indexing_order(config.get("array.order"))

if dimension_separator is None:
dimension_separator = "."
Expand All @@ -627,7 +636,7 @@ async def _create_v2(
filters=filters,
attributes=attributes,
)
array = cls(metadata=metadata, store_path=store_path)
array = cls(metadata=metadata, store_path=store_path, order=order)
await array._save_metadata(metadata, ensure_parents=True)
return array

Expand Down Expand Up @@ -1179,7 +1188,7 @@ def create(
# v2 only
chunks: ChunkCoords | None = None,
dimension_separator: Literal[".", "/"] | None = None,
order: Literal["C", "F"] | None = None,
order: MemoryOrder | None = None,
filters: list[dict[str, JSON]] | None = None,
compressor: dict[str, JSON] | None = None,
# runtime
Expand Down Expand Up @@ -1370,7 +1379,7 @@ def store_path(self) -> StorePath:
return self._async_array.store_path

@property
def order(self) -> Literal["C", "F"]:
def order(self) -> MemoryOrder:
return self._async_array.order

@property
Expand Down
8 changes: 4 additions & 4 deletions src/zarr/core/array_spec.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Literal
from typing import TYPE_CHECKING, Any

import numpy as np

from zarr.core.common import parse_fill_value, parse_order, parse_shapelike
from zarr.core.common import MemoryOrder, parse_fill_value, parse_order, parse_shapelike

if TYPE_CHECKING:
from zarr.core.buffer import BufferPrototype
Expand All @@ -17,15 +17,15 @@ class ArraySpec:
shape: ChunkCoords
dtype: np.dtype[Any]
fill_value: Any
order: Literal["C", "F"]
order: MemoryOrder
prototype: BufferPrototype

def __init__(
self,
shape: ChunkCoords,
dtype: np.dtype[Any],
fill_value: Any,
order: Literal["C", "F"],
order: MemoryOrder,
prototype: BufferPrototype,
) -> None:
shape_parsed = parse_shapelike(shape)
Expand Down
21 changes: 8 additions & 13 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,13 @@ async def from_store(
zarr_format: ZarrFormat = 3,
) -> AsyncGroup:
store_path = await make_store_path(store)
if not exists_ok:

if exists_ok:
if store_path.store.supports_deletes:
await store_path.delete_dir(recursive=True)
else:
await ensure_no_existing_node(store_path, zarr_format=zarr_format)
else:
await ensure_no_existing_node(store_path, zarr_format=zarr_format)
attributes = attributes or {}
group = cls(
Expand Down Expand Up @@ -710,19 +716,8 @@ def _getitem_consolidated(

async def delitem(self, key: str) -> None:
store_path = self.store_path / key
if self.metadata.zarr_format == 3:
await (store_path / ZARR_JSON).delete()

elif self.metadata.zarr_format == 2:
await asyncio.gather(
(store_path / ZGROUP_JSON).delete(), # TODO: missing_ok=False
(store_path / ZARRAY_JSON).delete(), # TODO: missing_ok=False
(store_path / ZATTRS_JSON).delete(), # TODO: missing_ok=True
)

else:
raise ValueError(f"unexpected zarr_format: {self.metadata.zarr_format}")

await store_path.delete_dir(recursive=True)
if self.metadata.consolidated_metadata:
self.metadata.consolidated_metadata.metadata.pop(key, None)
await self._save_metadata()
Expand Down
8 changes: 4 additions & 4 deletions src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from zarr.core.array_spec import ArraySpec
from zarr.core.chunk_grids import RegularChunkGrid
from zarr.core.chunk_key_encodings import parse_separator
from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, parse_shapelike
from zarr.core.common import ZARRAY_JSON, ZATTRS_JSON, MemoryOrder, parse_shapelike
from zarr.core.config import config, parse_indexing_order
from zarr.core.metadata.common import parse_attributes

Expand All @@ -45,7 +45,7 @@ class ArrayV2Metadata(Metadata):
chunks: tuple[int, ...]
dtype: np.dtype[Any]
fill_value: None | int | float | str | bytes = 0
order: Literal["C", "F"] = "C"
order: MemoryOrder = "C"
filters: tuple[numcodecs.abc.Codec, ...] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: numcodecs.abc.Codec | None = None
Expand All @@ -59,7 +59,7 @@ def __init__(
dtype: npt.DTypeLike,
chunks: ChunkCoords,
fill_value: Any,
order: Literal["C", "F"],
order: MemoryOrder,
dimension_separator: Literal[".", "/"] = ".",
compressor: numcodecs.abc.Codec | dict[str, JSON] | None = None,
filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None,
Expand Down Expand Up @@ -185,7 +185,7 @@ def to_dict(self) -> dict[str, JSON]:
return zarray_dict

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype
) -> ArraySpec:
return ArraySpec(
shape=self.chunks,
Expand Down
3 changes: 2 additions & 1 deletion src/zarr/core/metadata/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
JSON,
ZARR_JSON,
ChunkCoords,
MemoryOrder,
parse_named_configuration,
parse_shapelike,
)
Expand Down Expand Up @@ -289,7 +290,7 @@ def ndim(self) -> int:
return len(self.shape)

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
self, _chunk_coords: ChunkCoords, order: MemoryOrder, prototype: BufferPrototype
) -> ArraySpec:
assert isinstance(
self.chunk_grid, RegularChunkGrid
Expand Down
12 changes: 12 additions & 0 deletions src/zarr/storage/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,18 @@ async def delete(self) -> None:
"""
await self.store.delete(self.path)

async def delete_dir(self, recursive: bool = False) -> None:
"""
Delete all keys with the given prefix from the store.
"""
await self.store.delete_dir(self.path, recursive=recursive)

async def delete_prefix(self) -> None:
"""
Delete all keys with the given prefix from the store.
"""
await self.store.delete_prefix(self.path)

async def set_if_not_exists(self, default: Buffer) -> None:
"""
Store a key to ``value`` if the key is not already present.
Expand Down
Loading