Skip to content

Commit c66f32b

Browse files
LDeakind-v-b
andauthored
fix: implicit fill value initialisation (#2799)
* fix: implicit fill value initialisation - initialise empty chunks to the default fill value during writing - add default fill value for datetime, timedelta, structured data types * fmt * add "other" dtype test * changelog --------- Co-authored-by: Davis Bennett <davis.v.bennett@gmail.com>
1 parent 2f8b88a commit c66f32b

File tree

4 files changed

+45
-17
lines changed

4 files changed

+45
-17
lines changed

changes/2799.bugfix.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Enitialise empty chunks to the default fill value during writing and add default fill values for datetime, timedelta, structured, and other (void* fixed size) data types

src/zarr/core/codec_pipeline.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,19 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
5656
return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]
5757

5858

59+
def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
60+
fill_value = chunk_spec.fill_value
61+
if fill_value is None:
62+
# Zarr V2 allowed `fill_value` to be null in the metadata.
63+
# Zarr V3 requires it to be set. This has already been
64+
# validated when decoding the metadata, but we support reading
65+
# Zarr V2 data and need to support the case where fill_value
66+
# is None.
67+
return _default_fill_value(dtype=chunk_spec.dtype)
68+
else:
69+
return fill_value
70+
71+
5972
@dataclass(frozen=True)
6073
class BatchedCodecPipeline(CodecPipeline):
6174
"""Default codec pipeline.
@@ -247,17 +260,7 @@ async def read_batch(
247260
if chunk_array is not None:
248261
out[out_selection] = chunk_array
249262
else:
250-
fill_value = chunk_spec.fill_value
251-
252-
if fill_value is None:
253-
# Zarr V2 allowed `fill_value` to be null in the metadata.
254-
# Zarr V3 requires it to be set. This has already been
255-
# validated when decoding the metadata, but we support reading
256-
# Zarr V2 data and need to support the case where fill_value
257-
# is None.
258-
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
259-
260-
out[out_selection] = fill_value
263+
out[out_selection] = fill_value_or_default(chunk_spec)
261264
else:
262265
chunk_bytes_batch = await concurrent_map(
263266
[
@@ -284,10 +287,7 @@ async def read_batch(
284287
tmp = tmp.squeeze(axis=drop_axes)
285288
out[out_selection] = tmp
286289
else:
287-
fill_value = chunk_spec.fill_value
288-
if fill_value is None:
289-
fill_value = _default_fill_value(dtype=chunk_spec.dtype)
290-
out[out_selection] = fill_value
290+
out[out_selection] = fill_value_or_default(chunk_spec)
291291

292292
def _merge_chunk_array(
293293
self,
@@ -305,7 +305,7 @@ def _merge_chunk_array(
305305
shape=chunk_spec.shape,
306306
dtype=chunk_spec.dtype,
307307
order=chunk_spec.order,
308-
fill_value=chunk_spec.fill_value,
308+
fill_value=fill_value_or_default(chunk_spec),
309309
)
310310
else:
311311
chunk_array = existing_chunk_array.copy() # make a writable copy
@@ -394,7 +394,7 @@ async def _read_key(
394394
chunk_array_batch.append(None) # type: ignore[unreachable]
395395
else:
396396
if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
397-
chunk_spec.fill_value
397+
fill_value_or_default(chunk_spec)
398398
):
399399
chunk_array_batch.append(None)
400400
else:

src/zarr/core/metadata/v2.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,14 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
349349
return b""
350350
elif dtype.kind in "UO":
351351
return ""
352+
elif dtype.kind in "Mm":
353+
return dtype.type("nat")
354+
elif dtype.kind == "V":
355+
if dtype.fields is not None:
356+
default = tuple([_default_fill_value(field[0]) for field in dtype.fields.values()])
357+
return np.array([default], dtype=dtype)
358+
else:
359+
return np.zeros(1, dtype=dtype)
352360
else:
353361
return dtype.type(0)
354362

tests/test_v2.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,3 +313,22 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
313313
za[...] = a
314314
za = zarr.open_array(store=array_path)
315315
assert (a == za[:]).all()
316+
317+
318+
@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"])
319+
def test_other_dtype_roundtrip(fill_value, tmp_path) -> None:
320+
a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7")
321+
array_path = tmp_path / "data.zarr"
322+
za = zarr.create(
323+
shape=(3,),
324+
store=array_path,
325+
chunks=(2,),
326+
fill_value=fill_value,
327+
zarr_format=2,
328+
dtype=a.dtype,
329+
)
330+
if fill_value is not None:
331+
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
332+
za[...] = a
333+
za = zarr.open_array(store=array_path)
334+
assert (a == za[:]).all()

0 commit comments

Comments
 (0)