diff --git a/docs/conf.py b/docs/conf.py index 17770124be..dfd1ae07bb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -49,8 +49,6 @@ "sphinx_copybutton", "sphinx_design", 'sphinx_reredirects', - "IPython.sphinxext.ipython_directive", - "IPython.sphinxext.ipython_console_highlighting", ] issues_github_path = "zarr-developers/zarr-python" diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index 4d1ad12abd..5b2a937972 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -3,30 +3,22 @@ Working with arrays =================== -.. ipython:: python - :suppress: - - rm -r data/ + >>> import shutil + >>> + >>> shutil.rmtree("./data") Creating an array ----------------- -Zarr has several functions for creating arrays. For example: - -.. ipython:: python +Zarr has several functions for creating arrays. For example:: - import zarr - - store = {} - # TODO: replace with `create_array` after #2463 - z = zarr.create( - store=store, - mode="w", - shape=(10000, 10000), - chunks=(1000, 1000), - dtype="i4" - ) - z + >>> import zarr + >>> + >>> store = {} + >>> # TODO: replace with `create_array` after #2463 + >>> z = zarr.create(store=store, mode="w", shape=(10000, 10000), chunks=(1000, 1000), dtype="i4") + >>> z + The code above creates a 2-dimensional array of 32-bit integers with 10000 rows and 10000 columns, divided into chunks where each chunk has 1000 rows and 1000 @@ -44,31 +36,39 @@ Reading and writing data Zarr arrays support a similar interface to `NumPy `_ arrays for reading and writing data. For example, the entire array can be filled -with a scalar value: - -.. ipython:: python +with a scalar value:: - z[:] = 42 + >>> z[:] = 42 -Regions of the array can also be written to, e.g.: +Regions of the array can also be written to, e.g.:: -.. ipython:: python - - import numpy as np - - z[0, :] = np.arange(10000) - z[:, 0] = np.arange(10000) + >>> import numpy as np + >>> + >>> z[0, :] = np.arange(10000) + >>> z[:, 0] = np.arange(10000) The contents of the array can be retrieved by slicing, which will load the -requested region into memory as a NumPy array, e.g.: - -.. ipython:: python - - z[0, 0] - z[-1, -1] - z[0, :] - z[:, 0] - z[:] +requested region into memory as a NumPy array, e.g.:: + + >>> z[0, 0] + array(0, dtype=int32) + >>> z[-1, -1] + array(42, dtype=int32) + >>> z[0, :] + array([ 0, 1, 2, ..., 9997, 9998, 9999], + shape=(10000,), dtype=int32) + >>> z[:, 0] + array([ 0, 1, 2, ..., 9997, 9998, 9999], + shape=(10000,), dtype=int32) + >>> z[:] + array([[ 0, 1, 2, ..., 9997, 9998, 9999], + [ 1, 42, 42, ..., 42, 42, 42], + [ 2, 42, 42, ..., 42, 42, 42], + ..., + [9997, 42, 42, ..., 42, 42, 42], + [9998, 42, 42, ..., 42, 42, 42], + [9999, 42, 42, ..., 42, 42, 42]], + shape=(10000, 10000), dtype=int32) Read more about NumPy-style indexing can be found in the `NumPy documentation `_. @@ -81,18 +81,10 @@ Persistent arrays In the examples above, compressed data for each chunk of the array was stored in main memory. Zarr arrays can also be stored on a file system, enabling persistence of data between sessions. To do this, we can change the store -argument to point to a filesystem path: +argument to point to a filesystem path:: -.. ipython:: python - - # TODO: replace with `open_array` after #2463 - z1 = zarr.open( - store='data/example-2.zarr', - mode='w', - shape=(10000, 10000), - chunks=(1000, 1000), - dtype='i4' - ) + >>> # TODO: replace with `open_array` after #2463 + >>> z1 = zarr.open(store='data/example-2.zarr', mode='w', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') The array above will store its configuration metadata and all compressed chunk data in a directory called ``'data/example-2.zarr'`` relative to the current working @@ -103,32 +95,28 @@ close an array: data are automatically flushed to disk, and files are automatically closed whenever an array is modified. Persistent arrays support the same interface for reading and writing data, -e.g.: - -.. ipython:: python +e.g.:: - z1[:] = 42 - z1[0, :] = np.arange(10000) - z1[:, 0] = np.arange(10000) + >>> z1[:] = 42 + >>> z1[0, :] = np.arange(10000) + >>> z1[:, 0] = np.arange(10000) -Check that the data have been written and can be read again: +Check that the data have been written and can be read again:: -.. ipython:: python - - # TODO: replace with `open_array` after #2463 - z2 = zarr.open('data/example-2.zarr', mode='r') - np.all(z1[:] == z2[:]) + >>> # TODO: replace with `open_array` after #2463 + >>> z2 = zarr.open('data/example-2.zarr', mode='r') + >>> np.all(z1[:] == z2[:]) + np.True_ If you are just looking for a fast and convenient way to save NumPy arrays to disk then load back into memory later, the functions :func:`zarr.save` and :func:`zarr.load` may be -useful. E.g.: - -.. ipython:: python +useful. E.g.:: - a = np.arange(10) - zarr.save('data/example-3.zarr', a) - zarr.load('data/example-3.zarr') + >>> a = np.arange(10) + >>> zarr.save('data/example-3.zarr', a) + >>> zarr.load('data/example-3.zarr') + array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) Please note that there are a number of other options for persistent array storage, see the :ref:`Storage Guide ` guide for more details. @@ -139,36 +127,34 @@ Resizing and appending ---------------------- A Zarr array can be resized, which means that any of its dimensions can be -increased or decreased in length. For example: +increased or decreased in length. For example:: -.. ipython:: python - - z = zarr.zeros( - store="data/example-4.zarr", - shape=(10000, 10000), - chunks=(1000, 1000) - ) - z[:] = 42 - z.shape - z.resize((20000, 10000)) - z.shape + >>> z = zarr.zeros(store="data/example-4.zarr", shape=(10000, 10000), chunks=(1000, 1000)) + >>> z[:] = 42 + >>> z.shape + (10000, 10000) + >>> z.resize((20000, 10000)) + >>> z.shape + (20000, 10000) Note that when an array is resized, the underlying data are not rearranged in any way. If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. :func:`zarr.Array.append` is provided as a convenience function, which can be -used to append data to any axis. E.g.: - -.. ipython:: python - - a = np.arange(10000000, dtype='i4').reshape(10000, 1000) - # TODO: replace with create_array after #2463 - z = zarr.array(store="data/example-5", data=a, chunks=(1000, 100)) - z.shape - z.append(a) - z.append(np.vstack([a, a]), axis=1) - z.shape +used to append data to any axis. E.g.:: + + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(store="data/example-5", data=a, chunks=(1000, 100)) + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) .. _user-guide-compress: @@ -179,17 +165,15 @@ A number of different compressors can be used with Zarr. A separate package called NumCodecs_ is available which provides a common interface to various compressor libraries including Blosc, Zstandard, LZ4, Zlib, BZ2 and LZMA. Different compressors can be provided via the ``compressor`` keyword -argument accepted by all array creation functions. For example: - -.. ipython:: python +argument accepted by all array creation functions. For example:: - from numcodecs import Blosc - - compressor = None # TODO: Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) - data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - # TODO: remove zarr_format and replace with create_array after #2463 - z = zarr.array(store="data/example-6.zarr", data=data, chunks=(1000, 1000), compressor=compressor, zarr_format=2) - None # TODO: z.compressor + >>> from numcodecs import Blosc + >>> + >>> compressor = None # TODO: Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) + >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) + >>> # TODO: remove zarr_format and replace with create_array after #2463 + >>> z = zarr.array(store="data/example-6.zarr", data=data, chunks=(1000, 1000), compressor=compressor, zarr_format=2) + >>> None # TODO: z.compressor This array above will use Blosc as the primary compressor, using the Zstandard algorithm (compression level 3) internally within Blosc, and with the @@ -197,18 +181,37 @@ bit-shuffle filter applied. When using a compressor, it can be useful to get some diagnostics on the compression ratio. Zarr arrays provide the :attr:`zarr.Array.info` property -which can be used to print useful diagnostics, e.g.: - -.. ipython:: python - - z.info +which can be used to print useful diagnostics, e.g.:: + + >>> z.info + Type : Array + Zarr format : 2 + Data type : int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : LocalStore + Filters : (Zstd(level=0),) + No. bytes : 400000000 (381.5M) The :func:`zarr.Array.info_complete` method inspects the underlying store and -prints additional diagnostics, e.g.: - -.. ipython:: python - - z.info_complete() +prints additional diagnostics, e.g.:: + + >>> z.info_complete() + Type : Array + Zarr format : 2 + Data type : int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : LocalStore + Filters : (Zstd(level=0),) + No. bytes : 400000000 (381.5M) + No. bytes stored : 299348462 + Storage ratio : 1.3 + Chunks Initialized : 100 .. note:: :func:`zarr.Array.info_complete` will inspect the underlying store and may @@ -222,67 +225,48 @@ fact a "meta-compressor", which means that it can use a number of different compression algorithms internally to compress the data. Blosc also provides highly optimized implementations of byte- and bit-shuffle filters, which can improve compression ratios for some data. A list of the internal compression -libraries available within Blosc can be obtained via: - -.. ipython:: python +libraries available within Blosc can be obtained via:: - from numcodecs import blosc - - blosc.list_compressors() + >>> from numcodecs import blosc + >>> + >>> blosc.list_compressors() + ['blosclz', 'lz4', 'lz4hc', 'zlib', 'zstd'] In addition to Blosc, other compression libraries can also be used. For example, -here is an array using Zstandard compression, level 1: - -.. ipython:: python +here is an array using Zstandard compression, level 1:: - from numcodecs import Zstd - # TODO: remove zarr_format and replace with create_array after #2463 - z = zarr.array( - store="data/example-7.zarr", - data=np.arange(100000000, dtype='i4').reshape(10000, 10000), - chunks=(1000, 1000), - compressor=Zstd(level=1), - zarr_format=2 # TODO: remove zarr_format - ) - None # TODO: z.compressor + >>> from numcodecs import Zstd + >>> # TODO: remove zarr_format and replace with create_array after #2463 + >>> z = zarr.array(store="data/example-7.zarr", data=np.arange(100000000, dtype='i4').reshape(10000, 10000), chunks=(1000, 1000), compressor=Zstd(level=1), zarr_format=2) + >>> None # TODO: z.compressor Here is an example using LZMA with a custom filter pipeline including LZMA's -built-in delta filter: +built-in delta filter:: -.. ipython:: python - - import lzma - from numcodecs import LZMA - - lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] - compressor = LZMA(filters=lzma_filters) - # TODO: remove zarr_format and replace with create_array after #2463 - z = zarr.array( - np.arange(100000000, dtype='i4').reshape(10000, 10000), - chunks=(1000, 1000), - compressor=compressor, - zarr_format=2 - ) - None # TODO: z.compressor + >>> import lzma + >>> from numcodecs import LZMA + >>> + >>> lzma_filters = [dict(id=lzma.FILTER_DELTA, dist=4), dict(id=lzma.FILTER_LZMA2, preset=1)] + >>> compressor = LZMA(filters=lzma_filters) + >>> # TODO: remove zarr_format and replace with create_array after #2463 + >>> z = zarr.array(np.arange(100000000, dtype='i4').reshape(10000, 10000), chunks=(1000, 1000), compressor=compressor, zarr_format=2) + >>> None # TODO: z.compressor The default compressor can be changed by setting the value of the using Zarr's -:ref:`user-guide-config`, e.g.: - -.. ipython:: python - - with zarr.config.set({'array.v2_default_compressor.numeric': 'blosc'}): - z = zarr.zeros(100000000, chunks=1000000, zarr_format=2) - z.metadata.filters - z.metadata.compressor +:ref:`user-guide-config`, e.g.:: + >>> with zarr.config.set({'array.v2_default_compressor.numeric': 'blosc'}): + ... z = zarr.zeros(100000000, chunks=1000000, zarr_format=2) + >>> z.metadata.filters + (Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),) + >>> z.metadata.compressor + >>> -To disable compression, set ``compressor=None`` when creating an array, e.g.: +To disable compression, set ``compressor=None`` when creating an array, e.g.:: -.. ipython:: python + >>> # TODO: remove zarr_format + >>> z = zarr.zeros(100000000, chunks=1000000, compressor=None, zarr_format=2) - # TODO: remove zarr_format - z = zarr.zeros(100000000, chunks=1000000, compressor=None, zarr_format=2) - z .. _user-guide-filters: Filters @@ -299,18 +283,27 @@ filter. However, to provide additional flexibility for implementing and using filters in combination with different compressors, Zarr also provides a mechanism for configuring filters outside of the primary compressor. -Here is an example using a delta filter with the Blosc compressor: - -.. ipython:: python - - from numcodecs import Blosc, Delta - - filters = [Delta(dtype='i4')] - compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) - data = np.arange(100000000, dtype='i4').reshape(10000, 10000) - # TODO: remove zarr_format and replace with create_array after #2463 - z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor, zarr_format=2) - z.info +Here is an example using a delta filter with the Blosc compressor:: + + >>> from numcodecs import Blosc, Delta + >>> + >>> filters = [Delta(dtype='i4')] + >>> compressor = Blosc(cname='zstd', clevel=1, shuffle=Blosc.SHUFFLE) + >>> data = np.arange(100000000, dtype='i4').reshape(10000, 10000) + >>> # TODO: remove zarr_format and replace with create_array after #2463 + >>> z = zarr.array(data, chunks=(1000, 1000), filters=filters, compressor=compressor, zarr_format=2) + >>> z.info + Type : Array + Zarr format : 2 + Data type : int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : MemoryStore + Compressor : Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0) + Filters : (Delta(dtype='`_ documentation. @@ -334,91 +327,106 @@ Indexing with coordinate arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Items from a Zarr array can be extracted by providing an integer array of -coordinates. E.g.: - -.. ipython:: python +coordinates. E.g.:: - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(10) ** 2) - z[:] - z.get_coordinate_selection([2, 5]) + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(10) ** 2) + >>> z[:] + array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) + >>> z.get_coordinate_selection([2, 5]) + array([ 4, 25]) -Coordinate arrays can also be used to update data, e.g.: +Coordinate arrays can also be used to update data, e.g.:: -.. ipython:: python - - z.set_coordinate_selection([2, 5], [-1, -2]) - z[:] + >>> z.set_coordinate_selection([2, 5], [-1, -2]) + >>> z[:] + array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) For multidimensional arrays, coordinates must be provided for each dimension, -e.g.: - -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(15).reshape(3, 5)) - z[:] - z.get_coordinate_selection(([0, 2], [1, 3])) - z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) - z[:] +e.g.:: + + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_coordinate_selection(([0, 2], [1, 3])) + array([ 1, 13]) + >>> z.set_coordinate_selection(([0, 2], [1, 3]), [-1, -2]) + >>> z[:] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) For convenience, coordinate indexing is also available via the ``vindex`` -property, as well as the square bracket operator, e.g.: - -.. ipython:: python - - z.vindex[[0, 2], [1, 3]] - z.vindex[[0, 2], [1, 3]] = [-3, -4] - z[:] - z[[0, 2], [1, 3]] +property, as well as the square bracket operator, e.g.:: + + >>> z.vindex[[0, 2], [1, 3]] + array([-1, -2]) + >>> z.vindex[[0, 2], [1, 3]] = [-3, -4] + >>> z[:] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) + >>> z[[0, 2], [1, 3]] + array([-3, -4]) When the indexing arrays have different shapes, they are broadcast together. -That is, the following two calls are equivalent: - -.. ipython:: python +That is, the following two calls are equivalent:: - z[1, [1, 3]] - z[[1, 1], [1, 3]] + >>> z[1, [1, 3]] + array([6, 8]) + >>> z[[1, 1], [1, 3]] + array([6, 8]) Indexing with a mask array ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Items can also be extracted by providing a Boolean mask. E.g.: - -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(10) ** 2) - z[:] - sel = np.zeros_like(z, dtype=bool) - sel[2] = True - sel[5] = True - z.get_mask_selection(sel) - z.set_mask_selection(sel, [-1, -2]) - z[:] - -Here's a multidimensional example: - -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(15).reshape(3, 5)) - z[:] - sel = np.zeros_like(z, dtype=bool) - sel[0, 1] = True - sel[2, 3] = True - z.get_mask_selection(sel) - z.set_mask_selection(sel, [-1, -2]) - z[:] +Items can also be extracted by providing a Boolean mask. E.g.:: + + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(10) ** 2) + >>> z[:] + array([ 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[2] = True + >>> sel[5] = True + >>> z.get_mask_selection(sel) + array([ 4, 25]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[:] + array([ 0, 1, -1, 9, 16, -2, 36, 49, 64, 81]) + +Here's a multidimensional example:: + + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> sel = np.zeros_like(z, dtype=bool) + >>> sel[0, 1] = True + >>> sel[2, 3] = True + >>> z.get_mask_selection(sel) + array([ 1, 13]) + >>> z.set_mask_selection(sel, [-1, -2]) + >>> z[:] + array([[ 0, -1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -2, 14]]) For convenience, mask indexing is also available via the ``vindex`` property, -e.g.: - -.. ipython:: python +e.g.:: - z.vindex[sel] - z.vindex[sel] = [-3, -4] - z[:] + >>> z.vindex[sel] + array([-1, -2]) + >>> z.vindex[sel] = [-3, -4] + >>> z[:] + array([[ 0, -3, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, -4, 14]]) Mask indexing is conceptually the same as coordinate indexing, and is implemented internally via the same machinery. Both styles of indexing allow @@ -430,120 +438,144 @@ Orthogonal indexing Zarr arrays also support methods for orthogonal indexing, which allows selections to be made along each dimension of an array independently. For example, this allows selecting a subset of rows and/or columns from a -2-dimensional array. E.g.: - -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(15).reshape(3, 5)) - z[:] - z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows - z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns - z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] - -Data can also be modified, e.g.: - -.. ipython:: python +2-dimensional array. E.g.:: + + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z[:] + array([[ 0, 1, 2, 3, 4], + [ 5, 6, 7, 8, 9], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection(([0, 2], slice(None))) # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.get_orthogonal_selection((slice(None), [1, 3])) # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.get_orthogonal_selection(([0, 2], [1, 3])) # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + +Data can also be modified, e.g.:: + + >>> z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - z.set_orthogonal_selection(([0, 2], [1, 3]), [[-1, -2], [-3, -4]]) - z[:] For convenience, the orthogonal indexing functionality is also available via the -``oindex`` property, e.g.: - -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(15).reshape(3, 5)) - z.oindex[[0, 2], :] # select first and third rows - z.oindex[:, [1, 3]] # select second and fourth columns - z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] - z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] - z[:] +``oindex`` property, e.g.:: + + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> z.oindex[[0, 2], :] # select first and third rows + array([[ 0, 1, 2, 3, 4], + [10, 11, 12, 13, 14]]) + >>> z.oindex[:, [1, 3]] # select second and fourth columns + array([[ 1, 3], + [ 6, 8], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] # select rows [0, 2] and columns [1, 4] + array([[ 1, 3], + [11, 13]]) + >>> z.oindex[[0, 2], [1, 3]] = [[-1, -2], [-3, -4]] + >>> z[:] + array([[ 0, -1, 2, -2, 4], + [ 5, 6, 7, 8, 9], + [10, -3, 12, -4, 14]]) Any combination of integer, slice, 1D integer array and/or 1D Boolean array can be used for orthogonal indexing. If the index contains at most one iterable, and otherwise contains only slices and integers, -orthogonal indexing is also available directly on the array: +orthogonal indexing is also available directly on the array:: -.. ipython:: python - - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(15).reshape(3, 5)) - np.all(z.oindex[[0, 2], :] == z[[0, 2], :]) + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(15).reshape(3, 5)) + >>> np.all(z.oindex[[0, 2], :] == z[[0, 2], :]) + np.True_ Block Indexing ~~~~~~~~~~~~~~ Zarr also support block indexing, which allows selections of whole chunks based on their logical indices along each dimension of an array. For example, this allows selecting -a subset of chunk aligned rows and/or columns from a 2-dimensional array. E.g.: - -.. ipython:: python +a subset of chunk aligned rows and/or columns from a 2-dimensional array. E.g.:: - # TODO: replace with create_array after #2463 - z = zarr.array(np.arange(100).reshape(10, 10), chunks=(3, 3)) + >>> # TODO: replace with create_array after #2463 + >>> z = zarr.array(np.arange(100).reshape(10, 10), chunks=(3, 3)) -Retrieve items by specifying their block coordinates: +Retrieve items by specifying their block coordinates:: -.. ipython:: python + >>> z.get_block_selection(1) + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) - z.get_block_selection(1) +Equivalent slicing:: -Equivalent slicing: - -.. ipython:: python - - z[3:6] + >>> z[3:6] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) For convenience, the block selection functionality is also available via the -`blocks` property, e.g.: +`blocks` property, e.g.:: -.. ipython:: python - - z.blocks[1] + >>> z.blocks[1] + array([[30, 31, 32, 33, 34, 35, 36, 37, 38, 39], + [40, 41, 42, 43, 44, 45, 46, 47, 48, 49], + [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]]) Block index arrays may be multidimensional to index multidimensional arrays. -For example: - -.. ipython:: python +For example:: - z.blocks[0, 1:3] + >>> z.blocks[0, 1:3] + array([[ 3, 4, 5, 6, 7, 8], + [13, 14, 15, 16, 17, 18], + [23, 24, 25, 26, 27, 28]]) -Data can also be modified. Let's start by a simple 2D array: +Data can also be modified. Let's start by a simple 2D array:: -.. ipython:: python + >>> z = zarr.zeros((6, 6), dtype=int, chunks=2) - z = zarr.zeros((6, 6), dtype=int, chunks=2) +Set data for a selection of items:: -Set data for a selection of items: - -.. ipython:: python - - z.set_block_selection((1, 0), 1) - z[...] + >>> z.set_block_selection((1, 0), 1) + >>> z[...] + array([[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]]) For convenience, this functionality is also available via the ``blocks`` property. -E.g.: - -.. ipython:: python - - z.blocks[:, 2] = 7 - z[...] - -Any combination of integer and slice can be used for block indexing: - -.. ipython:: python - - z.blocks[2, 1:3] - - # TODO: replace with create_group after #2463 - root = zarr.group('data/example-12.zarr') - foo = root.create_array(name='foo', shape=(1000, 100), chunks=(10, 10), dtype='f4') - bar = root.create_array(name='foo/bar', shape=(100,), dtype='i4') - foo[:, :] = np.random.random((1000, 100)) - bar[:] = np.arange(100) - root.tree() +E.g.:: + + >>> z.blocks[:, 2] = 7 + >>> z[...] + array([[0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [1, 1, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7], + [0, 0, 0, 0, 7, 7]]) + +Any combination of integer and slice can be used for block indexing:: + + >>> z.blocks[2, 1:3] + array([[0, 0, 7, 7], + [0, 0, 7, 7]]) + >>> + >>> # TODO: replace with create_group after #2463 + >>> root = zarr.group('data/example-12.zarr') + >>> foo = root.create_array(name='foo', shape=(1000, 100), chunks=(10, 10), dtype='f4') + >>> bar = root.create_array(name='foo/bar', shape=(100,), dtype='i4') + >>> foo[:, :] = np.random.random((1000, 100)) + >>> bar[:] = np.arange(100) + >>> root.tree() + / + └── foo (1000, 100) float32 + .. _user-guide-sharding: diff --git a/docs/user-guide/attributes.rst b/docs/user-guide/attributes.rst index 71f454f2b2..96b533b1c4 100644 --- a/docs/user-guide/attributes.rst +++ b/docs/user-guide/attributes.rst @@ -4,22 +4,27 @@ Working with attributes ======================= Zarr arrays and groups support custom key/value attributes, which can be useful for -storing application-specific metadata. For example: +storing application-specific metadata. For example:: -.. ipython:: python - - # TODO: replace with create_group after #2463 - root = zarr.group() - root.attrs['foo'] = 'bar' - z = root.zeros(name='zzz', shape=(10000, 10000)) - z.attrs['baz'] = 42 - z.attrs['qux'] = [1, 4, 7, 12] - sorted(root.attrs) - 'foo' in root.attrs - root.attrs['foo'] - sorted(z.attrs) - z.attrs['baz'] - z.attrs['qux'] + >>> import zarr + >>> # TODO: replace with create_group after #2463 + >>> root = zarr.group() + >>> root.attrs['foo'] = 'bar' + >>> z = root.zeros(name='zzz', shape=(10000, 10000)) + >>> z.attrs['baz'] = 42 + >>> z.attrs['qux'] = [1, 4, 7, 12] + >>> sorted(root.attrs) + ['foo'] + >>> 'foo' in root.attrs + True + >>> root.attrs['foo'] + 'bar' + >>> sorted(z.attrs) + ['baz', 'qux'] + >>> z.attrs['baz'] + 42 + >>> z.attrs['qux'] + [1, 4, 7, 12] Internally Zarr uses JSON to store array attributes, so attribute values must be JSON serializable. diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst index 4f4d3b17b3..6a72a1ea92 100644 --- a/docs/user-guide/config.rst +++ b/docs/user-guide/config.rst @@ -6,18 +6,16 @@ Runtime configuration The :mod:`zarr.core.config` module is responsible for managing the configuration of zarr and is based on the `donfig `_ Python library. -Configuration values can be set using code like the following: +Configuration values can be set using code like the following:: -.. ipython:: python - - import zarr - zarr.config.set({"array.order": "F"}) - -.. ipython:: python - :suppress: - - # revert this change so it doesn't impact the rest of the docs - zarr.config.set({"array.order": "C"}) + >>> import zarr + >>> + >>> zarr.config.set({"array.order": "F"}) + + >>> + >>> # revert this change so it doesn't impact the rest of the docs + >>> zarr.config.set({"array.order": "C"}) + Alternatively, configuration values can be set using environment variables, e.g. ``ZARR_ARRAY__ORDER=F``. @@ -40,10 +38,32 @@ first register the implementations in the registry and then select them in the c For example, an implementation of the bytes codec in a class "custompackage.NewBytesCodec", requires the value of ``codecs.bytes.name`` to be "custompackage.NewBytesCodec". -This is the current default configuration: - -.. ipython:: python - - import zarr - - zarr.config.pprint() +This is the current default configuration:: + + >>> zarr.config.pprint() + {'array': {'order': 'C', + 'v2_default_compressor': {'bytes': 'vlen-bytes', + 'numeric': 'zstd', + 'string': 'vlen-utf8'}, + 'v3_default_codecs': {'bytes': ['vlen-bytes'], + 'numeric': ['bytes', 'zstd'], + 'string': ['vlen-utf8']}, + 'write_empty_chunks': False}, + 'async': {'concurrency': 10, 'timeout': None}, + 'buffer': 'zarr.core.buffer.cpu.Buffer', + 'codec_pipeline': {'batch_size': 1, + 'path': 'zarr.core.codec_pipeline.BatchedCodecPipeline'}, + 'codecs': {'blosc': 'zarr.codecs.blosc.BloscCodec', + 'bytes': 'zarr.codecs.bytes.BytesCodec', + 'crc32c': 'zarr.codecs.crc32c_.Crc32cCodec', + 'endian': 'zarr.codecs.bytes.BytesCodec', + 'gzip': 'zarr.codecs.gzip.GzipCodec', + 'sharding_indexed': 'zarr.codecs.sharding.ShardingCodec', + 'transpose': 'zarr.codecs.transpose.TransposeCodec', + 'vlen-bytes': 'zarr.codecs.vlen_utf8.VLenBytesCodec', + 'vlen-utf8': 'zarr.codecs.vlen_utf8.VLenUTF8Codec', + 'zstd': 'zarr.codecs.zstd.ZstdCodec'}, + 'default_zarr_version': 3, + 'json_indent': 2, + 'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer', + 'threading': {'max_workers': None}} diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst index 703f89bee4..33b9c3e40d 100644 --- a/docs/user-guide/consolidated_metadata.rst +++ b/docs/user-guide/consolidated_metadata.rst @@ -26,44 +26,41 @@ metadata reads get child Group or Array nodes will *not* require reads from the In Python, the consolidated metadata is available on the ``.consolidated_metadata`` attribute of the ``GroupMetadata`` object. -.. TODO: remove :okwarning: after warnings are removed - -.. ipython:: python - :okwarning: - - import zarr - store = zarr.storage.MemoryStore() - # TODO: replace with create_group after #2463 - group = zarr.open_group(store=store) - group.create_array(shape=(1,), name="a") - group.create_array(shape=(2, 2), name="b") - group.create_array(shape=(3, 3, 3), name="c") - zarr.consolidate_metadata(store) + >>> import zarr + >>> + >>> store = zarr.storage.MemoryStore() + >>> # TODO: replace with create_group after #2463 + >>> group = zarr.open_group(store=store) + >>> group.create_array(shape=(1,), name="a") + + >>> group.create_array(shape=(2, 2), name="b") + + >>> group.create_array(shape=(3, 3, 3), name="c") + + >>> zarr.consolidate_metadata(store) + If we open that group, the Group's metadata has a :class:`zarr.core.group.ConsolidatedMetadata` -that can be used. - -.. ipython:: python - - consolidated = zarr.open_group(store=store) - consolidated.metadata.consolidated_metadata.metadata - -Operations on the group to get children automatically use the consolidated metadata. - -.. ipython:: python +that can be used.: - consolidated["a"] # no read / HTTP request to the Store is required + >>> consolidated = zarr.open_group(store=store) + >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata + >>> dict(sorted(consolidated_metadata.items())) + {'a': ArrayV3Metadata(shape=(1,), data_type=, chunk_grid=RegularChunkGrid(chunk_shape=(1,)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), codecs=[BytesCodec(endian=), ZstdCodec(level=0, checksum=False)], attributes={}, dimension_names=None, zarr_format=3, node_type='array', storage_transformers=()), 'b': ArrayV3Metadata(shape=(2, 2), data_type=, chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), codecs=[BytesCodec(endian=), ZstdCodec(level=0, checksum=False)], attributes={}, dimension_names=None, zarr_format=3, node_type='array', storage_transformers=()), 'c': ArrayV3Metadata(shape=(3, 3, 3), data_type=, chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)), chunk_key_encoding=DefaultChunkKeyEncoding(name='default', separator='/'), fill_value=np.float64(0.0), codecs=[BytesCodec(endian=), ZstdCodec(level=0, checksum=False)], attributes={}, dimension_names=None, zarr_format=3, node_type='array', storage_transformers=())} -With nested groups, the consolidated metadata is available on the children, recursively. +Operations on the group to get children automatically use the consolidated metadata.: -.. ipython:: python - :okwarning: + >>> consolidated["a"] # no read / HTTP request to the Store is required + - child = group.create_group("child", attributes={"kind": "child"}) - grandchild = child.create_group("child", attributes={"kind": "grandchild"}) - consolidated = zarr.consolidate_metadata(store) +With nested groups, the consolidated metadata is available on the children, recursively.: - consolidated["child"].metadata.consolidated_metadata + >>> child = group.create_group("child", attributes={"kind": "child"}) + >>> grandchild = child.create_group("child", attributes={"kind": "grandchild"}) + >>> consolidated = zarr.consolidate_metadata(store) + >>> + >>> consolidated["child"].metadata.consolidated_metadata + ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False) Synchronization and Concurrency ------------------------------- diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index 858fc6cf6d..4807e28cc4 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -3,72 +3,68 @@ Working with groups =================== -.. ipython:: python - :suppress: - - rm -r data/ + >>> import shutil + >>> shutil.rmtree("./data") Zarr supports hierarchical organization of arrays via groups. As with arrays, groups can be stored in memory, on disk, or via other storage systems that support a similar interface. -To create a group, use the :func:`zarr.group` function: - -.. ipython:: python +To create a group, use the :func:`zarr.group` function:: - import zarr - - # TODO: replace with create_group after #2463 - root = zarr.group() - root + >>> import zarr + >>> + >>> # TODO: replace with create_group after #2463 + >>> root = zarr.group() + >>> root + Groups have a similar API to the Group class from `h5py -`_. For example, groups can contain other groups: - -.. ipython:: python +`_. For example, groups can contain other groups:: - foo = root.create_group('foo') - bar = foo.create_group('bar') + >>> foo = root.create_group('foo') + >>> bar = foo.create_group('bar') -Groups can also contain arrays, e.g.: +Groups can also contain arrays, e.g.:: -.. ipython:: python + >>> z1 = bar.zeros(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z1 + - z1 = bar.zeros(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - z1 +Members of a group can be accessed via the suffix notation, e.g.:: -Members of a group can be accessed via the suffix notation, e.g.: - -.. ipython:: python - - root['foo'] + >>> root['foo'] + The '/' character can be used to access multiple levels of the hierarchy in one -call, e.g.: +call, e.g.:: -.. ipython:: python - - root['foo/bar'] - root['foo/bar/baz'] + >>> root['foo/bar'] + + >>> root['foo/bar/baz'] + The :func:`zarr.Group.tree` method can be used to print a tree -representation of the hierarchy, e.g.: - -.. ipython:: python +representation of the hierarchy, e.g.:: - root.tree() + >>> root.tree() + / + └── foo + └── bar + └── baz (10000, 10000) int32 + The :func:`zarr.open_group` function provides a convenient way to create or re-open a group stored in a directory on the file-system, with sub-groups stored in -sub-directories, e.g.: +sub-directories, e.g.:: -.. ipython:: python - - root = zarr.open_group('data/group.zarr', mode='w') - root - - z = root.zeros(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') - z + >>> root = zarr.open_group('data/group.zarr', mode='w') + >>> root + + >>> + >>> z = root.zeros(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z + .. TODO: uncomment after __enter__ and __exit__ are implemented .. Groups can be used as context managers (in a ``with`` statement). @@ -82,27 +78,61 @@ Array and group diagnostics --------------------------- Diagnostic information about arrays and groups is available via the ``info`` -property. E.g.: - -.. ipython:: python - - # TODO: replace with create_group after #2463 - root = zarr.group() - foo = root.create_group('foo') - bar = foo.zeros(name='bar', shape=1000000, chunks=100000, dtype='i8') - bar[:] = 42 - baz = foo.zeros(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') - baz[:] = 4.2 - root.info - foo.info - bar.info_complete() - baz.info - -Groups also have the :func:`zarr.Group.tree` method, e.g.: - -.. ipython:: python - - root.tree() +property. E.g.:: + + >>> # TODO: replace with create_group after #2463 + >>> root = zarr.group() + >>> foo = root.create_group('foo') + >>> bar = foo.zeros(name='bar', shape=1000000, chunks=100000, dtype='i8') + >>> bar[:] = 42 + >>> baz = foo.zeros(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4') + >>> baz[:] = 4.2 + >>> root.info + Name : + Type : Group + Zarr format : 3 + Read-only : False + Store type : MemoryStore + >>> foo.info + Name : foo + Type : Group + Zarr format : 3 + Read-only : False + Store type : MemoryStore + >>> bar.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int64 + Shape : (1000000,) + Chunk shape : (100000,) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 8000000 (7.6M) + No. bytes stored : 1432 + Storage ratio : 5586.6 + Chunks Initialized : 0 + >>> baz.info + Type : Array + Zarr format : 3 + Data type : DataType.float32 + Shape : (1000, 1000) + Chunk shape : (100, 100) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 4000000 (3.8M) + +Groups also have the :func:`zarr.Group.tree` method, e.g.:: + + >>> root.tree() + / + └── foo + ├── bar (1000000,) int64 + └── baz (1000, 1000) float32 + .. note:: diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index b36a4c04b7..54408526f6 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -1,12 +1,10 @@ user-guide-performance Optimizing performance -====================== +======================: -.. ipython:: python - :suppress: - - rm -r data/ + >>> import shutil + >>> shutil.rmtree("./data", ignore_errors=True) .. _user-guide-chunks: @@ -25,48 +23,43 @@ The optimal chunk shape will depend on how you want to access the data. E.g., for a 2-dimensional array, if you only ever take slices along the first dimension, then chunk across the second dimension. If you know you want to chunk across an entire dimension you can use ``None`` or ``-1`` within the ``chunks`` -argument, e.g.: - -.. ipython:: python - - import zarr +argument, e.g.:: - z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') - z1.chunks + >>> import zarr + >>> + >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4') + >>> z1.chunks + (100, 10000) Alternatively, if you only ever take slices along the second dimension, then -chunk across the first dimension, e.g.: +chunk across the first dimension, e.g.:: -.. ipython:: python - - z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') - z2.chunks + >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4') + >>> z2.chunks + (10000, 100) If you require reasonable performance for both access patterns then you need to -find a compromise, e.g.: - -.. ipython:: python +find a compromise, e.g.:: - z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') - z3.chunks + >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4') + >>> z3.chunks + (1000, 1000) If you are feeling lazy, you can let Zarr guess a chunk shape for your data by providing ``chunks=True``, although please note that the algorithm for guessing -a chunk shape is based on simple heuristics and may be far from optimal. E.g.: - -.. ipython:: python +a chunk shape is based on simple heuristics and may be far from optimal. E.g.:: - z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') - z4.chunks + >>> z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4') + >>> z4.chunks + (625, 625) If you know you are always going to be loading the entire array into memory, you can turn off chunks by providing ``chunks=False``, in which case there will be -one single chunk for the array: +one single chunk for the array:: -.. ipython:: python - - z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') - z5.chunks + >>> z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4') + >>> z5.chunks + (10000, 10000) .. _user-guide-chunks-order: @@ -76,17 +69,43 @@ Chunk memory layout The order of bytes **within each chunk** of an array can be changed via the ``order`` config option, to use either C or Fortran layout. For multi-dimensional arrays, these two layouts may provide different compression -ratios, depending on the correlation structure within the data. E.g.: - -.. ipython:: python - - a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T - # TODO: replace with create_array after #2463 - c = zarr.array(a, chunks=(1000, 1000)) - c.info_complete() - with zarr.config.set({'array.order': 'F'}): - f = zarr.array(a, chunks=(1000, 1000)) - f.info_complete() +ratios, depending on the correlation structure within the data. E.g.:: + + >>> import numpy as np + >>> + >>> a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T + >>> # TODO: replace with create_array after #2463 + >>> c = zarr.array(a, chunks=(1000, 1000)) + >>> c.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : C + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 400000000 (381.5M) + No. bytes stored : 342588717 + Storage ratio : 1.2 + Chunks Initialized : 100 + >>> with zarr.config.set({'array.order': 'F'}): + ... f = zarr.array(a, chunks=(1000, 1000)) + >>> f.info_complete() + Type : Array + Zarr format : 3 + Data type : DataType.int32 + Shape : (10000, 10000) + Chunk shape : (1000, 1000) + Order : F + Read-only : False + Store type : MemoryStore + Codecs : [{'endian': }, {'level': 0, 'checksum': False}] + No. bytes : 400000000 (381.5M) + No. bytes stored : 342588717 + Storage ratio : 1.2 + Chunks Initialized : 100 In the above example, Fortran order gives a better compression ratio. This is an artificial example but illustrates the general point that changing the order of @@ -112,45 +131,51 @@ If you know that your data will form chunks that are almost always non-empty, th In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. The following example illustrates the effect of the ``write_empty_chunks`` flag on -the time required to write an array with different values.: - -.. ipython:: python - - import zarr - import numpy as np - import time - - def timed_write(write_empty_chunks): - """ - Measure the time required and number of objects created when writing - to a Zarr array with random ints or fill value. - """ - chunks = (8192,) - shape = (chunks[0] * 1024,) - data = np.random.randint(0, 255, shape) - dtype = 'uint8' - with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): - arr = zarr.open( - f"data/example-{write_empty_chunks}.zarr", - shape=shape, - chunks=chunks, - dtype=dtype, - fill_value=0, - mode='w' - ) - # initialize all chunks - arr[:] = 100 - result = [] - for value in (data, arr.fill_value): - start = time.time() - arr[:] = value - elapsed = time.time() - start - result.append((elapsed, arr.nchunks_initialized)) - return result - # log results - for write_empty_chunks in (True, False): - full, empty = timed_write(write_empty_chunks) - print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') +the time required to write an array with different values.:: + + >>> import zarr + >>> import numpy as np + >>> import time + >>> + >>> def timed_write(write_empty_chunks): + ... """ + ... Measure the time required and number of objects created when writing + ... to a Zarr array with random ints or fill value. + ... """ + ... chunks = (8192,) + ... shape = (chunks[0] * 1024,) + ... data = np.random.randint(0, 255, shape) + ... dtype = 'uint8' + ... with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}): + ... arr = zarr.open( + ... f"data/example-{write_empty_chunks}.zarr", + ... shape=shape, + ... chunks=chunks, + ... dtype=dtype, + ... fill_value=0, + ... mode='w' + ... ) + ... # initialize all chunks + ... arr[:] = 100 + ... result = [] + ... for value in (data, arr.fill_value): + ... start = time.time() + ... arr[:] = value + ... elapsed = time.time() - start + ... result.append((elapsed, arr.nchunks_initialized)) + ... return result + ... # log results + >>> for write_empty_chunks in (True, False): + ... full, empty = timed_write(write_empty_chunks) + ... print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n') + write_empty_chunks=True: + Random Data: ..., 1024 objects stored + Empty Data: ...s, 1024 objects stored + + write_empty_chunks=False: + Random Data: ...s, 1024 objects stored + Empty Data: ...s, 0 objects stored + In this example, writing random data is slightly slower with ``write_empty_chunks=True``, but writing empty data is substantially faster and generates far fewer objects in storage. @@ -183,18 +208,18 @@ If an array or group is backed by a persistent store such as the a :class:`zarr. **are not** pickled. The only thing that is pickled is the necessary parameters to allow the store to re-open any underlying files or databases upon being unpickled. -E.g., pickle/unpickle an local store array: - -.. ipython:: python - - import pickle - - # TODO: replace with create_array after #2463 - z1 = zarr.array(store="data/example-2", data=np.arange(100000)) - s = pickle.dumps(z1) - z2 = pickle.loads(s) - z1 == z2 - np.all(z1[:] == z2[:]) +E.g., pickle/unpickle an local store array:: + + >>> import pickle + >>> + >>> # TODO: replace with create_array after #2463 + >>> z1 = zarr.array(store="data/example-2", data=np.arange(100000)) + >>> s = pickle.dumps(z1) + >>> z2 = pickle.loads(s) + >>> z1 == z2 + True + >>> np.all(z1[:] == z2[:]) + np.True_ .. _user-guide-tips-blosc: diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst index 17a04c4fae..741c8ccf83 100644 --- a/docs/user-guide/storage.rst +++ b/docs/user-guide/storage.rst @@ -16,25 +16,26 @@ Implicit Store Creation ----------------------- In most cases, it is not required to create a ``Store`` object explicitly. Passing a string -to Zarr's top level API will result in the store being created automatically. - -.. ipython:: python - - import zarr - - # Implicitly create a writable LocalStore - zarr.open_group("data/foo/bar", mode="w") - - # Implicitly create a read-only FsspecStore - zarr.open_group( - "s3://noaa-nwm-retro-v2-zarr-pds", - mode="r", - storage_options={"anon": True} - ) - - # Implicitly creates a MemoryStore - data = {} - zarr.open_group(data, mode="w") +to Zarr's top level API will result in the store being created automatically.: + + >>> import zarr + >>> + >>> # Implicitly create a writable LocalStore + >>> zarr.open_group("data/foo/bar", mode="w") + + >>> + >>> # Implicitly create a read-only FsspecStore + >>> zarr.open_group( + ... "s3://noaa-nwm-retro-v2-zarr-pds", + ... mode="r", + ... storage_options={"anon": True} + ... ) + > + >>> + >>> # Implicitly creates a MemoryStore + >>> data = {} + >>> zarr.open_group(data, mode="w") + Explicit Store Creation ----------------------- @@ -47,25 +48,23 @@ Local Store ~~~~~~~~~~~ The :class:`zarr.storage.LocalStore` stores data in a nested set of directories on a local -filesystem. +filesystem.: -.. ipython:: python - - store = zarr.storage.LocalStore("data/foo/bar", read_only=True) - # TODO: replace with create_group after #2463 - zarr.open(store=store, mode='r') + >>> store = zarr.storage.LocalStore("data/foo/bar", read_only=True) + >>> # TODO: replace with create_group after #2463 + >>> zarr.open(store=store, mode='r') + Zip Store ~~~~~~~~~ The :class:`zarr.storage.ZipStore` stores the contents of a Zarr hierarchy in a single -Zip file. The `Zip Store specification`_ is currently in draft form. - -.. ipython:: python +Zip file. The `Zip Store specification`_ is currently in draft form.: - store = zarr.storage.ZipStore("data.zip", mode="w") - # TODO: replace with create_array after #2463 - zarr.open(store=store, shape=(2,)) + >>> store = zarr.storage.ZipStore("data.zip", mode="w") + >>> # TODO: replace with create_array after #2463 + >>> zarr.open(store=store, shape=(2,)) + Remote Store ~~~~~~~~~~~~ @@ -75,29 +74,27 @@ logical layout as the ``LocalStore``, except the store is assumed to be on a rem such as cloud object storage (e.g. AWS S3, Google Cloud Storage, Azure Blob Store). The :class:`zarr.storage.FsspecStore` is backed by `fsspec`_ and can support any backend that implements the `AbstractFileSystem `_ -API. ``storage_options`` can be used to configure the fsspec backend. +API. ``storage_options`` can be used to configure the fsspec backend.: -.. ipython:: python - - store = zarr.storage.FsspecStore.from_url( - "s3://noaa-nwm-retro-v2-zarr-pds", - read_only=True, - storage_options={"anon": True} - ) - zarr.open_group(store=store, mode='r') + >>> store = zarr.storage.FsspecStore.from_url( + ... "s3://noaa-nwm-retro-v2-zarr-pds", + ... read_only=True, + ... storage_options={"anon": True} + ... ) + >>> zarr.open_group(store=store, mode='r') + > Memory Store ~~~~~~~~~~~~ The :class:`zarr.storage.MemoryStore` a in-memory store that allows for serialization of -Zarr data (metadata and chunks) to a dictionary. - -.. ipython:: python +Zarr data (metadata and chunks) to a dictionary.: - data = {} - store = zarr.storage.MemoryStore(data) - # TODO: replace with create_array after #2463 - zarr.open(store=store, shape=(2, )) + >>> data = {} + >>> store = zarr.storage.MemoryStore(data) + >>> # TODO: replace with create_array after #2463 + >>> zarr.open(store=store, shape=(2, )) + Developing custom stores ------------------------ diff --git a/pyproject.toml b/pyproject.toml index 36842ba927..f6875b7c63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,6 @@ docs = [ 'pydata-sphinx-theme', 'numpydoc', 'numcodecs[msgpack]', - 'ipython', 'rich', 's3fs', ] @@ -139,7 +138,7 @@ numpy = ["1.25", "2.1"] features = ["gpu"] [tool.hatch.envs.test.scripts] -run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=src" +run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=src --doctest-glob='*.rst'" run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=src" run = "run-coverage --no-cov" run-verbose = "run-coverage --verbose" @@ -352,7 +351,7 @@ ignore_errors = true [tool.pytest.ini_options] minversion = "7" -testpaths = ["tests"] +testpaths = ["tests", "docs"] log_cli_level = "INFO" xfail_strict = true asyncio_mode = "auto"