diff --git a/.travis.yml b/.travis.yml index 5ecf462419..2c73212d0e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,6 +12,7 @@ addons: - libdb-dev services: + - docker - redis-server - mongodb @@ -24,6 +25,10 @@ matrix: dist: xenial sudo: true +before_install: + - docker pull arafato/azurite + - mkdir ~/blob_emulator + - docker run -e executable=blob -d -t -p 10000:10000 -v ~/blob_emulator:/opt/azurite/folder arafato/azurite before_script: - mongo mydb_test --eval 'db.createUser({user:"travis",pwd:"test",roles:["readWrite"]});' diff --git a/appveyor.yml b/appveyor.yml index 67058550dc..d04417d671 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,10 @@ branches: only: - master +# the VS C++ compiler path, doesn't seem to exist in the PATH environment variable of +# the Visual Studio 2017 build VM, due to which the pyosreplace package fails to build +image: Visual Studio 2015 + environment: global: @@ -9,6 +13,7 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build.cmd" + EMULATOR_LOC: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe matrix: @@ -36,5 +41,11 @@ install: build: off +before_test: + - '"%EMULATOR_LOC%" start' + test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" + +after_test: + - '"%EMULATOR_LOC%" stop' diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 9abe240379..85d85f40aa 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -33,6 +33,8 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ABSStore + .. autoclass:: ConsolidatedMetadataStore .. autofunction:: init_array diff --git a/docs/release.rst b/docs/release.rst index 65bd94c45f..23cc70c267 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,6 +9,9 @@ Release notes Enhancements ~~~~~~~~~~~~ +* New storage backend, backed by Azure Blob Storage, class :class:`zarr.storage.ABSStore`. + All data is stored as block blobs. By :user:`Shikhar Goenka ` and :user:`Tim Crone `, :issue:`345`. + * Add "consolidated" metadata as an experimental feature: use :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various metadata keys within a dataset hierarchy under a single key, and diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 3e8e9bac66..8eae734b59 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -787,6 +787,21 @@ Here is an example using S3Map to read an array created previously:: >>> z[:].tostring() b'Hello from the cloud!' +Zarr now also has a builtin storage backend for Azure Blob Storage. +The class is :class:`zarr.storage.ABSStore` (requires + `azure-storage-blob `_ +to be installed):: + + >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', blob_service_kwargs={'is_emulated': True}) + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + +When using an actual storage account, provide ``account_name`` and +``account_key`` arguments to :class:`zarr.storage.ABSStore`, the +above client is just testing against the emulator. Please also note +that this is an experimental feature. + Note that retrieving data from a remote service via the network can be significantly slower than retrieving data from a local file system, and will depend on network latency and bandwidth between the client and server systems. If you are experiencing poor diff --git a/requirements_test.txt b/requirements_test.txt index a668f130cc..1492a673c1 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -9,3 +9,4 @@ pytest-cov s3fs setuptools-scm tox +azure-storage-blob diff --git a/zarr/__init__.py b/zarr/__init__.py index e208b8ae82..178e857983 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -8,7 +8,7 @@ ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, - LRUStoreCache, RedisStore, MongoDBStore) + LRUStoreCache, ABSStore, RedisStore, MongoDBStore) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/storage.py b/zarr/storage.py index d71ee3a18a..e6fd98705a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1879,6 +1879,153 @@ def __delitem__(self, key): self._invalidate_value(key) +class ABSStore(MutableMapping): + """Storage class using Azure Blob Storage (ABS). + + Parameters + ---------- + container : string + The name of the ABS container to use. + prefix : string + Location of the "directory" to use as the root of the storage hierarchy + within the container. + account_name : string + The Azure blob storage account name. + account_key : string + The Azure blob storage account access key. + blob_service_kwargs : dictionary + Extra arguments to be passed into the azure blob client, for e.g. when + using the emulator, pass in blob_service_kwargs={'is_emulated': True}. + + Notes + ----- + In order to use this store, you must install the Microsoft Azure Storage SDK for Python. + """ + + def __init__(self, container, prefix, account_name=None, account_key=None, + blob_service_kwargs=None): + from azure.storage.blob import BlockBlobService + self.container = container + self.prefix = normalize_storage_path(prefix) + self.account_name = account_name + self.account_key = account_key + if blob_service_kwargs is not None: + self.blob_service_kwargs = blob_service_kwargs + else: # pragma: no cover + self.blob_service_kwargs = dict() + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) + + # needed for pickling + def __getstate__(self): + state = self.__dict__.copy() + del state['client'] + return state + + def __setstate__(self, state): + from azure.storage.blob import BlockBlobService + self.__dict__.update(state) + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) + + @staticmethod + def _append_path_to_prefix(path, prefix): + return '/'.join([normalize_storage_path(prefix), + normalize_storage_path(path)]) + + @staticmethod + def _strip_prefix_from_path(path, prefix): + # normalized things will not have any leading or trailing slashes + path_norm = normalize_storage_path(path) + prefix_norm = normalize_storage_path(prefix) + return path_norm[(len(prefix_norm)+1):] + + def __getitem__(self, key): + from azure.common import AzureMissingResourceHttpError + blob_name = '/'.join([self.prefix, key]) + try: + blob = self.client.get_blob_to_bytes(self.container, blob_name) + return blob.content + except AzureMissingResourceHttpError: + raise KeyError('Blob %s not found' % blob_name) + + def __setitem__(self, key, value): + value = ensure_bytes(value) + blob_name = '/'.join([self.prefix, key]) + self.client.create_blob_from_bytes(self.container, blob_name, value) + + def __delitem__(self, key): + from azure.common import AzureMissingResourceHttpError + try: + self.client.delete_blob(self.container, '/'.join([self.prefix, key])) + except AzureMissingResourceHttpError: + raise KeyError('Blob %s not found' % key) + + def __eq__(self, other): + return ( + isinstance(other, ABSStore) and + self.container == other.container and + self.prefix == other.prefix + ) + + def keys(self): + return list(self.__iter__()) + + def __iter__(self): + for blob in self.client.list_blobs(self.container, self.prefix + '/'): + yield self._strip_prefix_from_path(blob.name, self.prefix) + + def __len__(self): + return len(self.keys()) + + def __contains__(self, key): + blob_name = '/'.join([self.prefix, key]) + if self.client.exists(self.container, blob_name): + return True + else: + return False + + def listdir(self, path=None): + store_path = normalize_storage_path(path) + # prefix is normalized to not have a trailing slash + dir_path = self.prefix + if store_path: + dir_path = dir_path + '/' + store_path + dir_path += '/' + items = list() + for blob in self.client.list_blobs(self.container, prefix=dir_path, delimiter='/'): + if '/' in blob.name[len(dir_path):]: + items.append(self._strip_prefix_from_path( + blob.name[:blob.name.find('/', len(dir_path))], dir_path)) + else: + items.append(self._strip_prefix_from_path(blob.name, dir_path)) + return items + + def rmdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path, self.prefix)) + '/' + for blob in self.client.list_blobs(self.container, prefix=dir_path): + self.client.delete_blob(self.container, blob.name) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self.prefix + if store_path: + fs_path = self._append_path_to_prefix(store_path, self.prefix) + if self.client.exists(self.container, fs_path): + return self.client.get_blob_properties(self.container, + fs_path).properties.content_length + else: + size = 0 + for blob in self.client.list_blobs(self.container, prefix=fs_path + '/', + delimiter='/'): + if '/' not in blob.name[len(fs_path + '/'):]: + size += blob.properties.content_length + return size + + def clear(self): + self.rmdir() + + class SQLiteStore(MutableMapping): """Storage class using SQLite. diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 1c7d526c0c..d8f566fe3c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -12,11 +12,12 @@ import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, - DBMStore, LMDBStore, SQLiteStore, atexit_rmtree, atexit_rmglob, - LRUStoreCache) + DBMStore, LMDBStore, SQLiteStore, ABSStore, atexit_rmtree, + atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2, text_type, binary_type, zip_longest @@ -1322,6 +1323,28 @@ def test_nbytes_stored(self): assert expect_nbytes_stored == z.nbytes_stored +class TestArrayWithABSStore(TestArray): + + @staticmethod + def absstore(): + blob_client = BlockBlobService(is_emulated=True) + blob_client.delete_container('test') + blob_client.create_container('test') + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) + store.rmdir() + return store + + def create_array(self, read_only=False, **kwargs): + store = self.absstore() + kwargs.setdefault('compressor', Zlib(1)) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): @staticmethod diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 369cf4b55a..37baecf1ae 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -13,12 +13,13 @@ import numpy as np from numpy.testing import assert_array_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, - atexit_rmglob, LRUStoreCache) + ABSStore, atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -864,6 +865,19 @@ def create_store(): return store, None +class TestGroupWithABSStore(TestGroup): + + @staticmethod + def create_store(): + blob_client = BlockBlobService(is_emulated=True) + blob_client.delete_container('test') + blob_client.create_container('test') + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) + store.rmdir() + return store, None + + class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 87273d140c..5dc0e72c44 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -14,14 +14,15 @@ import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (init_array, array_meta_key, attrs_key, DictStore, DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, SQLiteStore, MongoDBStore, RedisStore, - atexit_rmglob, LRUStoreCache, ConsolidatedMetadataStore) + LMDBStore, SQLiteStore, ABSStore, atexit_rmglob, LRUStoreCache, + ConsolidatedMetadataStore, MongoDBStore, RedisStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) from zarr.compat import PY2 @@ -1370,6 +1371,18 @@ def test_format_compatibility(): assert compressor.get_config() == z.compressor.get_config() +class TestABSStore(StoreTests, unittest.TestCase): + + def create_store(self): + blob_client = BlockBlobService(is_emulated=True) + blob_client.delete_container('test') + blob_client.create_container('test') + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) + store.rmdir() + return store + + class TestConsolidatedMetadataStore(unittest.TestCase): def test_bad_format(self):