diff --git a/.gitignore b/.gitignore index d2d6f360b5d..e8f59a8999c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,5 @@ nosetests.xml .mr.developer.cfg .project .pydevproject + +doc/_build diff --git a/README.md b/README.md index df617932e0b..b79cd9e0343 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,76 @@ -scidata -======= - -Objects for holding self describing scientific data in python. The goal of this project is to -provide a Common Data Model (http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM/) -allowing users to read write and manipulate netcdf-like data without worrying about where the data -source lives. A dataset that is too large to fit in memory, served from an OpenDAP server, streamed -or stored as NetCDF3, NetCDF4, grib (?), HDF5 and others can all be inspected and manipulated using -the same methods. - -Of course there are already several packages in python that offer similar functionality (netCDF4, -scipy.io, pupynere, iris, ... ) but each of those packages have their own shortcomings: - -netCDF4 - Doesn't allow streaming. If you want to create a new object it needs to live on disk. -scipy.io / pupynere - Only works with NetCDF3 and doesn't support DAP making it difficult to work with large datasets. -iris - is REALLY close to what this project will provide, but iris strays further from the CDM, - than I would like. (if you read then write a netcdf file using iris all global attributes - are pushed down to variable level attributes. +# xray: transparently manipulate scientific datasets in Python + +**xray** is a Python package for working with aligned sets of homogeneous, +n-dimensional arrays. It implements flexible array operations and dataset +manipulation for in-memory datasets within the [Common Data Model][cdm] widely +used for self-describing scientific data (netCDF, OpenDAP, etc.). + +***Warning: xray is still in its early development phase. Expect the API to +change.*** + +## Main Feaures + + - A `DatasetArray` object that is compatible with NumPy's ndarray and ufuncs + but keeps ancilliary variables and metadata intact. + - Array broadcasting based on dimension names and coordinate indices + instead of only shapes. + - Flexible split-apply-combine functionality with the `Array.groupby` method + (patterned after [pandas][pandas]). + - Fast label-based indexing and (limited) time-series functionality built on + [pandas][pandas]. + +## Design Goals + + - Provide a data analysis toolkit as fast and powerful as pandas but + designed for working with datasets of aligned, homogeneous N-dimensional + arrays. + - Whenever possible, build on top of and interoperate with pandas and the + rest of the awesome [scientific python stack][scipy]. + - Provide a uniform API for loading and saving scientific data in a variety + of formats (including streaming data). + - Use metadata according to [conventions][cf] when appropriate, but don't + strictly enforce them. Conflicting attributes (e.g., units) should be + silently dropped instead of causing errors. The onus is on the user to + make sure that operations make sense. + +## Prior Art + + - [Iris][iris] (supported by the UK Met office) is a similar package + designed for working with geophysical datasets in Python. Iris provided + much of the inspiration for xray (e.g., xray's `DatasetArray` is largely + based on the Iris `Cube`), but it has several limitations that led us to + build xray instead of extending Iris: + 1. Iris has essentially one first-class object (the `Cube`) on which it + attempts to build all functionality (`Coord` supports a much more + limited set of functionality). xray has its equivalent of the Cube + (the `DatasetArray` object), but it is only a thin wrapper on the more + primitive building blocks of Dataset and Array objects. + 2. Iris has a strict interpretation of [CF conventions][cf], which, + although a principled choice, we have found to be impractical for + everyday uses. With Iris, every quantity has physical (SI) units, all + coordinates have cell-bounds, and all metadata (units, cell-bounds and + other attributes) is required to match before merging or doing + operations with on multiple cubes. This means that a lot of time with + Iris is spent figuring out why cubes are incompatible and explicitly + removing possibly conflicting metadata. + 3. Iris can be slow and complex. Strictly interpretting metadata requires + a lot of work and (in our experience) can be difficult to build mental + models of how Iris functions work. Moreover, it means that a lot of + logic (e.g., constraint handling) uses non-vectorized operations. For + example, extracting all times within a range can be surprisingly slow + (e.g., 0.3 seconds vs 3 milliseconds in xray to select along a time + dimension with 10000 elements). + - [pandas][pandas] is fast and powerful but oriented around working with + tabular datasets. pandas has experimental N-dimensional panels, but they + don't support aligned math with other objects. We believe the + `DatasetArray`/ `Cube` model is better suited to working with scientific + datasets. We use pandas internally in xray to support fast indexing. + - [netCDF4-python][nc4] provides xray's primary interface for working with + netCDF and OpenDAP datasets. + +[pandas]: http://pandas.pydata.org/ +[cdm]: http://www.unidata.ucar.edu/software/thredds/current/netcdf-java/CDM/ +[cf]: http://cf-pcmdi.llnl.gov/documents/cf-conventions/1.6/cf-conventions.html +[scipy]: http://scipy.org/ +[nc4]: http://netcdf4-python.googlecode.com/svn/trunk/docs/netCDF4-module.html +[iris]: http://scitools.org.uk/iris/ diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 00000000000..78e298c02b5 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scidata.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scidata.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/scidata" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/scidata" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 00000000000..b3774bfff87 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,272 @@ +# -*- coding: utf-8 -*- +# +# xray documentation build configuration file, created by +# sphinx-quickstart on Thu Feb 6 18:57:54 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.viewcode', + 'numpydoc', +] + +numpydoc_class_members_toctree = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'xray' +copyright = u'2014, Stephan Hoyer and Alex Kleeman' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1-dev' +# The full version, including alpha/beta/rc tags. +release = '0.1-dev' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'xraydoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'xray.tex', u'xray Documentation', + u'Stephan Hoyer and Alex Kleeman', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'xray', u'xray Documentation', + [u'Stephan Hoyer and Alex Kleeman'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'xray', u'xray Documentation', + u'Stephan Hoyer and Alex Kleeman', 'xray', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 00000000000..29639696b24 --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,24 @@ +.. scidata documentation master file, created by + sphinx-quickstart on Thu Feb 6 18:57:54 2014. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +xray reference +============== + +Contents: + +.. toctree:: + :maxdepth: 2 + +.. automodule:: scidata + :members: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/setup.py b/setup.py index 7743a55f8c5..18b124cc994 100644 --- a/setup.py +++ b/setup.py @@ -5,14 +5,15 @@ except: from distutils.core import setup -setup(name='scidata', - version='0.01', +setup(name='xray', + version='0.1-dev', description='Objects for holding self describing scientific data in python', - author='Alex Kleeman', + author='Stephan Hoyer, Alex Kleeman, Eugene Brevdo', author_email='TODO', - install_requires=['scipy >= 0.10.0', 'numpy >= 1.7', 'netCDF4 >= 1.0.6'], + install_requires=['scipy >= 0.10.0', 'numpy >= 1.8', 'netCDF4 >= 1.0.6', + 'pandas >= 0.13.1'], tests_require=['nose >= 1.0'], url='https://github.com/akleeman/scidata', test_suite='nose.collector', - packages=['scidata'], + packages=['xray'], package_dir={'': 'src'}) diff --git a/src/scidata/__init__.py b/src/scidata/__init__.py deleted file mode 100644 index 628427b3273..00000000000 --- a/src/scidata/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from data import Dataset, open_dataset -from variable import Variable - -import backends diff --git a/src/scidata/backends.py b/src/scidata/backends.py deleted file mode 100644 index b548a4a46a1..00000000000 --- a/src/scidata/backends.py +++ /dev/null @@ -1,189 +0,0 @@ -import netCDF4 as nc4 - -from scipy.io import netcdf -from collections import OrderedDict - -import variable - - -class InMemoryDataStore(object): - """ - Stores dimensions, variables and attributes - in ordered dictionaries, making this store - fast compared to stores which store to disk. - """ - def __init__(self): - self.dimensions = OrderedDict() - self.variables = OrderedDict() - self.attributes = variable.AttributesDict() - - def unchecked_set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" - self.dimensions.update(dimensions) - - def unchecked_set_attributes(self, attributes): - """Set the attributes without checking validity""" - self.attributes.update(attributes) - - def unchecked_set_variables(self, variables): - """Set the variables without checking validity""" - self.variables.update(variables) - - def unchecked_create_dimension(self, name, length): - """Set a dimension length""" - self.dimensions[name] = length - - def unchecked_add_variable(self, name, variable): - """Add a variable without checks""" - self.variables[name] = variable - return self.variables[name] - - def sync(self): - pass - - -class ScipyVariable(variable.Variable): - def __init__(self, scipy_var): - self._dimensions = scipy_var.dimensions - self._data = scipy_var.data - self._attributes = scipy_var._attributes - - -class ScipyDataStore(object): - """ - Stores data using the scipy.io.netcdf package. - This store has the advantage of being able to - be initialized with a StringIO object, allow for - serialization. - """ - def __init__(self, fobj, *args, **kwdargs): - self.ds = netcdf.netcdf_file(fobj, *args, **kwdargs) - - @property - def variables(self): - return OrderedDict((k, ScipyVariable(v)) - for k, v in self.ds.variables.iteritems()) - - @property - def attributes(self): - return self.ds._attributes - - @property - def dimensions(self): - return self.ds.dimensions - - def unchecked_set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" - for d, l in dimensions.iteritems(): - self.unchecked_create_dimension(d, l) - - def unchecked_set_attributes(self, attributes): - """Set the attributes without checking validity""" - for k, v in attributes.iteritems(): - setattr(self.ds, k, v) - - def unchecked_set_variables(self, variables): - """Set the variables without checking validity""" - for vn, v in variables.iteritems(): - self.unchecked_add_variable(vn, v) - - def unchecked_create_dimension(self, name, length): - """Set a dimension length""" - self.ds.createDimension(name, length) - - def unchecked_add_variable(self, name, variable): - """Add a variable without checks""" - self.ds.createVariable(name, variable.dtype, - variable.dimensions) - self.ds.variables[name][:] = variable.data[:] - for k, v in variable.attributes.iteritems(): - setattr(self.ds.variables[name], k, v) - return variable #self.ds.variables[name] - # return self.ds.variables[name] - - def sync(self): - self.ds.flush() - - -class NetCDF4Variable(variable.Variable): - def __init__(self, nc4_variable): - self._nc4_variable = nc4_variable - self._dimensions = nc4_variable.dimensions - self._data = nc4_variable - self._attributes = None - - @property - def attributes(self): - if self._attributes is None: - # we don't want to see scale_factor and add_offset in the attributes - # since the netCDF4 package automatically scales the data on read. - # If we kept scale_factor and add_offset around and did this: - # - # foo = ncdf4.Dataset('foo.nc') - # ncdf4.dump(foo, 'bar.nc') - # bar = ncdf4.Dataset('bar.nc') - # - # you would find that any packed variables in the original - # netcdf file would now have been scaled twice! - packing_attributes = ['scale_factor', 'add_offset'] - keys = [k for k in self._nc4_variable.ncattrs() - if not k in packing_attributes] - attr_dict = variable.AttributesDict( - (k, self._nc4_variable.getncattr(k)) for k in keys) - self._attributes = attr_dict - return self._attributes - - -class NetCDF4DataStore(object): - def __init__(self, filename, *args, **kwdargs): - self.ds = nc4.Dataset(filename, *args, **kwdargs) - - @property - def variables(self): - return OrderedDict((k, NetCDF4Variable(v)) - for k, v in self.ds.variables.iteritems()) - - @property - def attributes(self): - return variable.AttributesDict((k, self.ds.getncattr(k)) - for k in self.ds.ncattrs()) - - @property - def dimensions(self): - return OrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) - - def unchecked_set_dimensions(self, dimensions): - """Set the dimensions without checking validity""" - for d, l in dimensions.iteritems(): - self.unchecked_create_dimension(d, l) - - def unchecked_set_attributes(self, attributes): - """Set the attributes without checking validity""" - self.ds.setncatts(attributes) - - def unchecked_set_variables(self, variables): - """Set the variables without checking validity""" - for vn, v in variables.iteritems(): - self.unchecked_add_variable(vn, v) - - def unchecked_create_dimension(self, name, length): - """Set a dimension length""" - self.ds.createDimension(name, size=length) - - def unchecked_add_variable(self, name, variable): - """Add a variable without checks""" - # netCDF4 will automatically assign a fill value - # depending on the datatype of the variable. Here - # we let the package handle the _FillValue attribute - # instead of setting it ourselves. - fill_value = variable.attributes.pop('_FillValue', None) - self.ds.createVariable(varname=name, - datatype=variable.dtype, - dimensions=variable.dimensions, - fill_value=fill_value) - self.ds.variables[name][:] = variable.data[:] - self.ds.variables[name].setncatts(variable.attributes) - return variable #self.ds.variables[name] - - def sync(self): - self.ds.sync() diff --git a/src/scidata/data.py b/src/scidata/data.py deleted file mode 100644 index 7347ee321cb..00000000000 --- a/src/scidata/data.py +++ /dev/null @@ -1,891 +0,0 @@ -# TODO Use various backend data stores. pytable, ncdf4, scipy.io, iris, memory - -import os -import copy -import numpy as np -import netCDF4 as nc4 - -from cStringIO import StringIO -from collections import OrderedDict - -import conventions, backends, variable, utils - -date2num = nc4.date2num -num2date = nc4.num2date - - -def construct_dimensions(variables): - """ - Given a dictionary of variables, construct a dimensions mapping - - Parameters - ---------- - variables : mapping - Mapping from variable names to Variable objects. - - Returns - ------- - dimensions : mapping - Mapping from dimension names to lengths. - - Raises - ------ - ValueError if variable dimensions are inconsistent. - """ - dimensions = OrderedDict() - for k, var in variables.iteritems(): - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - dimensions[dim] = length - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length %s ' - 'but already is saved with length %s' % - (dim, k, length, dimensions[dim])) - return dimensions - - -def check_dims_and_vars_consistency(dimensions, variables): - """ - Validate dimensions and variables are consistent - - Parameters - ---------- - dimensions : mapping - Mapping from dimension names to lengths. - variables : mapping - Mapping from variable names to Variable objects. - - Raises - ------ - ValueError if variable dimensions are inconsistent with the provided - dimensions. - """ - for k, var in variables.iteritems(): - if k in dimensions and var.ndim != 1: - raise ValueError('a coordinate variable must be defined with ' - '1-dimensional data') - for dim, length in zip(var.dimensions, var.shape): - if dim not in dimensions: - raise ValueError('dimension %r on variable %r is not one ' - 'of the dataset dimensions %r' % - (dim, k, list(dimensions))) - elif dimensions[dim] != length: - raise ValueError('dimension %r on variable %r has length ' - '%s but in on the dataset has length %s' % - (dim, k, length, dimensions[dim])) - - -def open_dataset(nc, *args, **kwargs): - #TODO: add tests for this function - # move this to a classmethod Dataset.open? - if isinstance(nc, basestring) and not nc.startswith('CDF'): - # If the initialization nc is a string and it doesn't - # appear to be the contents of a netcdf file we load - # it using the netCDF4 package - store = backends.NetCDF4DataStore(nc, *args, **kwargs) - else: - # If nc is a file-like object we read it using - # the scipy.io.netcdf package - store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset(store=store) - - -class Dataset(object): - """ - A netcdf-like data object consisting of dimensions, variables and - attributes which together form a self describing data set - - Dataset objects can also be treated as a mapping from variable names to - Variable objects. - - They should be modified by using methods, not by directly changing any of - the attributes listed below: - TODO: change this! - - Attributes - ---------- - dimensions : {name: length, ...} - variables : {name: variable, ...} - coordinates : {name: variable, ...} - Coordinates are simply variables that are also dimensions. They must - all have dimension 1. - noncoordinates : {name: variable, ...} - Variables that are not coordinates. - attributes : dict-like - store : baackends.*DataStore - """ - def __init__(self, variables=None, dimensions=None, attributes=None, - store=None, check_consistency=True): - """ - If dimensions are not provided, they are inferred from the variables. - - Otherwise, variables and dimensions are only checked for consistency - if check_dimensions=True. - """ - # TODO: fill out this docstring - if store is None: - store = backends.InMemoryDataStore() - object.__setattr__(self, 'store', store) - - if attributes is not None: - self._unchecked_set_attributes(attributes) - - if dimensions is not None: - self._unchecked_set_dimensions(dimensions) - - if variables is not None: - if dimensions is None: - self._unchecked_set_dimensions(construct_dimensions(variables)) - elif check_consistency: - check_dims_and_vars_consistency(dimensions, variables) - self._unchecked_set_variables(variables) - - def _unchecked_set_dimensions(self, *args, **kwdargs): - self.store.unchecked_set_dimensions(*args, **kwdargs) - - def _unchecked_set_attributes(self, *args, **kwdargs): - self.store.unchecked_set_attributes(*args, **kwdargs) - - def _unchecked_set_variables(self, *args, **kwdargs): - self.store.unchecked_set_variables(*args, **kwdargs) - - def _unchecked_create_dimension(self, *args, **kwdargs): - self.store.unchecked_create_dimension(*args, **kwdargs) - - def _unchecked_add_variable(self, *args, **kwdargs): - return self.store.unchecked_add_variable(*args, **kwdargs) - - def sync(self): - return self.store.sync() - - @property - def variables(self): - return self.store.variables - - @property - def attributes(self): - return self.store.attributes - - @property - def dimensions(self): - return self.store.dimensions - - def copy(self): - """ - Returns a shallow copy of the current object. - """ - return self.__copy__() - - def __copy__(self): - """ - Returns a shallow copy of the current object. - """ - return type(self)(self.variables, self.dimensions, self.attributes, - check_consistency=False) - - def __setattr__(self, attr, value): - """"__setattr__ is overloaded to prevent operations that could - cause loss of data consistency. If you really intend to update - dir(self), use the self.__dict__.update method or the - super(type(a), self).__setattr__ method to bypass.""" - #TODO: remove this hack? - raise AttributeError("__setattr__ is disabled") - - def __contains__(self, key): - """ - The 'in' operator will return true or false depending on - whether 'key' is a varibale in the data object or not. - """ - return key in self.variables - - def __iter__(self): - return iter(self.variables) - - def __getitem__(self, key): - return self.variables[key] - - def __setitem__(self, key, value): - return self.add_variable(key, value) - - def __delitem__(self, key): - # does deleting variables make sense for all backends? - raise NotImplementedError - - def __eq__(self, other): - try: - # some stores (e.g., scipy) do not seem to preserve order, so don't - # require matching dimension or variable order for equality - return (dict(self.dimensions) == dict(other.dimensions) - and self.attributes == other.attributes - and all(k1 == k2 and utils.variable_equal(v1, v2) - for (k1, v1), (k2, v2) - in zip(dict(self.variables).items(), - dict(other.variables).items()))) - except AttributeError: - return False - - def __ne__(self, other): - return not self == other - - @property - def coordinates(self): - """Coordinates are variables with names that match dimensions""" - return OrderedDict([(dim, self.variables[dim]) - for dim in self.dimensions - if dim in self.variables and - self.variables[dim].data.ndim == 1 and - self.variables[dim].dimensions == (dim,)]) - - @property - def noncoordinates(self): - """Non-coordinates are variables with names that do not match - dimensions - """ - return OrderedDict([(name, v) - for (name, v) in self.variables.iteritems() - if name not in self.coordinates]) - - def stored_to(self, store): - """ - Store dataset contents to a backends.*DataStore object and return a new - dataset with the contents of the store - """ - target = type(self)(self.variables, self.dimensions, self.attributes, - store=store, check_consistency=False) - target.store.sync() - return target - - def dump(self, filepath, *args, **kwdargs): - """ - Dump dataset contents to a location on disk using the netCDF4 package - """ - nc4_store = backends.NetCDF4DataStore(filepath, mode='w', - *args, **kwdargs) - self.stored_to(nc4_store) - - def dumps(self): - """ - Serialize dataset contents to a string. The serialization creates an - in memory netcdf version 3 string using the scipy.io.netcdf package. - """ - fobj = StringIO() - scipy_store = backends.ScipyDataStore(fobj, mode='w') - self.stored_to(scipy_store) - return fobj.getvalue() - - def __str__(self): - """Create a ncdump-like summary of the object""" - summary = ["dimensions:"] - # prints dims that look like: - # dimension = length - dim_print = lambda d, l : "\t%s = %s" % (conventions.pretty_print(d, 30), - conventions.pretty_print(l, 10)) - # add each dimension to the summary - summary.extend([dim_print(d, l) for d, l in self.dimensions.iteritems()]) - - # Print variables - summary.append("variables:") - for vname, var in self.variables.iteritems(): - # this looks like: - # dtype name(dim1, dim2) - summary.append("\t%s %s(%s)" % (conventions.pretty_print(var.dtype, 8), - conventions.pretty_print(vname, 20), - conventions.pretty_print(', '.join(var.dimensions), 45))) - # attribute:value - summary.extend(["\t\t%s:%s" % (conventions.pretty_print(att, 30), - conventions.pretty_print(val, 30)) - for att, val in var.attributes.iteritems()]) - - summary.append("attributes:") - # attribute:value - summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), - conventions.pretty_print(val, 30)) - for att, val in self.attributes.iteritems()]) - # create the actual summary - return '\n'.join(summary) - - def __repr__(self): - dim_summary = ', '.join('%s: %s' % (k, v) for k, v - in self.dimensions.iteritems()) - vars_summary = ' '.join(map(str, self.noncoordinates)) - return '' % (dim_summary, vars_summary) - - def create_dimension(self, name, length): - """Adds a dimension with name dim and length to the object - - Parameters - ---------- - name : string - The name of the new dimension. An exception will be raised if the - object already has a dimension with this name. - length : int - The length of the new dimension; must a be non-negative integer. - """ - if name in self.dimensions: - raise ValueError('dimension named %r already exists' % name) - elif not isinstance(length, int): - raise TypeError('length must be an integer') - elif length < 0: - raise ValueError('length must be non-negative') - self._unchecked_create_dimension(name, int(length)) - - def create_variable(self, name, dims, data, attributes=None): - """Create a new variable. - - Parameters - ---------- - name : string - The name of the new variable. An exception will be raised - if the object already has a variable with this name. name - must satisfy netCDF-3 naming rules. If name equals the name - of a dimension, then the new variable is treated as a - coordinate variable and must be 1-dimensional. - dims : tuple - The dimensions of the new variable. Elements must be dimensions of - the object. - data : numpy.ndarray - Data to populate the new variable. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Variable - Reference to the newly created variable. - """ - # any error checking should be taken care of by add_variable - v = variable.Variable(dims, np.asarray(data), attributes) - return self.add_variable(name, v) - - def create_coordinate(self, name, data, attributes=None): - """Create a new dimension and a corresponding coordinate variable. - - This method combines the create_dimension and create_variable methods - for the common case when the variable is a 1-dimensional coordinate - variable with the same name as the dimension. - - Parameters - ---------- - name : string - The name of the new dimension and variable. An exception will be - raised if the object already has a dimension or variable with this - name. - data : array_like - The coordinate values along this dimension; must be 1-dimensional. - The size of data is the length of the new dimension. - attributes : dict_like or None, optional - Attributes to assign to the new variable. If None (default), an - empty attribute dictionary is initialized. - - Returns - ------- - var : Variable - Reference to the newly created coordinate variable. - """ - # We need to be cleanly roll back the effects of - # create_dimension if create_variable fails, otherwise we will - # end up in a partial state. - if name in self.dimensions: - raise ValueError("dimension named '%s' already exists" % name) - var = variable.Variable((name,), np.asarray(data), attributes) - if var.ndim != 1: - raise ValueError("coordinate data must be 1-dimensional (vector)") - self._unchecked_create_dimension(name, var.size) - return self._unchecked_add_variable(name, var) - - def add_variable(self, name, var): - """Add a variable to the dataset - - Parameters - ---------- - name : string - The name under which the variable will be added. - variable : variable.Variable - The variable to be added. If the desired action is to add a copy of - the variable be sure to do so before passing it to this function. - - Returns - ------- - variable - The variable object in the underlying datastore - """ - if name in self.variables: - raise ValueError("Variable named %r already exists" % name) - check_dims_and_vars_consistency(self.dimensions, {name: var}) - return self._unchecked_add_variable(name, var) - - def views(self, slicers): - """Return a new object whose contents are a view of a slice from the - current object along a specified dimension - - Parameters - ---------- - slicers : {dim: slice, ...} - A dictionary mapping from dimensions to integers or slice objects. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are viewed in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - Care must be taken since modifying (most) values in the returned - object will result in modification to the parent object. - - See Also - -------- - view - numpy.take - Variable.take - """ - if not all(k in self.dimensions for k in slicers): - invalid = [k for k in slicers if not k in self.dimensions] - raise KeyError("dimensions %r don't exist" % invalid) - - # slice all variables - variables = OrderedDict() - for (name, var) in self.variables.iteritems(): - var_slicers = dict((k, v) for k, v in slicers.iteritems() - if k in var.dimensions) - variables[name] = var.views(var_slicers) - - def search_dim_len(dim, variables): - # loop through the variables to find the dimension length, or if - # the dimension is not found, return None - for var in variables.values(): - if dim in var.dimensions: - return int(var.shape[var.dimensions.index(dim)]) - return None - - # update dimensions - dimensions = OrderedDict() - for dim in self.dimensions: - new_len = search_dim_len(dim, variables) - if new_len is not None: - # dimension length is defined by a new dataset variable - dimensions[dim] = new_len - elif search_dim_len(dim, self.variables) is None: - # dimension length is also not defined by old dataset variables - # note: dimensions only defined in old dataset variables are be - # dropped - if dim not in slicers: - dimensions[dim] = self.dimensions[dim] - else: - # figure it by slicing temporary coordinate data - temp_data = np.arange(self.dimensions[dim]) - temp_data_sliced = temp_data[slicers[dim]] - new_len = temp_data_sliced.size - if new_len > 0 and temp_data_sliced.ndim > 0: - # drop the dimension if the result of getitem is an - # integer (dimension 0) - dimensions[dim] = new_len - - return type(self)(variables, dimensions, self.attributes, - check_consistency=False) - - def view(self, s, dim): - """Return a new object whose contents are a view of a slice from the - current object along a specified dimension - - Parameters - ---------- - s : slice - The slice representing the range of the values to extract. - dim : string, optional - The dimension to slice along. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are viewed in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - Care must be taken since modifying (most) values in the returned - object will result in modification to the parent object. - - See Also - -------- - views - numpy.take - Variable.take - """ - return self.views({dim: s}) - - def take(self, indices, dim=None): - """Return a new object whose contents are taken from the - current object along a specified dimension - - Parameters - ---------- - indices : array_like - The indices of the values to extract. indices must be compatible - with the ndarray.take() method. - dim : string, optional - The dimension to slice along. If multiple dimensions of a - variable equal dim (e.g. a correlation matrix), then that - variable is sliced only along its first matching dimension. - If None (default), then the object is sliced along its - unlimited dimension; an exception is raised if the object - does not have an unlimited dimension. - - Returns - ------- - obj : Data object - The returned object has the same attributes, dimensions, - variable names and variable attributes as the original. - Variables that are not defined along the specified - dimensions are copied in their entirety. Variables that are - defined along the specified dimension have their data - contents taken along the specified dimension. - - See Also - -------- - numpy.take - Variable.take - """ - if dim is None: - raise ValueError("dim cannot be None") - # Create a new object - obj = type(self)() - # Create fancy-indexed variables and infer the new dimension length - new_length = self.dimensions[dim] - for (name, var) in self.variables.iteritems(): - if dim in var.dimensions: - obj.store.unchecked_add_variable(name, var.take(indices, dim)) - new_length = obj.variables[name].data.shape[ - list(var.dimensions).index(dim)] - else: - obj.store.unchecked_add_variable(name, copy.deepcopy(var)) - # Hard write the dimensions, skipping validation - for d, l in self.dimensions.iteritems(): - if d == dim: - l = new_length - obj.store.unchecked_create_dimension(d, l) - if obj.dimensions[dim] == 0: - raise IndexError( - "take would result in a dimension of length zero") - # Copy attributes - self._unchecked_set_attributes(self.attributes.copy()) - return obj - - def renamed(self, name_dict): - """ - Returns a new object with renamed variables and dimensions - - Parameters - ---------- - name_dict : dict-like - Dictionary-like object whose keys are current variable or dimension - names and whose values are new names. - """ - for k in name_dict: - if k not in self.dimensions and k not in self.variables: - raise ValueError("Cannot rename %r because it is not a " - "variable or dimension in this dataset" % k) - variables = OrderedDict() - for k, v in self.variables.iteritems(): - name = name_dict.get(k, k) - dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) - #TODO: public interface for renaming a variable without loading - # data - variables[name] = variable.Variable(dims, v._data, v.attributes) - - dimensions = OrderedDict((name_dict.get(k, k), v) - for k, v in self.dimensions.iteritems()) - - return type(self)(variables, dimensions, self.attributes, - check_consistency=False) - - def join(self, other): - """ - Join two datasets into a single new dataset - - Raises ValueError if any variables or dimensions do not match. - """ - new_vars = utils.safe_merge(self.variables, other.variables, - compat=utils.variable_equal) - new_dims = utils.safe_merge(self.dimensions, other.dimensions) - new_attr = utils.safe_merge(self.attributes, other.attributes) - return type(self)(new_vars, new_dims, new_attr) - - def select(self, *names): - """Return a new object that contains the specified namesiables, - along with the dimensions on which those variables are defined - and corresponding coordinate variables. - - Parameters - ---------- - *names : str - Names of the variables to include in the returned object. - - Returns - ------- - obj : Data object - The returned object has the same attributes as the - original. A dimension is included if at least one of the - specified variables is defined along that dimension. - Coordinate variables (1-dimensional variables with the same - name as a dimension) that correspond to an included - dimension are also included. All other variables are - dropped. - """ - if not all(k in self.variables for k in names): - raise KeyError( - "One or more of the specified variables does not exist") - - dim_names = (set(self.variables[k].dimensions) for k in names) - names = set(names).union(*dim_names) - - variables = OrderedDict((k, v) for k, v in self.variables.iteritems() - if k in names) - dimensions = OrderedDict((k, v) for k, v in self.dimensions.iteritems() - if k in names) - return type(self)(variables, dimensions, self.attributes, - check_consistency=False) - - def iterator(self, dim=None, views=False): - """Iterator along a data dimension - - Return an iterator yielding (coordinate, data_object) pairs - that are singleton along the specified dimension - - Parameters - ---------- - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - views : boolean, optional - If True, the iterator will give views of the data along - the dimension, otherwise copies. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued - coordinate variables and data objects. The yielded data - objects contain *copies* onto the underlying numpy arrays of - the original data object. If the data object does not have - a coordinate variable with the same name as the specified - dimension, then the returned coordinate value is None. If - multiple dimensions of a variable equal dim (e.g. a - correlation matrix), then that variable is iterated along - the first matching dimension. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='x', data=numpy.arange(10)) - >>> d.create_coordinate(name='y', data=numpy.arange(20)) - >>> print d - - dimensions: - name | length - =========================== - x | 10 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (10,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - - >>> i = d.iterator(dim='x') - >>> (a, b) = i.next() - >>> print a - - dtype: - int32 - - dimensions: - name | length - =========================== - x | 1 - - attributes: - None - - >>> print b - - dimensions: - name | length - =========================== - x | 1 - y | 20 - - variables: - name | dtype | shape | dimensions - ===================================================================== - x | int32 | (1,) | ('x',) - y | int32 | (20,) | ('y',) - - attributes: - None - - """ - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over the object - if dim in self.coordinates: - coord = self.variables[dim] - if views: - for i in xrange(n): - s = slice(i, i + 1) - yield (coord.view(s, dim=dim), - self.view(s, dim=dim)) - else: - for i in xrange(n): - indices = np.array([i]) - yield (coord.take(indices, dim=dim), - self.take(indices, dim=dim)) - else: - if views: - for i in xrange(n): - yield (None, self.view(slice(i, i + 1), dim=dim)) - else: - for i in xrange(n): - yield (None, self.take(np.array([i]), dim=dim)) - - def iterarray(self, var, dim=None): - """Iterator along a data dimension returning the corresponding slices - of the underlying data of a variable. - - Return an iterator yielding (scalar, ndarray) pairs that are singleton - along the specified dimension. While iterator is more general, this - method has less overhead and in turn should be considerably faster. - - Parameters - ---------- - var : string - The variable over which you want to iterate. - - dim : string, optional - The dimension along which you want to iterate. If None - (default), then the iterator operates along the record - dimension; if there is no record dimension, an exception - will be raised. - - Returns - ------- - it : iterator - The returned iterator yields pairs of scalar-valued - and ndarray objects. The yielded data objects contain *views* - onto the underlying numpy arrays of the original data object. - - Examples - -------- - >>> d = Data() - >>> d.create_coordinate(name='t', data=numpy.arange(5)) - >>> d.create_dimension(name='h', length=3) - >>> d.create_variable(name='x', dim=('t', 'h'),\ - ... data=numpy.random.random((10, 3,))) - >>> print d['x'].data - [[ 0.33499995 0.47606901 0.41334325] - [ 0.20229308 0.73693437 0.97451746] - [ 0.40020704 0.29763575 0.85588908] - [ 0.44114434 0.79233816 0.59115313] - [ 0.18583972 0.55084889 0.95478946]] - >>> i = d.iterarray(var='x', dim='t') - >>> (a, b) = i.next() - >>> print a - 0 - >>> print b - [[ 0.33499995 0.47606901 0.41334325]] - """ - # Get a reference to the underlying ndarray for the desired variable - # and build a list of slice objects - data = self.variables[var].data - axis = list(self.variables[var].dimensions).index(dim) - slicer = [slice(None)] * data.ndim - # Determine the size of the dim we're about to iterate over - n = self.dimensions[dim] - # Iterate over dim returning views of the variable. - if dim in self.coordinates: - coord = self.variables[dim].data - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (coord[i], data[slicer]) - else: - for i in xrange(n): - slicer[axis] = slice(i, i + 1) - yield (None, data[slicer]) - - def squeeze(self, dimension): - """ - Squeezes a dimension of length 1, returning a copy of the object - with that dimension removed. - """ - if self.dimensions[dimension] != 1: - raise ValueError(("Can only squeeze along dimensions with" + - "length one, %s has length %d") % - (dimension, self.dimensions[dimension])) - # Create a new Data instance - obj = type(self)() - # Copy dimensions - for (name, length) in self.dimensions.iteritems(): - if not name == dimension: - obj.create_dimension(name, length) - # Copy variables - for (name, var) in self.variables.iteritems(): - if not name == dimension: - dims = list(var.dimensions) - data = var.data - if dimension in dims: - shape = list(var.data.shape) - index = dims.index(dimension) - shape.pop(index) - dims.pop(index) - data = data.reshape(shape) - obj.create_variable(name=name, - dims=tuple(dims), - data=data, - attributes=var.attributes.copy()) - obj.store.unchecked_set_attributes(self.attributes.copy()) - return obj - - -if __name__ == "__main__": - """ - A bunch of regression tests. - """ - base_dir = os.path.dirname(__file__) - test_dir = os.path.join(base_dir, '..', '..', 'test', ) - write_test_path = os.path.join(test_dir, 'test_output.nc') - ecmwf_netcdf = os.path.join(test_dir, 'ECMWF_ERA-40_subset.nc') - - import time - st = time.time() - nc = Dataset(ecmwf_netcdf) - print "Seconds to read from filepath : ", time.time() - st - - st = time.time() - nc.dump(write_test_path) - print "Seconds to write : ", time.time() - st - - st = time.time() - nc_string = nc.dumps() - print "Seconds to serialize : ", time.time() - st - - st = time.time() - nc = Dataset(nc_string) - print "Seconds to deserialize : ", time.time() - st - - st = time.time() - with open(ecmwf_netcdf, 'r') as f: - nc = Dataset(f) - print "Seconds to read from fobj : ", time.time() - st - diff --git a/src/scidata/utils.py b/src/scidata/utils.py deleted file mode 100644 index ae591fab9bc..00000000000 --- a/src/scidata/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -import operator -from collections import OrderedDict - -import numpy as np - - -def expanded_indexer(key, ndim): - """Given a key for indexing an ndarray, return an equivalent - key which is a tuple with length equal to the number of dimensions - """ - if not isinstance(key, tuple): - key = (key,) - new_key = [slice(None)] * ndim - new_key[:len(key)] = key - return tuple(new_key) - - -def safe_merge(*dicts, **kwargs): - """Merge any number of dictionaries into a new OrderedDict - - Raises ValueError if dictionaries have non-compatible values for any key, - where compatibility is determined by the `compat` function. - - Parameters - ---------- - *dicts : dict-like - Mappings to merge. - compat : function, optional - Binary operator to determine if two values are compatible. By default, - checks for equality. - - Returns - ------- - merged : OrderedDict - Merged contents. - """ - compat = kwargs.pop('compat', operator.eq) - merged = OrderedDict() - for d in dicts: - for k, v in d.iteritems(): - if k in merged and not compat(v, merged[k]): - raise ValueError('cannot override values with safe_merge') - merged[k] = v - return merged - - -def variable_equal(v1, v2): - """True if two objects have the same dimensions, attributes and data; - otherwise False - - This function is necessary because `v1 == v2` does element-wise comparison - (like numpy.ndarrays). - """ - if (v1.dimensions == v2.dimensions - and v1.attributes == v2.attributes): - try: - # if _data is identical, skip checking arrays by value - if v1._data is v2._data: - return True - except AttributeError: - # _data is not part of the public interface, so it's okay if its - # missing - pass - return np.array_equal(v1.data, v2.data) - else: - return False diff --git a/src/scidata/variable.py b/src/scidata/variable.py deleted file mode 100644 index 40bb36fa263..00000000000 --- a/src/scidata/variable.py +++ /dev/null @@ -1,450 +0,0 @@ -import copy -import numpy as np - -from collections import OrderedDict -from functools import wraps -import operator -import warnings - -import conventions -from utils import expanded_indexer, safe_merge - - -class AttributesDict(OrderedDict): - """A subclass of OrderedDict whose __setitem__ method automatically - checks and converts values to be valid netCDF attributes - """ - def __init__(self, *args, **kwds): - OrderedDict.__init__(self, *args, **kwds) - - def __setitem__(self, key, value): - if not conventions.is_valid_name(key): - raise ValueError("Not a valid attribute name") - # Strings get special handling because netCDF treats them as - # character arrays. Everything else gets coerced to a numpy - # vector. netCDF treats scalars as 1-element vectors. Arrays of - # non-numeric type are not allowed. - if isinstance(value, basestring): - # netcdf attributes should be unicode - value = unicode(value) - else: - try: - value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) - except: - raise ValueError("Not a valid value for a netCDF attribute") - if value.ndim > 1: - raise ValueError("netCDF attributes must be vectors " + - "(1-dimensional)") - value = conventions.coerce_type(value) - if str(value.dtype) not in conventions.TYPEMAP: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") - OrderedDict.__setitem__(self, key, value) - - def copy(self): - """The copy method of the superclass simply calls the constructor, - which in turn calls the update method, which in turns calls - __setitem__. This subclass implementation bypasses the expensive - validation in __setitem__ for a substantial speedup.""" - obj = self.__class__() - for (attr, value) in self.iteritems(): - OrderedDict.__setitem__(obj, attr, copy.copy(value)) - return obj - - def __deepcopy__(self, memo=None): - """ - Returns a deep copy of the current object. - - memo does nothing but is required for compatability with copy.deepcopy - """ - return self.copy() - - def update(self, *other, **kwargs): - """Set multiple attributes with a mapping object or an iterable of - key/value pairs""" - # Capture arguments in an OrderedDict - args_dict = OrderedDict(*other, **kwargs) - try: - # Attempt __setitem__ - for (attr, value) in args_dict.iteritems(): - self.__setitem__(attr, value) - except: - # A plain string attribute is okay, but an array of - # string objects is not okay! - raise ValueError("Can not convert to a valid netCDF type") - # Clean up so that we don't end up in a partial state - for (attr, value) in args_dict.iteritems(): - if self.__contains__(attr): - self.__delitem__(attr) - # Re-raise - raise - - def __eq__(self, other): - if not set(self.keys()) == set(other.keys()): - return False - for (key, value) in self.iteritems(): - if value.__class__ != other[key].__class__: - return False - if isinstance(value, basestring): - if value != other[key]: - return False - else: - if value.tostring() != other[key].tostring(): - return False - return True - - -def _as_compatible_data(data): - """If data does not have the necessary attributes to be the private _data - attribute, convert it to a np.ndarray and raise an warning - """ - # don't check for __len__ or __iter__ so as not to warn if data is a numpy - # numeric type like np.float32 - required = ['dtype', 'shape', 'size', 'ndim'] - if not all(hasattr(data, attr) for attr in required): - warnings.warn('converting data to np.ndarray because it lacks some of ' - 'the necesssary attributes for lazy use', RuntimeWarning, - stacklevel=3) - data = np.asarray(data) - return data - - -class Variable(object): - """ - A netcdf-like variable consisting of dimensions, data and attributes - which describe a single varRiable. A single variable object is not - fully described outside the context of its parent Dataset. - """ - def __init__(self, dims, data, attributes=None): - data = _as_compatible_data(data) - if len(dims) != data.ndim: - raise ValueError('data must have same shape as the number of ' - 'dimensions') - self._dimensions = tuple(dims) - self._data = data - if attributes is None: - attributes = {} - self._attributes = AttributesDict(attributes) - - @property - def dimensions(self): - return self._dimensions - - @property - def data(self): - """ - The variable's data as a numpy.ndarray - """ - if not isinstance(self._data, np.ndarray): - self._data = np.asarray(self._data[...]) - return self._data - - @data.setter - def data(self, value): - value = np.asarray(value) - if value.shape != self.shape: - raise ValueError("replacement data must match the Variable's " - "shape") - self._data = value - - @property - def dtype(self): - return self._data.dtype - - @property - def shape(self): - return self._data.shape - - @property - def size(self): - return self._data.size - - @property - def ndim(self): - return self._data.ndim - - def __len__(self): - return len(self._data) - - def __nonzero__(self): - if self.size == 1: - return bool(self.data) - else: - raise ValueError('ValueError: The truth value of variable with ' - 'more than one element is ambiguous.') - - def __getitem__(self, key): - """ - Return a new Variable object whose contents are consistent with getting - the provided key from the underlying data - """ - key = expanded_indexer(key, self.ndim) - dimensions = [dim for k, dim in zip(key, self.dimensions) - if not isinstance(k, int)] - # always return a Variable, because Variable subtypes may have - # different constructors and may not make sense without an attached - # datastore - return Variable(dimensions, self._data[key], self.attributes) - - def __setitem__(self, key, value): - """__setitem__ is overloaded to access the underlying numpy data""" - self.data[key] = value - - def __iter__(self): - """ - Iterate over the contents of this Variable - """ - for n in range(len(self)): - yield self[n] - - @property - def attributes(self): - return self._attributes - - def copy(self): - """ - Returns a shallow copy of the current object. - """ - return self.__copy__() - - def _copy(self, deepcopy=False): - # deepcopies should always be of a numpy view of the data, not the data - # itself, because non-memory backends don't necessarily have deepcopy - # defined sensibly (this is a problem for netCDF4 variables) - data = copy.deepcopy(self.data) if deepcopy else self._data - # note: - # dimensions is already an immutable tuple - # attributes will be copied when the new Variable is created - return Variable(self.dimensions, data, self.attributes) - - def __copy__(self): - """ - Returns a shallow copy of the current object. - """ - return self._copy(deepcopy=False) - - def __deepcopy__(self, memo=None): - """ - Returns a deep copy of the current object. - - memo does nothing but is required for compatability with copy.deepcopy - """ - return self._copy(deepcopy=True) - - # mutable objects should not be hashable - __hash__ = None - - def __str__(self): - """Create a ncdump-like summary of the object""" - summary = ["dimensions:"] - # prints dims that look like: - # dimension = length - dim_print = lambda d, l : "\t%s : %s" % (conventions.pretty_print(d, 30), - conventions.pretty_print(l, 10)) - # add each dimension to the summary - summary.extend([dim_print(d, l) for d, l in zip(self.dimensions, self.shape)]) - summary.append("dtype : %s" % (conventions.pretty_print(self.dtype, 8))) - summary.append("attributes:") - # attribute:value - summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), - conventions.pretty_print(val, 30)) - for att, val in self.attributes.iteritems()]) - # create the actual summary - return '\n'.join(summary) - - def __repr__(self): - dim_summary = ', '.join('%s: %s' % (k, v) for k, v - in zip(self.dimensions, self.shape)) - return '' % (dim_summary, self.dtype) - - def views(self, slicers): - """Return a new Variable object whose contents are a view of the object - sliced along a specified dimension. - - Parameters - ---------- - slicers : {dim: slice, ...} - A dictionary mapping from dim to slice, dim represents - the dimension to slice along slice represents the range of the - values to extract. - - Returns - ------- - obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. Care must be taken since modifying (most) - values in the returned object will result in modification to the - parent object. - - See Also - -------- - view - take - """ - slices = [slice(None)] * self.data.ndim - for i, dim in enumerate(self.dimensions): - if dim in slicers: - slices[i] = slicers[dim] - return self[tuple(slices)] - - def view(self, s, dim): - """Return a new Variable object whose contents are a view of the object - sliced along a specified dimension. - - Parameters - ---------- - s : slice - The slice representing the range of the values to extract. - dim : string - The dimension to slice along. - - Returns - ------- - obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. Care must be taken since modifying (most) - values in the returned object will result in modification to the - parent object. - - See Also - -------- - take - """ - return self.views({dim: s}) - - def take(self, indices, dim): - """Return a new Variable object whose contents are sliced from - the current object along a specified dimension - - Parameters - ---------- - indices : array_like - The indices of the values to extract. indices must be compatible - with the ndarray.take() method. - dim : string - The dimension to slice along. If multiple dimensions equal - dim (e.g. a correlation matrix), then the slicing is done - only along the first matching dimension. - - Returns - ------- - obj : Variable object - The returned object has the same attributes and dimensions - as the original. Data contents are taken along the - specified dimension. - - See Also - -------- - numpy.take - """ - indices = np.asarray(indices) - if indices.ndim != 1: - raise ValueError('indices should have a single dimension') - # When dim appears repeatedly in self.dimensions, using the index() - # method gives us only the first one, which is the desired behavior - axis = self.dimensions.index(dim) - # take only works on actual numpy arrays - data = self.data.take(indices, axis=axis) - return Variable(self.dimensions, data, self.attributes) - - -def broadcast_var_data(self, other): - self_data = self.data - if all(hasattr(other, attr) for attr in ['dimensions', 'data']): - # build dimensions for new Variable - other_only_dims = [dim for dim in other.dimensions - if dim not in self.dimensions] - dimensions = list(self.dimensions) + other_only_dims - - # expand self_data's dimensions so it's broadcast compatible after - # adding other's dimensions to the end - for _ in xrange(len(other_only_dims)): - self_data = np.expand_dims(self_data, axis=-1) - - # expand and reorder other_data so the dimensions line up - self_only_dims = [dim for dim in dimensions - if dim not in other.dimensions] - other_data = other.data - for _ in xrange(len(self_only_dims)): - other_data = np.expand_dims(other_data, axis=-1) - other_dims = list(other.dimensions) + self_only_dims - axes = [other_dims.index(dim) for dim in dimensions] - other_data = other_data.transpose(axes) - else: - # rely on numpy broadcasting rules - other_data = other - dimensions = self.dimensions - return self_data, other_data, dimensions - - -def _math_safe_attributes(v): - """Given a variable, return the variables's attributes that are safe for - mathematical operations (e.g., all those except for 'units') - """ - try: - attr = v.attributes - except AttributeError: - return {} - else: - return OrderedDict((k, v) for k, v in attr.items() if k != 'units') - - -def unary_op_wrapper(name): - f = getattr(operator, '__%s__' % name) - @wraps(f) - def func(self): - new_data = f(self.data) - new_attr = _math_safe_attributes(self) - return Variable(self.dimensions, new_data, new_attr) - return func - - -def binary_op(name, reflexive=False): - f = getattr(operator, '__%s__' % name) - @wraps(f) - def func(self, other): - self_data, other_data, new_dims = broadcast_var_data(self, other) - new_data = (f(self_data, other_data) - if not reflexive - else f(other_data, self_data)) - new_attr = safe_merge(_math_safe_attributes(self), - _math_safe_attributes(other)) - return Variable(new_dims, new_data, new_attr) - return func - - -def inplace_binary_op(name): - f = getattr(operator, '__i%s__' % name) - @wraps(f) - def func(self, other): - self_data, other_data, dimensions = broadcast_var_data(self, other) - if dimensions != self.dimensions: - raise ValueError('dimensions cannot change for in-place operations') - self.data = f(self_data, other_data) - return self - return func - - -UNARY_OPS = ['neg', 'pos', 'abs', 'invert'] -CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] -NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', - 'pow', 'and', 'xor', 'or'] - - -def inject_special_operations(cls, priority=10): - # priortize our operations over numpy.ndarray's (priority=1.0) - cls.__array_priority__ = priority - for name in UNARY_OPS: - setattr(cls, '__%s__' % name, unary_op_wrapper(name)) - for name in CMP_BINARY_OPS: - setattr(cls, '__%s__' % name, binary_op(name)) - for name in NUM_BINARY_OPS: - setattr(cls, '__%s__' % name, binary_op(name)) - setattr(cls, '__r%s__' % name, binary_op(name, reflexive=True)) - setattr(cls, '__i%s__' % name, inplace_binary_op(name)) - - -inject_special_operations(Variable) diff --git a/src/xray/__init__.py b/src/xray/__init__.py new file mode 100644 index 00000000000..bd4408b3ddb --- /dev/null +++ b/src/xray/__init__.py @@ -0,0 +1,12 @@ +from .array_ import Array, broadcast_variables +from .dataset import Dataset, open_dataset +from .dataset_array import DatasetArray, align +from .utils import orthogonal_indexer, num2datetimeindex, variable_equal + +from . import backends + +concat = DatasetArray.from_stack + +__all__ = ['open_dataset', 'Dataset', 'DatasetArray', 'Array', 'align', + 'broadcast_variables', 'orthogonal_indexer', 'num2datetimeindex', + 'variable_equal'] diff --git a/src/xray/array_.py b/src/xray/array_.py new file mode 100644 index 00000000000..874b46242cf --- /dev/null +++ b/src/xray/array_.py @@ -0,0 +1,572 @@ +import functools +import warnings +from collections import OrderedDict +from itertools import izip + +import numpy as np + +import conventions +import dataset +import dataset_array +import groupby +import ops +import utils +from common import AbstractArray + + +def _as_compatible_data(data): + """If data does not have the necessary attributes to be the private _data + attribute, convert it to a np.ndarray and raise an warning + """ + # don't check for __len__ or __iter__ so as not to warn if data is a numpy + # numeric type like np.float32 + required = ['dtype', 'shape', 'size', 'ndim'] + if not all(hasattr(data, attr) for attr in required): + data = np.asarray(data) + if data.ndim == 0: + # unpack 0d data + data = data[()] + elif isinstance(data, AbstractArray): + # we don't want nested Array objects + data = data.data + return data + + +class Array(AbstractArray): + """A netcdf-like variable consisting of dimensions, data and attributes + which describe a single Array. A single Array object is not fully described + outside the context of its parent Dataset (if you want such a fully + described object, use a DatasetArray instead). + """ + def __init__(self, dims, data, attributes=None, indexing_mode='numpy'): + """ + Parameters + ---------- + dims : str or sequence of str + Name(s) of the the data dimension(s). Must be either a string (only + for 1D data) or a sequence of strings with length equal to the + number of dimensions. + data : array_like + Data array which supports numpy-like data access. + attributes : dict_like or None, optional + Attributes to assign to the new variable. If None (default), an + empty attribute dictionary is initialized. + indexing_mode : {'numpy', 'orthogonal'} + String indicating how the data parameter handles fancy indexing + (with arrays). Two modes are supported: 'numpy' (fancy indexing + like numpy.ndarray objects) and 'orthogonal' (array indexing + accesses different dimensions independently, like netCDF4 + variables). Accessing data from a Array always uses orthogonal + indexing, so `indexing_mode` tells the variable whether index + lookups need to be internally converted to numpy-style indexing. + """ + if isinstance(dims, basestring): + dims = (dims,) + self._dimensions = tuple(dims) + self._data = _as_compatible_data(data) + if len(dims) != self.ndim: + raise ValueError('data and dimensions must have the same ' + 'dimensionality') + if attributes is None: + attributes = {} + self._attributes = OrderedDict(attributes) + self._indexing_mode = indexing_mode + + @property + def data(self): + """The variable's data as a numpy.ndarray""" + if not isinstance(self._data, (np.ndarray, np.string_)): + self._data = np.asarray(self._data[...]) + self._indexing_mode = 'numpy' + return self._data + + @data.setter + def data(self, value): + # allow any array to support pandas.Index objects + value = np.asanyarray(value) + if value.shape != self.shape: + raise ValueError("replacement data must match the Array's " + "shape") + self._data = value + self._indexing_mode = 'numpy' + + @property + def dimensions(self): + return self._dimensions + + def _convert_indexer(self, key, indexing_mode=None): + """Converts an orthogonal indexer into a fully expanded key (of the + same length as dimensions) suitable for indexing a data array with the + given indexing_mode. + + See Also + -------- + utils.expanded_indexer + utils.orthogonal_indexer + """ + if indexing_mode is None: + indexing_mode = self._indexing_mode + key = utils.expanded_indexer(key, self.ndim) + if (indexing_mode == 'numpy' + and any(not isinstance(k, (int, slice)) for k in key)): + # key would trigger fancy indexing + key = utils.orthogonal_indexer(key, self.shape) + return key + + def __getitem__(self, key): + """Return a new Array object whose contents are consistent with + getting the provided key from the underlying data + + NB. __getitem__ and __setitem__ implement "orthogonal indexing" like + netCDF4-python, where the key can only include integers, slices + (including `Ellipsis`) and 1d arrays, each of which are applied + orthogonally along their respective dimensions. + + The difference not matter in most cases unless you are using numpy's + "fancy indexing," which can otherwise result in data arrays + with shapes is inconsistent (or just uninterpretable with) with the + variable's dimensions. + + If you really want to do indexing like `x[x > 0]`, manipulate the numpy + array `x.data` directly. + """ + key = self._convert_indexer(key) + dimensions = [dim for k, dim in zip(key, self.dimensions) + if not isinstance(k, int)] + if len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + key, = key + # do location based indexing if supported by _data + new_data = getattr(self._data, 'iloc', self._data)[key] + # orthogonal indexing should ensure the dimensionality is consistent + if hasattr(new_data, 'ndim'): + assert new_data.ndim == len(dimensions) + else: + assert len(dimensions) == 0 + # return a variable with the same indexing_mode, because data should + # still be the same type as _data + return type(self)(dimensions, new_data, self.attributes, + indexing_mode=self._indexing_mode) + + def __setitem__(self, key, value): + """__setitem__ is overloaded to access the underlying numpy data with + orthogonal indexing (see __getitem__ for more details) + """ + self.data[self._convert_indexer(key, indexing_mode='numpy')] = value + + def __iter__(self): + for n in range(len(self)): + yield self[n] + + @property + def attributes(self): + return self._attributes + + def copy(self): + """Returns a shallow copy of the current object. The data array is + always loaded into memory. + """ + return self.__copy__() + + def _copy(self, deepcopy=False): + # np.array always makes a copy + data = np.array(self._data) if deepcopy else self.data + # note: + # dimensions is already an immutable tuple + # attributes will be copied when the new Array is created + return type(self)(self.dimensions, data, self.attributes) + + def __copy__(self): + return self._copy(deepcopy=False) + + def __deepcopy__(self, memo=None): + # memo does nothing but is required for compatability with + # copy.deepcopy + return self._copy(deepcopy=True) + + # mutable objects should not be hashable + __hash__ = None + + def __str__(self): + """Create a ncdump-like summary of the object""" + summary = ["dimensions:"] + # prints dims that look like: + # dimension = length + dim_print = lambda d, l : "\t%s : %s" % (conventions.pretty_print(d, 30), + conventions.pretty_print(l, 10)) + # add each dimension to the summary + summary.extend([dim_print(d, l) for d, l in zip(self.dimensions, self.shape)]) + summary.append("dtype : %s" % (conventions.pretty_print(self.dtype, 8))) + summary.append("attributes:") + # attribute:value + summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), + conventions.pretty_print(val, 30)) + for att, val in self.attributes.iteritems()]) + # create the actual summary + return '\n'.join(summary).replace('\t', ' ' * 4) + + def __repr__(self): + if self.ndim > 0: + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in zip(self.dimensions, self.shape)) + contents = ' (%s): %s' % (dim_summary, self.dtype) + else: + contents = ': %s' % self.data + return '' % (type(self).__name__, contents) + + def indexed_by(self, **indexers): + """Return a new array indexed along the specified dimension(s) + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by integers, slice objects or arrays. + + Returns + ------- + obj : Array object + A new Array with the selected data and dimensions. In general, + the new variable's data will be a view of this variable's data, + unless numpy fancy indexing was triggered by using an array + indexer, in which case the data will be a copy. + """ + invalid = [k for k in indexers if not k in self.dimensions] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + key = [slice(None)] * self.data.ndim + for i, dim in enumerate(self.dimensions): + if dim in indexers: + key[i] = indexers[dim] + return self[tuple(key)] + + def transpose(self, *dimensions): + """Return a new Array object with transposed dimensions + + Note: Although this operation returns a view of this variable's data, + it is not lazy -- the data will be fully loaded. + + Parameters + ---------- + *dimensions : str, optional + By default, reverse the dimensions. Otherwise, reorder the + dimensions to this order. + + Returns + ------- + transposed : Array + The returned object has transposed data and dimensions with the + same attributes as the original. + + See Also + -------- + numpy.transpose + """ + if len(dimensions) == 0: + dimensions = self.dimensions[::-1] + axes = [self.dimensions.index(dim) for dim in dimensions] + data = self.data.transpose(*axes) + return type(self)(dimensions, data, self.attributes) + + def reduce(self, func, dimension=None, axis=None, **kwargs): + """Reduce this array by applying `func` along some dimension(s) + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to return the result of reducing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `func`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `func`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then the reduction is calculated over the flattened array + (by calling `func(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Note + ---- + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is + performed repeatedly along each dimension in turn from left to right. + + Returns + ------- + reduced : Array + Array with summarized data and the indicated dimension(s) + removed. + """ + if dimension is not None and axis is not None: + raise ValueError("cannot supply both 'axis' and 'dimension' " + "arguments") + + if axis is not None: + # determine dimensions + if isinstance(axis, int): + axis = [axis] + dimension = [self.dimensions[i] for i in axis] + + if dimension is not None: + if isinstance(dimension, basestring): + dimension = [dimension] + var = self + for dim in dimension: + var = var._reduce(func, dim, **kwargs) + else: + var = type(self)([], func(self.data, **kwargs), self.attributes) + var._append_to_cell_methods(': '.join(self.dimensions) + + ': ' + func.__name__) + return var + + def _append_to_cell_methods(self, string): + if 'cell_methods' in self.attributes: + base = self.attributes['cell_methods'] + ' ' + else: + base = '' + self.attributes['cell_methods'] = base + string + + def _reduce(self, f, dim, **kwargs): + """Reduce a single dimension""" + axis = self.dimensions.index(dim) + dims = tuple(dim for i, dim in enumerate(self.dimensions) + if axis not in [i, i - self.ndim]) + data = f(self.data, axis=axis, **kwargs) + new_var = type(self)(dims, data, self.attributes) + new_var._append_to_cell_methods(self.dimensions[axis] + + ': ' + f.__name__) + return new_var + + def groupby(self, group_name, group_array, squeeze=True): + """Group this dataset by unique values of the indicated group + + Parameters + ---------- + group_name : str + Name of the group array. + group_array : Array + Array whose unique values should be used to group this array. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. + + Returns + ------- + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs + or over which grouped operations can be applied with the `apply` + and `reduce` methods (and the associated aliases `mean`, `sum`, + `std`, etc.). + """ + return groupby.ArrayGroupBy( + self, group_name, group_array, squeeze=squeeze) + + @classmethod + def from_stack(cls, variables, dimension='stacked_dimension', + stacked_indexers=None, length=None, template=None): + """Stack variables along a new or existing dimension to form a new + variable + + Parameters + ---------- + variables : iterable of Array + Arrays to stack together. Each variable is expected to have + matching dimensions and shape except for along the stacked + dimension. + dimension : str or DatasetArray, optional + Name of the dimension to stack along. This can either be a new + dimension name, in which case it is added along axis=0, or an + existing dimension name, in which case the location of the + dimension is unchanged. Where to insert the new dimension is + determined by the first variable. + stacked_indexers : iterable of indexers, optional + length : int, optional + Length of the new dimension. This is used to allocate the new data + array for the stacked variable data before iterating over all + items, which is thus more memory efficient and a bit faster. + + Returns + ------- + stacked : Array + Stacked variable formed by stacking all the supplied variables + along the new dimension. + """ + if not isinstance(dimension, basestring): + length = dimension.size + dimension, = dimension.dimensions + + if length is None or stacked_indexers is None: + # so much for lazy evaluation! we need to look at all the variables + # to figure out the indexers and/or dimensions of the stacked + # variable + variables = list(variables) + steps = [var.shape[var.dimensions.index(dimension)] + if dimension in var.dimensions else 1 + for var in variables] + if length is None: + length = sum(steps) + if stacked_indexers is None: + stacked_indexers = [] + i = 0 + for step in steps: + stacked_indexers.append(slice(i, i + step)) + i += step + if i != length: + raise ValueError('actual length of stacked variables ' + 'along %s is %r but expected length was ' + '%s' % (dimension, i, length)) + + # initialize the stacked variable with empty data + first_var, variables = groupby.peek_at(variables) + if dimension in first_var.dimensions: + axis = first_var.dimensions.index(dimension) + shape = tuple(length if n == axis else s + for n, s in enumerate(first_var.shape)) + dims = first_var.dimensions + else: + axis = 0 + shape = (length,) + first_var.shape + dims = (dimension,) + first_var.dimensions + attr = OrderedDict() if template is None else template.attributes + + stacked = cls(dims, np.empty(shape, dtype=first_var.dtype), attr) + stacked.attributes.update(first_var.attributes) + + alt_dims = tuple(d for d in dims if d != dimension) + + # copy in the data from the variables + for var, indexer in izip(variables, stacked_indexers): + if template is None: + # do sanity checks if we don't have a template + if dimension in var.dimensions: + # transpose verifies that the dimensions are equivalent + if var.dimensions != stacked.dimensions: + var = var.transpose(*stacked.dimensions) + elif var.dimensions != alt_dims: + raise ValueError('inconsistent dimensions') + utils.remove_incompatible_items(stacked.attributes, + var.attributes) + + key = tuple(indexer if n == axis else slice(None) + for n in range(stacked.ndim)) + stacked.data[tuple(key)] = var.data + + return stacked + + def __array_wrap__(self, obj, context=None): + return type(self)(self.dimensions, obj, self.attributes) + + @staticmethod + def _unary_op(f): + @functools.wraps(f) + def func(self, *args, **kwargs): + return type(self)(self.dimensions, f(self.data, *args, **kwargs), + self.attributes) + return func + + @staticmethod + def _binary_op(f, reflexive=False): + @functools.wraps(f) + def func(self, other): + if isinstance(other, dataset_array.DatasetArray): + return NotImplemented + self_data, other_data, dims = _broadcast_variable_data(self, other) + new_data = (f(self_data, other_data) + if not reflexive + else f(other_data, self_data)) + if hasattr(other, 'attributes'): + new_attr = utils.ordered_dict_intersection(self.attributes, + other.attributes) + else: + new_attr = self.attributes + return type(self)(dims, new_data, new_attr) + return func + + @staticmethod + def _inplace_binary_op(f): + @functools.wraps(f) + def func(self, other): + self_data, other_data, dims = _broadcast_variable_data(self, other) + if dims != self.dimensions: + raise ValueError('dimensions cannot change for in-place ' + 'operations') + self.data = f(self_data, other_data) + if hasattr(other, 'attributes'): + utils.remove_incompatible_items(self.attributes, other) + return self + return func + +ops.inject_special_operations(Array) + + +def broadcast_variables(first, second): + """Given two arrays, return two arrays with matching dimensions and numpy + broadcast compatible data + + Parameters + ---------- + first, second : Array + Array objects to broadcast. + + Returns + ------- + first_broadcast, second_broadcast : Array + Broadcast arrays. The data on each variable will be a view of the + data on the corresponding original arrays, but dimensions will be + reordered and inserted so that both broadcast arrays have the same + dimensions. The new dimensions are sorted in order of appearence in the + first variable's dimensions followed by the second variable's + dimensions. + """ + # TODO: add unit tests specifically for this function + # validate dimensions + dim_lengths = dict(zip(first.dimensions, first.shape)) + for k, v in zip(second.dimensions, second.shape): + if k in dim_lengths and dim_lengths[k] != v: + raise ValueError('operands could not be broadcast together ' + 'with mismatched lengths for dimension %r: %s' + % (k, (dim_lengths[k], v))) + for dimensions in [first.dimensions, second.dimensions]: + if len(set(dimensions)) < len(dimensions): + raise ValueError('broadcasting requires that neither operand ' + 'has duplicate dimensions: %r' + % list(dimensions)) + + # build dimensions for new Array + second_only_dims = [d for d in second.dimensions + if d not in first.dimensions] + dimensions = list(first.dimensions) + second_only_dims + + # expand first_data's dimensions so it's broadcast compatible after + # adding second's dimensions at the end + first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] + new_first = Array(dimensions, first_data) + # expand and reorder second_data so the dimensions line up + first_only_dims = [d for d in dimensions if d not in second.dimensions] + second_dims = list(second.dimensions) + first_only_dims + second_data = second.data[(Ellipsis,) + (None,) * len(first_only_dims)] + new_second = Array(second_dims, second_data).transpose(*dimensions) + return new_first, new_second + + +def _broadcast_variable_data(self, other): + if isinstance(other, dataset.Dataset): + raise TypeError('datasets do not support mathematical operations') + elif all(hasattr(other, attr) for attr in ['dimensions', 'data', 'shape']): + # `other` satisfies the xray.Array API + new_self, new_other = broadcast_variables(self, other) + self_data = new_self.data + other_data = new_other.data + dimensions = new_self.dimensions + else: + # rely on numpy broadcasting rules + self_data = self.data + other_data = other + dimensions = self.dimensions + return self_data, other_data, dimensions diff --git a/src/xray/backends.py b/src/xray/backends.py new file mode 100644 index 00000000000..7c72d50dae8 --- /dev/null +++ b/src/xray/backends.py @@ -0,0 +1,222 @@ +"""Backend objects for saving and loading data + +DataStores provide a uniform interface for saving and loading data in different +formats. They should not be used directly, but rather through Dataset objects. +""" +# TODO: implement backend logic directly in OrderedDict subclasses, to allow +# for directly manipulating Dataset.variables and the like? +import netCDF4 as nc4 +import numpy as np +import pandas as pd + +from scipy.io import netcdf +from collections import OrderedDict + +import array_ +import conventions +from utils import FrozenOrderedDict, Frozen, datetimeindex2num + + +class AbstractDataStore(object): + def set_dimensions(self, dimensions): + for d, l in dimensions.iteritems(): + self.set_dimension(d, l) + + def set_attributes(self, attributes): + for k, v in attributes.iteritems(): + self.set_attribute(k, v) + + def set_variables(self, variables): + for vn, v in variables.iteritems(): + self.set_variable(vn, v) + + +class InMemoryDataStore(AbstractDataStore): + """ + Stores dimensions, variables and attributes + in ordered dictionaries, making this store + fast compared to stores which store to disk. + """ + def __init__(self): + self.dimensions = OrderedDict() + self.variables = OrderedDict() + self.attributes = OrderedDict() + + def set_dimension(self, name, length): + self.dimensions[name] = length + + def set_attribute(self, key, value): + self.attributes[key] = value + + def set_variable(self, name, variable): + self.variables[name] = variable + return self.variables[name] + + def del_attribute(self, key): + del self.attributes[key] + + def sync(self): + pass + + +def convert_to_cf_variable(array): + data = array.data + attributes = array.attributes.copy() + if isinstance(array.data, pd.DatetimeIndex): + (data, units, calendar) = datetimeindex2num(array.data) + attributes['units'] = units + attributes['calendar'] = calendar + return array_.Array(array.dimensions, data, attributes) + + +def convert_scipy_variable(var): + return array_.Array(var.dimensions, var.data, var._attributes) + + +class ScipyDataStore(AbstractDataStore): + """ + Stores data using the scipy.io.netcdf package. + This store has the advantage of being able to + be initialized with a StringIO object, allow for + serialization. + """ + def __init__(self, fobj, *args, **kwdargs): + self.ds = netcdf.netcdf_file(fobj, *args, **kwdargs) + + @property + def variables(self): + return FrozenOrderedDict((k, convert_scipy_variable(v)) + for k, v in self.ds.variables.iteritems()) + + @property + def attributes(self): + return Frozen(self.ds._attributes) + + @property + def dimensions(self): + return Frozen(self.ds.dimensions) + + def set_dimension(self, name, length): + if name in self.dimensions: + raise ValueError('%s does not support modifying dimensions' + % type(self).__name__) + self.ds.createDimension(name, length) + + def _validate_attr_key(self, key): + if not conventions.is_valid_name(key): + raise ValueError("Not a valid attribute name") + + def _cast_attr_value(self, value): + # Strings get special handling because netCDF treats them as + # character arrays. Everything else gets coerced to a numpy + # vector. netCDF treats scalars as 1-element vectors. Arrays of + # non-numeric type are not allowed. + if isinstance(value, basestring): + # netcdf attributes should be unicode + value = unicode(value) + else: + try: + value = conventions.coerce_type(np.atleast_1d(np.asarray(value))) + except: + raise ValueError("Not a valid value for a netCDF attribute") + if value.ndim > 1: + raise ValueError("netCDF attributes must be vectors " + + "(1-dimensional)") + value = conventions.coerce_type(value) + if str(value.dtype) not in conventions.TYPEMAP: + # A plain string attribute is okay, but an array of + # string objects is not okay! + raise ValueError("Can not convert to a valid netCDF type") + return value + + def set_attribute(self, key, value): + self._validate_attr_key(key) + setattr(self.ds, key, self._cast_attr_value(value)) + + def set_variable(self, name, variable): + variable = convert_to_cf_variable(variable) + data = variable.data + dtype_convert = {'int64': 'int32', 'float64': 'float32'} + if str(data.dtype) in dtype_convert: + data = np.asarray(data, dtype=dtype_convert[str(data.dtype)]) + self.ds.createVariable(name, data.dtype, variable.dimensions) + scipy_var = self.ds.variables[name] + scipy_var[:] = data[:] + for k, v in variable.attributes.iteritems(): + self._validate_attr_key(k) + setattr(scipy_var, k, self._cast_attr_value(v)) + + def del_attribute(self, key): + delattr(self.ds, key) + + def sync(self): + self.ds.flush() + + +def convert_nc4_variable(var): + # we don't want to see scale_factor and add_offset in the attributes + # since the netCDF4 package automatically scales the data on read. + # If we kept scale_factor and add_offset around and did this: + # + # foo = ncdf4.Dataset('foo.nc') + # ncdf4.dump(foo, 'bar.nc') + # bar = ncdf4.Dataset('bar.nc') + # + # you would find that any packed variables in the original + # netcdf file would now have been scaled twice! + attr = OrderedDict((k, var.getncattr(k)) for k in var.ncattrs() + if k not in ['scale_factor', 'add_offset']) + return array_.Array(var.dimensions, var, attr, indexing_mode='orthogonal') + + +class NetCDF4DataStore(AbstractDataStore): + def __init__(self, filename, *args, **kwdargs): + # TODO: set auto_maskandscale=True so we can handle the array + # packing/unpacking ourselves (using NaN instead of masked arrays) + self.ds = nc4.Dataset(filename, *args, **kwdargs) + + @property + def variables(self): + return FrozenOrderedDict((k, convert_nc4_variable(v)) + for k, v in self.ds.variables.iteritems()) + + @property + def attributes(self): + return FrozenOrderedDict((k, self.ds.getncattr(k)) + for k in self.ds.ncattrs()) + + @property + def dimensions(self): + return FrozenOrderedDict((k, len(v)) for k, v in self.ds.dimensions.iteritems()) + + def set_dimension(self, name, length): + self.ds.createDimension(name, size=length) + + def set_attribute(self, key, value): + self.ds.setncatts({key: value}) + + def _cast_data(self, data): + if isinstance(data, pd.DatetimeIndex): + data = datetimeindex2num(data) + return data + + def set_variable(self, name, variable): + variable = convert_to_cf_variable(variable) + # netCDF4 will automatically assign a fill value + # depending on the datatype of the variable. Here + # we let the package handle the _FillValue attribute + # instead of setting it ourselves. + fill_value = variable.attributes.pop('_FillValue', None) + self.ds.createVariable(varname=name, + datatype=variable.dtype, + dimensions=variable.dimensions, + fill_value=fill_value) + nc4_var = self.ds.variables[name] + nc4_var[:] = variable.data[:] + nc4_var.setncatts(variable.attributes) + + def del_attribute(self, key): + self.ds.delncattr(key) + + def sync(self): + self.ds.sync() diff --git a/src/xray/common.py b/src/xray/common.py new file mode 100644 index 00000000000..3a6cde96eea --- /dev/null +++ b/src/xray/common.py @@ -0,0 +1,97 @@ + +class ImplementsReduce(object): + @classmethod + def _reduce_method(cls, f, name=None, module=None): + def func(self, dimension=cls._reduce_dimension_default, + axis=cls._reduce_axis_default, **kwargs): + return self.reduce(f, dimension, axis, **kwargs) + if name is None: + name = f.__name__ + func.__name__ = name + func.__doc__ = cls._reduce_method_docstring.format( + name=('' if module is None else module + '.') + name, + cls=cls.__name__) + return func + + +class AbstractArray(ImplementsReduce): + @property + def dtype(self): + return getattr(self._data, 'dtype', object) + + @property + def shape(self): + return getattr(self._data, 'shape', ()) + + @property + def size(self): + return getattr(self._data, 'size', 1) + + @property + def ndim(self): + return getattr(self._data, 'ndim', 0) + + def __len__(self): + return len(self._data) + + def __nonzero__(self): + return bool(self._data) + + def __float__(self): + return float(self._data) + + def __int__(self): + return int(self._data) + + def __complex__(self): + return complex(self._data) + + def __long__(self): + return long(self._data) + + # adapted from pandas.NDFrame + # https://github.com/pydata/pandas/blob/master/pandas/core/generic.py#L699 + + def __array__(self, dtype=None): + return self.data + + # @property + # def __array_interface__(self): + # data = self.data + # return dict(typestr=data.dtype.str, shape=data.shape, data=data) + + @property + def T(self): + return self.transpose() + + _reduce_method_docstring = \ + """Reduce this {cls}'s data' by applying `{name}` along some + dimension(s) + + Parameters + ---------- + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `{name}`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `{name}`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then `{name}` is calculated over the flattened array + (by calling `{name}(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `{name}`. + + Note + ---- + If this method is called with multiple dimensions (or axes, which are + converted into dimensions), then `{name}` is performed repeatedly along + each dimension in turn from left to right. + + Returns + ------- + reduced : {cls} + New {cls} object with `{name}` applied to its data and the + indicated dimension(s) removed. + """ + + _reduce_dimension_default = None + _reduce_axis_default = None diff --git a/src/scidata/conventions.py b/src/xray/conventions.py similarity index 100% rename from src/scidata/conventions.py rename to src/xray/conventions.py diff --git a/src/xray/dataset.py b/src/xray/dataset.py new file mode 100644 index 00000000000..a0f46a635e1 --- /dev/null +++ b/src/xray/dataset.py @@ -0,0 +1,638 @@ +import numpy as np +import netCDF4 as nc4 +import pandas as pd + +from cStringIO import StringIO +from collections import OrderedDict, Mapping + +import array_ as array +import backends +import conventions +import groupby +import utils +from dataset_array import DatasetArray +from utils import FrozenOrderedDict, Frozen, remap_loc_indexers + +date2num = nc4.date2num +num2date = nc4.num2date + + +def open_dataset(nc, *args, **kwargs): + # move this to a classmethod Dataset.open? + if isinstance(nc, basestring) and not nc.startswith('CDF'): + # If the initialization nc is a string and it doesn't + # appear to be the contents of a netcdf file we load + # it using the netCDF4 package + store = backends.NetCDF4DataStore(nc, *args, **kwargs) + else: + # If nc is a file-like object we read it using + # the scipy.io.netcdf package + store = backends.ScipyDataStore(nc, *args, **kwargs) + return Dataset.load_store(store) + + +# list of attributes of pd.DatetimeIndex that are ndarrays of time info +_DATETIMEINDEX_COMPONENTS = ['year', 'month', 'day', 'hour', 'minute', + 'second', 'microsecond', 'nanosecond', 'date', + 'time', 'dayofyear', 'weekofyear', 'dayofweek', + 'quarter'] + + +class _VariablesDict(OrderedDict): + """_VariablesDict is an OrderedDict subclass that also implements "virtual" + variables that are created from other variables on demand + + Currently, virtual variables are restricted to attributes of + pandas.DatetimeIndex objects (e.g., 'year', 'month', 'day', etc., plus + 'season' for climatological season), which are accessed by getting the item + 'time.year'. + """ + def _datetimeindices(self): + return [k for k, v in self.iteritems() + if isinstance(v._data, pd.DatetimeIndex)] + + @property + def virtual(self): + """Variables that don't exist in this dataset but for which could be + created on demand (because they can be calculated from other dataset + variables) + """ + virtual_vars = [] + for k in self._datetimeindices(): + for suffix in _DATETIMEINDEX_COMPONENTS + ['season']: + name = '%s.%s' % (k, suffix) + if name not in self: + virtual_vars.append(name) + return virtual_vars + + def _get_virtual_variable(self, key): + split_key = key.split('.') + if len(split_key) == 2: + ref_var, suffix = split_key + if ref_var in self._datetimeindices(): + if suffix == 'season': + # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) + month = self[ref_var].data.month + data = (month // 3) % 4 + 1 + else: + data = getattr(self[ref_var].data, suffix) + return array.Array(self[ref_var].dimensions, data) + raise KeyError('virtual variable %r not found' % key) + + def __getitem__(self, key): + if key in self: + return OrderedDict.__getitem__(self, key) + elif key in self.virtual: + return self._get_virtual_variable(key) + else: + raise KeyError(repr(key)) + + +class Dataset(Mapping): + """A netcdf-like data object consisting of variables and attributes which + together form a self describing data set + + Dataset implements the mapping interface with keys given by variable names + and values given by DatasetArray objects focused on each variable name. + + Note: the size of dimensions in a dataset cannot be changed. + + Attributes + ---------- + variables : {name: variable, ...} + attributes : {key: value, ...} + dimensions : {name: length, ...} + coordinates : {name: variable, ...} + noncoordinates : {name: variable, ...} + virtual_variables : list + """ + def __init__(self, variables=None, attributes=None): + """To load data from a file or file-like object, use the `open_dataset` + function. + + Parameters + ---------- + variables : dict-like, optional + A mapping from variable names to `xray.Array` objects or sequences + of the form `(dimensions, data[, attributes])` which can be used as + arguments to create a new `xray.Array`. Each dimension must have + the same length in all variables in which it appears. One + dimensional variables with name equal to their dimension are + coordinate variables, which means they are saved in the dataset as + `pandas.Index` objects. + attributes : dict-like, optional + Global attributes to save on this dataset. + """ + self._variables = _VariablesDict() + self._dimensions = OrderedDict() + if variables is not None: + self._set_variables(variables) + if attributes is None: + attributes = {} + self._attributes = OrderedDict(attributes) + + def _as_variable(self, name, var): + if not isinstance(var, array.Array): + try: + var = array.Array(*var) + except TypeError: + raise TypeError('Dataset variables must be of type ' + 'DatasetArray or Array, or a sequence of the ' + 'form (dimensions, data[, attributes])') + + if name in var.dimensions: + # convert the coordinate into a pandas.Index + if var.ndim != 1: + raise ValueError('a coordinate variable must be defined with ' + '1-dimensional data') + attr = var.attributes + if 'units' in attr and 'since' in attr['units']: + var.data = utils.num2datetimeindex(var.data, attr.pop('units'), + attr.pop('calendar', None)) + else: + var.data = pd.Index(var.data) + return var + + def _set_variables(self, variables): + """Set a mapping of variables and update dimensions""" + # save new variables into a temporary list so all the error checking + # can be done before updating _variables + new_variables = [] + for k, var in variables.iteritems(): + var = self._as_variable(k, var) + for dim, size in zip(var.dimensions, var.shape): + if dim not in self._dimensions: + self._dimensions[dim] = size + if dim not in variables and dim not in self._variables: + coord = self._as_variable(dim, (dim, np.arange(size))) + new_variables.append((dim, coord)) + elif self._dimensions[dim] != size: + raise ValueError('dimension %r on variable %r has size %s ' + 'but already is saved with size %s' % + (dim, k, size, self._dimensions[dim])) + new_variables.append((k, var)) + self._variables.update(new_variables) + + @classmethod + def load_store(cls, store): + return cls(store.variables, store.attributes) + + @property + def variables(self): + return Frozen(self._variables) + + @property + def attributes(self): + return self._attributes + + @attributes.setter + def attributes(self, value): + self._attributes = OrderedDict(value) + + @property + def dimensions(self): + return Frozen(self._dimensions) + + def copy(self): + """ + Returns a shallow copy of the current object. + """ + return self.__copy__() + + def __copy__(self): + """ + Returns a shallow copy of the current object. + """ + return type(self)(self.variables, self.attributes) + + def __contains__(self, key): + """ + The 'in' operator will return true or false depending on + whether 'key' is a varibale in the data object or not. + """ + return key in self.variables + + def __len__(self): + return len(self.variables) + + def __iter__(self): + return iter(self.variables) + + @property + def virtual_variables(self): + """Arrays that don't exist in this dataset but for which dataviews + could be created on demand (because they can be calculated from other + dataset variables or dimensions) + """ + return self._variables.virtual + + def __getitem__(self, key): + return DatasetArray(self.select(key), key) + + def __setitem__(self, key, value): + if isinstance(value, DatasetArray): + self.merge(value.renamed(key).dataset, inplace=True) + else: + self._set_variables({key: value}) + + def __delitem__(self, key): + del self._variables[key] + dims = set().union(v.dimensions for v in self._variables.itervalues()) + for dim in self._dimensions: + if dim not in dims: + del self._dimensions[dim] + + # mutable objects should not be hashable + __hash__ = None + + def __eq__(self, other): + try: + # some stores (e.g., scipy) do not seem to preserve order, so don't + # require matching dimension or variable order for equality + return (sorted(self.attributes.items()) + == sorted(other.attributes.items()) + and all(k1 == k2 and utils.variable_equal(v1, v2) + for (k1, v1), (k2, v2) + in zip(sorted(self.variables.items()), + sorted(other.variables.items())))) + except AttributeError: + return False + + def __ne__(self, other): + return not self == other + + @property + def coordinates(self): + """Coordinates are variables with names that match dimensions + + They are always stored internally as arrays with data that is a + pandas.Index object + """ + return FrozenOrderedDict([(dim, self.variables[dim]) + for dim in self.dimensions]) + + @property + def noncoordinates(self): + """Non-coordinates are variables with names that do not match + dimensions + """ + return FrozenOrderedDict([(name, v) + for (name, v) in self.variables.iteritems() + if name not in self.dimensions]) + + def dump_to_store(self, store): + """Store dataset contents to a backends.*DataStore object""" + store.set_dimensions(self.dimensions) + store.set_variables(self.variables) + store.set_attributes(self.attributes) + store.sync() + + def dump(self, filepath, *args, **kwdargs): + """Dump dataset contents to a location on disk using the netCDF4 + package + """ + nc4_store = backends.NetCDF4DataStore(filepath, mode='w', + *args, **kwdargs) + self.dump_to_store(nc4_store) + + def dumps(self): + """Serialize dataset contents to a string. The serialization creates an + in memory netcdf version 3 string using the scipy.io.netcdf package. + """ + fobj = StringIO() + scipy_store = backends.ScipyDataStore(fobj, mode='w') + self.dump_to_store(scipy_store) + return fobj.getvalue() + + def __str__(self): + """Create a ncdump-like summary of the object""" + summary = ["dimensions:"] + # prints dims that look like: + # dimension = length + dim_print = lambda d, l : "\t%s = %s" % (conventions.pretty_print(d, 30), + conventions.pretty_print(l, 10)) + # add each dimension to the summary + summary.extend([dim_print(d, l) for d, l in self.dimensions.iteritems()]) + + # Print variables + summary.append("variables:") + for vname, var in self.variables.iteritems(): + # this looks like: + # dtype name(dim1, dim2) + summary.append("\t%s %s(%s)" % (conventions.pretty_print(var.dtype, 8), + conventions.pretty_print(vname, 20), + conventions.pretty_print(', '.join(var.dimensions), 45))) + # attribute:value + summary.extend(["\t\t%s:%s" % (conventions.pretty_print(att, 30), + conventions.pretty_print(val, 30)) + for att, val in var.attributes.iteritems()]) + + summary.append("attributes:") + # attribute:value + summary.extend(["\t%s:%s" % (conventions.pretty_print(att, 30), + conventions.pretty_print(val, 30)) + for att, val in self.attributes.iteritems()]) + # create the actual summary + return '\n'.join(summary).replace('\t', ' ' * 4) + + def __repr__(self): + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in self.dimensions.iteritems()) + return '' % (type(self).__name__, dim_summary, + ' '.join(self.noncoordinates)) + + def indexed_by(self, **indexers): + """Return a new dataset with each array indexed along the specified + dimension(s) + + This method selects values from each array using its `__getitem__` + method, except this method does not require knowing the order of + each array's dimensions. + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by integers, slice objects or arrays. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + array and dimension is indexed by the appropriate indexers. In + general, each array's data will be a view of the array's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. + + See Also + -------- + Dataset.labeled_by + Dataset.indexed_by + Array.indexed_by + """ + invalid = [k for k in indexers if not k in self.dimensions] + if invalid: + raise ValueError("dimensions %r do not exist" % invalid) + + # all indexers should be int, slice or np.ndarrays + indexers = {k: np.asarray(v) if not isinstance(v, (int, slice)) else v + for k, v in indexers.iteritems()} + + variables = OrderedDict() + for name, var in self.variables.iteritems(): + var_indexers = {k: v for k, v in indexers.iteritems() + if k in var.dimensions} + new_var = var.indexed_by(**var_indexers) + if new_var.ndim > 0: + # filter out variables reduced to numbers + variables[name] = new_var + + return type(self)(variables, self.attributes) + + def labeled_by(self, **indexers): + """Return a new dataset with each variable indexed by coordinate labels + along the specified dimension(s) + + In contrast to `Dataset.indexed_by`, indexers for this method should + use coordinate values instead of integers. + + Under the hood, this method is powered by using Panda's powerful Index + objects. This makes label based indexing essentially just as fast as + using integer indexing. + + It also means this method uses pandas's (well documented) logic for + indexing. This means you can use string shortcuts for datetime indexes + (e.g., '2000-01' to select all values in January 2000). It also means + that slices are treated as inclusive of both the start and stop values, + unlike normal Python indexing. + + Parameters + ---------- + **indexers : {dim: indexer, ...} + Keyword arguments with names matching dimensions and values given + by individual, slices or arrays of coordinate values. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + variable and dimension is indexed by the appropriate indexers. In + general, each variable's data will be a view of the variable's data + in this dataset, unless numpy fancy indexing was triggered by using + an array indexer, in which case the data will be a copy. + + See Also + -------- + Dataset.labeled_by + Dataset.indexed_by + Array.indexed_by + """ + return self.indexed_by(**remap_loc_indexers(self.variables, indexers)) + + def renamed(self, name_dict): + """ + Returns a new object with renamed variables and dimensions + + Parameters + ---------- + name_dict : dict-like + Dictionary-like object whose keys are current variable or dimension + names and whose values are new names. + """ + for k in name_dict: + if k not in self.variables: + raise ValueError("Cannot rename %r because it is not a " + "variable in this dataset" % k) + variables = OrderedDict() + for k, v in self.variables.iteritems(): + name = name_dict.get(k, k) + dims = tuple(name_dict.get(dim, dim) for dim in v.dimensions) + #TODO: public interface for renaming a variable without loading + # data? + variables[name] = array.Array(dims, v._data, v.attributes, + v._indexing_mode) + + return type(self)(variables, self.attributes) + + def merge(self, other, inplace=False): + """Merge two datasets into a single new dataset + + This method generally not allow for overriding data. Arrays, + dimensions and indices are checked for conflicts. However, conflicting + attributes are removed. + + Parameters + ---------- + other : Dataset + Dataset to merge with this dataset. + inplace : bool, optional + If True, merge the other dataset into this dataset in-place. + + Returns + ------- + merged : Dataset + Merged dataset. + + Raises + ------ + ValueError + If any variables or dimensions conflict. Conflicting attributes + are silently dropped. + """ + # check for conflicts + utils.update_safety_check(self.variables, other.variables, + compat=utils.variable_equal) + # update contents + obj = self if inplace else self.copy() + obj._set_variables(OrderedDict((k, v) for k, v + in other.variables.iteritems() + if k not in obj.variables)) + # remove conflicting attributes + for k, v in other.attributes.iteritems(): + if k in self.attributes and v != self.attributes[k]: + del self.attributes[k] + return obj + + def select(self, *names): + """Returns a new dataset that contains the named variables + + Dimensions on which those variables are defined are also included, as + well as the corresponding coordinate variables, and any variables + listed under the 'coordinates' attribute of the named variables. + + Parameters + ---------- + *names : str + Names of the variables to include in the returned object. + + Returns + ------- + Dataset + The returned object has the same attributes as the original. + Variables are included (recursively) if at least one of the + specified variables refers to that variable in its dimensions or + "coordinates" attribute. All other variables are dropped. + """ + possible_vars = set(self) | set(self.virtual_variables) + if not set(names) <= possible_vars: + raise ValueError( + "One or more of the specified variables does not exist") + + def get_all_associated_names(name): + yield name + if name in possible_vars: + var = self.variables[name] + for dim in var.dimensions: + yield dim + if 'coordinates' in var.attributes: + coords = var.attributes['coordinates'] + if coords != '': + for coord in coords.split(' '): + yield coord + + queue = set(names) + selected_names = set() + while queue: + name = queue.pop() + new_names = set(get_all_associated_names(name)) + queue |= new_names - selected_names + selected_names |= new_names + + variables = OrderedDict((k, self.variables[k]) + for k in list(self) + self.virtual_variables + if k in selected_names) + return type(self)(variables, self.attributes) + + def unselect(self, *names): + """Returns a new dataset without the named variables + + Parameters + ---------- + *names : str + Names of the variables to omit from the returned object. + + Returns + ------- + Dataset + New dataset based on this dataset. Only the named variables are + removed. + """ + if any(k not in self.variables and k not in self.dimensions + for k in names): + raise ValueError('One or more of the specified variable/dimension ' + 'names does not exist on this dataset') + variables = OrderedDict((k, v) for k, v in self.variables.iteritems() + if k not in names) + return type(self)(variables, self.attributes) + + def replace(self, name, variable): + """Returns a new dataset with the variable 'name' replaced with + 'variable' + + Parameters + ---------- + name : str + Name of the variable to replace in this object. + variable : Array + Replacement variable. + + Returns + ------- + Dataset + New dataset based on this dataset. Dimensions are unchanged. + """ + ds = self.unselect(name) + ds[name] = variable + return ds + + def groupby(self, group, squeeze=True): + """Group this dataset by unique values of the indicated group + + Parameters + ---------- + group : str or DatasetArray + Array whose unique values should be used to group this array. If a + string, must be the name of a variable contained in this dataset. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. + + Returns + ------- + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs. + """ + if isinstance(group, basestring): + # merge in the group's dataset to allow group to be a virtual + # variable in this dataset + ds = self.merge(self[group].dataset) + group = DatasetArray(ds, group) + return groupby.GroupBy(self, group.focus, group, squeeze=squeeze) + + def to_dataframe(self): + """Convert this dataset into a pandas.DataFrame + + Non-coordinate variables in this dataset form the columns of the + DataFrame. The DataFrame is be indexed by the Cartesian product of + this dataset's indices. + """ + index_names = self.coordinates.keys() + columns = self.noncoordinates.keys() + data = [] + # we need a template to broadcast all dataset variables against + # using stride_tricks lets us make the ndarray for broadcasting without + # having to allocate memory + shape = tuple(self.dimensions.values()) + empty_data = np.lib.stride_tricks.as_strided(np.array(0), shape=shape, + strides=[0] * len(shape)) + template = array.Array(self.dimensions.keys(), empty_data) + for k in columns: + _, var = array.broadcast_variables(template, self[k]) + _, var_data = np.broadcast_arrays(template.data, var.data) + data.append(var_data.reshape(-1)) + # note: pd.MultiIndex.from_product is new in pandas-0.13.1 + index = pd.MultiIndex.from_product(self.coordinates.values(), + names=index_names) + return pd.DataFrame(OrderedDict(zip(columns, data)), index=index) diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py new file mode 100644 index 00000000000..84d4daf7f46 --- /dev/null +++ b/src/xray/dataset_array.py @@ -0,0 +1,479 @@ +# TODO: replace aggregate and iterator methods by a 'groupby' method/object +# like pandas +import functools +import re +from collections import OrderedDict + +import numpy as np +import pandas as pd + +import array_ +import dataset as dataset_ +import groupby +import ops +from common import AbstractArray +from utils import expanded_indexer, FrozenOrderedDict, remap_loc_indexers + + +class _LocIndexer(object): + def __init__(self, ds_array): + self.ds_array = ds_array + + def _remap_key(self, key): + indexers = remap_loc_indexers(self.ds_array.dataset.variables, + self.ds_array._key_to_indexers(key)) + return tuple(indexers.values()) + + def __getitem__(self, key): + return self.ds_array[self._remap_key(key)] + + def __setitem__(self, key, value): + self.ds_array[self._remap_key(key)] = value + + +class DatasetArray(AbstractArray): + """Hybrid between Dataset and Array + + Dataset arrays are the primary way to do computations with Dataset + variables. They are designed to make it easy to manipulate arrays in the + context of an intact Dataset object. Indeed, the contents of a DatasetArray + are uniquely defined by its `dataset` and `focus` paramters. + + Getting items from or doing mathematical operations with a dataset array + returns another dataset array. + + The design of DatasetArray is strongly inspired by the Iris Cube. However, + dataset arrays are much lighter weight than cubes. They are simply aligned, + labeled datasets and do not explicitly guarantee or rely on the CF model. + """ + def __init__(self, dataset, focus): + """ + Parameters + ---------- + dataset : xray.Dataset + The dataset on which to build this dataset array. + focus : str + The name of the "focus variable" in `dataset` on which this object + is oriented. This is the variable on which mathematical operations + are applied. + """ + if not isinstance(dataset, dataset_.Dataset): + dataset = dataset_.Dataset(dataset) + if not focus in dataset: + raise ValueError('focus %r is not a variable in dataset %r' + % (focus, dataset)) + self.dataset = dataset + self.focus = focus + + @property + def array(self): + return self.dataset.variables[self.focus] + @array.setter + def array(self, value): + self.dataset[self.focus] = value + + # _data is necessary for AbstractArray + @property + def _data(self): + return self.array._data + + @property + def data(self): + """The array's data as a numpy.ndarray""" + return self.array.data + @data.setter + def data(self, value): + self.array.data = value + + @property + def dimensions(self): + return self.array.dimensions + + def _key_to_indexers(self, key): + return OrderedDict( + zip(self.dimensions, expanded_indexer(key, self.ndim))) + + def __getitem__(self, key): + if isinstance(key, basestring): + # grab another dataset array from the dataset + return self.dataset[key] + else: + # orthogonal array indexing + return self.indexed_by(**self._key_to_indexers(key)) + + def __setitem__(self, key, value): + if isinstance(key, basestring): + # add an array to the dataset + self.dataset[key] = value + else: + # orthogonal array indexing + self.array[key] = value + + def __delitem__(self, key): + del self.dataset[key] + + def __contains__(self, key): + return key in self.dataset + + @property + def loc(self): + """Attribute for location based indexing like pandas + """ + return _LocIndexer(self) + + def __iter__(self): + for n in range(len(self)): + yield self[n] + + @property + def attributes(self): + return self.array.attributes + + @property + def variables(self): + return self.dataset.variables + + @property + def coordinates(self): + return FrozenOrderedDict((k, self.dataset.variables[k]) + for k in self.dimensions) + + def copy(self): + return self.__copy__() + + def __copy__(self): + # shallow copy the underlying dataset + return DatasetArray(self.dataset.copy(), self.focus) + + # mutable objects should not be hashable + __hash__ = None + + def __str__(self): + #TODO: make this less hacky + return re.sub(' {4}(%s\s+%s)' % (self.dtype, self.focus), + r'--> \1', str(self.dataset)) + + def __repr__(self): + if self.ndim > 0: + dim_summary = ', '.join('%s: %s' % (k, v) for k, v + in zip(self.dimensions, self.shape)) + contents = ' (%s): %s' % (dim_summary, self.dtype) + else: + contents = ': %s' % self.data + return '' % (type(self).__name__, self.focus, contents) + + def indexed_by(self, **indexers): + """Return a new dataset array whose dataset is given by indexing along + the specified dimension(s) + + See Also + -------- + Dataset.indexed_by + """ + ds = self.dataset.indexed_by(**indexers) + if self.focus not in ds: + # always keep focus variable in the dataset, even if it was + # unselected because indexing made it a scaler + ds[self.focus] = self.array.indexed_by(**indexers) + return type(self)(ds, self.focus) + + def labeled_by(self, **indexers): + """Return a new dataset array whose dataset is given by selecting + coordinate labels along the specified dimension(s) + + See Also + -------- + Dataset.labeled_by + """ + return self.indexed_by(**remap_loc_indexers(self.dataset.variables, + indexers)) + + def renamed(self, new_name): + """Returns a new DatasetArray with this DatasetArray's focus variable + renamed + """ + renamed_dataset = self.dataset.renamed({self.focus: new_name}) + return type(self)(renamed_dataset, new_name) + + def unselected(self): + """Returns a copy of this DatasetArray's dataset with this + DatasetArray's focus variable removed + """ + return self.dataset.unselect(self.focus) + + def unselect(self, *names): + if self.focus in names: + raise ValueError('cannot unselect the focus variable of a ' + 'DatasetArray with unselect. Use the `unselected`' + 'method or the `unselect` method of the dataset.') + return type(self)(self.dataset.unselect(*names), self.focus) + + def refocus(self, new_var, name=None): + """Returns a copy of this DatasetArray's dataset with this + DatasetArray's focus variable replaced by `new_var` + + If `new_var` is a dataset array, its contents will be merged in. + """ + if not hasattr(new_var, 'dimensions'): + new_var = type(self.array)(self.array.dimensions, new_var) + if self.focus not in self.dimensions: + # only unselect the focus from the dataset if it isn't a coordinate + # variable + ds = self.unselected() + else: + ds = self.dataset + if name is None: + name = self.focus + '_' + ds[name] = new_var + return type(self)(ds, name) + + def groupby(self, group, squeeze=True): + """Group this dataset by unique values of the indicated group + + Parameters + ---------- + group : str or DatasetArray + Array whose unique values should be used to group this array. If a + string, must be the name of a variable contained in this dataset. + squeeze : boolean, optional + If "group" is a coordinate of this array, `squeeze` controls + whether the subarrays have a dimension of length 1 along that + coordinate or if the dimension is squeezed out. + + Returns + ------- + grouped : GroupBy + A `GroupBy` object patterned after `pandas.GroupBy` that can be + iterated over in the form of `(unique_value, grouped_array)` pairs + or over which grouped operations can be applied with the `apply` + and `reduce` methods (and the associated aliases `mean`, `sum`, + `std`, etc.). + """ + if isinstance(group, basestring): + # merge in the group's dataset to allow group to be a virtual + # variable in this dataset + ds = self.dataset.merge(self.dataset[group].dataset) + group = DatasetArray(ds, group) + return groupby.ArrayGroupBy(self, group.focus, group, squeeze=squeeze) + + def transpose(self, *dimensions): + """Return a new DatasetArray object with transposed dimensions + + Note: Although this operation returns a view of this array's data, it + is not lazy -- the data will be fully loaded. + + Parameters + ---------- + *dimensions : str, optional + By default, reverse the dimensions. Otherwise, reorder the + dimensions to this order. + + Returns + ------- + transposed : DatasetArray + The returned DatasetArray's array is transposed. + + See Also + -------- + numpy.transpose + Array.transpose + """ + return self.refocus(self.array.transpose(*dimensions), self.focus) + + def reduce(self, func, dimension=None, axis=None, **kwargs): + """Reduce this array by applying `func` along some dimension(s) + + Parameters + ---------- + func : function + Function which can be called in the form + `f(x, axis=axis, **kwargs)` to return the result of reducing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `func`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `func`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then the reduction is calculated over the flattened array + (by calling `f(x)` without an axis argument). + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Note + ---- + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is + performed repeatedly along each dimension in turn from left to right. + + Returns + ------- + reduced : DatasetArray + DatasetArray with this object's array replaced with an array with + summarized data and the indicated dimension(s) removed. + """ + var = self.array.reduce(func, dimension, axis, **kwargs) + drop = set(self.dimensions) - set(var.dimensions) + # For now, take an aggressive strategy of removing all variables + # associated with any dropped dimensions + # TODO: save some summary (mean? bounds?) of dropped variables + drop |= {k for k, v in self.dataset.variables.iteritems() + if any(dim in drop for dim in v.dimensions)} + ds = self.dataset.unselect(*drop) + ds[self.focus] = var + return type(self)(ds, self.focus) + + @classmethod + def from_stack(cls, arrays, dimension='stacked_dimension', + stacked_indexers=None, length=None, template=None): + """Stack arrays along a new or existing dimension to form a new + dataview + + Parameters + ---------- + arrays : iterable of Array + Arrays to stack together. Each variable is expected to have + matching dimensions and shape except for along the stacked + dimension. + dimension : str or Array, optional + Name of the dimension to stack along. This can either be a new + dimension name, in which case it is added along axis=0, or an + existing dimension name, in which case the location of the + dimension is unchanged. Where to insert the new dimension is + determined by whether it is found in the first array. + stacked_indexers : optional + length : optional + template : optional + + Returns + ------- + stacked : DatasetArray + Stacked dataset array formed by stacking all the supplied variables + along the new dimension. + """ + ds = dataset_.Dataset() + if isinstance(dimension, basestring): + dim_name = dimension + else: + dim_name, = dimension.dimensions + if hasattr(dimension, 'focus'): + ds[dimension.focus] = dimension + + if template is not None: + # use metadata from the template dataset array + focus = template.focus + old_dim_name, = template.dataset.variables[dim_name].dimensions + drop = {k for k, v in template.dataset.variables.iteritems() + if old_dim_name in v.dimensions} + ds.merge(template.dataset.unselect(*drop), inplace=True) + else: + # figure out metadata by inspecting each array + focus = None + arrays = list(arrays) + for array in arrays: + if isinstance(array, cls): + unselected = array.unselected() + drop = {k for k, v in unselected.variables.iteritems() + if k == dim_name or dim_name in v.dimensions} + if drop: + unselected = unselected.unselect(*drop) + ds.merge(unselected, inplace=True) + if focus is None: + focus = array.focus + elif focus != array.focus: + raise ValueError('DatasetArray.from_stack requires ' + 'that all stacked views have the ' + 'same focus') + if focus is None: + focus = 'stacked_variable' + + ds[focus] = array_.Array.from_stack(arrays, dimension, + stacked_indexers, length, template) + return cls(ds, focus) + + def to_dataframe(self): + """Convert this array into a pandas.DataFrame + + Non-coordinate variables in this array's dataset (which include the + view's data) form the columns of the DataFrame. The DataFrame is be + indexed by the Cartesian product of the dataset's coordinates. + """ + return self.dataset.to_dataframe() + + def to_series(self): + """Conver this array into a pandas.Series + + The Series is be indexed by the Cartesian product of the coordinates. + Unlike `to_dataframe`, only the variable at the focus of this array is + including in the returned series. + """ + index = pd.MultiIndex.from_product(self.coordinates.values(), + names=self.coordinates.keys()) + return pd.Series(self.data.reshape(-1), index=index, name=self.focus) + + def __array_wrap__(self, obj, context=None): + return self.refocus(self.array.__array_wrap__(obj, context)) + + @staticmethod + def _unary_op(f): + @functools.wraps(f) + def func(self, *args, **kwargs): + return self.refocus(f(self.array, *args, **kwargs), + self.focus + '_' + f.__name__) + return func + + def _check_coordinates_compat(self, other): + # TODO: possibly automatically select index intersection instead? + if hasattr(other, 'coordinates'): + for k, v in self.coordinates.iteritems(): + if (k in other.coordinates + and not np.array_equal(v, other.coordinates[k])): + raise ValueError('coordinate %r is not aligned' % k) + + @staticmethod + def _binary_op(f, reflexive=False): + @functools.wraps(f) + def func(self, other): + # TODO: automatically group by other variable dimensions to allow + # for broadcasting dimensions like 'dayofyear' against 'time' + self._check_coordinates_compat(other) + ds = self.unselected() + if hasattr(other, 'unselected'): + ds.merge(other.unselected(), inplace=True) + other_array = getattr(other, 'array', other) + other_focus = getattr(other, 'focus', 'other') + focus = self.focus + '_' + f.__name__ + '_' + other_focus + ds[focus] = (f(self.array, other_array) + if not reflexive + else f(other_array, self.array)) + return type(self)(ds, focus) + return func + + @staticmethod + def _inplace_binary_op(f): + @functools.wraps(f) + def func(self, other): + self._check_coordinates_compat(other) + other_array = getattr(other, 'array', other) + self.array = f(self.array, other_array) + if hasattr(other, 'unselected'): + self.dataset.merge(other.unselected(), inplace=True) + return self + return func + +ops.inject_special_operations(DatasetArray, priority=60) + + +def align(array1, array2): + """Given two Dataset or DatasetArray objects, returns two new objects where + all coordinates found on both datasets are replaced by their intersection, + and thus are aligned for performing mathematical operations. + """ + # TODO: automatically align when doing math with arrays, or better yet + # calculate the union of the indices and fill in the mis-aligned data with + # NaN. + overlapping_coords = {k: (array1.coordinates[k].data + & array2.coordinates[k].data) + for k in array1.coordinates + if k in array2.coordinates} + return tuple(ar.labeled_by(**overlapping_coords) + for ar in [array1, array2]) diff --git a/src/xray/groupby.py b/src/xray/groupby.py new file mode 100644 index 00000000000..bab4ff33298 --- /dev/null +++ b/src/xray/groupby.py @@ -0,0 +1,296 @@ +import itertools + +from common import ImplementsReduce +from ops import inject_reduce_methods +import array_ +import dataset +import numpy as np + + +def unique_value_groups(ar): + """Group an array by its unique values + + Parameters + ---------- + ar : array_like + Input array. This will be flattened if it is not already 1-D. + + Returns + ------- + values : np.ndarray + Sorted, unique values as returned by `np.unique`. + indices : list of lists of int + Each element provides the integer indices in `ar` with values given by + the corresponding value in `unique_values`. + """ + values, inverse = np.unique(ar, return_inverse=True) + groups = [[] for _ in range(len(values))] + for n, g in enumerate(inverse): + groups[g].append(n) + return values, groups + + +def peek_at(iterable): + """Returns the first value from iterable, as well as a new iterable with + the same content as the original iterable + """ + gen = iter(iterable) + peek = gen.next() + return peek, itertools.chain([peek], gen) + + +class GroupBy(object): + """A object that implements the split-apply-combine pattern + + Modeled after `pandas.GroupBy`. The `GroupBy` object can be iterated over + (unique_value, grouped_array) pairs, but the main way to interact with a + groupby object are with the `apply` or `reduce` methods. You can also + directly call numpy methods like `mean` or `std`. + + See Also + -------- + Array.groupby + DatasetArray.groupby + """ + def __init__(self, obj, group_name, group_coord, squeeze=True): + """See Array.groupby and DatasetArray.groupby + """ + if group_coord.ndim != 1: + # TODO: remove this limitation? + raise ValueError('`group_coord` must be 1 dimensional') + + self.obj = obj + self.group_coord = group_coord + self.group_dim, = group_coord.dimensions + + dimensions = obj.dimensions + try: + expected_size = dimensions[self.group_dim] + except TypeError: + expected_size = obj.shape[obj.dimensions.index(self.group_dim)] + + if group_coord.size != expected_size: + raise ValueError('the group variable\'s length does not ' + 'match the length of this variable along its ' + 'dimension') + + if group_name in obj.dimensions: + # assume that group_coord already has sorted, unique values + if group_coord.dimensions != (group_name,): + raise ValueError('`group_coord` is required to be a coordinate ' + 'variable along the `group_name` dimension ' + 'if `group_name` is a dimension in `obj`') + group_indices = np.arange(group_coord.size) + if not squeeze: + # group_indices = group_indices.reshape(-1, 1) + # use slices to do views instead of fancy indexing + group_indices = [slice(i, i + 1) for i in group_indices] + unique_coord = group_coord + else: + # look through group_coord to find the unique values + unique_values, group_indices = unique_value_groups(group_coord) + variables = {group_name: (group_name, unique_values)} + unique_coord = dataset.Dataset(variables)[group_name] + + self.group_indices = group_indices + self.unique_coord = unique_coord + self._groups = None + + @property + def groups(self): + # provided for compatibility with pandas.groupby + if self._groups is None: + self._groups = dict(zip(self.unique_coord, self.group_indices)) + return self._groups + + def __len__(self): + return self.unique_coord.size + + def __iter__(self): + return itertools.izip(self.unique_coord, self.iter_indexed()) + + def iter_indexed(self): + for indices in self.group_indices: + yield self.obj.indexed_by(**{self.group_dim: indices}) + + +class ArrayGroupBy(GroupBy, ImplementsReduce): + def iter_shortcut(self): + """Fast version of `iter_groups` that yields Arrays without metadata + """ + # extract the underlying Array object + array = self.obj + if hasattr(array, 'array'): + array = array.array + + group_axis = array.dimensions.index(self.group_dim) + + # build the new dimensions + index_int = isinstance(self.group_indices[0], int) + if index_int: + dims = tuple(d for n, d in enumerate(array.dimensions) + if n != group_axis) + else: + dims = array.dimensions + + # slice the data and build the new Arrays directly + for indices in self.group_indices: + indexer = tuple(indices if n == group_axis else slice(None) + for n in range(array.ndim)) + data = array.data[indexer] + yield array_.Array(dims, data) + + def apply(self, func, shortcut=False, **kwargs): + """Apply a function over each array in the group and stack them + together into a new array + + `func` is called like `func(ar, *args, **kwargs)` for each array `ar` + in this group. + + Apply uses heuristics (like `pandas.GroupBy.apply`) to figure out how + to stack together the array. The rule is: + 1. If the dimension along which the group coordinate is defined is + still in the first grouped array after applying `func`, then stack + over this dimension. + 2. Otherwise, stack over the new dimension given by name of this + grouping (the argument to the `groupby` function). + + Parameters + ---------- + func : function + Callable to apply to each array. + shortcut : bool, optional + Whether or not to shortcut evaluation under the assumptions that: + (1) The action of `func` does not depend on any of the array + metadata (attributes, indices or other contained arrays) but + only on the data and dimensions. + (2) The action of `func` creates arrays with homogeneous metadata, + that is, with the same dimensions and attributes. + If these conditions are satisfied (and they should be in most + cases), the `shortcut` provides significant speedup for common + groupby operations like applying numpy ufuncs. + **kwargs + Used to call `func(ar, **kwargs)` for each array `ar. + + Returns + ------- + applied : Array + A new Array of the same type from which this grouping was created. + """ + applied = (func(ar, **kwargs) for ar in (self.iter_shortcut() if shortcut + else self.iter_indexed())) + + # peek at applied to determine which coordinate to stack over + applied_example, applied = peek_at(applied) + if self.group_dim in applied_example.dimensions: + stack_coord = self.group_coord + indexers = self.group_indices + else: + stack_coord = self.unique_coord + indexers = np.arange(self.unique_coord.size) + + from_stack_kwargs = {'template': self.obj} if shortcut else {} + stacked = type(self.obj).from_stack(applied, stack_coord, indexers, + **from_stack_kwargs) + + # now, reorder the stacked array's dimensions so that those that + # appeared in the original array appear in the same order they did + # originally + stack_dim, = stack_coord.dimensions + original_dims = [stack_dim if d == self.group_dim else d + for d in self.obj.dimensions + if d in stacked.dimensions or d == self.group_dim] + iter_original_dims = iter(original_dims) + new_order = [iter_original_dims.next() if d in original_dims else d + for d in stacked.dimensions] + return stacked.transpose(*new_order) + + def reduce(self, func, dimension=Ellipsis, axis=Ellipsis, shortcut=True, + **kwargs): + """Reduce this variable by applying `func` along some dimension(s) + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, axis=axis, **kwargs)` to return the result of collapsing an + np.ndarray over an integer valued axis. + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `func`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `func`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then `{name}` is calculated over the axis of the variable + over which the group was formed. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Note + ---- + If `reduce` is called with multiple dimensions (or axes, which + are converted into dimensions), then the reduce operation is + performed repeatedly along each dimension in turn from left to right. + + `Ellipsis` is used as a sentinel value for the default dimension and + axis to indicate that this operation is applied along the axis over + which the group was formed, instead of all axes. To instead apply + `{name}` simultaneously over all grouped values, use `dimension=None` + (or equivalently `axis=None`). + + Returns + ------- + reduced : Array + Array with summarized data and the indicated dimension(s) + removed. + """ + # Ellipsis is used as a sentinel value for the altered default + if axis is Ellipsis and dimension is Ellipsis: + dimension = self.group_dim + if dimension is Ellipsis: + dimension = None + if axis is Ellipsis: + axis = None + def reduce_array(ar): + return ar.reduce(func, dimension, axis, **kwargs) + return self.apply(reduce_array, shortcut=shortcut) + + _reduce_method_docstring = \ + """Reduce this {cls}'s data' by applying `{name}` along some + dimension(s) + + Parameters + ---------- + dimension : str or sequence of str, optional + Dimension(s) over which to repeatedly apply `{name}`. + axis : int or sequence of int, optional + Axis(es) over which to repeatedly apply `{name}`. Only one of the + 'dimension' and 'axis' arguments can be supplied. If neither are + supplied, then `{name}` is calculated over the axis of the variable + over which the group was formed. + **kwargs : dict + Additional keyword arguments passed on to `{name}`. + + Note + ---- + If this method is called with multiple dimensions (or axes, which are + converted into dimensions), then `{name}` is performed repeatedly along + each dimension in turn from left to right. + + `Ellipsis` is used as a sentinel value for the default dimension and + axis to indicate that this operation is applied along the axis over + which the group was formed, instead of all axes. To instead apply + `{name}` simultaneously over all grouped values, use `dimension=None` + (or equivalently `axis=None`). + + Returns + ------- + reduced : {cls} + New {cls} object with `{name}` applied to its data and the + indicated dimension(s) removed. + """ + + _reduce_dimension_default = Ellipsis + _reduce_axis_default = Ellipsis + + +inject_reduce_methods(ArrayGroupBy) diff --git a/src/xray/ops.py b/src/xray/ops.py new file mode 100644 index 00000000000..4bb6215dede --- /dev/null +++ b/src/xray/ops.py @@ -0,0 +1,71 @@ +import operator + +import numpy as np + + +UNARY_OPS = ['neg', 'pos', 'abs', 'invert'] +CMP_BINARY_OPS = ['lt', 'le', 'eq', 'ne', 'ge', 'gt'] +NUM_BINARY_OPS = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', + 'pow', 'and', 'xor', 'or'] +# methods which should return the standard numpy return value unchanged +# some of these can probably be wrapped +NUMPY_CONVERT_METHODS = ['choose', 'compress', 'flatten', 'item', 'itemset', + 'nonzero', 'ravel', 'repeat', 'reshape', + 'searchsorted', 'squeeze', 'swapaxes', 'take', + 'trace', 'diagonal', 'dot'] +# methods which don't modify the data shape, so the result should still be +# wrapped in an Variable/DataView +NUMPY_UNARY_METHODS = ['argsort', 'clip', 'conj', 'conjugate', 'fill', + 'getfield', 'newbyteorder', 'put', 'round', 'setfield', + 'setflags', 'view'] +# methods which remove an axis +NUMPY_REDUCE_METHODS = ['all', 'any', 'argmax', 'argmin', 'cumprod', + 'cumsum', 'max', 'mean', 'min', 'prod', 'ptp', 'std', + 'sum', 'var'] + + +def _data_method_wrapper(f): + def func(self, *args, **kwargs): + return getattr(self.data, f)(*args, **kwargs) + func.__name__ = f + return func + + +def _method_wrapper(f): + def func(self, *args, **kwargs): + return getattr(self, f)(*args, **kwargs) + func.__name__ = f + return func + + +def inject_reduce_methods(cls): + # TODO: change these to use methods instead of numpy functions + for name in NUMPY_REDUCE_METHODS: + setattr(cls, name, cls._reduce_method(getattr(np, name), + name, 'numpy')) + + +def inject_special_operations(cls, priority=50): + # priortize our operations over those of numpy.ndarray (priority=1) + # and numpy.matrix (priority=10) + cls.__array_priority__ = priority + op_str = lambda name: '__%s__' % name + op = lambda name: getattr(operator, op_str(name)) + # patch in standard special operations + for op_names, op_wrap in [(UNARY_OPS, cls._unary_op), + (CMP_BINARY_OPS + NUM_BINARY_OPS, + cls._binary_op)]: + for name in op_names: + setattr(cls, op_str(name), op_wrap(op(name))) + # only numeric operations have in-place and reflexive variants + for name in NUM_BINARY_OPS: + setattr(cls, op_str('r' + name), + cls._binary_op(op(name), reflexive=True)) + setattr(cls, op_str('i' + name), + cls._inplace_binary_op(op('i' + name))) + # patch in numpy methods + for name in NUMPY_CONVERT_METHODS: + setattr(cls, name, _data_method_wrapper(name)) + for name in NUMPY_UNARY_METHODS: + setattr(cls, name, cls._unary_op(_method_wrapper(name))) + inject_reduce_methods(cls) diff --git a/src/xray/utils.py b/src/xray/utils.py new file mode 100644 index 00000000000..72e44954914 --- /dev/null +++ b/src/xray/utils.py @@ -0,0 +1,291 @@ +import netCDF4 as nc4 +import operator +from collections import OrderedDict, Mapping +from datetime import datetime + +import numpy as np +import pandas as pd + + +def expanded_indexer(key, ndim): + """Given a key for indexing an ndarray, return an equivalent key which is a + tuple with length equal to the number of dimensions + + The expansion is done by replacing all `Ellipsis` items with the right + number of full slices and then padding the key with full slices so that it + reaches the appropriate dimensionality. + """ + if not isinstance(key, tuple): + # numpy treats non-tuple keys equivalent to tuples of length 1 + key = (key,) + new_key = [] + # handling Ellipsis right is a little tricky, see: + # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + found_ellipsis = False + for k in key: + if k is Ellipsis: + if not found_ellipsis: + new_key.extend((ndim + 1 - len(key)) * [slice(None)]) + found_ellipsis = True + else: + new_key.append(slice(None)) + else: + new_key.append(k) + new_key.extend((ndim - len(new_key)) * [slice(None)]) + return tuple(new_key) + + +def orthogonal_indexer(key, shape): + """Given a key for orthogonal array indexing, returns an equivalent key + suitable for indexing a numpy.ndarray with fancy indexing + """ + def expand_array(k, length): + if isinstance(k, slice): + return np.arange(k.start or 0, k.stop or length, k.step or 1) + else: + k = np.asarray(k) + if k.ndim != 1: + raise ValueError('orthogonal array indexing only supports ' + '1d arrays') + return k + # replace Ellipsis objects with slices + key = list(expanded_indexer(key, len(shape))) + # replace 1d arrays and slices with broadcast compatible arrays + # note: we treat integers separately (instead of turning them into 1d + # arrays) because integers (and only integers) collapse axes when used with + # __getitem__ + non_int_keys = [n for n, k in enumerate(key) if not isinstance(k, int)] + + def full_slices_unselected(n_list): + def all_full_slices(key_index): + return all(isinstance(key[n], slice) and key[n] == slice(None) + for n in key_index) + if not n_list: + return n_list + elif all_full_slices(range(n_list[0] + 1)): + return full_slices_unselected(n_list[1:]) + elif all_full_slices(range(n_list[-1], len(key))): + return full_slices_unselected(n_list[:-1]) + else: + return n_list + + # However, testing suggests it is OK to keep contiguous sequences of full + # slices at the start or the end of the key. Keeping slices around (when + # possible) instead of converting slices to arrays significantly speeds up + # indexing. + # (Honestly, I don't understand when it's not OK to keep slices even in + # between integer indices if as array is somewhere in the key, but such are + # the admittedly mind-boggling ways of numpy's advanced indexing.) + array_keys = full_slices_unselected(non_int_keys) + + array_indexers = np.ix_(*(expand_array(key[n], shape[n]) + for n in array_keys)) + for i, n in enumerate(array_keys): + key[n] = array_indexers[i] + return tuple(key) + + +def remap_loc_indexers(indices, indexers): + """Given mappings of indices and label based indexers, return equivalent + location based indexers + """ + new_indexers = OrderedDict() + for dim, loc in indexers.iteritems(): + index = indices[dim].data + if isinstance(loc, slice): + indexer = index.slice_indexer(loc.start, loc.stop, loc.step) + else: + try: + indexer = index.get_loc(loc) + except TypeError: + # value is a list or array + indexer = index.get_indexer(np.asarray(loc)) + if np.any(indexer < 0): + raise ValueError('not all values found in index %r' % dim) + new_indexers[dim] = indexer + return new_indexers + + +def num2datetimeindex(num_dates, units, calendar=None): + """Convert an array of numeric dates in netCDF format into a + pandas.DatetimeIndex + + For standard (Gregorian) calendars, this function uses vectorized + operations, which makes it much faster than netCDF4.num2date. + """ + num_dates = np.asarray(num_dates) + if calendar is None: + calendar = 'standard' + start_date = nc4.num2date(num_dates[0], units, calendar) + if (num_dates.size < 2 + or calendar not in ['standard', 'gregorian', 'proleptic_gregorian'] + or (start_date < datetime(1582, 10, 15) + and calendar != 'proleptic_gregorian')): + dates = nc4.num2date(num_dates, units, calendar) + else: + first_dates = nc4.num2date(num_dates[:2], units, calendar) + first_time_delta = np.timedelta64(first_dates[1] - first_dates[0]) + num_delta = (num_dates - num_dates[0]) / (num_dates[1] - num_dates[0]) + dates = first_time_delta * num_delta + np.datetime64(first_dates[0]) + return pd.Index(dates) + + +def guess_time_units(dates): + """Given an array of dates suitable for input to `pandas.DatetimeIndex`, + returns a CF compatible time-unit string of the form "{time_unit} since + {date[0]}", where `time_unit` is 'days', 'hours', 'minutes' or 'seconds' + (the first one that can evenly divide all unique time deltas in `dates`) + """ + dates = pd.DatetimeIndex(dates) + unique_timedeltas = np.unique(np.diff(dates.values)) + for time_unit, delta in [('days', '1 days'), ('hours', '3600s'), + ('minutes', '60s'), ('seconds', '1s')]: + unit_delta = pd.to_timedelta(delta) + diffs = unique_timedeltas / unit_delta + if np.all(diffs == diffs.astype(int)): + break + else: + raise ValueError('could not automatically determine time units') + return '%s since %s' % (time_unit, dates[0]) + + +def datetimeindex2num(dates, units=None, calendar=None): + """Given an array of dates suitable for input to `pandas.DatetimeIndex`, + returns the tuple `(num, units, calendar)` suitable for CF complient time + variable. + """ + dates = pd.DatetimeIndex(dates) + if units is None: + units = guess_time_units(dates) + if calendar is None: + calendar = 'proleptic_gregorian' + # for now, don't bother doing any trickery like num2datetimeindex to + # convert dates to numbers faster + num = nc4.date2num(dates.to_pydatetime(), units, calendar) + return (num, units, calendar) + + +def variable_equal(v1, v2, rtol=1e-05, atol=1e-08): + """True if two objects have the same dimensions, attributes and data; + otherwise False + + This function is necessary because `v1 == v2` for variables and dataviews + does element-wise comparisions (like numpy.ndarrays). + """ + if (v1.dimensions == v2.dimensions + and v1.attributes == v2.attributes): + try: + # if _data is identical, skip checking arrays by value + if v1._data is v2._data: + return True + except AttributeError: + # _data is not part of the public interface, so it's okay if its + # missing + pass + # TODO: replace this with a NaN safe version. + # see: pandas.core.common.array_equivalent + data1 = v1.data + data2 = v2.data + if hasattr(data1, 'equals'): + # handle pandas.Index objects + return data1.equals(data2) + elif np.issubdtype(data1.dtype, (str, object)): + return np.array_equal(data1, data2) + else: + return np.allclose(data1, data2, rtol=rtol, atol=atol) + else: + return False + + +def update_safety_check(first_dict, second_dict, compat=operator.eq): + """Check the safety of updating one dictionary with another + + Raises ValueError if dictionaries have non-compatible values for any key, + where compatibility is determined by identity (they are the same item) or + the `compat` function. + + Parameters + ---------- + first_dict, second_dict : dict-like + All items in the second dictionary are checked against for conflicts + against items in the first dictionary. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + """ + for k, v in second_dict.iteritems(): + if (k in first_dict and + not (v is first_dict[k] or compat(v, first_dict[k]))): + raise ValueError('unsafe to merge dictionaries without ' + 'overriding values; conflicting key %r' % k) + + +def remove_incompatible_items(first_dict, second_dict, compat=operator.eq): + """Remove incompatible items from the first dictionary in-place + + Items are retained if their keys are found in both dictionaries and the + values are compatible. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + """ + for k, v in second_dict.iteritems(): + if k in first_dict and not compat(v, first_dict[k]): + del first_dict[k] + + +def ordered_dict_intersection(first_dict, second_dict, compat=operator.eq): + """Return the intersection of two dictionaries as a new OrderedDict + + Items are retained if their keys are found in both dictionaries and the + values are compatible. + + Parameters + ---------- + first_dict, second_dict : dict-like + Mappings to merge. + compat : function, optional + Binary operator to determine if two values are compatible. By default, + checks for equality. + + Returns + ------- + intersection : OrderedDict + Intersection of the contents. + """ + new_dict = OrderedDict(first_dict) + remove_incompatible_items(new_dict, second_dict, compat) + return new_dict + + +class Frozen(Mapping): + """Wrapper around an object implementing the mapping interface to make it + immutable. If you really want to modify the mapping, the mutable version is + saved under the `mapping` attribute. + """ + def __init__(self, mapping): + self.mapping = mapping + + def __getitem__(self, key): + return self.mapping[key] + + def __iter__(self): + return iter(self.mapping) + + def __len__(self): + return len(self.mapping) + + def __contains__(self, key): + return key in self.mapping + + def __repr__(self): + return '%s(%r)' % (type(self).__name__, self.mapping) + + +def FrozenOrderedDict(*args, **kwargs): + return Frozen(OrderedDict(*args, **kwargs)) diff --git a/test/__init__.py b/test/__init__.py index c8cd6a5ebd8..efb7ef80aaa 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -2,7 +2,7 @@ from numpy.testing import assert_array_equal -from scidata import utils +from xray import utils class TestCase(unittest.TestCase): @@ -12,5 +12,10 @@ def assertVarEqual(self, v1, v2): def assertVarNotEqual(self, v1, v2): self.assertFalse(utils.variable_equal(v1, v2)) - def assertArrayEqual(self, a1, a2): + def assertNDArrayEqual(self, a1, a2): assert_array_equal(a1, a2) + + +class ReturnItem(object): + def __getitem__(self, key): + return key diff --git a/test/test_array.py b/test/test_array.py new file mode 100644 index 00000000000..224a71af4ae --- /dev/null +++ b/test/test_array.py @@ -0,0 +1,266 @@ +from copy import deepcopy +import warnings + +import numpy as np + +from xray import Array, Dataset +from . import TestCase + + +class TestArray(TestCase): + def setUp(self): + self.d = np.random.random((10, 3)).astype(np.float64) + + def test_data(self): + v = Array(['time', 'x'], self.d, indexing_mode='not-supported') + self.assertIs(v.data, self.d) + with self.assertRaises(ValueError): + # wrong size + v.data = np.random.random(5) + d2 = np.random.random((10, 3)) + v.data = d2 + self.assertIs(v.data, d2) + self.assertEqual(v._indexing_mode, 'numpy') + + def test_array_equality(self): + d = np.random.rand(10, 3) + v1 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v2 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v3 = Array(('dim1', 'dim3'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 3]}) + v4 = Array(('dim1', 'dim2'), data=d, + attributes={'att1': 3, 'att2': [1, 2, 4]}) + v5 = deepcopy(v1) + v5.data[:] = np.random.rand(10, 3) + self.assertVarEqual(v1, v2) + self.assertVarNotEqual(v1, v3) + self.assertVarNotEqual(v1, v4) + self.assertVarNotEqual(v1, v5) + + def test_properties(self): + v = Array(['time', 'x'], self.d, {'foo': 'bar'}) + self.assertEqual(v.dimensions, ('time', 'x')) + self.assertEqual(v.dtype, float) + self.assertEqual(v.shape, (10, 3)) + self.assertEqual(v.size, 30) + self.assertEqual(v.ndim, 2) + self.assertEqual(len(v), 10) + self.assertEqual(v.attributes, {'foo': u'bar'}) + + def test_repr(self): + v = Array(['time', 'x'], self.d) + self.assertEqual('', + repr(v)) + + def test_items(self): + data = np.random.random((10, 11)) + v = Array(['x', 'y'], data) + # test slicing + self.assertVarEqual(v, v[:]) + self.assertVarEqual(v, v[...]) + self.assertVarEqual(Array(['y'], data[0]), v[0]) + self.assertVarEqual(Array(['x'], data[:, 0]), v[:, 0]) + self.assertVarEqual(Array(['x', 'y'], data[:3, :2]), v[:3, :2]) + # test array indexing + x = Array(['x'], np.arange(10)) + y = Array(['y'], np.arange(11)) + self.assertVarEqual(v, v[x.data]) + self.assertVarEqual(v, v[x]) + self.assertVarEqual(v[:3], v[x < 3]) + self.assertVarEqual(v[:, 3:], v[:, y >= 3]) + self.assertVarEqual(v[:3, 3:], v[x < 3, y >= 3]) + self.assertVarEqual(v[:3, :2], v[x[:3], y[:2]]) + self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) + # test iteration + for n, item in enumerate(v): + self.assertVarEqual(Array(['y'], data[n]), item) + # test setting + v.data[:] = 0 + self.assertTrue(np.all(v.data == 0)) + + def test_indexed_by(self): + v = Array(['time', 'x'], self.d) + self.assertVarEqual(v.indexed_by(time=slice(None)), v) + self.assertVarEqual(v.indexed_by(time=0), v[0]) + self.assertVarEqual(v.indexed_by(time=slice(0, 3)), v[:3]) + self.assertVarEqual(v.indexed_by(x=0), v[:, 0]) + with self.assertRaisesRegexp(ValueError, 'do not exist'): + v.indexed_by(not_a_dim=0) + + def test_transpose(self): + v = Array(['time', 'x'], self.d) + v2 = Array(['x', 'time'], self.d.T) + self.assertVarEqual(v, v2.transpose()) + self.assertVarEqual(v.transpose(), v.T) + x = np.random.randn(2, 3, 4, 5) + w = Array(['a', 'b', 'c', 'd'], x) + w2 = Array(['d', 'b', 'c', 'a'], np.einsum('abcd->dbca', x)) + self.assertEqual(w2.shape, (5, 3, 4, 2)) + self.assertVarEqual(w2, w.transpose('d', 'b', 'c', 'a')) + self.assertVarEqual(w, w2.transpose('a', 'b', 'c', 'd')) + w3 = Array(['b', 'c', 'd', 'a'], np.einsum('abcd->bcda', x)) + self.assertVarEqual(w, w3.transpose('a', 'b', 'c', 'd')) + + def test_1d_math(self): + x = np.arange(5) + y = np.ones(5) + v = Array(['x'], x) + # unary ops + self.assertVarEqual(v, +v) + self.assertVarEqual(v, abs(v)) + self.assertNDArrayEqual((-v).data, -x) + # bianry ops with numbers + self.assertVarEqual(v, v + 0) + self.assertVarEqual(v, 0 + v) + self.assertVarEqual(v, v * 1) + self.assertNDArrayEqual((v > 2).data, x > 2) + self.assertNDArrayEqual((0 == v).data, 0 == x) + self.assertNDArrayEqual((v - 1).data, x - 1) + self.assertNDArrayEqual((1 - v).data, 1 - x) + # binary ops with numpy arrays + self.assertNDArrayEqual((v * x).data, x ** 2) + self.assertNDArrayEqual((x * v).data, x ** 2) + self.assertNDArrayEqual(v - y, v - 1) + self.assertNDArrayEqual(y - v, 1 - v) + # verify attributes + v2 = Array(['x'], x, {'units': 'meters'}) + self.assertVarEqual(v2, +v2) + self.assertVarEqual(v2, 0 + v2) + # binary ops with all variables + self.assertNDArrayEqual(v + v, 2 * v) + w = Array(['x'], y, {'foo': 'bar'}) + self.assertVarEqual(v + w, Array(['x'], x + y)) + self.assertNDArrayEqual((v * w).data, x * y) + # something complicated + self.assertNDArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) + + def test_broadcasting_math(self): + x = np.random.randn(2, 3) + v = Array(['a', 'b'], x) + # 1d to 2d broadcasting + self.assertVarEqual( + v * v, + Array(['a', 'b'], np.einsum('ab,ab->ab', x, x))) + self.assertVarEqual( + v * v[0], + Array(['a', 'b'], np.einsum('ab,b->ab', x, x[0]))) + self.assertVarEqual( + v[0] * v, + Array(['b', 'a'], np.einsum('b,ab->ba', x[0], x))) + self.assertVarEqual( + v[0] * v[:, 0], + Array(['b', 'a'], np.einsum('b,a->ba', x[0], x[:, 0]))) + # higher dim broadcasting + y = np.random.randn(3, 4, 5) + w = Array(['b', 'c', 'd'], y) + self.assertVarEqual( + v * w, Array(['a', 'b', 'c', 'd'], + np.einsum('ab,bcd->abcd', x, y))) + self.assertVarEqual( + w * v, Array(['b', 'c', 'd', 'a'], + np.einsum('bcd,ab->bcda', y, x))) + self.assertVarEqual( + v * w[0], Array(['a', 'b', 'c', 'd'], + np.einsum('ab,cd->abcd', x, y[0]))) + + def test_broadcasting_failures(self): + a = Array(['x'], np.arange(10)) + b = Array(['x'], np.arange(5)) + c = Array(['x', 'x'], np.arange(100).reshape(10, 10)) + with self.assertRaisesRegexp(ValueError, 'mismatched lengths'): + a + b + with self.assertRaisesRegexp(ValueError, 'duplicate dimensions'): + a + c + + def test_inplace_math(self): + x = np.arange(5) + v = Array(['x'], x) + v2 = v + v2 += 1 + self.assertIs(v, v2) + # since we provided an ndarray for data, it is also modified in-place + self.assertIs(v.data, x) + self.assertNDArrayEqual(v.data, np.arange(5) + 1) + + def test_array_interface(self): + x = np.arange(5) + v = Array(['x'], x) + self.assertNDArrayEqual(np.asarray(v), x) + # test patched in methods + self.assertNDArrayEqual(v.take([2, 3]), x.take([2, 3])) + self.assertVarEqual(v.argsort(), v) + self.assertVarEqual(v.clip(2, 3), Array('x', x.clip(2, 3))) + # test ufuncs + self.assertVarEqual(np.sin(v), Array(['x'], np.sin(x))) + + def test_reduce(self): + v = Array(['time', 'x'], self.d) + # intentionally test with an operation for which order matters + self.assertVarEqual(v.reduce(np.std, 'time'), + Array(['x'], self.d.std(axis=0), + {'cell_methods': 'time: std'})) + self.assertVarEqual(v.reduce(np.std, axis=0), + v.reduce(np.std, dimension='time')) + self.assertVarEqual(v.reduce(np.std, ['x', 'time']), + Array([], self.d.std(axis=1).std(axis=0), + {'cell_methods': 'x: std time: std'})) + self.assertVarEqual(v.reduce(np.std), + Array([], self.d.std(), + {'cell_methods': 'time: x: std'})) + self.assertVarEqual(v.mean('time'), v.reduce(np.mean, 'time')) + + def test_groupby(self): + agg_var = Array(['y'], np.array(['a', 'a', 'b'])) + v = Array(['x', 'y'], self.d) + + expected_unique = Array(['abc'], np.array(['a', 'b'])) + expected_aggregated = Array(['x', 'abc'], + np.array([self.d[:, :2].sum(axis=1), + self.d[:, 2:].sum(axis=1)]).T, + {'cell_methods': 'y: sum'}) + + x = Array('x', np.arange(10)) + y = Array('y', np.arange(3)) + self.assertVarEqual(v, v.groupby('y', y).apply(lambda x: x)) + self.assertVarEqual(v, v.groupby('x', x).apply(lambda x: x)) + + grouped = v.groupby('abc', agg_var) + self.assertVarEqual(expected_unique, grouped.unique_coord) + self.assertVarEqual(v, grouped.apply(lambda x: x)) + self.assertVarEqual(expected_aggregated, grouped.reduce(np.sum)) + + actual = list(grouped) + expected = zip(expected_unique, [v[:, :2], v[:, 2:]]) + self.assertEqual(len(expected), len(actual)) + for (ke, ve), (ka, va) in zip(expected, actual): + self.assertVarEqual(ke, ka) + self.assertVarEqual(ve, va) + + def test_from_stack(self): + x = np.arange(5) + y = np.ones(5) + v = Array(['a'], x) + w = Array(['a'], y) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack([v, w], 'b')) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack((v, w), 'b')) + self.assertVarEqual(Array(['b', 'a'], np.array([x, y])), + Array.from_stack((v, w), 'b', length=2)) + with self.assertRaisesRegexp(ValueError, 'actual length'): + Array.from_stack([v, w], 'b', length=1) + with self.assertRaisesRegexp(ValueError, 'actual length'): + Array.from_stack([v, w, w], 'b', length=4) + with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): + Array.from_stack([v, Array(['c'], y)], 'b') + # test concatenating along a dimension + v = Array(['time', 'x'], np.random.random((10, 8))) + self.assertVarEqual(v, Array.from_stack([v[:5], v[5:]], 'time')) + self.assertVarEqual(v, Array.from_stack([v[:5], v[5], v[6:]], 'time')) + self.assertVarEqual(v, Array.from_stack([v[0], v[1:]], 'time')) + # test dimension order + self.assertVarEqual(v, Array.from_stack([v[:, :5], v[:, 5:]], 'x')) + self.assertVarEqual(v.transpose(), + Array.from_stack([v[:, 0], v[:, 1:]], 'x')) diff --git a/test/test_data.py b/test/test_data.py deleted file mode 100644 index 22b43fde163..00000000000 --- a/test/test_data.py +++ /dev/null @@ -1,381 +0,0 @@ -from collections import OrderedDict -from copy import deepcopy -from cStringIO import StringIO -import os.path -import unittest - -import numpy as np - -from scidata import Dataset, Variable, backends -from . import TestCase - - -_dims = {'dim1':100, 'dim2':50, 'dim3':10} -_vars = {'var1':['dim1', 'dim2'], - 'var2':['dim1', 'dim2'], - 'var3':['dim3', 'dim1'], - } -_testvar = sorted(_vars.keys())[0] -_testdim = sorted(_dims.keys())[0] - -def create_test_data(store=None): - obj = Dataset(store=store) - obj.create_dimension('time', 10) - for d, l in sorted(_dims.items()): - obj.create_dimension(d, l) - var = obj.create_variable(name=d, dims=(d,), - data=np.arange(l, dtype=np.int32), - attributes={'units':'integers'}) - for v, dims in sorted(_vars.items()): - var = obj.create_variable(name=v, dims=tuple(dims), - data=np.random.normal(size=tuple([_dims[d] for d in dims]))) - var.attributes['foo'] = 'variable' - return obj - -class DataTest(TestCase): - #TODO: test constructor - - def get_store(self): - return None - - def test_iterator(self): - data = create_test_data(self.get_store()) - # iterate over the first dim - iterdim = _testdim - for t, sub in data.iterator(dim=iterdim): - ind = int(np.where(data.variables[iterdim].data == t.data)[0]) - # make sure all the slices match - for v in _vars.keys(): - if iterdim in data[v].dimensions: - dim_axis = list(data[v].dimensions).index(iterdim) - expected = data[v].data.take( - [ind], axis=dim_axis).reshape(sub[v].data.shape) - np.testing.assert_array_equal(sub[v].data, expected) - self.assertEquals(sub.dimensions[iterdim], 1) - # test that the yielded objects are copies of the original - for (t, sub) in data.iterator(dim=iterdim): - sub[_testvar][:] = -71 - self.assertTrue((data[_testvar].data != -71).all()) - - def test_iterarray(self): - data = create_test_data(self.get_store()) - # iterate over the first dim - iterdim = _testdim - for t, d in data.iterarray(dim=iterdim, var=_testvar): - ind = int(np.where(data.variables[iterdim].data == t)[0]) - # make sure all the slices match - dim_axis = list(data[_testvar].dimensions).index(iterdim) - expected = data[_testvar].data.take([ind], axis=dim_axis) - np.testing.assert_array_equal(d, expected) - # test that the yielded objects are views of the original - # This test doesn't make sense for the netCDF4 backend - # for (t, d) in data.iterarray(dim=iterdim, var=_testvar): - # d[:] = -71 - # self.assertTrue((data[_testvar].data == -71).all()) - - def test_dimension(self): - a = Dataset() - a.create_dimension('time', 10) - a.create_dimension('x', 5) - # prevent duplicate creation - self.assertRaises(ValueError, a.create_dimension, 'time', 0) - # length must be integer - self.assertRaises(TypeError, a.create_dimension, 'foo', 'a') - self.assertRaises(TypeError, a.create_dimension, 'foo', [1,]) - self.assertRaises(ValueError, a.create_dimension, 'foo', -1) - self.assertTrue('foo' not in a.dimensions) - - def test_variable(self): - a = Dataset() - a.create_dimension('time', 10) - a.create_dimension('x', 3) - d = np.random.random((10, 3)) - a.create_variable(name='foo', dims=('time', 'x',), data=d) - self.assertTrue('foo' in a.variables) - self.assertTrue('foo' in a) - a.create_variable(name='bar', dims=('time', 'x',), data=d) - # order of creation is preserved - self.assertTrue(a.variables.keys() == ['foo', 'bar']) - self.assertTrue(all([a['foo'][i].data == d[i] - for i in np.ndindex(*d.shape)])) - # prevent duplicate creation - self.assertRaises(ValueError, a.create_variable, - name='foo', dims=('time', 'x',), data=d) - # dimension must be defined - self.assertRaises(ValueError, a.create_variable, - name='qux', dims=('time', 'missing_dim',), data=d) - # try to add variable with dim (10,3) with data that's (3,10) - self.assertRaises(ValueError, a.create_variable, - name='qux', dims=('time', 'x'), data=d.T) - # Variable equality - d = np.random.rand(10, 3) - v1 = Variable(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v2 = Variable(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v5 = Variable(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v3 = Variable(('dim1','dim3'), data=d, - attributes={'att1': 3, 'att2': [1,2,3]}) - v4 = Variable(('dim1','dim2'), data=d, - attributes={'att1': 3, 'att2': [1,2,4]}) - v5 = deepcopy(v1) - v5.data[:] = np.random.rand(10,3) - self.assertVarEqual(v1, v2) - self.assertVarNotEqual(v1, v3) - self.assertVarNotEqual(v1, v4) - self.assertVarNotEqual(v1, v5) - - def test_coordinate(self): - a = Dataset() - vec = np.random.random((10,)) - attributes = {'foo': 'bar'} - a.create_coordinate('x', data=vec, attributes=attributes) - self.assertTrue('x' in a.coordinates) - self.assertVarEqual(a.coordinates['x'], a.variables['x']) - b = Dataset() - b.create_dimension('x', vec.size) - b.create_variable('x', dims=('x',), data=vec, attributes=attributes) - self.assertVarEqual(a['x'], b['x']) - self.assertEquals(a.dimensions, b.dimensions) - arr = np.random.random((10, 1,)) - scal = np.array(0) - self.assertRaises(ValueError, a.create_coordinate, - name='y', data=arr) - self.assertRaises(ValueError, a.create_coordinate, - name='y', data=scal) - self.assertTrue('y' not in a.dimensions) - - def test_attributes(self): - a = Dataset() - a.attributes['foo'] = 'abc' - a.attributes['bar'] = 1 - # numeric scalars are stored as length-1 vectors - self.assertTrue(isinstance(a.attributes['bar'], np.ndarray) and - a.attributes['bar'].ndim == 1) - # __contains__ method - self.assertTrue('foo' in a.attributes) - self.assertTrue('bar' in a.attributes) - self.assertTrue('baz' not in a.attributes) - # user-defined attributes are not object attributes - self.assertRaises(AttributeError, object.__getattribute__, a, 'foo') - # different ways of setting attributes ought to be equivalent - b = Dataset() - b.attributes.update(foo='abc') - self.assertEquals(a.attributes['foo'], b.attributes['foo']) - b = Dataset() - b.attributes.update([('foo', 'abc')]) - self.assertEquals(a.attributes['foo'], b.attributes['foo']) - b = Dataset() - b.attributes.update({'foo': 'abc'}) - self.assertEquals(a.attributes['foo'], b.attributes['foo']) - # attributes can be overwritten - b.attributes['foo'] = 'xyz' - self.assertEquals(b.attributes['foo'], 'xyz') - # attributes can be deleted - del b.attributes['foo'] - self.assertTrue('foo' not in b.attributes) - # attributes can be cleared - b.attributes.clear() - self.assertTrue(len(b.attributes) == 0) - # attributes can be compared - a = Dataset() - b = Dataset() - a.attributes['foo'] = 'bar' - b.attributes['foo'] = np.nan - self.assertFalse(a == b) - a.attributes['foo'] = np.nan - self.assertTrue(a == b) - # attribute names/values must be netCDF-compatible - self.assertRaises(ValueError, b.attributes.__setitem__, '/', 0) - self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', np.zeros((2, 2))) - self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) - - def test_view(self): - data = create_test_data(self.get_store()) - slicedim = _testdim - self.assertEqual(data.view(slice(10), slicedim), - data.views({slicedim: slice(10)})) - - def test_views(self): - data = create_test_data(self.get_store()) - slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} - ret = data.views(slicers) - - # Verify that only the specified dimension was altered - self.assertItemsEqual(data.dimensions, ret.dimensions) - for d in data.dimensions: - if d in slicers: - self.assertEqual(ret.dimensions[d], - np.arange(data.dimensions[d])[slicers[d]].size) - else: - self.assertEqual(data.dimensions[d], ret.dimensions[d]) - # Verify that the data is what we expect - for v in data.variables: - self.assertEqual(data[v].dimensions, ret[v].dimensions) - self.assertEqual(data[v].attributes, ret[v].attributes) - slice_list = [slice(None)] * data[v].data.ndim - for d, s in slicers.iteritems(): - if d in data[v].dimensions: - inds = np.nonzero(np.array(data[v].dimensions) == d)[0] - for ind in inds: - slice_list[ind] = s - expected = data[v].data[slice_list] - actual = ret[v].data - np.testing.assert_array_equal(expected, actual) - # Test that our view accesses the same underlying array - # This test doesn't make sense for the netCDF4 backend - # actual.fill(np.pi) - # np.testing.assert_array_equal(expected, actual) - - self.assertRaises(KeyError, data.views, - {'not_a_dim': slice(0, 2)}) - - ret = data.views({'dim1': 0}) - self.assertEqual({'time': 10, 'dim2': 50, 'dim3': 10}, ret.dimensions) - - ret = data.views({'time': slice(2), 'dim1': 0, 'dim2': slice(5)}) - self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dimensions) - - ret = data.views({'time': 0, 'dim1': 0, 'dim2': slice(5)}) - self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) - - - def test_take(self): - data = create_test_data(self.get_store()) - slicedim = _testdim - # using a list - ret = data.take(indices=range(2, 5), dim=slicedim) - self.assertEquals(len(ret[slicedim].data), 3) - # using a numpy vector - ret = data.take(indices=np.array([2, 3, 4,]), dim=slicedim) - self.assertEquals(len(ret[slicedim].data), 3) - # With a random index - indices = np.random.randint(data.dimensions[slicedim], size=10) - ret = data.take(indices=indices, dim=slicedim) - # Verify that only the specified dimension was altered - for d in data.dimensions: - if d == slicedim: - self.assertEqual(ret.dimensions[d], indices.size) - else: - self.assertEqual(data.dimensions[d], ret.dimensions[d]) - # Verify that the data is what we expect - for v in data.variables: - self.assertEqual(data[v].dimensions, ret[v].dimensions) - self.assertEqual(data[v].attributes, ret[v].attributes) - if slicedim in data[v].dimensions: - expected = data[v].data.take( - indices, axis=data[v].dimensions.index(slicedim)) - else: - expected = data[v].data[:] - actual = ret[v].data - np.testing.assert_array_equal(expected, actual) - # Test that our take is a copy - ret[v].data.fill(np.pi) - self.assertTrue(not (data[v].data == np.pi).any()) - self.assertRaises(KeyError, data.take, - indices=indices, dim='not_a_dim') - self.assertRaises(IndexError, data.take, - indices=[data.dimensions[slicedim] + 10], - dim=slicedim) - - def test_squeeze(self): - data = create_test_data(self.get_store()) - singleton = data.take([1], 'dim2') - squeezed = singleton.squeeze('dim2') - assert not 'dim2' in squeezed.dimensions - for x in [v for v, d in _vars.iteritems() if 'dim2' in d]: - np.testing.assert_array_equal(singleton[x].data.flatten(), - squeezed[x].data) - - def test_select(self): - data = create_test_data(self.get_store()) - ret = data.select(_testvar) - self.assertVarEqual(data[_testvar], ret[_testvar]) - self.assertTrue(_vars.keys()[1] not in ret.variables) - self.assertRaises(KeyError, data.select, (_testvar, 'not_a_var')) - - def test_copy(self): - data = create_test_data(self.get_store()) - var = data.variables[_testvar] - var.attributes['foo'] = 'hello world' - var_copy = var.__deepcopy__() - self.assertEqual(var.data[2, 3], var_copy.data[2, 3]) - var_copy.data[2, 3] = np.pi - self.assertNotEqual(var.data[2, 3], np.pi) - self.assertEqual(var_copy.attributes['foo'], var.attributes['foo']) - var_copy.attributes['foo'] = 'xyz' - self.assertNotEqual(var_copy.attributes['foo'], var.attributes['foo']) - self.assertEqual(var_copy.attributes['foo'], 'xyz') - self.assertNotEqual(id(var), id(var_copy)) - self.assertNotEqual(id(var.data), id(var_copy.data)) - self.assertNotEqual(id(var.attributes), id(var_copy.attributes)) - - def test_rename(self): - data = create_test_data(self.get_store()) - newnames = {'var1': 'renamed_var1', 'dim2': 'renamed_dim2'} - renamed = data.renamed(newnames) - - variables = OrderedDict(data.variables) - for k, v in newnames.iteritems(): - variables[v] = variables.pop(k) - - for k, v in variables.iteritems(): - self.assertTrue(k in renamed.variables) - self.assertEqual(v.attributes, renamed.variables[k].attributes) - dims = list(v.dimensions) - for name, newname in newnames.iteritems(): - if name in dims: - dims[dims.index(name)] = newname - self.assertEqual(dims, list(renamed.variables[k].dimensions)) - self.assertTrue(np.all(v.data == renamed.variables[k].data)) - self.assertEqual(v.attributes, renamed.variables[k].attributes) - - self.assertTrue('var1' not in renamed.variables) - self.assertTrue('var1' not in renamed.dimensions) - self.assertTrue('dim2' not in renamed.variables) - self.assertTrue('dim2' not in renamed.dimensions) - - def test_join(self): - data = create_test_data(self.get_store()) - ds1 = data.select('var1') - ds2 = data.select('var3') - expected = data.select('var1', 'var3') - actual = ds1.join(ds2) - self.assertEqual(expected, actual) - with self.assertRaises(ValueError): - ds1.join(ds2.view(0, 'dim1')) - with self.assertRaises(ValueError): - ds1.join(ds2.renamed({'var3': 'var1'})) - - -class NetCDF4DataTest(DataTest): - def get_store(self): - tmp_file = './delete_me.nc' - if os.path.exists(tmp_file): - os.remove(tmp_file) - return backends.NetCDF4DataStore(tmp_file, mode='w') - - -class ScipyDataTest(DataTest): - def get_store(self): - fobj = StringIO() - return backends.ScipyDataStore(fobj, 'w') - - -class StoreTest(TestCase): - def test_stored_to_consistency(self): - store = backends.InMemoryDataStore() - expected = create_test_data(store) - - mem_nc = deepcopy(expected) - self.assertTrue(isinstance(mem_nc.store, backends.InMemoryDataStore)) - - fobj = StringIO() - store = backends.ScipyDataStore(fobj, 'w') - actual = mem_nc.stored_to(store) - self.assertTrue(actual == expected) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_dataset.py b/test/test_dataset.py new file mode 100644 index 00000000000..29ff0a54701 --- /dev/null +++ b/test/test_dataset.py @@ -0,0 +1,364 @@ +from collections import OrderedDict +from copy import deepcopy +from cStringIO import StringIO +import os.path +import unittest +import tempfile + +import numpy as np +import pandas as pd + +from xray import Dataset, DatasetArray, Array, backends, open_dataset +from . import TestCase + + +_dims = {'dim1':100, 'dim2':50, 'dim3':10} +_vars = {'var1':['dim1', 'dim2'], + 'var2':['dim1', 'dim2'], + 'var3':['dim3', 'dim1'], + } +_testvar = sorted(_vars.keys())[0] +_testdim = sorted(_dims.keys())[0] + + +def create_test_data(store=None): + obj = Dataset() if store is None else Dataset.load_store(store) + obj['time'] = ('time', pd.date_range('2000-01-01', periods=1000)) + for k, d in sorted(_dims.items()): + obj[k] = (k, np.arange(d)) + for v, dims in sorted(_vars.items()): + data = np.random.normal(size=tuple(_dims[d] for d in dims)) + obj[v] = (dims, data, {'foo': 'variable'}) + return obj + + +class DataTest(TestCase): + def get_store(self): + return backends.InMemoryDataStore() + + def test_repr(self): + data = create_test_data(self.get_store()) + self.assertEqual('', repr(data)) + + def test_init(self): + var1 = Array('x', np.arange(100)) + var2 = Array('x', np.arange(1000)) + var3 = Array(['x', 'y'], np.arange(1000).reshape(100, 10)) + with self.assertRaisesRegexp(ValueError, 'but already is saved'): + Dataset({'a': var1, 'b': var2}) + with self.assertRaisesRegexp(ValueError, 'must be defined with 1-d'): + Dataset({'a': var1, 'x': var3}) + + def test_groupby(self): + data = create_test_data(self.get_store()) + for n, (t, sub) in enumerate(list(data.groupby('dim1'))[:3]): + self.assertEqual(data['dim1'][n], t) + self.assertVarEqual(data['var1'][n], sub['var1']) + self.assertVarEqual(data['var2'][n], sub['var2']) + self.assertVarEqual(data['var3'][:, n], sub['var3']) + + def test_variable(self): + a = Dataset() + d = np.random.random((10, 3)) + a['foo'] = (('time', 'x',), d) + self.assertTrue('foo' in a.variables) + self.assertTrue('foo' in a) + a['bar'] = (('time', 'x',), d) + # order of creation is preserved + self.assertTrue(a.variables.keys() == ['time', 'x', 'foo', 'bar']) + self.assertTrue(all([a.variables['foo'][i].data == d[i] + for i in np.ndindex(*d.shape)])) + # try to add variable with dim (10,3) with data that's (3,10) + with self.assertRaises(ValueError): + a['qux'] = (('time', 'x'), d.T) + + def test_coordinate(self): + a = Dataset() + vec = np.random.random((10,)) + attributes = {'foo': 'bar'} + a['x'] = ('x', vec, attributes) + self.assertTrue('x' in a.coordinates) + self.assertIsInstance(a.coordinates['x'].data, pd.Index) + self.assertVarEqual(a.coordinates['x'], a.variables['x']) + b = Dataset() + b['x'] = ('x', vec, attributes) + self.assertVarEqual(a['x'], b['x']) + self.assertEquals(a.dimensions, b.dimensions) + with self.assertRaises(ValueError): + a['x'] = ('x', vec[:5]) + arr = np.random.random((10, 1,)) + scal = np.array(0) + with self.assertRaises(ValueError): + a['y'] = ('y', arr) + with self.assertRaises(ValueError): + a['y'] = ('y', scal) + self.assertTrue('y' not in a.dimensions) + + @unittest.skip('attribute checks are not yet backend specific') + def test_attributes(self): + a = Dataset() + a.attributes['foo'] = 'abc' + a.attributes['bar'] = 1 + # numeric scalars are stored as length-1 vectors + self.assertTrue(isinstance(a.attributes['bar'], np.ndarray) and + a.attributes['bar'].ndim == 1) + # __contains__ method + self.assertTrue('foo' in a.attributes) + self.assertTrue('bar' in a.attributes) + self.assertTrue('baz' not in a.attributes) + # user-defined attributes are not object attributes + self.assertRaises(AttributeError, object.__getattribute__, a, 'foo') + # different ways of setting attributes ought to be equivalent + b = Dataset() + b.attributes.update(foo='abc') + self.assertEquals(a.attributes['foo'], b.attributes['foo']) + b = Dataset() + b.attributes.update([('foo', 'abc')]) + self.assertEquals(a.attributes['foo'], b.attributes['foo']) + b = Dataset() + b.attributes.update({'foo': 'abc'}) + self.assertEquals(a.attributes['foo'], b.attributes['foo']) + # attributes can be overwritten + b.attributes['foo'] = 'xyz' + self.assertEquals(b.attributes['foo'], 'xyz') + # attributes can be deleted + del b.attributes['foo'] + self.assertTrue('foo' not in b.attributes) + # attributes can be cleared + b.attributes.clear() + self.assertTrue(len(b.attributes) == 0) + # attributes can be compared + a = Dataset() + b = Dataset() + a.attributes['foo'] = 'bar' + b.attributes['foo'] = np.nan + self.assertFalse(a == b) + a.attributes['foo'] = np.nan + self.assertTrue(a == b) + # attribute names/values must be netCDF-compatible + self.assertRaises(ValueError, b.attributes.__setitem__, '/', 0) + self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', np.zeros((2, 2))) + self.assertRaises(ValueError, b.attributes.__setitem__, 'foo', dict()) + + def test_indexed_by(self): + data = create_test_data(self.get_store()) + slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} + ret = data.indexed_by(**slicers) + + # Verify that only the specified dimension was altered + self.assertItemsEqual(data.dimensions, ret.dimensions) + for d in data.dimensions: + if d in slicers: + self.assertEqual(ret.dimensions[d], + np.arange(data.dimensions[d])[slicers[d]].size) + else: + self.assertEqual(data.dimensions[d], ret.dimensions[d]) + # Verify that the data is what we expect + for v in data.variables: + self.assertEqual(data[v].dimensions, ret[v].dimensions) + self.assertEqual(data[v].attributes, ret[v].attributes) + slice_list = [slice(None)] * data[v].data.ndim + for d, s in slicers.iteritems(): + if d in data[v].dimensions: + inds = np.nonzero(np.array(data[v].dimensions) == d)[0] + for ind in inds: + slice_list[ind] = s + expected = data[v].data[slice_list] + actual = ret[v].data + np.testing.assert_array_equal(expected, actual) + + with self.assertRaises(ValueError): + data.indexed_by(not_a_dim=slice(0, 2)) + + ret = data.indexed_by(dim1=0) + self.assertEqual({'time': 1000, 'dim2': 50, 'dim3': 10}, ret.dimensions) + + ret = data.indexed_by(time=slice(2), dim1=0, dim2=slice(5)) + self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dimensions) + + ret = data.indexed_by(time=0, dim1=0, dim2=slice(5)) + self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dimensions) + + def test_labeled_by(self): + data = create_test_data(self.get_store()) + int_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 2)} + loc_slicers = {'dim1': slice(None, None, 2), 'dim2': slice(0, 1)} + self.assertEqual(data.indexed_by(**int_slicers), + data.labeled_by(**loc_slicers)) + data['time'] = ('time', np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) + self.assertEqual(data.indexed_by(time=0), + data.labeled_by(time='2000-01-01')) + self.assertEqual(data.indexed_by(time=slice(10)), + data.labeled_by(time=slice('2000-01-01', + '2000-01-10'))) + self.assertEqual(data, data.labeled_by(time=slice('1999', '2005'))) + self.assertEqual(data.indexed_by(time=slice(3)), + data.labeled_by( + time=pd.date_range('2000-01-01', periods=3))) + + def test_variable_indexing(self): + data = create_test_data(self.get_store()) + v = data['var1'] + d1 = data['dim1'] + d2 = data['dim2'] + self.assertVarEqual(v, v[d1.data]) + self.assertVarEqual(v, v[d1]) + self.assertVarEqual(v[:3], v[d1 < 3]) + self.assertVarEqual(v[:, 3:], v[:, d2 >= 3]) + self.assertVarEqual(v[:3, 3:], v[d1 < 3, d2 >= 3]) + self.assertVarEqual(v[:3, :2], v[d1[:3], d2[:2]]) + self.assertVarEqual(v[:3, :2], v[range(3), range(2)]) + + def test_select(self): + data = create_test_data(self.get_store()) + ret = data.select(_testvar) + self.assertVarEqual(data[_testvar], ret[_testvar]) + self.assertTrue(_vars.keys()[1] not in ret.variables) + self.assertRaises(ValueError, data.select, (_testvar, 'not_a_var')) + + @unittest.skip('need to write this test') + def test_unselect(self): + pass + + def test_copy(self): + data = create_test_data(self.get_store()) + var = data.variables[_testvar] + var.attributes['foo'] = 'hello world' + var_copy = var.__deepcopy__() + self.assertEqual(var.data[2, 3], var_copy.data[2, 3]) + var_copy.data[2, 3] = np.pi + self.assertNotEqual(var.data[2, 3], np.pi) + self.assertEqual(var_copy.attributes['foo'], var.attributes['foo']) + var_copy.attributes['foo'] = 'xyz' + self.assertNotEqual(var_copy.attributes['foo'], var.attributes['foo']) + self.assertEqual(var_copy.attributes['foo'], 'xyz') + self.assertNotEqual(id(var), id(var_copy)) + self.assertNotEqual(id(var.data), id(var_copy.data)) + self.assertNotEqual(id(var.attributes), id(var_copy.attributes)) + + def test_rename(self): + data = create_test_data(self.get_store()) + newnames = {'var1': 'renamed_var1', 'dim2': 'renamed_dim2'} + renamed = data.renamed(newnames) + + variables = OrderedDict(data.variables) + for k, v in newnames.iteritems(): + variables[v] = variables.pop(k) + + for k, v in variables.iteritems(): + self.assertTrue(k in renamed.variables) + self.assertEqual(v.attributes, renamed.variables[k].attributes) + dims = list(v.dimensions) + for name, newname in newnames.iteritems(): + if name in dims: + dims[dims.index(name)] = newname + self.assertEqual(dims, list(renamed.variables[k].dimensions)) + self.assertTrue(np.all(v.data == renamed.variables[k].data)) + self.assertEqual(v.attributes, renamed.variables[k].attributes) + + self.assertTrue('var1' not in renamed.variables) + self.assertTrue('var1' not in renamed.dimensions) + self.assertTrue('dim2' not in renamed.variables) + self.assertTrue('dim2' not in renamed.dimensions) + + def test_merge(self): + data = create_test_data(self.get_store()) + ds1 = data.select('var1') + ds2 = data.select('var3') + expected = data.select('var1', 'var3') + actual = ds1.merge(ds2) + self.assertEqual(expected, actual) + with self.assertRaises(ValueError): + ds1.merge(ds2.indexed_by(dim1=slice(2))) + with self.assertRaises(ValueError): + ds1.merge(ds2.renamed({'var3': 'var1'})) + + def test_getitem(self): + data = create_test_data(self.get_store()) + data['time'] = ('time', np.arange(1000, dtype=np.int32), + {'units': 'days since 2000-01-01'}) + self.assertIsInstance(data['var1'], DatasetArray) + self.assertVarEqual(data['var1'], data.variables['var1']) + self.assertItemsEqual(data['var1'].dataset.variables, + {'var1', 'dim1', 'dim2'}) + # access virtual variables + self.assertVarEqual(data['time.dayofyear'][:300], + Array('time', 1 + np.arange(300))) + self.assertNDArrayEqual(data['time.month'].data, + data.variables['time'].data.month) + + def test_setitem(self): + # assign a variable + var = Array(['dim1'], np.random.randn(100)) + data1 = create_test_data(self.get_store()) + data1['A'] = var + data2 = data1.copy() + data2['A'] = var + self.assertEqual(data1, data2) + # assign a dataset array + dv = 2 * data2['A'] + data1['B'] = dv.array + data2['B'] = dv + self.assertEqual(data1, data2) + # assign an array + with self.assertRaisesRegexp(TypeError, 'variables must be of type'): + data2['C'] = var.data + + def test_write_store(self): + expected = create_test_data() + store = self.get_store() + expected.dump_to_store(store) + actual = Dataset.load_store(store) + self.assertEquals(expected, actual) + + def test_to_dataframe(self): + x = np.random.randn(10) + y = np.random.randn(10) + ds = Dataset({'a': Array('t', x), 'b': Array('t', y)}) + expected = pd.DataFrame(np.array([x, y]).T, columns=['a', 'b'], + index=pd.Index(np.arange(10), name='t')) + actual = ds.to_dataframe() + # use the .equals method to check all DataFrame metadata + self.assertTrue(expected.equals(actual)) + + +class NetCDF4DataTest(DataTest): + def get_store(self): + f, self.tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + return backends.NetCDF4DataStore(self.tmp_file, mode='w') + + def test_dump_and_open_dataset(self): + data = create_test_data(self.get_store()) + f, tmp_file = tempfile.mkstemp(suffix='.nc') + os.close(f) + data.dump(tmp_file) + + expected = data.copy() + actual = open_dataset(tmp_file) + self.assertEquals(expected, actual) + os.remove(tmp_file) + + def tearDown(self): + if hasattr(self, 'tmp_file') and os.path.exists(self.tmp_file): + os.remove(self.tmp_file) + + +class ScipyDataTest(DataTest): + def get_store(self): + fobj = StringIO() + return backends.ScipyDataStore(fobj, 'w') + + def test_dump_and_open_dataset(self): + data = create_test_data(self.get_store()) + serialized = data.dumps() + + expected = data.copy() + actual = open_dataset(StringIO(serialized)) + self.assertEquals(expected, actual) + + def test_repr(self): + # scipy.io.netcdf does not keep track of dimension order :( + pass diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py new file mode 100644 index 00000000000..90428e412cc --- /dev/null +++ b/test/test_dataset_array.py @@ -0,0 +1,244 @@ +import numpy as np + +from xray import Dataset, DatasetArray, Array, align +from . import TestCase, ReturnItem + + +class TestDatasetArray(TestCase): + def assertDSArrayEqual(self, ar1, ar2): + self.assertEqual(ar1.dataset, ar2.dataset) + self.assertEqual(ar1.focus, ar2.focus) + + def assertDSArrayEquiv(self, ar1, ar2): + random_name = 'randomly-renamed-variable' + self.assertEqual(ar1.renamed(random_name).dataset, + ar2.renamed(random_name).dataset) + + def setUp(self): + self.x = np.random.random((10, 20)) + self.v = Array(['x', 'y'], self.x) + self.ds = Dataset({'foo': self.v}) + self.dv = DatasetArray(self.ds, 'foo') + + def test_properties(self): + self.assertIs(self.dv.dataset, self.ds) + self.assertEqual(self.dv.focus, 'foo') + self.assertVarEqual(self.dv.array, self.v) + self.assertNDArrayEqual(self.dv.data, self.v.data) + for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', + 'attributes']: + self.assertEqual(getattr(self.dv, attr), getattr(self.v, attr)) + self.assertEqual(len(self.dv), len(self.v)) + self.assertVarEqual(self.dv, self.v) + self.assertEqual(list(self.dv.coordinates), list(self.ds.coordinates)) + for k, v in self.dv.coordinates.iteritems(): + self.assertNDArrayEqual(v, self.ds.coordinates[k]) + + def test_items(self): + # strings pull out dataviews + self.assertDSArrayEqual(self.dv, self.ds['foo']) + x = self.dv['x'] + y = self.dv['y'] + self.assertDSArrayEqual(DatasetArray(self.ds.select('x'), 'x'), x) + self.assertDSArrayEqual(DatasetArray(self.ds.select('y'), 'y'), y) + # integer indexing + I = ReturnItem() + for i in [I[:], I[...], I[x.data], I[x.array], I[x], I[x, y], + I[x.data > -1], I[x.array > -1], I[x > -1], + I[x > -1, y > -1]]: + self.assertVarEqual(self.dv, self.dv[i]) + for i in [I[0], I[:, 0], I[:3, :2], + I[x.data[:3]], I[x.array[:3]], I[x[:3]], I[x[:3], y[:4]], + I[x.data > 3], I[x.array > 3], I[x > 3], I[x > 3, y > 3]]: + self.assertVarEqual(self.v[i], self.dv[i]) + # make sure we always keep the array around, even if it's a scalar + self.assertVarEqual(self.dv[0, 0], self.dv.array[0, 0]) + self.assertEqual(self.dv[0, 0].dataset, + Dataset({'foo': self.dv.array[0, 0]})) + + def test_indexed_by(self): + self.assertEqual(self.dv[0].dataset, self.ds.indexed_by(x=0)) + self.assertEqual(self.dv[:3, :5].dataset, + self.ds.indexed_by(x=slice(3), y=slice(5))) + self.assertDSArrayEqual(self.dv, self.dv.indexed_by(x=slice(None))) + self.assertDSArrayEqual(self.dv[:3], self.dv.indexed_by(x=slice(3))) + + def test_labeled_by(self): + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertDSArrayEqual(self.dv, self.dv.labeled_by(x=slice(None))) + self.assertDSArrayEqual(self.dv[1], self.dv.labeled_by(x='b')) + self.assertDSArrayEqual(self.dv[:3], self.dv.labeled_by(x=slice('c'))) + + def test_loc(self): + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertDSArrayEqual(self.dv[:3], self.dv.loc[:'c']) + self.assertDSArrayEqual(self.dv[1], self.dv.loc['b']) + self.assertDSArrayEqual(self.dv[:3], self.dv.loc[['a', 'b', 'c']]) + self.assertDSArrayEqual(self.dv[:3, :4], + self.dv.loc[['a', 'b', 'c'], np.arange(4)]) + self.dv.loc['a':'j'] = 0 + self.assertTrue(np.all(self.dv.data == 0)) + + def test_renamed(self): + renamed = self.dv.renamed('bar') + self.assertEqual(renamed.dataset, self.ds.renamed({'foo': 'bar'})) + self.assertEqual(renamed.focus, 'bar') + + def test_refocus(self): + self.assertVarEqual(self.dv, self.dv.refocus(self.v)) + self.assertVarEqual(self.dv, self.dv.refocus(self.x)) + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertVarEqual(self.dv.coordinates['x'], + self.dv['x'].refocus( + np.arange(10)).coordinates['x']) + + def test_dataset_getitem(self): + dv = self.ds['foo'] + self.assertDSArrayEqual(dv, self.dv) + + def test_array_interface(self): + self.assertNDArrayEqual(np.asarray(self.dv), self.x) + # test patched in methods + self.assertNDArrayEqual(self.dv.take([2, 3]), self.x.take([2, 3])) + self.assertDSArrayEquiv(self.dv.argsort(), + self.dv.refocus(self.x.argsort())) + self.assertDSArrayEquiv(self.dv.clip(2, 3), + self.dv.refocus(self.x.clip(2, 3))) + # test ufuncs + self.assertDSArrayEquiv(np.sin(self.dv), + self.dv.refocus(np.sin(self.x))) + self.assertDSArrayEquiv(self.dv, np.maximum(self.v, self.dv)) + self.ds['bar'] = Array(['x', 'y'], np.zeros((10, 20))) + self.assertDSArrayEquiv(self.dv, np.maximum(self.dv, self.ds['bar'])) + + def test_math(self): + x = self.x + v = self.v + a = self.dv + # variable math was already tested extensively, so let's just make sure + # that all types are properly converted here + self.assertDSArrayEquiv(a, +a) + self.assertDSArrayEquiv(a, a + 0) + self.assertDSArrayEquiv(a, 0 + a) + self.assertDSArrayEquiv(a, a + 0 * v) + self.assertDSArrayEquiv(a, 0 * v + a) + self.assertDSArrayEquiv(a, a + 0 * x) + self.assertDSArrayEquiv(a, 0 * x + a) + self.assertDSArrayEquiv(a, a + 0 * a) + self.assertDSArrayEquiv(a, 0 * a + a) + # test different indices + ds2 = self.ds.replace('x', Array(['x'], 3 + np.arange(10))) + b = DatasetArray(ds2, 'foo') + with self.assertRaisesRegexp(ValueError, 'not aligned'): + a + b + with self.assertRaisesRegexp(ValueError, 'not aligned'): + b + a + + def test_item_math(self): + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + self.assertVarEqual(self.dv + self.dv[0, 0], + self.dv + self.dv[0, 0].data) + new_data = self.x[0][None, :] + self.x[:, 0][:, None] + self.assertVarEqual(self.dv[:, 0] + self.dv[0], + Array(['x', 'y'], new_data)) + self.assertVarEqual(self.dv[0] + self.dv[:, 0], + Array(['y', 'x'], new_data.T)) + + def test_inplace_math(self): + x = self.x + v = self.v + a = self.dv + b = a + b += 1 + self.assertIs(b, a) + self.assertIs(b.array, v) + self.assertIs(b.data, x) + self.assertIs(b.dataset, self.ds) + + def test_reduce(self): + self.assertVarEqual(self.dv.reduce(np.mean, 'x'), + self.v.reduce(np.mean, 'x')) + # needs more... + # should check which extra dimensions are dropped + + def test_groupby_iter(self): + for ((act_x, act_dv), (exp_x, exp_ds)) in \ + zip(self.dv.groupby('y'), self.ds.groupby('y')): + self.assertVarEqual(exp_x, act_x) + self.assertDSArrayEqual(DatasetArray(exp_ds, 'foo'), act_dv) + for ((_, exp_dv), act_dv) in zip(self.dv.groupby('x'), self.dv): + self.assertDSArrayEqual(exp_dv, act_dv) + + def test_groupby(self): + agg_var = Array(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) + self.dv['abc'] = agg_var + self.dv['y'] = 20 + 100 * self.ds['y'].array + + identity = lambda x: x + for g in ['x', 'y']: + for shortcut in [True, False]: + for squeeze in [True, False]: + expected = self.dv + actual = self.dv.groupby(g, squeeze=squeeze).apply( + identity, shortcut=shortcut) + self.assertDSArrayEqual(expected, actual) + + grouped = self.dv.groupby('abc') + + expected_sum_all = DatasetArray(Dataset( + {'foo': Array(['abc'], np.array([self.x[:, :9].sum(), + self.x[:, 10:].sum(), + self.x[:, 9:10].sum()]).T, + {'cell_methods': 'x: y: sum'}), + 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') + self.assertDSArrayEqual(expected_sum_all, + grouped.reduce(np.sum, dimension=None)) + self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) + + grouped = self.dv.groupby('abc', squeeze=False) + self.assertDSArrayEqual(expected_sum_all, grouped.sum(dimension=None)) + + expected_sum_axis1 = DatasetArray(Dataset( + {'foo': Array(['x', 'abc'], np.array([self.x[:, :9].sum(1), + self.x[:, 10:].sum(1), + self.x[:, 9:10].sum(1)]).T, + {'cell_methods': 'y: sum'}), + 'x': self.ds.variables['x'], + 'abc': Array(['abc'], np.array(['a', 'b', 'c']))}), 'foo') + self.assertDSArrayEqual(expected_sum_axis1, grouped.reduce(np.sum)) + self.assertDSArrayEqual(expected_sum_axis1, grouped.sum()) + + self.assertDSArrayEqual(self.dv, grouped.apply(identity)) + + def test_from_stack(self): + self.ds['bar'] = Array(['x', 'y'], np.random.randn(10, 20)) + foo = self.ds['foo'] + bar = self.ds['bar'].renamed('foo') + # from dataviews: + self.assertVarEqual(Array(['w', 'x', 'y'], + np.array([foo.data, bar.data])), + DatasetArray.from_stack([foo, bar], 'w')) + # from variables: + self.assertVarEqual(Array(['w', 'x', 'y'], + np.array([foo.data, bar.data])), + DatasetArray.from_stack([foo.array, + bar.array], 'w')) + # from iteration: + stacked = DatasetArray.from_stack((v for _, v in foo.groupby('x')), + self.ds['x']) + self.assertDSArrayEqual(foo, stacked) + + def test_align(self): + self.ds['x'] = ('x', np.array(list('abcdefghij'))) + with self.assertRaises(ValueError): + self.dv + self.dv[:5] + dv1, dv2 = align(self.dv, self.dv[:5]) + self.assertDSArrayEqual(dv1, self.dv[:5]) + self.assertDSArrayEqual(dv2, self.dv[:5]) + + def test_to_series(self): + expected = self.dv.to_dataframe()['foo'] + actual = self.dv.to_series() + self.assertNDArrayEqual(expected.values, actual.values) + self.assertNDArrayEqual(expected.index.values, actual.index.values) + self.assertEqual('foo', actual.name) diff --git a/test/test_utils.py b/test/test_utils.py index 2783485bde2..bd7e3438512 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,33 +1,115 @@ +import netCDF4 as nc4 import numpy as np +import pandas as pd -from scidata import utils -from . import TestCase +from xray import utils +from . import TestCase, ReturnItem -class ReturnItem(object): - def __getitem__(self, key): - return key +class TestIndexers(TestCase): + def set_to_zero(self, x, i): + x = x.copy() + x[i] = 0 + return x + def test_expanded_indexer(self): + x = np.random.randn(10, 11, 12, 13, 14) + y = np.arange(5) + I = ReturnItem() + for i in [I[:], I[...], I[0, :, 10], I[..., 10], I[:5, ..., 0], + I[y], I[y, y], I[..., y, y], I[..., 0, 1, 2, 3, 4]]: + j = utils.expanded_indexer(i, x.ndim) + self.assertNDArrayEqual(x[i], x[j]) + self.assertNDArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) -class TestExpandedIndexer(TestCase): - def test(self): - x = np.random.randn(10, 20, 30) - i = ReturnItem() - for i in [i[:], i[...], i[0, :, 10], i[:5, ...], i[np.arange(5)]]: - j = utils.expanded_indexer(i, 3) - self.assertArrayEqual(x[i], x[j]) + def test_orthogonal_indexer(self): + x = np.random.randn(10, 11, 12, 13, 14) + y = np.arange(5) + I = ReturnItem() + # orthogonal and numpy indexing should be equivalent, because we only + # use at most one array and it never in between two slice objects + # (i.e., we try to avoid numpy's mind-boggling "partial indexing" + # http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html) + for i in [I[:], I[0], I[0, 0], I[:5], I[2:5], I[2:5:-1], I[:3, :4], + I[:3, 0, :4], I[:3, 0, :4, 0], I[y], I[:, y], I[0, y], + I[:2, :3, y], I[0, y, :, :4, 0]]: + j = utils.orthogonal_indexer(i, x.shape) + self.assertNDArrayEqual(x[i], x[j]) + self.assertNDArrayEqual(self.set_to_zero(x, i), + self.set_to_zero(x, j)) + # for more complicated cases, check orthogonal indexing is still + # equivalent to slicing + z = np.arange(2, 8, 2) + for i, j, shape in [ + (I[y, y], I[:5, :5], (5, 5, 12, 13, 14)), + (I[y, z], I[:5, 2:8:2], (5, 3, 12, 13, 14)), + (I[0, y, y], I[0, :5, :5], (5, 5, 13, 14)), + (I[y, 0, z], I[:5, 0, 2:8:2], (5, 3, 13, 14)), + (I[y, :, z], I[:5, :, 2:8:2], (5, 11, 3, 13, 14)), + (I[0, :2, y, y, 0], I[0, :2, :5, :5, 0], (2, 5, 5)), + (I[0, :, y, :, 0], I[0, :, :5, :, 0], (11, 5, 13)), + (I[:, :, y, :, 0], I[:, :, :5, :, 0], (10, 11, 5, 13)), + (I[:, :, y, z, :], I[:, :, :5, 2:8:2], (10, 11, 5, 3, 14))]: + k = utils.orthogonal_indexer(i, x.shape) + self.assertEqual(shape, x[k].shape) + self.assertNDArrayEqual(x[j], x[k]) + self.assertNDArrayEqual(self.set_to_zero(x, j), + self.set_to_zero(x, k)) + # standard numpy (non-orthogonal) indexing doesn't work anymore + with self.assertRaisesRegexp(ValueError, 'only supports 1d'): + utils.orthogonal_indexer(x > 0, x.shape) -class TestSafeMerge(TestCase): +class TestDatetime(TestCase): + def test_num2datetimeindex(self): + for num_dates, units in [ + (np.arange(1000), 'days since 2000-01-01'), + (12300 + np.arange(500), 'hours since 1680-01-01 00:00:00')]: + for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: + expected = pd.Index(nc4.num2date(num_dates, units, calendar)) + actual = utils.num2datetimeindex(num_dates, units, calendar) + self.assertNDArrayEqual(expected, actual) + + def test_guess_time_units(self): + for dates, expected in [(pd.date_range('1900-01-01', periods=5), + 'days since 1900-01-01 00:00:00'), + (pd.date_range('1900-01-01 12:00:00', freq='H', + periods=2), + 'hours since 1900-01-01 12:00:00'), + (['1900-01-01', '1900-01-02', + '1900-01-02 00:00:01'], + 'seconds since 1900-01-01 00:00:00')]: + self.assertEquals(expected, utils.guess_time_units(dates)) + + +class TestDictionaries(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} self.y = {'c': 'C', 'b': 'B'} + self.z = {'a': 'Z'} - def test_good_merge(self): - actual = utils.safe_merge(self.x, self.y) - self.x.update(self.y) - self.assertEqual(self.x, actual) + def test_safe(self): + # should not raise exception: + utils.update_safety_check(self.x, self.y) - def test_bad_merge(self): + def test_unsafe(self): with self.assertRaises(ValueError): - utils.safe_merge(self.x, {'a': 'Z'}) + utils.update_safety_check(self.x, self.z) + + def test_ordered_dict_intersection(self): + self.assertEquals({'a': 'A', 'b': 'B'}, + utils.ordered_dict_intersection(self.x, self.y)) + self.assertEquals({'b': 'B'}, + utils.ordered_dict_intersection(self.x, self.z)) + + def test_frozen(self): + x = utils.Frozen(self.x) + with self.assertRaises(TypeError): + x['foo'] = 'bar' + with self.assertRaises(TypeError): + del x['a'] + with self.assertRaises(AttributeError): + x.update(self.y) + self.assertEquals(x.mapping, self.x) + diff --git a/test/test_variable.py b/test/test_variable.py deleted file mode 100644 index 3282ddc3679..00000000000 --- a/test/test_variable.py +++ /dev/null @@ -1,134 +0,0 @@ -import warnings - -import numpy as np - -from scidata import Variable -from . import TestCase - - -class TestVariable(TestCase): - def setUp(self): - self.d = np.random.random((10, 3)) - - def test_data(self): - v = Variable(['time', 'x'], self.d) - self.assertIs(v.data, self.d) - with self.assertRaises(ValueError): - # wrong size - v.data = np.random.random(5) - d2 = np.random.random((10, 3)) - v.data = d2 - self.assertIs(v.data, d2) - - with warnings.catch_warnings(record=True) as w: - v = Variable(['x'], range(5)) - self.assertIn("converting data to np.ndarray", str(w[-1].message)) - self.assertIsInstance(v.data, np.ndarray) - with warnings.catch_warnings(record=True) as w: - # don't warn for numpy numbers - v = Variable([], np.float32(1)) - self.assertFalse(w) - - def test_properties(self): - v = Variable(['time', 'x'], self.d, {'foo': 'bar'}) - self.assertEqual(v.dimensions, ('time', 'x')) - self.assertEqual(v.dtype, float) - self.assertEqual(v.shape, (10, 3)) - self.assertEqual(v.size, 30) - self.assertEqual(v.ndim, 2) - self.assertEqual(len(v), 10) - self.assertEqual(v.attributes, {'foo': u'bar'}) - - def test_items(self): - v = Variable(['time', 'x'], self.d) - self.assertVarEqual(v, v[:]) - self.assertVarEqual(v, v[...]) - self.assertVarEqual(Variable(['x'], self.d[0]), v[0]) - self.assertVarEqual( - Variable(['time'], self.d[:, 0]), v[:, 0]) - self.assertVarEqual( - Variable(['time', 'x'], self.d[:3, :2]), v[:3, :2]) - for n, item in enumerate(v): - self.assertVarEqual(Variable(['x'], self.d[n]), item) - v.data[:] = 0 - self.assertTrue(np.all(v.data == 0)) - - def test_views(self): - v = Variable(['time', 'x'], self.d) - self.assertVarEqual(v.views({'time': slice(None)}), v) - self.assertVarEqual(v.views({'time': 0}), v[0]) - self.assertVarEqual(v.views({'time': slice(0, 3)}), v[:3]) - self.assertVarEqual(v.views({'x': 0}), v[:, 0]) - - def test_1d_math(self): - x = np.arange(5) - y = np.ones(5) - v = Variable(['x'], x) - # unary ops - self.assertVarEqual(v, +v) - self.assertVarEqual(v, abs(v)) - self.assertArrayEqual((-v).data, -x) - # verify attributes - v2 = Variable(['x'], x, {'units': 'meters'}) - self.assertVarEqual(v, +v2) - v3 = Variable(['x'], x, {'some': 'attribute'}) - self.assertVarEqual(v3, +v3) - # bianry ops with numbers - self.assertVarEqual(v, v + 0) - self.assertVarEqual(v, 0 + v) - self.assertVarEqual(v, v * 1) - self.assertArrayEqual((v > 2).data, x > 2) - self.assertArrayEqual((0 == v).data, 0 == x) - self.assertArrayEqual((v - 1).data, x - 1) - self.assertArrayEqual((1 - v).data, 1 - x) - # binary ops with numpy arrays - self.assertArrayEqual((v * x).data, x ** 2) - self.assertArrayEqual((x * v).data, x ** 2) - self.assertArrayEqual(v - y, v - 1) - self.assertArrayEqual(y - v, 1 - v) - # binary ops with all variables - self.assertArrayEqual(v + v, 2 * v) - w = Variable(['x'], y, {'foo': 'bar'}) - self.assertVarEqual(v + w, Variable(['x'], x + y, {'foo': 'bar'})) - self.assertArrayEqual((v * w).data, x * y) - # something complicated - self.assertArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) - - def test_broadcasting_math(self): - x = np.random.randn(2, 3) - v = Variable(['a', 'b'], x) - # 1d to 2d broadcasting - self.assertVarEqual( - v * v, - Variable(['a', 'b'], np.einsum('ab,ab->ab', x, x))) - self.assertVarEqual( - v * v[0], - Variable(['a', 'b'], np.einsum('ab,b->ab', x, x[0]))) - self.assertVarEqual( - v[0] * v, - Variable(['b', 'a'], np.einsum('b,ab->ba', x[0], x))) - self.assertVarEqual( - v[0] * v[:, 0], - Variable(['b', 'a'], np.einsum('b,a->ba', x[0], x[:, 0]))) - # higher dim broadcasting - y = np.random.randn(3, 4, 5) - w = Variable(['b', 'c', 'd'], y) - self.assertVarEqual( - v * w, Variable(['a', 'b', 'c', 'd'], - np.einsum('ab,bcd->abcd', x, y))) - self.assertVarEqual( - w * v, Variable(['b', 'c', 'd', 'a'], - np.einsum('bcd,ab->bcda', y, x))) - self.assertVarEqual( - v * w[0], Variable(['a', 'b', 'c', 'd'], - np.einsum('ab,cd->abcd', x, y[0]))) - - def test_inplace_math(self): - x = np.arange(5) - v = Variable(['x'], x) - v2 = v - v2 += 1 - self.assertIs(v, v2) - # since we provided an ndarray for data, it is also modified in-place - self.assertIs(v.data, x) - self.assertArrayEqual(v.data, np.arange(5) + 1)