diff --git a/.gitignore b/.gitignore index 320f03a0171a2..eb26b3cedc724 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ MANIFEST *.cpp *.so *.pyd +*.h5 pandas/version.py doc/source/generated doc/source/_static diff --git a/RELEASE.rst b/RELEASE.rst index 49f45fce13381..7746d8bd587ea 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -57,6 +57,7 @@ pandas 0.10.0 - `obj.fillna()` is no longer valid; make `method='pad'` no longer the default option, to be more explicit about what kind of filling to perform. Add `ffill/bfill` convenience functions per above (#2284) + - `HDFStore.keys()` now returns an absolute path-name for each key **Improvements to existing features** @@ -68,6 +69,7 @@ pandas 0.10.0 - Add ``normalize`` option to Series/DataFrame.asfreq (#2137) - SparseSeries and SparseDataFrame construction from empty and scalar values now no longer create dense ndarrays unnecessarily (#2322) + - ``HDFStore`` now supports hierarchial keys (#2397) - Support multiple query selection formats for ``HDFStore tables`` (#1996) - Support ``del store['df']`` syntax to delete HDFStores - Add multi-dtype support for ``HDFStore tables`` diff --git a/doc/source/io.rst b/doc/source/io.rst index 272e35fc7400d..a81899078f3ae 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -865,6 +865,25 @@ after data is already in the table (this may become automatic in the future or a store.create_table_index('df') store.handle.root.df.table +Hierarchical Keys +~~~~~~~~~~~~~~~~~ + +Keys to a store can be specified as a string. These can be in a hierarchical path-name like format (e.g. ``foo/bar/bah``), which will generate a hierarchy of sub-stores (or ``Groups`` in PyTables parlance). Keys can be specified with out the leading '/' and are ALWAYS absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove everying in the sub-store and BELOW, so be *careful*. + +.. ipython:: python + + store.put('foo/bar/bah', df) + store.append('food/orange', df) + store.append('food/apple', df) + store + + # a list of keys are returned + store.keys() + + # remove all nodes under this level + store.remove('food') + store + Storing Mixed Types in a Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index 98eb4746c7d79..cb6711f4679a9 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -63,6 +63,19 @@ Updated PyTables Support **Enhancements** + - added ability to hierarchical keys + + .. ipython:: python + + store.put('foo/bar/bah', df) + store.append('food/orange', df) + store.append('food/apple', df) + store + + # remove all nodes under this level + store.remove('food') + store + - added mixed-dtype support! .. ipython:: python @@ -77,7 +90,7 @@ Updated PyTables Support - performance improvments on table writing - support for arbitrarily indexed dimensions - - ``SparseSeries`` now has a ``density`` property (#2384) + - ``SparseSeries`` now has a ``density`` property (#2384) **Bug Fixes** diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5a5d9d2942ace..8af7151cb898c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -90,7 +90,6 @@ def _tables(): return _table_mod - @contextmanager def get_store(path, mode='a', complevel=None, complib=None, fletcher32=False): @@ -197,6 +196,11 @@ def __init__(self, path, mode='a', complevel=None, complib=None, self.filters = None self.open(mode=mode, warn=False) + @property + def root(self): + """ return the root node """ + return self.handle.root + def __getitem__(self, key): return self.get(key) @@ -207,26 +211,39 @@ def __delitem__(self, key): return self.remove(key) def __contains__(self, key): - return hasattr(self.handle.root, key) + """ check for existance of this key + can match the exact pathname or the pathnm w/o the leading '/' + """ + node = self.get_node(key) + if node is not None: + name = node._v_pathname + return re.search(key,name) is not None + return False def __len__(self): - return len(self.handle.root._v_children) + return len(self.groups()) def __repr__(self): output = '%s\nFile path: %s\n' % (type(self), self.path) - if len(self) > 0: - keys = [] + groups = self.groups() + if len(groups) > 0: + keys = [] values = [] - for k, v in sorted(self.handle.root._v_children.iteritems()): - kind = getattr(v._v_attrs,'pandas_type',None) + for n in sorted(groups, key = lambda x: x._v_name): + kind = getattr(n._v_attrs,'pandas_type',None) - keys.append(str(k)) + keys.append(str(n._v_pathname)) - if kind is None: + # a table + if _is_table_type(n): + values.append(str(create_table(self, n))) + + # a group + elif kind is None: values.append('unknown type') - elif _is_table_type(v): - values.append(str(create_table(self, v))) + + # another type of pandas object else: values.append(_NAME_MAP[kind]) @@ -239,9 +256,9 @@ def __repr__(self): def keys(self): """ Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore + objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. have the leading '/' """ - return self.handle.root._v_children.keys() + return [ n._v_pathname for n in self.groups() ] def open(self, mode='a', warn=True): """ @@ -304,12 +321,10 @@ def get(self, key): ------- obj : type of object stored in file """ - exc_type = _tables().NoSuchNodeError - try: - group = getattr(self.handle.root, key) - return self._read_group(group) - except (exc_type, AttributeError): + group = self.get_node(key) + if group is None: raise KeyError('No object named %s in the file' % key) + return self._read_group(group) def select(self, key, where=None): """ @@ -322,11 +337,12 @@ def select(self, key, where=None): where : list of Term (or convertable) objects, optional """ - group = getattr(self.handle.root, key, None) + group = self.get_node(key) + if group is None: + raise KeyError('No object named %s in the file' % key) if where is not None and not _is_table_type(group): raise Exception('can only select with where on objects written as tables') - if group is not None: - return self._read_group(group, where) + return self._read_group(group, where) def put(self, key, value, table=False, append=False, compression=None, **kwargs): @@ -352,9 +368,6 @@ def put(self, key, value, table=False, append=False, self._write_to_group(key, value, table=table, append=append, comp=compression, **kwargs) - def _get_handler(self, op, kind): - return getattr(self, '_%s_%s' % (op, kind)) - def remove(self, key, where=None): """ Remove pandas object partially by specifying the where condition @@ -372,15 +385,21 @@ def remove(self, key, where=None): number of rows removed (or None if not a Table) """ - if where is None: - self.handle.removeNode(self.handle.root, key, recursive=True) - else: - group = getattr(self.handle.root, key, None) - if group is not None: + group = self.get_node(key) + if group is not None: + + # remove the node + if where is None or not len(where): + group = self.get_node(key) + group._f_remove(recursive=True) + + # delete from the table + else: if not _is_table_type(group): raise Exception('can only remove with where on objects written as tables') t = create_table(self, group) return t.delete(where) + return None def append(self, key, value, **kwargs): @@ -416,20 +435,50 @@ def create_table_index(self, key, **kwargs): if not _table_supports_index: raise("PyTables >= 2.3 is required for table indexing") - group = getattr(self.handle.root, key, None) + group = self.get_node(key) if group is None: return if not _is_table_type(group): raise Exception("cannot create table index on a non-table") create_table(self, group).create_index(**kwargs) + def groups(self): + """ return a list of all the groups (that are not themselves a pandas storage object) """ + return [ g for g in self.handle.walkGroups() if getattr(g._v_attrs,'pandas_type',None) ] + + def get_node(self, key): + """ return the node with the key or None if it does not exist """ + try: + if not key.startswith('/'): + key = '/' + key + return self.handle.getNode(self.root,key) + except: + return None + + ###### private methods ###### + + def _get_handler(self, op, kind): + return getattr(self, '_%s_%s' % (op, kind)) + def _write_to_group(self, key, value, table=False, append=False, comp=None, **kwargs): - root = self.handle.root - if key not in root._v_children: - group = self.handle.createGroup(root, key) - else: - group = getattr(root, key) + group = self.get_node(key) + if group is None: + paths = key.split('/') + + # recursively create the groups + path = '/' + for p in paths: + if not len(p): + continue + new_path = path + if not path.endswith('/'): + new_path += '/' + new_path += p + group = self.get_node(new_path) + if group is None: + group = self.handle.createGroup(path, p) + path = new_path kind = _TYPE_MAP[type(value)] if table or (append and _is_table_type(group)): @@ -1306,6 +1355,9 @@ class LegacyTable(Table): _indexables = [Col(name = 'index'),Col(name = 'column', index_kind = 'columns_kind'), DataCol(name = 'fields', cname = 'values', kind_attr = 'fields') ] table_type = 'legacy' + def write(self, **kwargs): + raise Exception("write operations are not allowed on legacy tables!") + def read(self, where=None): """ we have 2 indexable columns, with an arbitrary number of data axes """ @@ -1380,6 +1432,21 @@ def read(self, where=None): return wp +class LegacyFrameTable(LegacyTable): + """ support the legacy frame table """ + table_type = 'legacy_frame' + def read(self, *args, **kwargs): + return super(LegacyFrameTable, self).read(*args, **kwargs)['value'] + +class LegacyPanelTable(LegacyTable): + """ support the legacy panel table """ + table_type = 'legacy_panel' + +class AppendableTable(LegacyTable): + """ suppor the new appendable table formats """ + _indexables = None + table_type = 'appendable' + def write(self, axes_to_index, obj, append=False, compression=None, complevel=None, min_itemsize = None, **kwargs): @@ -1488,22 +1555,6 @@ def delete(self, where = None): # return the number of rows removed return ln - -class LegacyFrameTable(LegacyTable): - """ support the legacy frame table """ - table_type = 'legacy_frame' - def read(self, *args, **kwargs): - return super(LegacyFrameTable, self).read(*args, **kwargs)['value'] - -class LegacyPanelTable(LegacyTable): - """ support the legacy panel table """ - table_type = 'legacy_panel' - -class AppendableTable(LegacyTable): - """ suppor the new appendable table formats """ - _indexables = None - table_type = 'appendable' - class AppendableFrameTable(AppendableTable): """ suppor the new appendable table formats """ table_type = 'appendable_frame' diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ca2ea2e7089a0..b4ad98b8cb437 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -51,13 +51,14 @@ def test_factory_fun(self): os.remove(self.scratchpath) - def test_len_keys(self): + def test_keys(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() - self.assertEquals(len(self.store), 4) - self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd'])) + self.store['foo/bar'] = tm.makePanel() + self.assertEquals(len(self.store), 5) + self.assert_(set(self.store.keys()) == set(['/a', '/b', '/c', '/d', '/foo/bar'])) def test_repr(self): repr(self.store) @@ -65,6 +66,7 @@ def test_repr(self): self.store['b'] = tm.makeStringSeries() self.store['c'] = tm.makeDataFrame() self.store['d'] = tm.makePanel() + self.store['foo/bar'] = tm.makePanel() self.store.append('e', tm.makePanel()) repr(self.store) str(self.store) @@ -72,9 +74,14 @@ def test_repr(self): def test_contains(self): self.store['a'] = tm.makeTimeSeries() self.store['b'] = tm.makeDataFrame() + self.store['foo/bar'] = tm.makeDataFrame() self.assert_('a' in self.store) self.assert_('b' in self.store) self.assert_('c' not in self.store) + self.assert_('foo/bar' in self.store) + self.assert_('/foo/bar' in self.store) + self.assert_('/foo/b' not in self.store) + self.assert_('bar' not in self.store) def test_reopen_handle(self): self.store['a'] = tm.makeTimeSeries() @@ -92,6 +99,10 @@ def test_get(self): right = self.store['a'] tm.assert_series_equal(left, right) + left = self.store.get('/a') + right = self.store['/a'] + tm.assert_series_equal(left, right) + self.assertRaises(KeyError, self.store.get, 'b') def test_put(self): @@ -99,6 +110,9 @@ def test_put(self): df = tm.makeTimeDataFrame() self.store['a'] = ts self.store['b'] = df[:10] + self.store['foo/bar/bah'] = df[:10] + self.store['foo'] = df[:10] + self.store['/foo'] = df[:10] self.store.put('c', df[:10], table=True) # not OK, not a table @@ -156,6 +170,19 @@ def test_append(self): store.append('df2', df[10:]) tm.assert_frame_equal(store['df2'], df) + store.append('/df3', df[:10]) + store.append('/df3', df[10:]) + tm.assert_frame_equal(store['df3'], df) + + # this is allowed by almost always don't want to do it + import warnings + import tables + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store.append('/df3 foo', df[:10]) + store.append('/df3 foo', df[10:]) + tm.assert_frame_equal(store['df3 foo'], df) + warnings.filterwarnings('always', category=tables.NaturalNameWarning) + wp = tm.makePanel() store.append('wp1', wp.ix[:,:10,:]) store.append('wp1', wp.ix[:,10:,:]) @@ -293,6 +320,18 @@ def test_remove(self): self.store.remove('b') self.assertEquals(len(self.store), 0) + # pathing + self.store['a'] = ts + self.store['b/foo'] = df + self.store.remove('foo') + self.store.remove('b/foo') + self.assertEquals(len(self.store), 1) + + self.store['a'] = ts + self.store['b/foo'] = df + self.store.remove('b') + self.assertEquals(len(self.store), 1) + # __delitem__ self.store['a'] = ts self.store['b'] = df @@ -315,6 +354,17 @@ def test_remove_where(self): expected = wp.reindex(minor_axis = ['B','C']) tm.assert_panel_equal(rs,expected) + # empty where + self.store.remove('wp') + self.store.put('wp', wp, table=True) + self.store.remove('wp', []) + + # non - empty where + self.store.remove('wp') + self.store.put('wp', wp, table=True) + self.assertRaises(Exception, self.store.remove, + 'wp', ['foo']) + # selectin non-table with a where self.store.put('wp2', wp, table=False) self.assertRaises(Exception, self.store.remove, @@ -846,6 +896,19 @@ def test_legacy_table_read(self): store.select('wp1') store.close() + def test_legacy_table_write(self): + # legacy table types + pth = curpath() + df = tm.makeDataFrame() + wp = tm.makePanel() + + store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') + + self.assertRaises(Exception, store.append, 'df1', df) + self.assertRaises(Exception, store.append, 'wp1', wp) + + store.close() + def test_store_datetime_fractional_secs(self): dt = datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt])