From d2b85ffe794d44d9cb7f62c6024a4b9ec1ac490e Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Sun, 21 Jul 2013 22:10:16 +0200 Subject: [PATCH 1/8] Typo in error message --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..6376b94d1ae99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5705,7 +5705,7 @@ def extract_index(data): raw_lengths.append(len(v)) if not indexes and not raw_lengths: - raise ValueError('If using all scalar values, you must must pass' + raise ValueError('If using all scalar values, you must pass' ' an index') if have_series or have_dicts: From e6c3cd6b796a8df0c545d7862c482d5464c1e8c9 Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Sun, 21 Jul 2013 23:12:03 +0200 Subject: [PATCH 2/8] Tentative fix for #4297. The problem finally seem to be an odd restriction: acceptable sequences were list, tuple, or numpy.ndarray. A guess is that strings in Python wanted to be considered scalars, but the above restriction was implemented rather than take anything with a __len__ except str (or unicode for Python 2.x) in. Considerations about the use of the buffer interface are not needed for now. note: I cannot run tests on Python 2.7. ``` nosetests pandas ``` ends with: ``` from . import hashtable, tslib, lib ImportError: cannot import name hashtable ``` --- pandas/core/frame.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6376b94d1ae99..24b4e8cabac2d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5700,9 +5700,20 @@ def extract_index(data): elif isinstance(v, dict): have_dicts = True indexes.append(v.keys()) - elif isinstance(v, (list, tuple, np.ndarray)): - have_raw_arrays = True - raw_lengths.append(len(v)) + elif not (isinstance(v, str) or \ + isinstance(v, unicode)): + # Although strings have a __len__, + # they are likely to be considered scalars. + try: + l = len(v) + except TypeError: + continue + # Anything else with a __len__ is considered + # a sequence (to be safe, we check + # that there is __getitem__) + if hasattr(v, '__getitem__'): + have_raw_arrays = True + raw_lengths.append(l) if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' From 209c01c393e7be9ab8064386d37896422d1a45af Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Mon, 22 Jul 2013 13:10:04 +0200 Subject: [PATCH 3/8] - Comment the source - Upper case for the first letter in exception messages --- pandas/core/frame.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 24b4e8cabac2d..0f19c7f518163 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5680,6 +5680,9 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): return create_block_manager_from_arrays(arrays, arr_names, axes) def extract_index(data): + # Slightly misleading name. + # Indexes are only extracted for elements in the iterable + # `data` inheriting from Series. from pandas.core.index import _union_indexes index = None @@ -5693,6 +5696,8 @@ def extract_index(data): have_series = False have_dicts = False + # Loop over the element, such as vectors `v` corresponding + # to columns in the DataFrame for v in data: if isinstance(v, Series): have_series = True @@ -5707,6 +5712,11 @@ def extract_index(data): try: l = len(v) except TypeError: + # Item v silently ignored (to conserve + # the original behaviour - see also + # test of __getitem__ below). + # This behaviour is kept, but I think + # that an exception (TypeError) should be raised instead. continue # Anything else with a __len__ is considered # a sequence (to be safe, we check @@ -5714,6 +5724,10 @@ def extract_index(data): if hasattr(v, '__getitem__'): have_raw_arrays = True raw_lengths.append(l) + else: + # The original code skipped silently invalid + # objects (see note at TypeError above). + pass if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' @@ -5725,7 +5739,7 @@ def extract_index(data): if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: - raise ValueError('arrays must all be same length') + raise ValueError('Arrays must all be same length') if have_dicts: raise ValueError('Mixing dicts with non-Series may lead to ' @@ -5733,7 +5747,7 @@ def extract_index(data): if have_series: if lengths[0] != len(index): - msg = ('array length %d does not match index length %d' + msg = ('Array length %d does not match index length %d' % (lengths[0], len(index))) raise ValueError(msg) else: From 04732e026d3b39d3688a435c4641e9956305b804 Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Mon, 22 Jul 2013 16:33:54 +0200 Subject: [PATCH 4/8] - Fixed unit test for commit d2b85ff - Added unit test for new capabilities introduced with commit e6c3cd6 (ran only the unit tests for test_frame.py - `nosetests pandas` aborts with an import error) --- pandas/tests/test_frame.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a9df56a498f63..7853af0964cfd 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2236,7 +2236,7 @@ def testit(): def testit(): DataFrame({'a': False, 'b': True}) - assertRaisesRegexp(ValueError, 'If using all scalar values, you must must pass an index', testit) + assertRaisesRegexp(ValueError, 'If using all scalar values, you must pass an index', testit) def test_insert_error_msmgs(self): @@ -2774,6 +2774,17 @@ def test_constructor_from_items(self): # pass some columns recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + # not any column either a dict, a list, a tuple, or a numpy.ndarray + import array + recons_ar = DataFrame.from_items([('A', array.array('i', range(10)))]) + recons_rg = DataFrame.from_items([('A', range(10))]) + recons_np = DataFrame.from_items([('A', np.array(range(10)))]) + self.assertEquals(tuple(recons_ar['A']), + tuple(recons_rg['A'])) + self.assertEquals(tuple(recons_ar['A']), + tuple(recons_np['A'])) + + # orient='index' From 9172436fcb05bb56d7e91420b63e21e813c659ff Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Mon, 22 Jul 2013 20:51:54 +0200 Subject: [PATCH 5/8] refactored to use core.common._is_sequence --- pandas/core/frame.py | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f19c7f518163..2c640960eceb4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5705,29 +5705,19 @@ def extract_index(data): elif isinstance(v, dict): have_dicts = True indexes.append(v.keys()) - elif not (isinstance(v, str) or \ - isinstance(v, unicode)): + elif com._is_sequence(v): + # This is a sequence-but-not-a-string # Although strings have a __len__, - # they are likely to be considered scalars. - try: - l = len(v) - except TypeError: - # Item v silently ignored (to conserve - # the original behaviour - see also - # test of __getitem__ below). - # This behaviour is kept, but I think - # that an exception (TypeError) should be raised instead. - continue - # Anything else with a __len__ is considered - # a sequence (to be safe, we check - # that there is __getitem__) - if hasattr(v, '__getitem__'): - have_raw_arrays = True - raw_lengths.append(l) - else: - # The original code skipped silently invalid - # objects (see note at TypeError above). - pass + # they will be considered scalar. + have_raw_arrays = True + raw_lengths.append(len(v)) + else: + # Item v silently ignored (to conserve + # the original behaviour - see also + # test of __getitem__ below). + # This behaviour is kept, but I think + # that an exception (TypeError) should be raised instead. + pass if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' From 4f71e9888d43808fd3b2b9e0c103292ab1de257f Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Tue, 23 Jul 2013 13:10:14 +0200 Subject: [PATCH 6/8] Corrected a comment in the code that was suggesting incorrectly that an exception should be raised --- pandas/core/frame.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c640960eceb4..d80402d776ac6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5712,11 +5712,9 @@ def extract_index(data): have_raw_arrays = True raw_lengths.append(len(v)) else: - # Item v silently ignored (to conserve - # the original behaviour - see also - # test of __getitem__ below). - # This behaviour is kept, but I think - # that an exception (TypeError) should be raised instead. + # Item v is silently ignored, + # as it is not anything an index can be inferred + # from. pass if not indexes and not raw_lengths: From f131b719db0386f3c05408703729e4fc6b7084d7 Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Tue, 23 Jul 2013 15:05:02 +0200 Subject: [PATCH 7/8] removed unnecessary test --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d80402d776ac6..6ec28f9141c05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5688,7 +5688,7 @@ def extract_index(data): index = None if len(data) == 0: index = Index([]) - elif len(data) > 0 and index is None: + elif len(data) > 0: raw_lengths = [] indexes = [] From 8787ef5b9bc175fbac3e07888299408b41e47ee3 Mon Sep 17 00:00:00 2001 From: Laurent Gautier Date: Wed, 24 Jul 2013 17:14:35 +0200 Subject: [PATCH 8/8] Fixed behaviour for mapping vs sequence The use of _is_sequence() is convenient but not optimal (not basestring implicitly tested twice) --- pandas/core/frame.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6ec28f9141c05..26877752017e3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5694,7 +5694,7 @@ def extract_index(data): have_raw_arrays = False have_series = False - have_dicts = False + have_mappings = False # Loop over the element, such as vectors `v` corresponding # to columns in the DataFrame @@ -5702,26 +5702,35 @@ def extract_index(data): if isinstance(v, Series): have_series = True indexes.append(v.index) - elif isinstance(v, dict): - have_dicts = True - indexes.append(v.keys()) - elif com._is_sequence(v): - # This is a sequence-but-not-a-string - # Although strings have a __len__, - # they will be considered scalar. - have_raw_arrays = True - raw_lengths.append(len(v)) else: - # Item v is silently ignored, - # as it is not anything an index can be inferred - # from. - pass + # When an OrderedDict, the mapping aspect + # is given priority, although there is a warning when + # mixture of sequences and mapping. The unit tests + # show that this is the desired behaviour. + # Also, shouldn't a `bytes` object be considered a scalar ? + is_mapping = isinstance(v, collections.Mapping) + is_sequence = (isinstance(v, collections.Sequence) or \ + _is_sequence(v)) and not isinstance(v, basestring) + if is_mapping: + have_mappings = True + indexes.append(v.keys()) + elif is_sequence: + # This is a sequence-but-not-a-string + # Although strings have a __len__, + # they will be considered scalar. + have_raw_arrays = True + raw_lengths.append(len(v)) + else: + # Item v is silently ignored, + # as it is not anything an index can be inferred + # from. + pass if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' ' an index') - if have_series or have_dicts: + if have_series or have_mappings: index = _union_indexes(indexes) if have_raw_arrays: @@ -5729,7 +5738,7 @@ def extract_index(data): if len(lengths) > 1: raise ValueError('Arrays must all be same length') - if have_dicts: + if have_mappings: raise ValueError('Mixing dicts with non-Series may lead to ' 'ambiguous ordering.')