pandas-dev · Aug 26, 2018
diff --git a/‎doc/source/whatsnew/v0.24.0.txt
Lines changed: 27 additions & 0 deletions b/‎doc/source/whatsnew/v0.24.0.txt
Lines changed: 27 additions & 0 deletions
diff --git a/‎pandas/core/indexes/multi.py
Lines changed: 46 additions & 14 deletions b/‎pandas/core/indexes/multi.py
Lines changed: 46 additions & 14 deletions
diff --git a/‎pandas/io/formats/printing.py
Lines changed: 65 additions & 15 deletions b/‎pandas/io/formats/printing.py
Lines changed: 65 additions & 15 deletions
diff --git a/‎pandas/tests/indexes/multi/test_format.py
Lines changed: 133 additions & 44 deletions b/‎pandas/tests/indexes/multi/test_format.py
Lines changed: 133 additions & 44 deletions
diff --git a/‎pandas/tests/util/test_testing.py
Lines changed: 7 additions & 4 deletions b/‎pandas/tests/util/test_testing.py
Lines changed: 7 additions & 4 deletions
@@ -159,6 +159,33 @@ This is the same behavior as ``Series.values`` for categorical data. See
 :ref:`whatsnew_0240.api_breaking.interval_values` for more.
 
 
+.. _whatsnew_0240.enhancements.multi_index_repr:
+
+Better repr for MultiIndex
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Previously, outputting a :class:`MultiIndex` printed the levels/labels of the
+multiindex. This was visually unappealing and made it difficult to understand
+the structure of the MultiIndex. Also, this could be a problem for large
+indices as the output could be slow to print and make the console output
+difficult to navigate.
+
+Outputting of ``MultiIndex`` instances now outputs tuples of each row and ensures
+that the tuple items are vertically aligned, so it's now much much easier to
+understand the structure of the ``MultiIndex``. (:issue:`13480`):
+
+.. ipython:: python
+
+   index1=range(1000)
+   index2 = pd.Index(['a'] * 500 + ['abc'] * 500)
+   pd.MultiIndex.from_arrays([index1, index2])
+
+For number of rows smaller than :attr:`options.display.max_seq_items`, all
+values will be shown (default: 100 items). Horizontally, the output will
+truncate, if it's longer than :attr:`options.display.width` (default: 80 characters).
+This solves the problem with outputting large MultiIndex instances to the console.
+
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
 
@@ -7,7 +7,7 @@
 import numpy as np
 from pandas._libs import algos as libalgos, index as libindex, lib, Timestamp
 
-from pandas.compat import range, zip, lrange, lzip, map
+from pandas.compat import range, zip, lrange, lzip, map, u
 from pandas.compat.numpy import function as nv
 from pandas import compat
 
@@ -31,7 +31,8 @@
 import pandas.core.common as com
 import pandas.core.missing as missing
 import pandas.core.algorithms as algos
-from pandas.io.formats.printing import pprint_thing
+from pandas.io.formats.printing import (format_object_summary,
+                                        default_pprint, pprint_thing)
 
 from pandas.core.config import get_option
 
@@ -607,27 +608,58 @@ def _nbytes(self, deep=False):
         result += self._engine.sizeof(deep=deep)
         return result
 
+    def _formatter_func(self, tup):
+        """
+        Formats each item in tup according to its level's formatter function.
+        """
+        formatter_funcs = [level._formatter_func for level in self.levels]
+        return tuple(func(val) for func, val in zip(formatter_funcs, tup))
+
     def _format_attrs(self):
         """
         Return a list of tuples of the (attr,formatted_value)
         """
-        attrs = [
-            ('levels', ibase.default_pprint(self._levels,
-                                            max_seq_items=False)),
-            ('labels', ibase.default_pprint(self._labels,
-                                            max_seq_items=False))]
-        if com._any_not_none(*self.names):
-            attrs.append(('names', ibase.default_pprint(self.names)))
-        if self.sortorder is not None:
-            attrs.append(('sortorder', ibase.default_pprint(self.sortorder)))
+        attrs = []
+        attrs.append(('dtype', "'{}'".format(self.dtype)))
+        if self.names is not None and any(self.names):
+            attrs.append(('names', default_pprint(self.names)))
+        max_seq_items = get_option('display.max_seq_items') or len(self)
+        if len(self) > max_seq_items:
+            attrs.append(('length', len(self)))
         return attrs
 
     def _format_space(self):
-        return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
+        return " "
 
     def _format_data(self, name=None):
-        # we are formatting thru the attributes
-        return None
+        """
+        Return the formatted data as a unicode string
+        """
+        return format_object_summary(self, self._formatter_func,
+                                     name=name, is_multi=True)
+
+    def __unicode__(self):
+        """
+        Return a string representation for this MultiIndex.
+
+        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+        py2/py3.
+        """
+        klass = self.__class__.__name__
+        data = self._format_data()
+        attrs = self._format_attrs()
+        space = self._format_space()
+
+        prepr = (u(",%s") %
+                 space).join(u("%s=%s") % (k, v) for k, v in attrs)
+
+        # no data provided, just attributes
+        if data is None:
+            data = ''
+
+        res = u("%s(%s%s)") % (klass, data, prepr)
+
+        return res
 
     def __len__(self):
         return len(self.labels[0])
 
@@ -268,7 +268,8 @@ class TableSchemaFormatter(BaseFormatter):
                  max_seq_items=max_seq_items)
 
 
-def format_object_summary(obj, formatter, is_justify=True, name=None):
+def format_object_summary(obj, formatter, is_justify=True,
+                          name=None, is_multi=False):
     """
     Return the formatted obj as a unicode string
 
@@ -280,8 +281,10 @@ def format_object_summary(obj, formatter, is_justify=True, name=None):
         string formatter for an element
     is_justify : boolean
         should justify the display
-    name : name, optiona
+    name : name, optional
         defaults to the class name of the obj
+    is_multi : bool, default False
+        Is ``obj`` a :class:`MultiIndex` or not
 
     Returns
     -------
@@ -301,7 +304,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None):
     space2 = "\n%s" % (' ' * (len(name) + 2))
 
     n = len(obj)
-    sep = ','
+    sep = ',' if not is_multi else (',\n ' + ' ' * len(name))
     max_seq_items = get_option('display.max_seq_items') or n
 
     # are we a truncated display
@@ -327,10 +330,10 @@ def best_len(values):
 
     if n == 0:
         summary = '[], '
-    elif n == 1:
+    elif n == 1 and not is_multi:
         first = formatter(obj[0])
         summary = '[%s], ' % first
-    elif n == 2:
+    elif n == 2 and not is_multi:
         first = formatter(obj[0])
         last = formatter(obj[-1])
         summary = '[%s, %s], ' % (first, last)
@@ -346,15 +349,16 @@ def best_len(values):
 
         # adjust all values to max length if needed
         if is_justify:
-
-            # however, if we are not truncated and we are only a single
-            # line, then don't justify
-            if (is_truncated or
-                    not (len(', '.join(head)) < display_width and
-                         len(', '.join(tail)) < display_width)):
-                max_len = max(best_len(head), best_len(tail))
-                head = [x.rjust(max_len) for x in head]
-                tail = [x.rjust(max_len) for x in tail]
+            head, tail = _justify(head, tail, display_width, best_len,
+                                  is_truncated, is_multi)
+        if is_multi:
+            max_space = display_width - len(space2)
+            item = tail[0]
+            for i in reversed(range(1, len(item) + 1)):
+                if len(_pprint_seq(item, max_seq_items=i)) < max_space:
+                    break
+            head = [_pprint_seq(x, max_seq_items=i) for x in head]
+            tail = [_pprint_seq(x, max_seq_items=i) for x in tail]
 
         summary = ""
         line = space2
@@ -380,7 +384,7 @@ def best_len(values):
         summary += line
         summary += '],'
 
-        if len(summary) > (display_width):
+        if len(summary) > (display_width) or is_multi:
             summary += space1
         else:  # one row
             summary += ' '
@@ -391,6 +395,52 @@ def best_len(values):
     return summary
 
 
+def _justify(head, tail, display_width, best_len,
+             is_truncated=False, is_multi=False):
+    """
+    Justify each item in head and tail, so they align properly.
+    """
+    if is_multi:
+        max_length = _max_level_item_length(head + tail)
+        head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
+                for seq in head]
+        tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
+                for seq in tail]
+    elif (is_truncated or not (len(', '.join(head)) < display_width and
+                               len(', '.join(tail)) < display_width)):
+        max_length = max(best_len(head), best_len(tail))
+        head = [x.rjust(max_length) for x in head]
+        tail = [x.rjust(max_length) for x in tail]
+
+    return head, tail
+
+
+def _max_level_item_length(seq):
+    """
+    For each position for the sequences in ``seq``, find the largest length.
+
+    Used for justifying individual values in a :class:`pandas.MultiIndex`.
+
+    Parameters
+    ----------
+    seq : list-like of list-likes of strings
+
+    Returns
+    -------
+    max_length : list of ints
+
+    Examples
+    --------
+    >>> _max_level_item_length([['s', 'ab'], ['abc', 'a']])
+    [3, 2]
+    """
+    max_length = [0] * len(seq[0])
+    for inner_seq in seq:
+        length = [len(item) for item in inner_seq]
+        max_length = [max(x, y) for x, y in zip(max_length, length)]
+    return max_length
+
+
 def format_object_attrs(obj):
     """
     Return a list of tuples of the (attr, formatted_value)
 
@@ -3,10 +3,12 @@
 
 import warnings
 
+import pytest
+
 import pandas as pd
 import pandas.util.testing as tm
 from pandas import MultiIndex, compat
-from pandas.compat import PY3, range, u
+from pandas.compat import PY3, PY2, u
 
 
 def test_dtype_str(indices):
@@ -57,49 +59,6 @@ def test_repr_with_unicode_data():
         assert "\\u" not in repr(index)  # we don't want unicode-escaped
 
 
-def test_repr_roundtrip():
-
-    mi = MultiIndex.from_product([list('ab'), range(3)],
-                                 names=['first', 'second'])
-    str(mi)
-
-    if PY3:
-        tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
-    else:
-        result = eval(repr(mi))
-        # string coerces to unicode
-        tm.assert_index_equal(result, mi, exact=False)
-        assert mi.get_level_values('first').inferred_type == 'string'
-        assert result.get_level_values('first').inferred_type == 'unicode'
-
-    mi_u = MultiIndex.from_product(
-        [list(u'ab'), range(3)], names=['first', 'second'])
-    result = eval(repr(mi_u))
-    tm.assert_index_equal(result, mi_u, exact=True)
-
-    # formatting
-    if PY3:
-        str(mi)
-    else:
-        compat.text_type(mi)
-
-    # long format
-    mi = MultiIndex.from_product([list('abcdefg'), range(10)],
-                                 names=['first', 'second'])
-
-    if PY3:
-        tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
-    else:
-        result = eval(repr(mi))
-        # string coerces to unicode
-        tm.assert_index_equal(result, mi, exact=False)
-        assert mi.get_level_values('first').inferred_type == 'string'
-        assert result.get_level_values('first').inferred_type == 'unicode'
-
-    result = eval(repr(mi_u))
-    tm.assert_index_equal(result, mi_u, exact=True)
-
-
 def test_unicode_string_with_unicode():
     d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
     idx = pd.DataFrame(d).set_index(["a", "b"]).index
@@ -126,3 +85,133 @@ def test_repr_max_seq_item_setting(idx):
     with pd.option_context("display.max_seq_items", None):
         repr(idx)
         assert '...' not in str(idx)
+
+
+@pytest.mark.skipif(PY2, reason="repr output is different for python2")
+class TestRepr(object):
+
+    def setup_class(self):
+        n = 1000
+        ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n))
+        dti = pd.date_range('2000-01-01', freq='s', periods=n * 2)
+        self.narrow_mi = pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti],
+                                                   names=['a', 'b', 'dti'])
+
+        levels = [ci, ci.codes + 9, dti, dti, dti]
+        names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3']
+        self.wide_mi = pd.MultiIndex.from_arrays(levels, names=names)
+
+    def test_repr(self, idx):
+        result = idx[:1].__repr__()
+        expected = """MultiIndex([('foo', 'one')],
+           dtype='object', names=['first', 'second'])"""
+        assert result == expected
+
+        result = idx.__repr__()
+        expected = """MultiIndex([('foo', 'one'),
+            ('foo', 'two'),
+            ('bar', 'one'),
+            ('baz', 'two'),
+            ('qux', 'one'),
+            ('qux', 'two')],
+           dtype='object', names=['first', 'second'])"""
+        assert result == expected
+
+        with pd.option_context('display.max_seq_items', 5):
+            result = idx.__repr__()
+            expected = """MultiIndex([('foo', 'one'),
+            ('foo', 'two'),
+            ...
+            ('qux', 'one'),
+            ('qux', 'two')],
+           dtype='object', names=['first', 'second'], length=6)"""
+            assert result == expected
+
+    def test_rjust(self):
+        result = self.narrow_mi[:1].__repr__()
+        expected = """\
+MultiIndex([('a', 9, '2000-01-01 00:00:00')],
+           dtype='object', names=['a', 'b', 'dti'])"""
+        assert result == expected
+
+        result = self.narrow_mi[::500].__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00'),
+            (  'a',  9, '2000-01-01 00:08:20'),
+            ('abc', 10, '2000-01-01 00:16:40'),
+            ('abc', 10, '2000-01-01 00:25:00')],
+           dtype='object', names=['a', 'b', 'dti'])"""
+        assert result == expected
+
+        result = self.narrow_mi.__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00'),
+            (  'a',  9, '2000-01-01 00:00:01'),
+            (  'a',  9, '2000-01-01 00:00:02'),
+            (  'a',  9, '2000-01-01 00:00:03'),
+            (  'a',  9, '2000-01-01 00:00:04'),
+            (  'a',  9, '2000-01-01 00:00:05'),
+            (  'a',  9, '2000-01-01 00:00:06'),
+            (  'a',  9, '2000-01-01 00:00:07'),
+            (  'a',  9, '2000-01-01 00:00:08'),
+            (  'a',  9, '2000-01-01 00:00:09'),
+            ...
+            ('abc', 10, '2000-01-01 00:33:10'),
+            ('abc', 10, '2000-01-01 00:33:11'),
+            ('abc', 10, '2000-01-01 00:33:12'),
+            ('abc', 10, '2000-01-01 00:33:13'),
+            ('abc', 10, '2000-01-01 00:33:14'),
+            ('abc', 10, '2000-01-01 00:33:15'),
+            ('abc', 10, '2000-01-01 00:33:16'),
+            ('abc', 10, '2000-01-01 00:33:17'),
+            ('abc', 10, '2000-01-01 00:33:18'),
+            ('abc', 10, '2000-01-01 00:33:19')],
+           dtype='object', names=['a', 'b', 'dti'], length=2000)"""
+        assert result == expected
+
+    def test_tuple_width(self):
+        result = self.wide_mi[:1].__repr__()
+        expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
+        assert result == expected
+
+        result = self.wide_mi[:10].__repr__()
+        expected = """\
+MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
+            ('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
+            ('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
+            ('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
+            ('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
+            ('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
+            ('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
+            ('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
+            ('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
+            ('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
+        assert result == expected
+
+        result = self.wide_mi.__repr__()
+        expected = """\
+MultiIndex([(  'a',  9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
+            (  'a',  9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
+            (  'a',  9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
+            (  'a',  9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
+            (  'a',  9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
+            (  'a',  9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
+            (  'a',  9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
+            (  'a',  9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
+            (  'a',  9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
+            (  'a',  9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...),
+            ...
+            ('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...),
+            ('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...),
+            ('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...),
+            ('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...),
+            ('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...),
+            ('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...),
+            ('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...),
+            ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...),
+            ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...),
+            ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)],
+           dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)"""  # noqa
+        assert result == expected
@@ -367,12 +367,15 @@ def test_index_equal_message(self):
 
 Index levels are different
 \\[left\\]:  1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
-\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\],
-           labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)"""
+\\[right\\]: 2, MultiIndex\\(\\[\\(10, 1\\),
+            \\(20, 2\\),
+            \\(30, 3\\),
+            \\(40, 4\\)\\],
+           dtype='object'\\)"""
 
         idx1 = pd.Index([1, 2, 3])
-        idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2),
-                                          ('B', 3), ('B', 4)])
+        idx2 = pd.MultiIndex.from_tuples([(10, 1), (20, 2),
+                                          (30, 3), (40, 4)])
         with tm.assert_raises_regex(AssertionError, expected):
             assert_index_equal(idx1, idx2, exact=False)