Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 661e3be

Browse files
committedAug 26, 2018
ENH: better MultiIndex.__repr__
1 parent 9f6c02d commit 661e3be

File tree

5 files changed

+278
-77
lines changed

5 files changed

+278
-77
lines changed
 

‎doc/source/whatsnew/v0.24.0.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,33 @@ This is the same behavior as ``Series.values`` for categorical data. See
159159
:ref:`whatsnew_0240.api_breaking.interval_values` for more.
160160

161161

162+
.. _whatsnew_0240.enhancements.multi_index_repr:
163+
164+
Better repr for MultiIndex
165+
^^^^^^^^^^^^^^^^^^^^^^^^^^
166+
167+
Previously, outputting a :class:`MultiIndex` printed the levels/labels of the
168+
multiindex. This was visually unappealing and made it difficult to understand
169+
the structure of the MultiIndex. Also, this could be a problem for large
170+
indices as the output could be slow to print and make the console output
171+
difficult to navigate.
172+
173+
Outputting of ``MultiIndex`` instances now outputs tuples of each row and ensures
174+
that the tuple items are vertically aligned, so it's now much much easier to
175+
understand the structure of the ``MultiIndex``. (:issue:`13480`):
176+
177+
.. ipython:: python
178+
179+
index1=range(1000)
180+
index2 = pd.Index(['a'] * 500 + ['abc'] * 500)
181+
pd.MultiIndex.from_arrays([index1, index2])
182+
183+
For number of rows smaller than :attr:`options.display.max_seq_items`, all
184+
values will be shown (default: 100 items). Horizontally, the output will
185+
truncate, if it's longer than :attr:`options.display.width` (default: 80 characters).
186+
This solves the problem with outputting large MultiIndex instances to the console.
187+
188+
162189
.. _whatsnew_0240.enhancements.other:
163190

164191
Other Enhancements

‎pandas/core/indexes/multi.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88
from pandas._libs import algos as libalgos, index as libindex, lib, Timestamp
99

10-
from pandas.compat import range, zip, lrange, lzip, map
10+
from pandas.compat import range, zip, lrange, lzip, map, u
1111
from pandas.compat.numpy import function as nv
1212
from pandas import compat
1313

@@ -31,7 +31,8 @@
3131
import pandas.core.common as com
3232
import pandas.core.missing as missing
3333
import pandas.core.algorithms as algos
34-
from pandas.io.formats.printing import pprint_thing
34+
from pandas.io.formats.printing import (format_object_summary,
35+
default_pprint, pprint_thing)
3536

3637
from pandas.core.config import get_option
3738

@@ -607,27 +608,58 @@ def _nbytes(self, deep=False):
607608
result += self._engine.sizeof(deep=deep)
608609
return result
609610

611+
def _formatter_func(self, tup):
612+
"""
613+
Formats each item in tup according to its level's formatter function.
614+
"""
615+
formatter_funcs = [level._formatter_func for level in self.levels]
616+
return tuple(func(val) for func, val in zip(formatter_funcs, tup))
617+
610618
def _format_attrs(self):
611619
"""
612620
Return a list of tuples of the (attr,formatted_value)
613621
"""
614-
attrs = [
615-
('levels', ibase.default_pprint(self._levels,
616-
max_seq_items=False)),
617-
('labels', ibase.default_pprint(self._labels,
618-
max_seq_items=False))]
619-
if com._any_not_none(*self.names):
620-
attrs.append(('names', ibase.default_pprint(self.names)))
621-
if self.sortorder is not None:
622-
attrs.append(('sortorder', ibase.default_pprint(self.sortorder)))
622+
attrs = []
623+
attrs.append(('dtype', "'{}'".format(self.dtype)))
624+
if self.names is not None and any(self.names):
625+
attrs.append(('names', default_pprint(self.names)))
626+
max_seq_items = get_option('display.max_seq_items') or len(self)
627+
if len(self) > max_seq_items:
628+
attrs.append(('length', len(self)))
623629
return attrs
624630

625631
def _format_space(self):
626-
return "\n%s" % (' ' * (len(self.__class__.__name__) + 1))
632+
return " "
627633

628634
def _format_data(self, name=None):
629-
# we are formatting thru the attributes
630-
return None
635+
"""
636+
Return the formatted data as a unicode string
637+
"""
638+
return format_object_summary(self, self._formatter_func,
639+
name=name, is_multi=True)
640+
641+
def __unicode__(self):
642+
"""
643+
Return a string representation for this MultiIndex.
644+
645+
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
646+
py2/py3.
647+
"""
648+
klass = self.__class__.__name__
649+
data = self._format_data()
650+
attrs = self._format_attrs()
651+
space = self._format_space()
652+
653+
prepr = (u(",%s") %
654+
space).join(u("%s=%s") % (k, v) for k, v in attrs)
655+
656+
# no data provided, just attributes
657+
if data is None:
658+
data = ''
659+
660+
res = u("%s(%s%s)") % (klass, data, prepr)
661+
662+
return res
631663

632664
def __len__(self):
633665
return len(self.labels[0])

‎pandas/io/formats/printing.py

Lines changed: 65 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,8 @@ class TableSchemaFormatter(BaseFormatter):
268268
max_seq_items=max_seq_items)
269269

270270

271-
def format_object_summary(obj, formatter, is_justify=True, name=None):
271+
def format_object_summary(obj, formatter, is_justify=True,
272+
name=None, is_multi=False):
272273
"""
273274
Return the formatted obj as a unicode string
274275
@@ -280,8 +281,10 @@ def format_object_summary(obj, formatter, is_justify=True, name=None):
280281
string formatter for an element
281282
is_justify : boolean
282283
should justify the display
283-
name : name, optiona
284+
name : name, optional
284285
defaults to the class name of the obj
286+
is_multi : bool, default False
287+
Is ``obj`` a :class:`MultiIndex` or not
285288
286289
Returns
287290
-------
@@ -301,7 +304,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None):
301304
space2 = "\n%s" % (' ' * (len(name) + 2))
302305

303306
n = len(obj)
304-
sep = ','
307+
sep = ',' if not is_multi else (',\n ' + ' ' * len(name))
305308
max_seq_items = get_option('display.max_seq_items') or n
306309

307310
# are we a truncated display
@@ -327,10 +330,10 @@ def best_len(values):
327330

328331
if n == 0:
329332
summary = '[], '
330-
elif n == 1:
333+
elif n == 1 and not is_multi:
331334
first = formatter(obj[0])
332335
summary = '[%s], ' % first
333-
elif n == 2:
336+
elif n == 2 and not is_multi:
334337
first = formatter(obj[0])
335338
last = formatter(obj[-1])
336339
summary = '[%s, %s], ' % (first, last)
@@ -346,15 +349,16 @@ def best_len(values):
346349

347350
# adjust all values to max length if needed
348351
if is_justify:
349-
350-
# however, if we are not truncated and we are only a single
351-
# line, then don't justify
352-
if (is_truncated or
353-
not (len(', '.join(head)) < display_width and
354-
len(', '.join(tail)) < display_width)):
355-
max_len = max(best_len(head), best_len(tail))
356-
head = [x.rjust(max_len) for x in head]
357-
tail = [x.rjust(max_len) for x in tail]
352+
head, tail = _justify(head, tail, display_width, best_len,
353+
is_truncated, is_multi)
354+
if is_multi:
355+
max_space = display_width - len(space2)
356+
item = tail[0]
357+
for i in reversed(range(1, len(item) + 1)):
358+
if len(_pprint_seq(item, max_seq_items=i)) < max_space:
359+
break
360+
head = [_pprint_seq(x, max_seq_items=i) for x in head]
361+
tail = [_pprint_seq(x, max_seq_items=i) for x in tail]
358362

359363
summary = ""
360364
line = space2
@@ -380,7 +384,7 @@ def best_len(values):
380384
summary += line
381385
summary += '],'
382386

383-
if len(summary) > (display_width):
387+
if len(summary) > (display_width) or is_multi:
384388
summary += space1
385389
else: # one row
386390
summary += ' '
@@ -391,6 +395,52 @@ def best_len(values):
391395
return summary
392396

393397

398+
def _justify(head, tail, display_width, best_len,
399+
is_truncated=False, is_multi=False):
400+
"""
401+
Justify each item in head and tail, so they align properly.
402+
"""
403+
if is_multi:
404+
max_length = _max_level_item_length(head + tail)
405+
head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
406+
for seq in head]
407+
tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
408+
for seq in tail]
409+
elif (is_truncated or not (len(', '.join(head)) < display_width and
410+
len(', '.join(tail)) < display_width)):
411+
max_length = max(best_len(head), best_len(tail))
412+
head = [x.rjust(max_length) for x in head]
413+
tail = [x.rjust(max_length) for x in tail]
414+
415+
return head, tail
416+
417+
418+
def _max_level_item_length(seq):
419+
"""
420+
For each position for the sequences in ``seq``, find the largest length.
421+
422+
Used for justifying individual values in a :class:`pandas.MultiIndex`.
423+
424+
Parameters
425+
----------
426+
seq : list-like of list-likes of strings
427+
428+
Returns
429+
-------
430+
max_length : list of ints
431+
432+
Examples
433+
--------
434+
>>> _max_level_item_length([['s', 'ab'], ['abc', 'a']])
435+
[3, 2]
436+
"""
437+
max_length = [0] * len(seq[0])
438+
for inner_seq in seq:
439+
length = [len(item) for item in inner_seq]
440+
max_length = [max(x, y) for x, y in zip(max_length, length)]
441+
return max_length
442+
443+
394444
def format_object_attrs(obj):
395445
"""
396446
Return a list of tuples of the (attr, formatted_value)

‎pandas/tests/indexes/multi/test_format.py

Lines changed: 133 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33

44
import warnings
55

6+
import pytest
7+
68
import pandas as pd
79
import pandas.util.testing as tm
810
from pandas import MultiIndex, compat
9-
from pandas.compat import PY3, range, u
11+
from pandas.compat import PY3, PY2, u
1012

1113

1214
def test_dtype_str(indices):
@@ -57,49 +59,6 @@ def test_repr_with_unicode_data():
5759
assert "\\u" not in repr(index) # we don't want unicode-escaped
5860

5961

60-
def test_repr_roundtrip():
61-
62-
mi = MultiIndex.from_product([list('ab'), range(3)],
63-
names=['first', 'second'])
64-
str(mi)
65-
66-
if PY3:
67-
tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
68-
else:
69-
result = eval(repr(mi))
70-
# string coerces to unicode
71-
tm.assert_index_equal(result, mi, exact=False)
72-
assert mi.get_level_values('first').inferred_type == 'string'
73-
assert result.get_level_values('first').inferred_type == 'unicode'
74-
75-
mi_u = MultiIndex.from_product(
76-
[list(u'ab'), range(3)], names=['first', 'second'])
77-
result = eval(repr(mi_u))
78-
tm.assert_index_equal(result, mi_u, exact=True)
79-
80-
# formatting
81-
if PY3:
82-
str(mi)
83-
else:
84-
compat.text_type(mi)
85-
86-
# long format
87-
mi = MultiIndex.from_product([list('abcdefg'), range(10)],
88-
names=['first', 'second'])
89-
90-
if PY3:
91-
tm.assert_index_equal(eval(repr(mi)), mi, exact=True)
92-
else:
93-
result = eval(repr(mi))
94-
# string coerces to unicode
95-
tm.assert_index_equal(result, mi, exact=False)
96-
assert mi.get_level_values('first').inferred_type == 'string'
97-
assert result.get_level_values('first').inferred_type == 'unicode'
98-
99-
result = eval(repr(mi_u))
100-
tm.assert_index_equal(result, mi_u, exact=True)
101-
102-
10362
def test_unicode_string_with_unicode():
10463
d = {"a": [u("\u05d0"), 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
10564
idx = pd.DataFrame(d).set_index(["a", "b"]).index
@@ -126,3 +85,133 @@ def test_repr_max_seq_item_setting(idx):
12685
with pd.option_context("display.max_seq_items", None):
12786
repr(idx)
12887
assert '...' not in str(idx)
88+
89+
90+
@pytest.mark.skipif(PY2, reason="repr output is different for python2")
91+
class TestRepr(object):
92+
93+
def setup_class(self):
94+
n = 1000
95+
ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n))
96+
dti = pd.date_range('2000-01-01', freq='s', periods=n * 2)
97+
self.narrow_mi = pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti],
98+
names=['a', 'b', 'dti'])
99+
100+
levels = [ci, ci.codes + 9, dti, dti, dti]
101+
names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3']
102+
self.wide_mi = pd.MultiIndex.from_arrays(levels, names=names)
103+
104+
def test_repr(self, idx):
105+
result = idx[:1].__repr__()
106+
expected = """MultiIndex([('foo', 'one')],
107+
dtype='object', names=['first', 'second'])"""
108+
assert result == expected
109+
110+
result = idx.__repr__()
111+
expected = """MultiIndex([('foo', 'one'),
112+
('foo', 'two'),
113+
('bar', 'one'),
114+
('baz', 'two'),
115+
('qux', 'one'),
116+
('qux', 'two')],
117+
dtype='object', names=['first', 'second'])"""
118+
assert result == expected
119+
120+
with pd.option_context('display.max_seq_items', 5):
121+
result = idx.__repr__()
122+
expected = """MultiIndex([('foo', 'one'),
123+
('foo', 'two'),
124+
...
125+
('qux', 'one'),
126+
('qux', 'two')],
127+
dtype='object', names=['first', 'second'], length=6)"""
128+
assert result == expected
129+
130+
def test_rjust(self):
131+
result = self.narrow_mi[:1].__repr__()
132+
expected = """\
133+
MultiIndex([('a', 9, '2000-01-01 00:00:00')],
134+
dtype='object', names=['a', 'b', 'dti'])"""
135+
assert result == expected
136+
137+
result = self.narrow_mi[::500].__repr__()
138+
expected = """\
139+
MultiIndex([( 'a', 9, '2000-01-01 00:00:00'),
140+
( 'a', 9, '2000-01-01 00:08:20'),
141+
('abc', 10, '2000-01-01 00:16:40'),
142+
('abc', 10, '2000-01-01 00:25:00')],
143+
dtype='object', names=['a', 'b', 'dti'])"""
144+
assert result == expected
145+
146+
result = self.narrow_mi.__repr__()
147+
expected = """\
148+
MultiIndex([( 'a', 9, '2000-01-01 00:00:00'),
149+
( 'a', 9, '2000-01-01 00:00:01'),
150+
( 'a', 9, '2000-01-01 00:00:02'),
151+
( 'a', 9, '2000-01-01 00:00:03'),
152+
( 'a', 9, '2000-01-01 00:00:04'),
153+
( 'a', 9, '2000-01-01 00:00:05'),
154+
( 'a', 9, '2000-01-01 00:00:06'),
155+
( 'a', 9, '2000-01-01 00:00:07'),
156+
( 'a', 9, '2000-01-01 00:00:08'),
157+
( 'a', 9, '2000-01-01 00:00:09'),
158+
...
159+
('abc', 10, '2000-01-01 00:33:10'),
160+
('abc', 10, '2000-01-01 00:33:11'),
161+
('abc', 10, '2000-01-01 00:33:12'),
162+
('abc', 10, '2000-01-01 00:33:13'),
163+
('abc', 10, '2000-01-01 00:33:14'),
164+
('abc', 10, '2000-01-01 00:33:15'),
165+
('abc', 10, '2000-01-01 00:33:16'),
166+
('abc', 10, '2000-01-01 00:33:17'),
167+
('abc', 10, '2000-01-01 00:33:18'),
168+
('abc', 10, '2000-01-01 00:33:19')],
169+
dtype='object', names=['a', 'b', 'dti'], length=2000)"""
170+
assert result == expected
171+
172+
def test_tuple_width(self):
173+
result = self.wide_mi[:1].__repr__()
174+
expected = """MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...)],
175+
dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
176+
assert result == expected
177+
178+
result = self.wide_mi[:10].__repr__()
179+
expected = """\
180+
MultiIndex([('a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
181+
('a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
182+
('a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
183+
('a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
184+
('a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
185+
('a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
186+
('a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
187+
('a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
188+
('a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
189+
('a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...)],
190+
dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'])"""
191+
assert result == expected
192+
193+
result = self.wide_mi.__repr__()
194+
expected = """\
195+
MultiIndex([( 'a', 9, '2000-01-01 00:00:00', '2000-01-01 00:00:00', ...),
196+
( 'a', 9, '2000-01-01 00:00:01', '2000-01-01 00:00:01', ...),
197+
( 'a', 9, '2000-01-01 00:00:02', '2000-01-01 00:00:02', ...),
198+
( 'a', 9, '2000-01-01 00:00:03', '2000-01-01 00:00:03', ...),
199+
( 'a', 9, '2000-01-01 00:00:04', '2000-01-01 00:00:04', ...),
200+
( 'a', 9, '2000-01-01 00:00:05', '2000-01-01 00:00:05', ...),
201+
( 'a', 9, '2000-01-01 00:00:06', '2000-01-01 00:00:06', ...),
202+
( 'a', 9, '2000-01-01 00:00:07', '2000-01-01 00:00:07', ...),
203+
( 'a', 9, '2000-01-01 00:00:08', '2000-01-01 00:00:08', ...),
204+
( 'a', 9, '2000-01-01 00:00:09', '2000-01-01 00:00:09', ...),
205+
...
206+
('abc', 10, '2000-01-01 00:33:10', '2000-01-01 00:33:10', ...),
207+
('abc', 10, '2000-01-01 00:33:11', '2000-01-01 00:33:11', ...),
208+
('abc', 10, '2000-01-01 00:33:12', '2000-01-01 00:33:12', ...),
209+
('abc', 10, '2000-01-01 00:33:13', '2000-01-01 00:33:13', ...),
210+
('abc', 10, '2000-01-01 00:33:14', '2000-01-01 00:33:14', ...),
211+
('abc', 10, '2000-01-01 00:33:15', '2000-01-01 00:33:15', ...),
212+
('abc', 10, '2000-01-01 00:33:16', '2000-01-01 00:33:16', ...),
213+
('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...),
214+
('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...),
215+
('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)],
216+
dtype='object', names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" # noqa
217+
assert result == expected

‎pandas/tests/util/test_testing.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -367,12 +367,15 @@ def test_index_equal_message(self):
367367
368368
Index levels are different
369369
\\[left\\]: 1, Int64Index\\(\\[1, 2, 3\\], dtype='int64'\\)
370-
\\[right\\]: 2, MultiIndex\\(levels=\\[\\[u?'A', u?'B'\\], \\[1, 2, 3, 4\\]\\],
371-
labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)"""
370+
\\[right\\]: 2, MultiIndex\\(\\[\\(10, 1\\),
371+
\\(20, 2\\),
372+
\\(30, 3\\),
373+
\\(40, 4\\)\\],
374+
dtype='object'\\)"""
372375

373376
idx1 = pd.Index([1, 2, 3])
374-
idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2),
375-
('B', 3), ('B', 4)])
377+
idx2 = pd.MultiIndex.from_tuples([(10, 1), (20, 2),
378+
(30, 3), (40, 4)])
376379
with tm.assert_raises_regex(AssertionError, expected):
377380
assert_index_equal(idx1, idx2, exact=False)
378381

0 commit comments

Comments
 (0)
Please sign in to comment.