Speed-up multi-index html repr + add display_values_threshold option (#6400)

benbovy · web-flow · commit d8fc34660f40 · 2022-03-29T09:05:31.000+02:00
* add _repr_html_ for PandasMultiIndexingAdapter

This may greatly speed-up the html repr of Xarray objects with
multi-indexes

This optimized _repr_html_ is now used for formatting the array detailed
view of multi-index coordinates, instead of converting the full index /
levels to numpy arrays before formatting it.

* update release notes

* nit

* add display_values_threshold

* fix last merge main
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -32,6 +32,10 @@ New Features
 - Multi-index levels are now accessible through their own, regular coordinates
   instead of virtual coordinates (:pull:`5692`).
   By `Benoît Bovy <https://github.com/benbovy>`_.
+- Add a ``display_values_threshold`` option to control the total number of array
+  elements which trigger summarization rather than full repr in (numpy) array
+  detailed views of the html repr (:pull:`6400`).
+  By `Benoît Bovy <https://github.com/benbovy>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
@@ -60,6 +64,8 @@ Bug fixes
 - Fixed "unhashable type" error trying to read NetCDF file with variable having its 'units'
   attribute not ``str`` (e.g. ``numpy.ndarray``) (:issue:`6368`).
   By `Oleh Khoma <https://github.com/okhoma>`_.
+- Fixed the poor html repr performance on large multi-indexes (:pull:`6400`).
+  By `Benoît Bovy <https://github.com/benbovy>`_.
 - Allow fancy indexing of duck dask arrays along multiple dimensions. (:pull:`6414`)
   By `Justus Magin <https://github.com/keewis>`_.
 
diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py
@@ -520,7 +520,11 @@ def short_numpy_repr(array):
 
     # default to lower precision so a full (abbreviated) line can fit on
     # one line with the default display_width
-    options = {"precision": 6, "linewidth": OPTIONS["display_width"], "threshold": 200}
+    options = {
+        "precision": 6,
+        "linewidth": OPTIONS["display_width"],
+        "threshold": OPTIONS["display_values_threshold"],
+    }
     if array.ndim < 3:
         edgeitems = 3
     elif array.ndim == 3:
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -5,6 +5,7 @@
 from contextlib import suppress
 from dataclasses import dataclass, field
 from datetime import timedelta
+from html import escape
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -25,6 +26,7 @@
 
 from . import duck_array_ops, nputils, utils
 from .npcompat import DTypeLike
+from .options import OPTIONS
 from .pycompat import dask_version, integer_types, is_duck_dask_array, sparse_array_type
 from .types import T_Xarray
 from .utils import either_dict_or_kwargs, get_valid_numpy_dtype
@@ -1507,23 +1509,31 @@ def __repr__(self) -> str:
             )
             return f"{type(self).__name__}{props}"
 
-    def _repr_inline_(self, max_width) -> str:
-        # special implementation to speed-up the repr for big multi-indexes
+    def _get_array_subset(self) -> np.ndarray:
+        # used to speed-up the repr for big multi-indexes
+        threshold = max(100, OPTIONS["display_values_threshold"] + 2)
+        if self.size > threshold:
+            pos = threshold // 2
+            indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)])
+            subset = self[OuterIndexer((indices,))]
+        else:
+            subset = self
+
+        return np.asarray(subset)
+
+    def _repr_inline_(self, max_width: int) -> str:
+        from .formatting import format_array_flat
+
         if self.level is None:
             return "MultiIndex"
         else:
-            from .formatting import format_array_flat
+            return format_array_flat(self._get_array_subset(), max_width)
 
-            if self.size > 100 and max_width < self.size:
-                n_values = max_width
-                indices = np.concatenate(
-                    [np.arange(0, n_values), np.arange(-n_values, 0)]
-                )
-                subset = self[OuterIndexer((indices,))]
-            else:
-                subset = self
+    def _repr_html_(self) -> str:
+        from .formatting import short_numpy_repr
 
-            return format_array_flat(np.asarray(subset), max_width)
+        array_repr = short_numpy_repr(self._get_array_subset())
+        return f"<pre>{escape(array_repr)}</pre>"
 
     def copy(self, deep: bool = True) -> "PandasMultiIndexingAdapter":
         # see PandasIndexingAdapter.copy
diff --git a/xarray/core/options.py b/xarray/core/options.py
@@ -15,6 +15,7 @@ class T_Options(TypedDict):
     cmap_divergent: Union[str, "Colormap"]
     cmap_sequential: Union[str, "Colormap"]
     display_max_rows: int
+    display_values_threshold: int
     display_style: Literal["text", "html"]
     display_width: int
     display_expand_attrs: Literal["default", True, False]
@@ -33,6 +34,7 @@ class T_Options(TypedDict):
     "cmap_divergent": "RdBu_r",
     "cmap_sequential": "viridis",
     "display_max_rows": 12,
+    "display_values_threshold": 200,
     "display_style": "html",
     "display_width": 80,
     "display_expand_attrs": "default",
@@ -57,6 +59,7 @@ def _positive_integer(value):
 _VALIDATORS = {
     "arithmetic_join": _JOIN_OPTIONS.__contains__,
     "display_max_rows": _positive_integer,
+    "display_values_threshold": _positive_integer,
     "display_style": _DISPLAY_OPTIONS.__contains__,
     "display_width": _positive_integer,
     "display_expand_attrs": lambda choice: choice in [True, False, "default"],
@@ -154,6 +157,9 @@ class set_options:
         * ``default`` : to expand unless over a pre-defined limit
     display_max_rows : int, default: 12
         Maximum display rows.
+    display_values_threshold : int, default: 200
+        Total number of array elements which trigger summarization rather
+        than full repr for variable data views (numpy arrays).
     display_style : {"text", "html"}, default: "html"
         Display style to use in jupyter for xarray objects.
     display_width : int, default: 80
diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py
@@ -479,6 +479,12 @@ def test_short_numpy_repr() -> None:
         num_lines = formatting.short_numpy_repr(array).count("\n") + 1
         assert num_lines < 30
 
+    # threshold option (default: 200)
+    array = np.arange(100)
+    assert "..." not in formatting.short_numpy_repr(array)
+    with xr.set_options(display_values_threshold=10):
+        assert "..." in formatting.short_numpy_repr(array)
+
 
 def test_large_array_repr_length() -> None: