pandas-dev · VincentAntoine · Sep 4, 2017 · Sep 16, 2017 · Sep 26, 2017 · Sep 28, 2017
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -581,26 +581,58 @@ each point:
    @savefig scatter_plot_colored.png
    df.plot.scatter(x='a', y='b', c='c', s=50);
 
+The keyword ``s`` may be given as the name of a column to define the size of
+each point, making the plot a bubble plot:
 
 .. ipython:: python
-   :suppress:
 
-   plt.close('all')
+   @savefig scatter_plot_bubble_without_size_factor.png
+   df.plot.scatter(x='a', y='b', s='c');
 
-You can pass other keywords supported by matplotlib 
-:meth:`scatter <matplotlib.axes.Axes.scatter>`. The example  below shows a 
-bubble chart using a column of the ``DataFrame`` as the bubble size.
+By default, the largest bubble (corresponding to the largest value of the column
+represented by bubble sizes) has size 200. The keyword ``size_factor`` may be
+given to specify a multiplication factor to bubble sizes displayed on the graph:
 
 .. ipython:: python
 
-   @savefig scatter_plot_bubble.png
-   df.plot.scatter(x='a', y='b', s=df['c']*200);
+   @savefig scatter_plot_bubble_with_size_factor.png
+   df.plot.scatter(x='a', y='b', s='c', size_factor=0.2);
+
+The keyword ``s`` can also be of ordered categorical data type.
+
+.. ipython:: python
+
+   surf_area = np.concatenate([40.0 + 80.0 * np.random.rand(30),
+                               80.0 + 160.0 * np.random.rand(20),
+                               100.0 + 200.0 * np.random.rand(10)])
+
+   types = np.array(30 * ['Flat'] + 20 * ['House'] + 10 * ['Castle'])
+
+   prices = 0.01 * surf_area * (np.random.rand(60) + 1.5) / 2
+   prices *= np.array([1] * 30 + [1.4] * 20 + [2] * 10)
+
+   categories = ['Flat', 'House', 'Castle']
+   property_types = pd.Categorical(types, categories=categories, ordered=True)
+
+   df = pd.DataFrame({
+      'Surface area (sqm)': surf_area,
+      'Price (M€)': prices,
+      'Property type': property_types
+   })
+
+   @savefig scatter_plot_bubble_categorical.png
+   df.plot.scatter(x='Surface area (sqm)', y='Price (M€)',
+                   s='Property type', alpha=.5);
+
+You can pass other keywords supported by matplotlib
+:meth:`scatter <matplotlib.axes.Axes.scatter>`.
 
 .. ipython:: python
    :suppress:
 
    plt.close('all')
 
+
 See the :meth:`scatter <matplotlib.axes.Axes.scatter>` method and the
 `matplotlib scatter documentation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.scatter>`__ for more.
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -12,6 +12,8 @@ v0.24.0 (Month XX, 2018)
 
 New features
 ~~~~~~~~~~~~
+- :func:`DataFrame.plot.scatter` now accepts column names as an argument ``s`` to produce a plot where the marker sizes reflect the values in the column. (:issue:`16827`)
+
 - :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
 
 
@@ -159,6 +161,24 @@ This is the same behavior as ``Series.values`` for categorical data. See
 :ref:`whatsnew_0240.api_breaking.interval_values` for more.
 
 
+.. _whatsnew_0240.enhancements.bubble_plots:
+
+Scatter plots with varying marker sizes reflecting data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Scatter plots in which marker sizes reflect the values in a column of the ``DataFrame`` (sometimes called bubble plots) can now be produced with :func:`DataFrame.plot.scatter` by passing a column name as an argument ``s`` to provide the data for marker sizes. Marker sizes automatically adjust to the maximum of the data, and the legend also reflects the marker sizes. (:issue:`22441`)
+
+.. ipython:: python
+
+   df = pd.DataFrame(np.arange(0, 2 * np.pi, np.pi/24), columns=['a'])
+   df['b'] = 10 * np.cos(df['a'])
+   df['size'] = df['b'].abs()
+   df.head()
+   @savefig scatter_bubble_whatsnew.png
+   df.plot.scatter(x='a', y='b', s='size', alpha=.5, title='Simple bubble plot');
+
+Bubble plots can also be produced in this way with ordered categorical data for the bubble sizes.
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -24,6 +24,8 @@
     is_integer,
     is_number,
     is_hashable,
+    is_numeric_dtype,
+    is_categorical_dtype,
     is_iterator)
 from pandas.core.dtypes.generic import (
     ABCSeries, ABCDataFrame, ABCPeriodIndex, ABCMultiIndex, ABCIndexClass)
@@ -861,11 +863,22 @@ def _plot_colorbar(self, ax, **kwds):
 class ScatterPlot(PlanePlot):
     _kind = 'scatter'
 
-    def __init__(self, data, x, y, s=None, c=None, **kwargs):
+    def __init__(self, data, x, y, s=None, c=None, size_factor=1, **kwargs):
         if s is None:
-            # hide the matplotlib default for size, in case we want to change
-            # the handling of this argument later
-            s = 20
+            # Set default size if no argument is given.
+            s = 20 * size_factor
+        elif is_hashable(s) and s in data.columns:
+            # Handle the case where s is a label of a column of the df.
+            # The data is normalized to 200 * size_factor.
+            self.size_title = s
+            n_bubble_points = 200
+            size_data = data[s]
+            s = self._get_plot_bubbles(size_data, n_bubble_points, size_factor)
+            self.bubble_legend_sizes, self.bubble_legend_labels = (
+                self._get_legend_bubbles(size_data,
+                                         n_bubble_points,
+                                         size_factor)
+            )
         super(ScatterPlot, self).__init__(data, x, y, s=s, **kwargs)
         if is_integer(c) and not self.data.columns.holds_integer():
             c = self.data.columns[c]
@@ -874,7 +887,6 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs):
     def _make_plot(self):
         x, y, c, data = self.x, self.y, self.c, self.data
         ax = self.axes[0]
-
         c_is_column = is_hashable(c) and c in self.data.columns
 
         # plot a colorbar only if a colormap is provided or necessary
@@ -919,6 +931,108 @@ def _make_plot(self):
             ax.errorbar(data[x].values, data[y].values,
                         linestyle='none', **err_kwds)
 
+    @staticmethod
+    def _get_plot_bubbles(size_data, n_bubble_points=200, size_factor=1):
+        if is_categorical_dtype(size_data):
+            if size_data.cat.ordered:
+                size_data_codes = size_data.cat.codes + 1
+                s_data_max = size_data_codes.max()
+                s = (n_bubble_points * size_factor
+                     * size_data_codes**2 / s_data_max**2)
+            else:
+                raise TypeError(
+                    "'s' must be numeric or ordered categorical dtype")
+        elif is_numeric_dtype(size_data):
+            s_data_max = size_data.max()
+            s = n_bubble_points * size_factor * size_data / s_data_max
+        else:
+            raise TypeError("'s' must be numeric or ordered categorical dtype")
+        return s
+
+    @classmethod
+    def _sci_notation(cls, num):
+        """
+        Returns mantissa and exponent of the number passed in argument.
+        Example:
+        >>> _sci_notation(89278.8924)
+        (8.9, 4.0)
+        """
+        scientific_notation = '{:e}'.format(num)
+        regexp = re.compile(r'^([+-]?\d\.\d).*e([+-]\d*)$')
+        mantis, expnt = regexp.search(scientific_notation).groups()
+        return float(mantis), float(expnt)
+
+    @staticmethod
+    def _get_legend_bubbles(size_data, n_bubble_points=200, size_factor=1):
+        """
+        Computes and returns appropriate bubble sizes and labels for the
+        legend of a bubble plot.
+
+        If bubble size represents numerical data, creates 4 bubbles with
+        round values for the labels, the largest of which is close to the
+        maximum of the data.
+
+        If bubble size represents ordered categorical data, creates one bubble
+        per category in the data. Sizes are determined by category codes.
+        """
+        if is_categorical_dtype(size_data):
+            if size_data.cat.ordered:
+                size_data_codes = size_data.cat.codes + 1
+                labels = list(size_data.cat.categories)[::-1]
+                n_categories = len(labels)
+                sizes = ((np.array(range(n_categories)) + 1)**2
+                         * n_bubble_points * size_factor
+                         / size_data_codes.max()**2)
+                sizes = sizes[::-1]
+            else:
+                raise TypeError(
+                    "'s' must be numeric or ordered categorical dtype")
+        elif is_numeric_dtype(size_data):
+            s_data_max = size_data.max()
+            coef, expnt = ScatterPlot._sci_notation(s_data_max)
+            labels_catalog = {
+                (9, 10): [10, 5, 2.5, 1],
+                (7, 9): [8, 4, 2, 0.5],
+                (5.5, 7): [6, 3, 1.5, 0.5],
+                (4.5, 5.5): [5, 2, 1, 0.2],
+                (3.5, 4.5): [4, 2, 1, 0.2],
+                (2.5, 3.5): [3, 1, 0.5, 0.2],
+                (1.5, 2.5): [2, 1, 0.5, 0.2],
+                (0, 1.5): [1, 0.5, 0.25, 0.1]
+            }
+            for lower_bound, upper_bound in labels_catalog:
+                if (coef >= lower_bound) and (coef < upper_bound):
+                    labels = 10**expnt * np.array(labels_catalog[lower_bound,
+                                                                 upper_bound])
+                    sizes = list(n_bubble_points * size_factor
+                                 * labels / s_data_max)
+                    labels = ['{:g}'.format(l) for l in labels]
+
+        else:
+            raise TypeError("'s' must be numeric or ordered categorical dtype")
+        return (sizes, labels)
+
+    def _make_legend(self):
+        if hasattr(self, "size_title"):
+            ax = self.axes[0]
+            import matplotlib.legend as legend
+            from matplotlib.collections import CircleCollection
+            sizes, labels = self.bubble_legend_sizes, self.bubble_legend_labels
+            color = self.plt.rcParams['axes.facecolor'],
+            edgecolor = self.plt.rcParams['axes.edgecolor']
+            bubbles = []
+            for size in sizes:
+                bubbles.append(CircleCollection(sizes=[size],
+                                                color=color,
+                                                edgecolor=edgecolor))
+            bubble_legend = legend.Legend(ax,
+                                          handles=bubbles,
+                                          labels=labels,
+                                          loc='best')
+            bubble_legend.set_title(self.size_title)
+            ax.add_artist(bubble_legend)
+        super(ScatterPlot, self)._make_legend()
+
 
 class HexBinPlot(PlanePlot):
     _kind = 'hexbin'
@@ -3458,7 +3572,7 @@ def pie(self, y=None, **kwds):
         """
         return self(kind='pie', y=y, **kwds)
 
-    def scatter(self, x, y, s=None, c=None, **kwds):
+    def scatter(self, x, y, s=None, c=None, size_factor=1, **kwds):
         """
         Create a scatter plot with varying marker point size and color.
 
@@ -3477,7 +3591,7 @@ def scatter(self, x, y, s=None, c=None, **kwds):
         y : int or str
             The column name or column position to be used as vertical
             coordinates for each point.
-        s : scalar or array_like, optional
+        s : int, str, scalar or array_like, optional
-        s : int, str, scalar or array_like, optional
+        s : str, scalar or array-like, optional
-        s : int, str, scalar or array_like, optional
+        s : str, scalar or array-like, optional
             The size of each point. Possible values are:
 
             - A single scalar so all points have the same size.
@@ -3486,6 +3600,12 @@ def scatter(self, x, y, s=None, c=None, **kwds):
               recursively. For instance, when passing [2,14] all points size
               will be either 2 or 14, alternatively.
 
+            - .. versionadded:: 0.24.0
+                s can now be the name of a column containing numeric or
+                ordered categorical data that will be represented by the size
+                of each point. This turns the scatter plot into a bubble plot.
+
+
         c : str, int or array_like, optional
             The color of each point. Possible values are:
 
@@ -3500,6 +3620,12 @@ def scatter(self, x, y, s=None, c=None, **kwds):
             - A column name or position whose values will be used to color the
               marker points according to a colormap.
 
+        size_factor : scalar, optional
+            A multiplication factor to change the size of bubbles
-            A multiplication factor to change the size of bubbles
+            A multiplication factor to change the size of points.
-            A multiplication factor to change the size of bubbles
+            A multiplication factor to change the size of points.
+
+            .. versionadded:: 0.24.0
+
+
         **kwds
             Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`.
 
@@ -3537,7 +3663,8 @@ def scatter(self, x, y, s=None, c=None, **kwds):
             ...                       c='species',
             ...                       colormap='viridis')
         """
-        return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds)
+        return self(kind='scatter', x=x, y=y, c=c, s=s,
+                    size_factor=size_factor, **kwds)
 
     def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None,
                **kwds):

diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py
@@ -1237,6 +1237,41 @@ def test_scatter_colors(self):
         tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0],
                                     np.array([1, 1, 1, 1], dtype=np.float64))
 
+    @pytest.mark.slow
+    def test_plot_scatter_with_s(self):
+        data = np.array([[3.1, 4.2, 1.9],
+                        [1.9, 2.8, 3.1],
+                        [5.4, 4.32, 2.0],
+                        [0.4, 3.4, 0.46],
+                        [4.4, 4.9, 0.8],
+                        [2.7, 6.2, 1.49]])
+        df = DataFrame(data,
+                       columns=['x', 'y', 'z'])
+        ax = df.plot.scatter(x='x', y='y', s='z', size_factor=4)
+        bubbles = ax.collections[0]
+        bubble_sizes = bubbles.get_sizes()
+        max_data = df['z'].max()
+        expected_sizes = 200 * 4 * df['z'].values / max_data
+        tm.assert_numpy_array_equal(bubble_sizes, expected_sizes)
+
+    @pytest.mark.slow
+    def test_plot_scatter_with_categorical_s(self):
+        data = np.array([[3.1, 4.2],
+                        [1.9, 2.8],
+                        [5.4, 4.32],
+                        [0.4, 3.4],
+                        [4.4, 4.9],
+                        [2.7, 6.2]])
+        df = DataFrame(data, columns=['x', 'y'])
+        df['z'] = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True)
+        ax = df.plot.scatter(x='x', y='y', s='z', size_factor=4)
+        bubbles = ax.collections[0]
+        bubble_sizes = bubbles.get_sizes()
+        max_data = df['z'].cat.codes.max() + 1.0
+        expected_sizes = (200.0 * 4 * (df['z'].cat.codes.values + 1)**2
+                          / max_data**2)
+        tm.assert_numpy_array_equal(bubble_sizes, expected_sizes)
+
     @pytest.mark.slow
     def test_plot_bar(self):
         df = DataFrame(randn(6, 4),