From 494a01ba512a9916d5039e80414e356cc2162b65 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 3 Oct 2024 23:58:04 -0400 Subject: [PATCH] Made hist scale properly by default --- datascience/tables.py | 107 ++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 55 deletions(-) diff --git a/datascience/tables.py b/datascience/tables.py index bd93b6dec..ca0b7ca9e 100644 --- a/datascience/tables.py +++ b/datascience/tables.py @@ -5349,15 +5349,14 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co >>> t.hist('value', group='category') # doctest: +SKIP """ + # Matplotlib has deprecated the normed keyword. - # TODO consider changing this function to use density= instead too if 'normed' not in vargs and 'density' not in vargs: vargs['density'] = True elif 'normed' in vargs and 'density' not in vargs: vargs['density'] = vargs.pop('normed') elif 'normed' in vargs and 'density' in vargs: - raise ValueError("You can't specify both normed and density. " - "Use one or the other.") + raise ValueError("You can't specify both normed and density. Use one or the other.") global _INTERACTIVE_PLOTS if _INTERACTIVE_PLOTS: @@ -5366,18 +5365,18 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co return self.ihist( *columns, - overlay = overlay, - bins = bins, - bin_column = bin_column, - unit = unit, - counts = counts, - group = group, - side_by_side = side_by_side, - left_end = left_end, - right_end = right_end, - width = width, - height = height, - rug = rug, + overlay=overlay, + bins=bins, + bin_column=bin_column, + unit=unit, + counts=counts, + group=group, + side_by_side=side_by_side, + left_end=left_end, + right_end=right_end, + width=width, + height=height, + rug=rug, **vargs ) @@ -5389,7 +5388,7 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co if counts is not None and bin_column is None: warnings.warn("counts arg of hist is deprecated; use bin_column") - bin_column=counts + bin_column = counts if columns: columns_included = list(columns) if bin_column is not None: @@ -5399,19 +5398,15 @@ def hist(self, *columns, overlay=True, bins=None, bin_column=None, unit=None, co self = self.select(*columns_included) if group is not None: if bin_column is not None: - raise ValueError("Using bin_column and group together is " - "currently unsupported.") + raise ValueError("Using bin_column and group together is currently unsupported.") if len(columns) > 1: - raise ValueError("Using group with multiple histogram value " - "columns is currently unsupported.") + raise ValueError("Using group with multiple histogram value columns is currently unsupported.") # Check for non-numerical values and raise a ValueError if any found for col in self: if col != group and any(isinstance(cell, np.flexible) for cell in self[col]): - raise ValueError("The column '{0}' contains non-numerical " - "values. A histogram cannot be drawn for this table." - .format(col)) - + raise ValueError("The column '{0}' contains non-numerical values. A histogram cannot be drawn for this table." + .format(col)) if bin_column is not None and bins is None: bins = np.unique(self.column(bin_column)) @@ -5431,13 +5426,11 @@ def prepare_hist_with_group(group): grouped = self.group(group, np.array) if grouped.num_rows > 20: warnings.warn("It looks like you're making a grouped histogram with " - "a lot of groups ({:d}), which is probably incorrect." - .format(grouped.num_rows)) + "a lot of groups ({:d}), which is probably incorrect." + .format(grouped.num_rows)) return [("{}={}".format(group, k), (v[0][1],)) for k, v in grouped.index_by(group).items()] - # Populate values_dict: An ordered dict from column name to singleton - # tuple of array of values or a (values, weights) pair of arrays. If - # any values have weights, they all must have weights. + # Populate values_dict: An ordered dict from column name to data arrays if bin_column is not None: values_dict = prepare_hist_with_bin_column(bin_column) elif group is not None: @@ -5445,6 +5438,7 @@ def prepare_hist_with_group(group): else: values_dict = [(k, (self.column(k),)) for k in self.labels] values_dict = collections.OrderedDict(values_dict) + if left_end is not None or right_end is not None: if left_end is None: if bins is not None and bins[0]: @@ -5458,26 +5452,23 @@ def prepare_hist_with_group(group): right_end = max([max(self.column(k)) for k in self.labels if np.issubdtype(self.column(k).dtype, np.number)]) def draw_hist(values_dict): - # Check if np.printoptions is set to legacy. Throw UserWarning if not if np.get_printoptions()['legacy'] != '1.13': - warnings.warn("We've detected you're not using the '1.13' legacy setting for `np.printoptions`. " - "This may cause excessive error terms in your plots. We recommend solving this by running the " - "following code: `np.set_printoptions(legacy='1.13')`", UserWarning) - # This code is factored as a function for clarity only. + warnings.warn("We've detected you're not using the '1.13' legacy setting for `np.printoptions`. " + "This may cause excessive error terms in your plots. We recommend solving this by running the " + "following code: `np.set_printoptions(legacy='1.13')`", UserWarning) n = len(values_dict) colors = [rgb_color + (self.default_alpha,) for rgb_color in - itertools.islice(itertools.cycle(self.chart_colors), n)] + itertools.islice(itertools.cycle(self.chart_colors), n)] hist_names = list(values_dict.keys()) values = [v[0] for v in values_dict.values()] weights = [v[1] for v in values_dict.values() if len(v) > 1] if n > len(weights) > 0: - raise ValueError("Weights were provided for some columns, but not " - " all, and that's not supported.") + raise ValueError("Weights were provided for some columns, but not all, and that's not supported.") if rug and overlay and n > 1: warnings.warn("Cannot plot overlaid rug plots; rug=True ignored", UserWarning) if vargs['density']: y_label = 'Percent per ' + (unit if unit else 'unit') - percentage = plt.FuncFormatter(lambda x, _: "{:g}".format(100*x)) + percentage = plt.FuncFormatter(lambda x, _: "{:g}".format(100 * x)) else: y_label = 'Count' @@ -5490,32 +5481,31 @@ def draw_hist(values_dict): vargs['weights'] = weights if not side_by_side: vargs.setdefault('histtype', 'stepfilled') - figure = plt.figure(figsize=(width, height)) - plt.hist(values, color=colors, **vargs) - # if rug: - # plt.scatter(values, np.zeros_like(values), marker="|", color=colors) - axis = figure.get_axes()[0] - _vertical_x(axis) - axis.set_ylabel(y_label) + # Added to ensure proper default dimensions + plt.figure(figsize=(width, height)) + plt.hist(values, color=colors, label=hist_names, **vargs) + plt.legend(loc=2, bbox_to_anchor=(1.05, 1)) + _vertical_x(plt.gca()) + plt.ylabel(y_label) if vargs['density']: - axis.yaxis.set_major_formatter(percentage) + plt.gca().yaxis.set_major_formatter(percentage) x_unit = ' (' + unit + ')' if unit else '' if group is not None and len(self.labels) == 2: - #There's a grouping in place but we're only plotting one column's values + # There's a grouping in place but we're only plotting one column's values label_not_grouped = [l for l in self.labels if l != group][0] - axis.set_xlabel(label_not_grouped + x_unit, fontsize=16) + plt.xlabel(label_not_grouped + x_unit, fontsize=16) else: - axis.set_xlabel(x_unit, fontsize=16) - plt.legend(hist_names, loc=2, bbox_to_anchor=(1.05, 1)) - type(self).plots.append(axis) + plt.xlabel(x_unit, fontsize=16) + type(self).plots.append(plt.gca()) else: - _, axes = plt.subplots(n, 1, figsize=(width, height * n)) + num_plots = n + fig, axes = plt.subplots(num_plots, 1, figsize=(width, height * num_plots)) if 'bins' in vargs: bins = vargs['bins'] if isinstance(bins, numbers.Integral) and bins > 76 or hasattr(bins, '__len__') and len(bins) > 76: # Use stepfilled when there are too many bins vargs.setdefault('histtype', 'stepfilled') - if n == 1: + if num_plots == 1: axes = [axes] for i, (axis, hist_name, values_for_hist, color) in enumerate(zip(axes, hist_names, values, colors)): axis.set_ylabel(y_label) @@ -5529,14 +5519,21 @@ def draw_hist(values_dict): if left_end is not None and right_end is not None: x_shade, height_shade, width_shade = _compute_shading(heights, bins.copy(), left_end, right_end) axis.bar(x_shade, height_shade, width=width_shade, - color=self.chart_colors[1], align="edge") + color=self.chart_colors[1], align="edge") _vertical_x(axis) if rug: axis.scatter(values_for_hist, np.zeros_like(values_for_hist), marker="|", - color="black", s=100, zorder=10) + color="black", s=100, zorder=10) type(self).plots.append(axis) + # Added to ensure proaper axis label dimensions + plt.tight_layout() + draw_hist(values_dict) + # Added to make sure graph displays + plt.show() + + def hist_of_counts(self, *columns, overlay=True, bins=None, bin_column=None, group=None, side_by_side=False, width=None, height=None, **vargs):