diff --git a/CHANGELOG.md b/CHANGELOG.md index b62a3cca88..b36ab8e1c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,13 @@ * update `radon` example dataset to current InferenceData schema specification (#1320) * update `from_cmdstan` functionality and add warmup groups (#1330) * restructure plotting code to be compatible with mpl>=3.3 (#1312) +* Replaced `_fast_kde()` with `kde()` which now also supports circular variables +via the argument `circular` (#1284). +* Replaced `_fast_kde()` with `kde()` which now also supports circular variables via the argument `circular` (#1284). + +### Maintenance and fixes +* plot_posterior: fix overlap of hdi and rope (#1263) +* All the functions that used to call `_fast_kde`() now use `kde()` and have been updated to handle the new types returned (#1284). ### Deprecation diff --git a/arviz/kde_utils.py b/arviz/kde_utils.py new file mode 100644 index 0000000000..cc1428a9b8 --- /dev/null +++ b/arviz/kde_utils.py @@ -0,0 +1,841 @@ +# pylint: disable=invalid-name +"""Kernel density estimation functions for ArviZ.""" + +import warnings +import numpy as np +from scipy.fftpack import fft +from scipy.optimize import brentq +from scipy.signal import gaussian, convolve +from scipy.special import ive # pylint: disable=no-name-in-module + +from .stats.stats_utils import histogram + + +def _bw_scott(x, x_std=None, **kwargs): # pylint: disable=unused-argument + """Scott's Rule.""" + if x_std is None: + x_std = np.std(x) + bw = 1.06 * x_std * len(x) ** (-0.2) + return bw + + +def _bw_silverman(x, x_std=None, **kwargs): # pylint: disable=unused-argument + """Silverman's Rule.""" + if x_std is None: + x_std = np.std(x) + q75, q25 = np.percentile(x, [75, 25]) + x_iqr = q75 - q25 + a = min(x_std, x_iqr / 1.34) + bw = 0.9 * a * len(x) ** (-0.2) + return bw + + +def _bw_isj(x, grid_counts=None, x_std=None, x_range=None): + """Improved Sheather-Jones bandwidth estimation. + + Improved Sheather and Jones method as explained in [1]_. + This is an internal version pretended to be used by the KDE estimator. + When used internally computation time is saved because things like minimums, + maximums and the grid are pre-computed. + + References + ---------- + .. [1] Kernel density estimation via diffusion. + Z. I. Botev, J. F. Grotowski, and D. P. Kroese. + Ann. Statist. 38 (2010), no. 5, 2916--2957. + """ + x_len = len(x) + if x_range is None: + x_min = np.min(x) + x_max = np.max(x) + x_range = x_max - x_min + + # Relative frequency per bin + if grid_counts is None: + x_std = np.std(x) + grid_len = 256 + grid_min = x_min - 0.5 * x_std + grid_max = x_max + 0.5 * x_std + grid_counts, _, _ = histogram(x, grid_len, (grid_min, grid_max)) + else: + grid_len = len(grid_counts) - 1 + + grid_relfreq = grid_counts / x_len + + # Discrete cosine transform of the data + a_k = _dct1d(grid_relfreq) + + k_sq = np.arange(1, grid_len) ** 2 + a_sq = a_k[range(1, grid_len)] ** 2 + + t = _root(_fixed_point, x_len, args=(x_len, k_sq, a_sq), x=x) + h = t ** 0.5 * x_range + return h + + +def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None): + """Experimental bandwidth estimator.""" + bw_silverman = _bw_silverman(x, x_std=x_std) + bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range) + return 0.5 * (bw_silverman + bw_isj) + + +def _bw_taylor(x): + """Taylor's rule for circular bandwidth estimation. + + This function implements a rule-of-thumb for choosing the bandwidth of + a von Mises kernel density estimator that assumes the underlying + distribution is von Mises as introduced in [1]_. + It is analogous to Scott's rule for the Gaussian KDE. + + Circular bandwidth has a different scale from linear bandwidth. + Unlike linear scale, low bandwidths are associated with oversmoothing + while high values are associated with undersmoothing. + + References + ---------- + .. [1] C.C Taylor (2008). Automatic bandwidth selection for circular + density estimation. + Computational Statistics and Data Analysis, 52, 7, 3493–3500. + """ + x_len = len(x) + kappa = _kappa_mle(x) + num = 3 * x_len * kappa ** 2 * ive(2, 2 * kappa) + den = 4 * np.pi ** 0.5 * ive(0, kappa) ** 2 + return (num / den) ** 0.4 + + +_BW_METHODS_LINEAR = { + "scott": _bw_scott, + "silverman": _bw_silverman, + "isj": _bw_isj, + "experimental": _bw_experimental, +} + + +def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None): + """Compute bandwidth for a given data `x` and `bw`. + + Also checks `bw` is correctly specified. + + Parameters + ---------- + x : 1-D numpy array + 1 dimensional array of sample data from the + variable for which a density estimate is desired. + bw: int, float or str + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth. + + Returns + ------- + bw: float + Bandwidth + """ + if isinstance(bw, bool): + raise ValueError( + ( + "`bw` must not be of type `bool`.\n" + "Expected a positive numeric or one of the following strings:\n" + "{}." + ).format(list(_BW_METHODS_LINEAR.keys())) + ) + if isinstance(bw, (int, float)): + if bw < 0: + raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw)) + elif isinstance(bw, str): + bw_lower = bw.lower() + + if bw_lower not in _BW_METHODS_LINEAR.keys(): + raise ValueError( + ( + "Unrecognized bandwidth method.\n" "Input is: {}.\n" "Expected one of: {}." + ).format(bw_lower, list(_BW_METHODS_LINEAR.keys())) + ) + + bw_fun = _BW_METHODS_LINEAR[bw_lower] + bw = bw_fun(x, grid_counts, x_std, x_range) + else: + raise ValueError( + ( + "Unrecognized `bw` argument.\n" + "Expected a positive numeric or one of the following strings:\n" + "{}." + ).format(list(_BW_METHODS_LINEAR.keys())) + ) + return bw + + +def _vonmises_pdf(x, mu, kappa): + """Calculate vonmises_pdf.""" + if kappa <= 0: + raise ValueError("Argument 'kappa' must be positive.") + pdf = 1 / (2 * np.pi * ive(0, kappa)) * np.exp(np.cos(x - mu) - 1) ** kappa + return pdf + + +def _a1inv(x): + """Compute inverse function. + + Inverse function of the ratio of the first and + zeroth order Bessel functions of the first kind. + + Returns the value k, such that a1inv(x) = k, i.e. a1(k) = x. + """ + if 0 <= x < 0.53: + return 2 * x + x ** 3 + (5 * x ** 5) / 6 + elif x < 0.85: + return -0.4 + 1.39 * x + 0.43 / (1 - x) + else: + return 1 / (x ** 3 - 4 * x ** 2 + 3 * x) + + +def _kappa_mle(x): + from numeric_utils import _circular_mean + + mean = _circular_mean(x) + kappa = _a1inv(np.mean(np.cos(x - mean))) + return kappa + + +def _dct1d(x): + """Discrete Cosine Transform in 1 Dimension. + + Parameters + ---------- + x : numpy array + 1 dimensional array of values for which the + DCT is desired + + Returns + ------- + output : DTC transformed values + """ + x_len = len(x) + + even_increasing = np.arange(0, x_len, 2) + odd_decreasing = np.arange(x_len - 1, 0, -2) + + x = np.concatenate((x[even_increasing], x[odd_decreasing])) + + w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))] + output = np.real(w_1k * fft(x)) + + return output + + +def _fixed_point(t, N, k_sq, a_sq): + """Calculate t-zeta*gamma^[l](t). + + Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1]. + + References + ---------- + .. [1] Kernel density estimation via diffusion. + Z. I. Botev, J. F. Grotowski, and D. P. Kroese. + Ann. Statist. 38 (2010), no. 5, 2916--2957. + """ + k_sq = np.asfarray(k_sq, dtype=np.float64) + a_sq = np.asfarray(a_sq, dtype=np.float64) + + l = 7 + f = np.sum(np.power(k_sq, l) * a_sq * np.exp(-k_sq * np.pi ** 2 * t)) + f *= 0.5 * np.pi ** (2.0 * l) + + for j in reversed(range(2, l)): + c1 = (1 + 0.5 ** (j + 0.5)) / 3 + c2 = np.product(np.arange(1.0, 2 * j + 1, 2, dtype=np.float64)) + c2 /= (np.pi / 2) ** 0.5 + t_j = np.power((c1 * (c2 / (N * f))), (2.0 / (3.0 + 2.0 * j))) + f = np.sum(k_sq ** j * a_sq * np.exp(-k_sq * np.pi ** 2.0 * t_j)) + f *= 0.5 * np.pi ** (2 * j) + + out = t - (2 * N * np.pi ** 0.5 * f) ** (-0.4) + return out + + +def _root(function, N, args, x): + # The right bound is at most 0.01 + found = False + N = max(min(1050, N), 50) + tol = 10e-12 + 0.01 * (N - 50) / 1000 + + while not found: + try: + bw, res = brentq(function, 0, 0.01, args=args, full_output=True, disp=False) + found = res.converged + except ValueError: + bw = 0 + tol *= 2.0 + found = False + if bw <= 0 or tol >= 1: + # warnings.warn( + # "Improved Sheather-Jones did not converge as expected. " + # "Using Silverman's rule instead.", + # Warning + # ) + bw = (_bw_silverman(x) / np.ptp(x)) ** 2 + return bw + return bw + + +def _check_type(x): + """Check the input is of the correct type. + + It only accepts numeric lists/numpy arrays of 1 dimension or something that + can be flattened to 1 dimension. + + Parameters + ---------- + x : Object whose type is checked before computing the KDE. + + Returns + ------- + x : 1-D numpy array + If no error is thrown, a 1 dimensional array of + sample data from the variable for which a density estimate is desired. + """ + # Will raise an error if `x` can't be casted to numeric or flattened to one dimension. + try: + x = np.asfarray(x).flatten() + except Exception as e: + warnings.warn( + "The following exception occurred while trying to convert `x`" + "to a 1 dimensional float array." + ) + raise e + + x = x[np.isfinite(x)] + + if x.size == 0: + raise ValueError("`x` does not contain any finite number.") + if x.size == 1: + raise ValueError("`x` is of length 1. Can't produce a KDE with only one data point.") + return x + + +def _check_custom_lims(custom_lims, x_min, x_max): + """Check if `custom_lims` are of the correct type. + + It accepts numeric lists/tuples of length 2. + + Parameters + ---------- + custom_lims : Object whose type is checked. + + Returns + ------- + None: Object of type None + """ + if not isinstance(custom_lims, (list, tuple)): + raise TypeError( + ( + "`custom_lims` must be a numeric list or tuple of length 2.\n" + "Not an object of {}." + ).format(type(custom_lims)) + ) + + if len(custom_lims) != 2: + raise AttributeError("`len(custom_lims)` must be 2, not {}.".format(len(custom_lims))) + + any_bool = any(isinstance(i, bool) for i in custom_lims) + if any_bool: + raise TypeError("Elements of `custom_lims` must be numeric or None, not bool.") + + custom_lims = list(custom_lims) # convert to a mutable object + if custom_lims[0] is None: + custom_lims[0] = x_min + + if custom_lims[1] is None: + custom_lims[1] = x_max + + all_numeric = all(isinstance(i, (int, float, np.integer, np.float)) for i in custom_lims) + if not all_numeric: + raise TypeError( + ("Elements of `custom_lims` must be numeric or None.\n" "At least one of them is not.") + ) + + if not custom_lims[0] < custom_lims[1]: + raise AttributeError("`custom_lims[0]` must be smaller than `custom_lims[1]`.") + + return custom_lims + + +def _get_grid( + x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False +): + """Compute the grid that bins the data used to estimate the density function. + + Parameters + ---------- + x_min : float + Minimum value of the data + x_max: float + Maximum value of the data. + x_std: float + Standard deviation of the data. + extend_fct: bool + Indicates the factor by which `x_std` is multiplied + to extend the range of the data. + grid_len: int + Number of bins + custom_lims: tuple or list + Custom limits for the domain of the density estimation. + Must be numeric of length 2. Overrides `extend`. + extend: bool, optional + Whether to extend the range of the data or not. + Default is True. + bound_correction: bool, optional + Whether the density estimations performs boundary correction or not. + This does not impacts directly in the output, but is used + to override `extend`. Overrides `extend`. + Default is False. + + Returns + ------- + grid_len: int + Number of bins + grid_min: float + Minimum value of the grid + grid_max: float + Maximum value of the grid + """ + # Set up number of bins. + if grid_len < 100: + grid_len = 100 + grid_len = int(grid_len) + + # Set up domain + if custom_lims is not None: + custom_lims = _check_custom_lims(custom_lims, x_min, x_max) + grid_min = custom_lims[0] + grid_max = custom_lims[1] + elif extend and not bound_correction: + grid_extend = extend_fct * x_std + grid_min = x_min - grid_extend + grid_max = x_max + grid_extend + else: + grid_min = x_min + grid_max = x_max + return grid_min, grid_max, grid_len + + +def _kde(x, circular=False, **kwargs): + """One dimensional density estimation. + + It is a wrapper around `kde_linear()` and `kde_circular()`. + + Parameters + ---------- + x : 1D numpy array + Data used to calculate the density estimation. + Theoritically it is a random sample obtained from $f$, + the true probability density function we aim to estimate. + circular: bool, optional + Whether `x` is a circular variable or not. Defaults to False. + **kwargs: Arguments passed to `kde_linear()` and `kde_circular()`. + See their documentation for more info. + + Returns + ------- + grid : Gridded numpy array for the x values. + pdf : Numpy array for the density estimates. + bw: optional, the estimated bandwidth. + + Examples + -------- + Default density estimation for linear data + .. plot:: + :context: close-figs + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from arviz.kde_utils import _kde + >>> + >>> rvs = np.random.gamma(shape=1.8, size=1000) + >>> grid, pdf = _kde(rvs) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Density estimation for linear data with Silverman's rule bandwidth + .. plot:: + :context: close-figs + + >>> grid, pdf = _kde(rvs, bw="silverman") + >>> plt.plot(grid, pdf) + >>> plt.show() + + Density estimation for linear data with scaled bandwidth + .. plot:: + :context: close-figs + + >>> # bw_fct > 1 means more smoothness. + >>> grid, pdf = _kde(rvs, bw_fct=2.5) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Default density estimation for linear data with extended limits + .. plot:: + :context: close-figs + + >>> grid, pdf = _kde(rvs, bound_correction=False, extend=True, extend_fct=0.5) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Default density estimation for linear data with custom limits + .. plot:: + :context: close-figs + # It accepts tuples and lists of length 2. + >>> grid, pdf = __kde(rvs, bound_correction=False, custom_lims=(0, 10)) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Default density estimation for circular data + .. plot:: + :context: close-figs + + >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500) + >>> grid, pdf = _kde(rvs, circular=True) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Density estimation for circular data with scaled bandwidth + .. plot:: + :context: close-figs + + >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500) + >>> # bw_fct > 1 means less smoothness. + >>> grid, pdf = _kde(rvs, circular=True, bw_fct=3) + >>> plt.plot(grid, pdf) + >>> plt.show() + + Density estimation for circular data with custom limits + .. plot:: + :context: close-figs + >>> # This is still experimental, does not always work. + >>> rvs = np.random.vonmises(mu=0, kappa=30, size=500) + >>> grid, pdf = _kde(rvs, circular=True, custom_lims=(-1, 1)) + >>> plt.plot(grid, pdf) + >>> plt.show() + + See Also + -------- + plot_kde : Compute and plot a kernel density estimate. + arviz.kde_utils._kde: Arviz KDE estimator + """ + if circular: + kde_fun = _kde_circular + else: + kde_fun = _kde_linear + + return kde_fun(x, **kwargs) + + +def _kde_linear( + x, + bw="experimental", + adaptive=False, + extend=False, + bound_correction=True, + extend_fct=0, + bw_fct=1, + bw_return=False, + custom_lims=None, + cumulative=False, + grid_len=512, +): + """One dimensional density estimation for linear data. + + Given an array of data points `x` it returns an estimate of + the probability density function that generated the samples in `x`. + + Parameters + ---------- + x : 1D numpy array + Data used to calculate the density estimation. + Theoritically it is a random sample obtained from $f$, + the true probability density function we aim to estimate. + bw: int, float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental". + Defaults to "experimental". + adaptive: boolean, optional + Indicates if the bandwidth is adaptative or not. + It is the recommended approach when there are multiple modalities + with different spread. + It is not compatible with convolution. Defaults to False. + extend: boolean, optional + Whether to extend the observed range for `x` in the estimation. + It extends each bound by a multiple of the standard deviation of `x` + given by `extend_fct`. Defaults to False. + bound_correction: boolean, optional + Whether to perform boundary correction on the bounds of `x` or not. + Defaults to True. + extend_fct: float, optional + Number of standard deviations used to widen the + lower and upper bounds of `x`. Defaults to 0.5. + bw_fct: float, optional + A value that multiplies `bw` which enables tuning smoothness by hand. + Must be positive. Values below 1 decrease smoothness while values + above 1 decrease it. Defaults to 1 (no modification). + bw_return: bool, optional + Whether to return the estimated bandwidth in addition to the + other objects. Defaults to False. + custom_lims: list or tuple, optional + A list or tuple of length 2 indicating custom bounds + for the range of `x`. Defaults to None which disables custom bounds. + cumulative: bool, optional + Whether return the PDF or the cumulative PDF. Defaults to False. + grid_len: int, optional + The number of intervals used to bin the data points + (a.k.a. the length of the grid used in the estimation) + Defaults to 512. + + Returns + ------- + grid : Gridded numpy array for the x values. + pdf : Numpy array for the density estimates. + bw: optional, the estimated bandwidth. + """ + # Check `x` is from appropiate type + try: + x = _check_type(x) + except ValueError as e: + warnings.warn("Something failed: " + str(e)) + return np.array([np.nan]), np.array([np.nan]) + + # Check `bw_fct` is numeric and positive + if not isinstance(bw_fct, (int, float, np.integer, np.floating)): + raise TypeError( + "`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct)) + ) + + if bw_fct <= 0: + raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct)) + + # Preliminary calculations + x_len = len(x) + x_min = x.min() + x_max = x.max() + x_std = (((x ** 2).sum() / x_len) - (x.sum() / x_len) ** 2) ** 0.5 + x_range = x_max - x_min + + # Determine grid + grid_min, grid_max, grid_len = _get_grid( + x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction + ) + grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) + + # Bandwidth estimation + bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range) + + # Density estimation + if adaptive: + grid, pdf = _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction) + else: + grid, pdf = _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction) + + if cumulative: + pdf = pdf.cumsum() / pdf.sum() + + if bw_return: + return grid, pdf, bw + else: + return grid, pdf + + +def _kde_circular( + x, bw="taylor", bw_fct=1, bw_return=False, custom_lims=None, cumulative=False, grid_len=512, +): + """One dimensional density estimation for circular data. + + Given an array of data points `x` measured in radians, + it returns an estimate of the probability density function that generated + the samples in `x`. + + Parameters + ---------- + x : 1D numpy array + Data used to calculate the density estimation. + Theoritically it is a random sample obtained from $f$, + the true probability density function we aim to estimate. + bw: int, float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + "taylor" since it is the only option supported so far. Defaults to "taylor". + bw_fct: float, optional + A value that multiplies `bw` which enables tuning smoothness by hand. + Must be positive. Values above 1 decrease smoothness while values + below 1 decrease it. Defaults to 1 (no modification). + bw_return: bool, optional + Whether to return the estimated bandwidth in addition to the + other objects. Defaults to False. + custom_lims: list or tuple, optional + A list or tuple of length 2 indicating custom bounds + for the range of `x`. Defaults to None which means the estimation + limits are [-pi, pi]. + cumulative: bool, optional + Whether return the PDF or the cumulative PDF. Defaults to False. + grid_len: int, optional + The number of intervals used to bin the data points + (a.k.a. the length of the grid used in the estimation) + Defaults to 512. + """ + try: + x = _check_type(x) + except ValueError as e: + warnings.warn("Something failed: " + str(e)) + return np.array([np.nan]), np.array([np.nan]) + + # All values between -pi and pi + from numeric_utils import _normalize_angle + + x = _normalize_angle(x) + + # Check `bw_fct` is numeric and positive + if not isinstance(bw_fct, (int, float, np.integer, np.floating)): + raise TypeError( + "`bw_fct` must be a positive number, not an object of {}.".format(type(bw_fct)) + ) + + if bw_fct <= 0: + raise ValueError("`bw_fct` must be a positive number, not {}.".format(bw_fct)) + + # Determine bandwidth + if isinstance(bw, bool): + raise ValueError( + ("`bw` can't be of type `bool`.\n" "Expected a positive numeric or 'taylor'") + ) + if isinstance(bw, (int, float)): + if bw < 0: + raise ValueError("Numeric `bw` must be positive.\nInput: {:.4f}.".format(bw)) + if isinstance(bw, str): + if bw == "taylor": + bw = _bw_taylor(x) + else: + raise ValueError(("`bw` must be a positive numeric or `taylor`, not {}".format(bw))) + bw *= bw_fct + + # Determine grid + if custom_lims is not None: + custom_lims = _check_custom_lims(custom_lims, x.min(), x.max()) + grid_min = custom_lims[0] + grid_max = custom_lims[1] + assert grid_min >= -np.pi, "Lower limit can't be smaller than -pi" + assert grid_max <= np.pi, "Upper limit can't be larger than pi" + else: + grid_min = -np.pi + grid_max = np.pi + + bins = np.linspace(grid_min, grid_max, grid_len + 1) + bin_counts, _, bin_edges = histogram(x, bins=bins) + grid = 0.5 * (bin_edges[1:] + bin_edges[:-1]) + + kern = _vonmises_pdf(x=grid, mu=0, kappa=bw) + pdf = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kern) * np.fft.rfft(bin_counts))) + pdf /= len(x) + + if cumulative: + pdf = pdf.cumsum() / pdf.sum() + + if bw_return: + return grid, pdf, bw + else: + return grid, pdf + + +def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction): + """Kernel density with convolution. + + One dimensional Gaussian kernel density estimation via + convolution of the binned relative frequencies and a Gaussian filter. + This is an internal function used by `_kde()`. + """ + # Calculate relative frequencies per bin + bin_width = grid_edges[1] - grid_edges[0] + f = grid_counts / bin_width / len(x) + + # Bandwidth must consider the bin width + bw /= bin_width + + # See: https://stackoverflow.com/questions/2773606/gaussian-filter-in-matlab + kernel_n = int(bw * 2 * np.pi) + + # Temporal fix? + if kernel_n == 0: + kernel_n = 1 + kernel = gaussian(kernel_n, bw) + + if bound_correction: + npad = int(grid_len / 5) + f = np.concatenate([f[npad - 1 :: -1], f, f[grid_len : grid_len - npad - 1 : -1]]) + pdf = convolve(f, kernel, mode="same", method="direct")[npad : npad + grid_len] + pdf /= bw * (2 * np.pi) ** 0.5 + else: + pdf = convolve(f, kernel, mode="same", method="direct") + pdf /= bw * (2 * np.pi) ** 0.5 + + grid = (grid_edges[1:] + grid_edges[:-1]) / 2 + return grid, pdf + + +def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction): + """Compute Adaptive Kernel Density Estimation. + + One dimensional adaptive Gaussian kernel density estimation. + The implementation uses the binning technique. + Since there is not an unique `bw`, the convolution is not possible. + The alternative implemented in this function is known as Abramson's method. + This is an internal function used by `_kde()`. + """ + # Pilot computations used for bandwidth adjustment + pilot_grid, pilot_pdf = _kde_convolution( + x, bw, grid_edges, grid_counts, grid_len, bound_correction + ) + + # Adds to avoid np.log(0) and zero division + pilot_pdf += 1e-9 + + # Determine the modification factors + pdf_interp = np.interp(x, pilot_grid, pilot_pdf) + geom_mean = np.exp(np.mean(np.log(pdf_interp))) + + # Power of c = 0.5 -> Abramson's method + adj_factor = (geom_mean / pilot_pdf) ** 0.5 + bw_adj = bw * adj_factor + + # Estimation of Gaussian KDE via binned method (convolution not possible) + grid = pilot_grid + + if bound_correction: + grid_npad = int(grid_len / 5) + grid_width = grid_edges[1] - grid_edges[0] + grid_pad = grid_npad * grid_width + grid_padded = np.linspace( + grid_edges[0] - grid_pad, + grid_edges[grid_len - 1] + grid_pad, + num=grid_len + 2 * grid_npad, + ) + grid_counts = np.concatenate( + [ + grid_counts[grid_npad - 1 :: -1], + grid_counts, + grid_counts[grid_len : grid_len - grid_npad - 1 : -1], + ] + ) + bw_adj = np.concatenate( + [bw_adj[grid_npad - 1 :: -1], bw_adj, bw_adj[grid_len : grid_len - grid_npad - 1 : -1]] + ) + pdf_mat = (grid_padded - grid_padded[:, None]) / bw_adj[:, None] + pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None] + pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None] + pdf = np.sum(pdf_mat[:, grid_npad : grid_npad + grid_len], axis=0) / len(x) + + else: + pdf_mat = (grid - grid[:, None]) / bw_adj[:, None] + pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None] + pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None] + pdf = np.sum(pdf_mat, axis=0) / len(x) + + return grid, pdf diff --git a/arviz/numeric_utils.py b/arviz/numeric_utils.py index c849b251d0..a007e614d4 100644 --- a/arviz/numeric_utils.py +++ b/arviz/numeric_utils.py @@ -1,83 +1,24 @@ """Numerical utility functions for ArviZ.""" import warnings import numpy as np -from scipy.signal import convolve, convolve2d -from scipy.signal.windows import gaussian +from scipy.signal import convolve2d from scipy.sparse import coo_matrix -from .stats.stats_utils import histogram from .utils import _stack, _dot, _cov +from .kde_utils import _kde +from .stats.stats_utils import histogram # pylint: disable=unused-import -def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): - """Fast Fourier transform-based Gaussian kernel density estimate (KDE). +def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): # pylint: disable=unused-argument + """Kernel Density Estimate, Deprecated.""" + if not (xmin is None and xmax is None): + custom_lims = (xmin, xmax) + else: + custom_lims = None + grid, pdf = _kde(x, cumulative=cumulative, bw=bw, custom_lims=custom_lims) - The code was adapted from https://github.com/mfouesneau/faststats - - Parameters - ---------- - x : Numpy array or list - cumulative : bool - If true, estimate the cdf instead of the pdf - bw : float - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). - xmin : float - Manually set lower limit. - xmax : float - Manually set upper limit. - - Returns - ------- - density: A gridded 1D KDE of the input points (x) - xmin: minimum value of x - xmax: maximum value of x - """ - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - if x.size == 0: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - len_x = len(x) - n_points = 200 if (xmin or xmax) is None else 500 - - if xmin is None: - xmin = np.min(x) - if xmax is None: - xmax = np.max(x) - - assert np.min(x) >= xmin - assert np.max(x) <= xmax - - log_len_x = np.log(len_x) * bw - - n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) - if n_bins < 2: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - # hist, bin_edges = np.histogram(x, bins=n_bins, range=(xmin, xmax)) - # grid = hist / (hist.sum() * np.diff(bin_edges)) - - _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) - - scotts_factor = len_x ** (-0.2) - kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) - kernel = gaussian(kern_nx, scotts_factor * log_len_x) - - npad = min(n_bins, 2 * kern_nx) - grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) - density = convolve(grid, kernel, mode="same", method="direct")[npad : npad + n_bins] - norm_factor = (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 - - density /= norm_factor - - if cumulative: - density = density.cumsum() / density.sum() - - return density, xmin, xmax + warnings.warn("_fast_kde() has been replaced by _kde() in kde_utils.py", FutureWarning) + return grid, pdf def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): @@ -202,7 +143,7 @@ def _sturges_formula(dataset, mult=1): Parameters ---------- dataset: xarray.DataSet - Must have the `draw` dimension + Must have the `draw` dimension mult: float Used to scale the number of bins up or down. Default is 1 for Sturges' formula. @@ -213,3 +154,30 @@ def _sturges_formula(dataset, mult=1): Number of bins to use """ return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) + + +def _circular_mean(x, na_rm=False): + """Compute mean of circular variable measured in radians. + + The result is between -pi and pi. + """ + if na_rm: + x = x[~np.isnan(x)] + + sinr = np.sum(np.sin(x)) + cosr = np.sum(np.cos(x)) + mean = np.arctan2(sinr, cosr) + + return mean + + +def _normalize_angle(x, zero_centered=True): + """Normalize angles. + + Take angles in radians and normalize them to [-pi, pi) or [0, 2 * pi) + depending on `zero_centered`. + """ + if zero_centered: + return (x + np.pi) % (2 * np.pi) - np.pi + else: + return x % (2 * np.pi) diff --git a/arviz/plots/backends/bokeh/bpvplot.py b/arviz/plots/backends/bokeh/bpvplot.py index bcb5c8ac28..75157de06e 100644 --- a/arviz/plots/backends/bokeh/bpvplot.py +++ b/arviz/plots/backends/bokeh/bpvplot.py @@ -14,7 +14,7 @@ is_valid_quantile, vectorized_to_hex, ) -from ....numeric_utils import _fast_kde +from ....kde_utils import _kde def plot_bpv( @@ -91,8 +91,7 @@ def plot_bpv( if kind == "p_value": tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1) - tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit) - x_s = np.linspace(xmin, xmax, len(tstat_pit_dens)) + x_s, tstat_pit_dens = _kde(tstat_pit) ax_i.line(x_s, tstat_pit_dens, line_width=linewidth, line_color=color) # ax_i.set_yticks([]) if reference is not None: @@ -112,8 +111,7 @@ def plot_bpv( elif kind == "u_value": tstat_pit = np.mean(pp_vals <= obs_vals, axis=0) - tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit) - x_s = np.linspace(xmin, xmax, len(tstat_pit_dens)) + x_s, tstat_pit_dens = _kde(tstat_pit) ax_i.line(x_s, tstat_pit_dens, line_color=color) if reference is not None: if reference == "analytical": diff --git a/arviz/plots/backends/bokeh/densityplot.py b/arviz/plots/backends/bokeh/densityplot.py index a5c53a7757..8910912dd1 100644 --- a/arviz/plots/backends/bokeh/densityplot.py +++ b/arviz/plots/backends/bokeh/densityplot.py @@ -16,7 +16,8 @@ vectorized_to_hex, ) from ....stats import hdi -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde def plot_density( @@ -25,6 +26,7 @@ def plot_density( to_plot, colors, bw, + circular, figsize, length_plotters, rows, @@ -97,6 +99,7 @@ def plot_density( label, colors[m_idx], bw, + circular, line_width, markersize, hdi_prob, @@ -124,6 +127,7 @@ def _d_helper( vname, color, bw, + circular, line_width, markersize, hdi_prob, @@ -144,11 +148,10 @@ def _d_helper( else: new_vec = vec - density, xmin, xmax = _fast_kde(new_vec, bw=bw) + x, density = _kde(new_vec, circular=circular, bw=bw) density *= hdi_prob - x = np.linspace(xmin, xmax, len(density)) - ymin = density[0] - ymax = density[-1] + xmin, xmax = x[0], x[-1] + ymin, ymax = density[0], density[-1] if outline: plotted.append(ax.line(x, density, line_color=color, line_width=line_width, **extra)) @@ -229,7 +232,7 @@ def _d_helper( plotted.append(ax.diamond(xmax, 0, line_color="black", fill_color=color, size=markersize)) if point_estimate is not None: - est = calculate_point_estimate(point_estimate, vec, bw) + est = calculate_point_estimate(point_estimate, vec, bw, circular) plotted.append(ax.circle(est, 0, fill_color=color, line_color="black", size=markersize)) _title = Title() diff --git a/arviz/plots/backends/bokeh/distplot.py b/arviz/plots/backends/bokeh/distplot.py index 8e5b0520da..bc851146e8 100644 --- a/arviz/plots/backends/bokeh/distplot.py +++ b/arviz/plots/backends/bokeh/distplot.py @@ -20,6 +20,7 @@ def plot_dist( rotated, rug, bw, + circular, quantiles, contour, fill_last, @@ -89,6 +90,7 @@ def plot_dist( rug=rug, label=label, bw=bw, + circular=circular, quantiles=quantiles, rotated=rotated, contour=contour, diff --git a/arviz/plots/backends/bokeh/forestplot.py b/arviz/plots/backends/bokeh/forestplot.py index 79b5cec338..ce8c19323d 100644 --- a/arviz/plots/backends/bokeh/forestplot.py +++ b/arviz/plots/backends/bokeh/forestplot.py @@ -16,7 +16,8 @@ from ....rcparams import rcParams from ....stats import hdi from ....stats.diagnostics import _ess, _rhat -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde from ....utils import conditional_jit @@ -570,9 +571,8 @@ def ridgeplot(self, mult, ridgeplot_kind): _, density, x = histogram(values, bins=bins) x = x[:-1] elif kind == "density": - density, lower, upper = _fast_kde(values) + x, density = _kde(values) density_q = density.cumsum() / density.sum() - x = np.linspace(lower, upper, len(density)) xvals.append(x) pdfs.append(density) diff --git a/arviz/plots/backends/bokeh/loopitplot.py b/arviz/plots/backends/bokeh/loopitplot.py index d198822d95..134e2765c4 100644 --- a/arviz/plots/backends/bokeh/loopitplot.py +++ b/arviz/plots/backends/bokeh/loopitplot.py @@ -7,7 +7,7 @@ from . import backend_kwarg_defaults from .. import show_layout -from ...kdeplot import _fast_kde +from ....kde_utils import _kde from ...plot_utils import _scale_fig_size @@ -184,8 +184,7 @@ def plot_loo_pit( ) else: for idx in range(n_unif): - unif_density, xmin, xmax = _fast_kde(unif[idx, :]) - x_s = np.linspace(xmin, xmax, len(unif_density)) + x_s, unif_density = _kde(unif[idx, :]) ax.line( x_s, unif_density, diff --git a/arviz/plots/backends/bokeh/posteriorplot.py b/arviz/plots/backends/bokeh/posteriorplot.py index b7f10960c9..e7a17f1c8e 100644 --- a/arviz/plots/backends/bokeh/posteriorplot.py +++ b/arviz/plots/backends/bokeh/posteriorplot.py @@ -28,6 +28,7 @@ def plot_posterior( figsize, plotters, bw, + circular, bins, kind, point_estimate, @@ -75,6 +76,7 @@ def plot_posterior( selection, ax=ax_, bw=bw, + circular=circular, bins=bins, kind=kind, point_estimate=point_estimate, @@ -104,6 +106,7 @@ def _plot_posterior_op( selection, ax, bw, + circular, linewidth, bins, kind, @@ -192,7 +195,7 @@ def display_rope(max_data): def display_point_estimate(max_data): if not point_estimate: return - point_value = calculate_point_estimate(point_estimate, values, bw) + point_value = calculate_point_estimate(point_estimate, values, bw, circular) sig_figs = format_sig_figs(point_value, round_to) point_text = "{point_estimate}={point_value:.{sig_figs}g}".format( point_estimate=point_estimate, point_value=point_value, sig_figs=sig_figs @@ -234,6 +237,7 @@ def format_axes(): plot_kde( values, bw=bw, + circular=circular, fill_kwargs={"fill_alpha": kwargs.pop("fill_alpha", 0)}, plot_kwargs=kwargs, ax=ax, diff --git a/arviz/plots/backends/bokeh/ppcplot.py b/arviz/plots/backends/bokeh/ppcplot.py index c4e9461e6e..d8c41cdc10 100644 --- a/arviz/plots/backends/bokeh/ppcplot.py +++ b/arviz/plots/backends/bokeh/ppcplot.py @@ -3,9 +3,10 @@ from . import backend_kwarg_defaults from .. import show_layout -from ...kdeplot import plot_kde, _fast_kde +from ...kdeplot import plot_kde from ...plot_utils import _create_axes_grid, _scale_fig_size from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde def plot_ppc( @@ -93,8 +94,7 @@ def plot_ppc( for vals in pp_sampled_vals: vals = np.array([vals]).flatten() if dtype == "f": - pp_density, lower, upper = _fast_kde(vals) - pp_x = np.linspace(lower, upper, len(pp_density)) + pp_x, pp_density = _kde(vals) pp_densities.append(pp_density) pp_xs.append(pp_x) else: diff --git a/arviz/plots/backends/bokeh/violinplot.py b/arviz/plots/backends/bokeh/violinplot.py index e13f40cc3a..48ccfc3bb6 100644 --- a/arviz/plots/backends/bokeh/violinplot.py +++ b/arviz/plots/backends/bokeh/violinplot.py @@ -4,7 +4,8 @@ from . import backend_kwarg_defaults from .. import show_layout -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde from ...plot_utils import make_label, _create_axes_grid, _scale_fig_size from ....stats import hdi @@ -23,6 +24,7 @@ def plot_violin( rug_kwargs, bw, textsize, + circular, hdi_prob, quartiles, backend_kwargs, @@ -64,7 +66,7 @@ def plot_violin( if val[0].dtype.kind == "i": dens = cat_hist(val, rug, shade, ax_, **shade_kwargs) else: - dens = _violinplot(val, rug, shade, bw, ax_, **shade_kwargs) + dens = _violinplot(val, rug, shade, bw, circular, ax_, **shade_kwargs) if rug: rug_x = -np.abs(np.random.normal(scale=max(dens) / 3.5, size=len(val))) @@ -99,10 +101,14 @@ def plot_violin( return ax -def _violinplot(val, rug, shade, bw, ax, **shade_kwargs): +def _violinplot(val, rug, shade, bw, circular, ax, **shade_kwargs): """Auxiliary function to plot violinplots.""" - density, low_b, up_b = _fast_kde(val, bw=bw) - x = np.linspace(low_b, up_b, len(density)) + if bw == "default": + if circular: + bw = "taylor" + else: + bw = "experimental" + x, density = _kde(val, circular=circular, bw=bw) if not rug: x = np.concatenate([x, x[::-1]]) diff --git a/arviz/plots/backends/matplotlib/bpvplot.py b/arviz/plots/backends/matplotlib/bpvplot.py index d02824a06b..08c695da7e 100644 --- a/arviz/plots/backends/matplotlib/bpvplot.py +++ b/arviz/plots/backends/matplotlib/bpvplot.py @@ -13,7 +13,7 @@ is_valid_quantile, matplotlib_kwarg_dealiaser, ) -from ....numeric_utils import _fast_kde +from ....kde_utils import _kde def plot_bpv( @@ -77,8 +77,7 @@ def plot_bpv( if kind == "p_value": tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1) - tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit) - x_s = np.linspace(xmin, xmax, len(tstat_pit_dens)) + x_s, tstat_pit_dens = _kde(tstat_pit) ax_i.plot(x_s, tstat_pit_dens, linewidth=linewidth, color=color) ax_i.set_yticks([]) if reference is not None: @@ -97,8 +96,7 @@ def plot_bpv( elif kind == "u_value": tstat_pit = np.mean(pp_vals <= obs_vals, axis=0) - tstat_pit_dens, xmin, xmax = _fast_kde(tstat_pit) - x_s = np.linspace(xmin, xmax, len(tstat_pit_dens)) + x_s, tstat_pit_dens = _kde(tstat_pit) ax_i.plot(x_s, tstat_pit_dens, color=color) if reference is not None: if reference == "analytical": diff --git a/arviz/plots/backends/matplotlib/densityplot.py b/arviz/plots/backends/matplotlib/densityplot.py index 843078e044..bfd803c248 100644 --- a/arviz/plots/backends/matplotlib/densityplot.py +++ b/arviz/plots/backends/matplotlib/densityplot.py @@ -12,7 +12,8 @@ _scale_fig_size, calculate_point_estimate, ) -from ....numeric_utils import _fast_kde, get_bins +from ....numeric_utils import get_bins +from ....kde_utils import _kde def plot_density( @@ -21,6 +22,7 @@ def plot_density( to_plot, colors, bw, + circular, figsize, length_plotters, rows, @@ -74,6 +76,7 @@ def plot_density( label, colors[m_idx], bw, + circular, titlesize, xt_labelsize, linewidth, @@ -102,6 +105,7 @@ def _d_helper( vname, color, bw, + circular, titlesize, xt_labelsize, linewidth, @@ -123,10 +127,11 @@ def _d_helper( variable name color : str matplotlib color - bw : float - Bandwidth scaling factor. Should be larger than 0. The higher this number the smoother the - KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule of thumb - (the default used rule by SciPy). + bw: float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. titlesize : float font size for title xt_labelsize : float @@ -152,11 +157,10 @@ def _d_helper( else: new_vec = vec - density, xmin, xmax = _fast_kde(new_vec, bw=bw) + x, density = _kde(new_vec, circular=circular, bw=bw) density *= hdi_prob - x = np.linspace(xmin, xmax, len(density)) - ymin = density[0] - ymax = density[-1] + xmin, xmax = x[0], x[-1] + ymin, ymax = density[0], density[-1] if outline: ax.plot(x, density, color=color, lw=linewidth) diff --git a/arviz/plots/backends/matplotlib/distplot.py b/arviz/plots/backends/matplotlib/distplot.py index c6fe73cae2..bbcc064235 100644 --- a/arviz/plots/backends/matplotlib/distplot.py +++ b/arviz/plots/backends/matplotlib/distplot.py @@ -18,6 +18,7 @@ def plot_dist( rotated, rug, bw, + circular, quantiles, contour, fill_last, @@ -81,6 +82,7 @@ def plot_dist( rug=rug, label=label, bw=bw, + circular=circular, quantiles=quantiles, rotated=rotated, contour=contour, diff --git a/arviz/plots/backends/matplotlib/forestplot.py b/arviz/plots/backends/matplotlib/forestplot.py index 2f0aabf445..93a134f5cd 100644 --- a/arviz/plots/backends/matplotlib/forestplot.py +++ b/arviz/plots/backends/matplotlib/forestplot.py @@ -9,7 +9,8 @@ from . import backend_kwarg_defaults, backend_show from ....stats import hdi from ....stats.diagnostics import _ess, _rhat -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label from ....utils import conditional_jit @@ -518,9 +519,8 @@ def ridgeplot(self, mult, ridgeplot_kind): density_q = density.cumsum() / density.sum() x = x[:-1] elif kind == "density": - density, lower, upper = _fast_kde(values) + x, density = _kde(values) density_q = density.cumsum() / density.sum() - x = np.linspace(lower, upper, len(density)) xvals.append(x) pdfs.append(density) diff --git a/arviz/plots/backends/matplotlib/loopitplot.py b/arviz/plots/backends/matplotlib/loopitplot.py index 8ef0568238..a6f442a929 100644 --- a/arviz/plots/backends/matplotlib/loopitplot.py +++ b/arviz/plots/backends/matplotlib/loopitplot.py @@ -10,7 +10,7 @@ _scale_fig_size, matplotlib_kwarg_dealiaser, ) -from ....numeric_utils import _fast_kde +from ....kde_utils import _kde def plot_loo_pit( @@ -118,8 +118,7 @@ def plot_loo_pit( ax.axhspan(*hdi_odds, **hdi_kwargs) else: for idx in range(n_unif): - unif_density, xmin, xmax = _fast_kde(unif[idx, :]) - x_s = np.linspace(xmin, xmax, len(unif_density)) + x_s, unif_density = _kde(unif[idx, :]) x_ss[idx] = x_s u_dens[idx] = unif_density ax.plot(x_ss.T, u_dens.T, **plot_unif_kwargs) diff --git a/arviz/plots/backends/matplotlib/posteriorplot.py b/arviz/plots/backends/matplotlib/posteriorplot.py index 94820a7ffc..09f4004105 100644 --- a/arviz/plots/backends/matplotlib/posteriorplot.py +++ b/arviz/plots/backends/matplotlib/posteriorplot.py @@ -26,6 +26,7 @@ def plot_posterior( figsize, plotters, bw, + circular, bins, kind, point_estimate, @@ -67,6 +68,7 @@ def plot_posterior( selection, ax=ax_, bw=bw, + circular=circular, bins=bins, kind=kind, point_estimate=point_estimate, @@ -95,6 +97,7 @@ def _plot_posterior_op( selection, ax, bw, + circular, linewidth, bins, kind, @@ -204,7 +207,7 @@ def display_rope(): def display_point_estimate(): if not point_estimate: return - point_value = calculate_point_estimate(point_estimate, values, bw) + point_value = calculate_point_estimate(point_estimate, values, bw, circular) sig_figs = format_sig_figs(point_value, round_to) point_text = "{point_estimate}={point_value:.{sig_figs}g}".format( point_estimate=point_estimate, point_value=point_value, sig_figs=sig_figs @@ -269,6 +272,7 @@ def format_axes(): plot_kde( values, bw=bw, + circular=circular, fill_kwargs={"alpha": kwargs.pop("fill_alpha", 0)}, plot_kwargs=kwargs, ax=ax, diff --git a/arviz/plots/backends/matplotlib/ppcplot.py b/arviz/plots/backends/matplotlib/ppcplot.py index 8ebdfc40ce..de5e060b38 100644 --- a/arviz/plots/backends/matplotlib/ppcplot.py +++ b/arviz/plots/backends/matplotlib/ppcplot.py @@ -12,7 +12,8 @@ _create_axes_grid, _scale_fig_size, ) -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....numeric_utils import histogram, get_bins +from ....kde_utils import _kde _log = logging.getLogger(__name__) @@ -149,8 +150,7 @@ def plot_ppc( for vals in pp_sampled_vals: vals = np.array([vals]).flatten() if dtype == "f": - pp_density, lower, upper = _fast_kde(vals) - pp_x = np.linspace(lower, upper, len(pp_density)) + pp_x, pp_density = _kde(vals) pp_densities.append(pp_density) pp_xs.append(pp_x) else: @@ -368,18 +368,13 @@ def _set_animation( if kind == "kde": length = len(pp_sampled_vals) if dtype == "f": - y_vals, lower, upper = _fast_kde(pp_sampled_vals[0]) - x_vals = np.linspace(lower, upper, len(y_vals)) - - max_max = max([max(_fast_kde(pp_sampled_vals[i])[0]) for i in range(length)]) - + x_vals, y_vals = _kde(pp_sampled_vals[0]) + max_max = max([max(_kde(pp_sampled_vals[i])[1]) for i in range(length)]) ax.set_ylim(0, max_max) - (line,) = ax.plot(x_vals, y_vals, **plot_kwargs) def animate(i): - y_vals, lower, upper = _fast_kde(pp_sampled_vals[i]) - x_vals = np.linspace(lower, upper, len(y_vals)) + x_vals, y_vals = _kde(pp_sampled_vals[i]) line.set_data(x_vals, y_vals) return line diff --git a/arviz/plots/backends/matplotlib/violinplot.py b/arviz/plots/backends/matplotlib/violinplot.py index 78ac2620b7..e5e3a0b9ca 100644 --- a/arviz/plots/backends/matplotlib/violinplot.py +++ b/arviz/plots/backends/matplotlib/violinplot.py @@ -4,7 +4,8 @@ from . import backend_show from ....stats import hdi -from ....numeric_utils import _fast_kde, histogram, get_bins +from ....kde_utils import _kde +from ....numeric_utils import histogram, get_bins from ...plot_utils import make_label, _create_axes_grid, matplotlib_kwarg_dealiaser, _scale_fig_size @@ -22,6 +23,7 @@ def plot_violin( rug_kwargs, bw, textsize, + circular, hdi_prob, quartiles, backend_kwargs, @@ -59,7 +61,7 @@ def plot_violin( if val[0].dtype.kind == "i": dens = cat_hist(val, rug, shade, ax_, **shade_kwargs) else: - dens = _violinplot(val, rug, shade, bw, ax_, **shade_kwargs) + dens = _violinplot(val, rug, shade, bw, circular, ax_, **shade_kwargs) if rug: rug_x = -np.abs(np.random.normal(scale=max(dens) / 3.5, size=len(val))) @@ -84,10 +86,14 @@ def plot_violin( return ax -def _violinplot(val, rug, shade, bw, ax, **shade_kwargs): +def _violinplot(val, rug, shade, bw, circular, ax, **shade_kwargs): """Auxiliary function to plot violinplots.""" - density, low_b, up_b = _fast_kde(val, bw=bw) - x = np.linspace(low_b, up_b, len(density)) + if bw == "default": + if circular: + bw = "taylor" + else: + bw = "experimental" + x, density = _kde(val, circular=circular, bw=bw) if not rug: x = np.concatenate([x, x[::-1]]) diff --git a/arviz/plots/densityplot.py b/arviz/plots/densityplot.py index 8c4e1a1fb4..3fdaba47c1 100644 --- a/arviz/plots/densityplot.py +++ b/arviz/plots/densityplot.py @@ -25,7 +25,8 @@ def plot_density( outline=True, hdi_markers="", shade=0.0, - bw=4.5, + bw="default", + circular=False, figsize=None, textsize=None, ax=None, @@ -77,10 +78,16 @@ def plot_density( shade : Optional[float] Alpha blending value for the shaded area under the curve, between 0 (no shade) and 1 (opaque). Defaults to 0. - bw : Optional[float] - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). + bw: Optional[float or str] + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. + circular: Optional[bool] + If True, it interprets the values passed are from a circular variable measured in radians + and a circular KDE is used. Only valid for 1D KDE. Defaults to False. figsize : Optional[Tuple[int, int]] Figure size. If None it will be defined automatically. textsize: Optional[float] @@ -212,12 +219,19 @@ def plot_density( length_plotters = max_plots rows, cols = default_grid(length_plotters, max_cols=3) + if bw == "default": + if circular: + bw = "taylor" + else: + bw = "experimental" + plot_density_kwargs = dict( ax=ax, all_labels=all_labels, to_plot=to_plot, colors=colors, bw=bw, + circular=circular, figsize=figsize, length_plotters=length_plotters, rows=rows, diff --git a/arviz/plots/distplot.py b/arviz/plots/distplot.py index bf222810d2..2472386cdc 100644 --- a/arviz/plots/distplot.py +++ b/arviz/plots/distplot.py @@ -15,7 +15,8 @@ def plot_dist( label=None, rotated=False, rug=False, - bw=4.5, + bw="default", + circular=False, quantiles=None, contour=True, fill_last=True, @@ -58,10 +59,16 @@ def plot_dist( Whether to rotate the 1D KDE plot 90 degrees. rug : bool If True adds a rugplot. Defaults to False. Ignored for 2D KDE - bw : float - Bandwidth scaling factor for 1D KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's - rule of thumb (the default rule used by SciPy). + bw: Optional[float or str] + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. + circular: Optional[bool] + If True, it interprets the values passed are from a circular variable measured in radians + and a circular KDE is used. Only valid for 1D KDE. Defaults to False. quantiles : list Quantiles in ascending order used to segment the KDE. Use [.25, .5, .75] for quartiles. Defaults to None. @@ -171,6 +178,7 @@ def plot_dist( rotated=rotated, rug=rug, bw=bw, + circular=circular, quantiles=quantiles, contour=contour, fill_last=fill_last, diff --git a/arviz/plots/energyplot.py b/arviz/plots/energyplot.py index c607d19086..91da33dc03 100644 --- a/arviz/plots/energyplot.py +++ b/arviz/plots/energyplot.py @@ -12,7 +12,7 @@ def plot_energy( legend=True, fill_alpha=(1, 0.75), fill_color=("C0", "C5"), - bw=4.5, + bw="experimental", textsize=None, fill_kwargs=None, plot_kwargs=None, @@ -43,10 +43,11 @@ def plot_energy( fill_color : tuple of valid matplotlib color Color for Marginal energy distribution and Energy transition distribution. Defaults to ('C0', 'C5') - bw : float - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). Only works if `kind='kde'` + bw: float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental". Defaults to "experimental" + Only works if `kind='kde'` textsize: float Text size scaling factor for labels, titles and lines. If None it will be autoscaled based on figsize. diff --git a/arviz/plots/kdeplot.py b/arviz/plots/kdeplot.py index c883c36247..b65a2c86b1 100644 --- a/arviz/plots/kdeplot.py +++ b/arviz/plots/kdeplot.py @@ -3,7 +3,8 @@ import xarray as xr from ..data import InferenceData -from ..numeric_utils import _fast_kde, _fast_kde_2d +from ..numeric_utils import _fast_kde_2d +from ..kde_utils import _kde from .plot_utils import get_plotting_function from ..rcparams import rcParams @@ -14,7 +15,9 @@ def plot_kde( cumulative=False, rug=False, label=None, - bw=4.5, + bw="default", + adaptive=False, + circular=False, quantiles=None, rotated=False, contour=True, @@ -50,17 +53,27 @@ def plot_kde( If True adds a rugplot. Defaults to False. Ignored for 2D KDE label : string Text to include as part of the legend - bw : float - Bandwidth scaling factor for 1D KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's - rule of thumb (the default rule used by SciPy). + bw: float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. + adaptive: bool, optional. + If True, an adaptative bandwidth is used. Only valid for 1D KDE. + Defaults to False. + circular: bool, optional. + If True, it interprets `values` is a circular variable measured in radians + and a circular KDE is used. Only valid for 1D KDE. Defaults to False. quantiles : list - Quantiles in ascending order used to segment the KDE. Use [.25, .5, .75] for quartiles. - Defaults to None. + Quantiles in ascending order used to segment the KDE. + Use [.25, .5, .75] for quartiles. Defaults to None. rotated : bool Whether to rotate the 1D KDE plot 90 degrees. contour : bool - If True plot the 2D KDE using contours, otherwise plot a smooth 2D KDE. Defaults to True. + If True plot the 2D KDE using contours, otherwise plot a smooth 2D KDE. + Defaults to True. fill_last : bool If True fill the last contour of the 2D KDE plot. Defaults to False. textsize: float @@ -127,6 +140,35 @@ def plot_kde( >>> az.plot_kde(mu_posterior, rug=True) + Plot KDE with adaptive bandwidth + + .. plot:: + :context: close-figs + + >>> az.plot_kde(mu_posterior, adaptive=True) + + Plot KDE with a different bandwidth estimator + + .. plot:: + :context: close-figs + + >>> az.plot_kde(mu_posterior, bw="scott") + + Plot KDE with a bandwidth specified manually + + .. plot:: + :context: close-figs + + >>> az.plot_kde(mu_posterior, bw=0.4) + + Plot KDE for a circular variable + + .. plot:: + :context: close-figs + + >>> rvs = np.random.vonmises(mu=np.pi, kappa=2, size=500) + >>> az.plot_kde(rvs, circular=True) + Plot a cumulative distribution @@ -179,6 +221,10 @@ def plot_kde( >>> az.plot_kde(mu_posterior, values2=tau_posterior, contour=False) + See Also + -------- + plot_kde : Compute and plot a kernel density estimate. + """ if isinstance(values, xr.Dataset): raise ValueError( @@ -192,7 +238,15 @@ def plot_kde( ) if values2 is None: - density, lower, upper = _fast_kde(values, cumulative, bw) + + if bw == "default": + if circular: + bw = "taylor" + else: + bw = "experimental" + + grid, density = _kde(values, circular, bw=bw, adaptive=adaptive, cumulative=cumulative) + lower, upper = grid[0], grid[-1] if cumulative: density_q = density diff --git a/arviz/plots/loopitplot.py b/arviz/plots/loopitplot.py index 9d6268fa32..395c90cdf1 100644 --- a/arviz/plots/loopitplot.py +++ b/arviz/plots/loopitplot.py @@ -3,7 +3,7 @@ import scipy.stats as stats from ..stats import loo_pit as _loo_pit -from ..numeric_utils import _fast_kde +from ..kde_utils import _kde from .plot_utils import get_plotting_function from ..rcparams import rcParams @@ -161,10 +161,9 @@ def plot_loo_pit( ) unif_ecdf = unif_ecdf / n_data_points else: - loo_pit_kde, xmin, xmax = _fast_kde(loo_pit) + x_vals, loo_pit_kde = _kde(loo_pit) unif = np.random.uniform(size=(n_unif, loo_pit.size)) - x_vals = np.linspace(xmin, xmax, len(loo_pit_kde)) if use_hdi: n_obs = loo_pit.size hdi_ = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - credible_interval) / 2) diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 2a52cab39c..f6818c12d8 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -13,7 +13,7 @@ from scipy.stats import mode import xarray as xr -from ..numeric_utils import _fast_kde +from ..kde_utils import _kde from ..rcparams import rcParams KwargSpec = Dict[str, Any] @@ -611,7 +611,7 @@ def get_plotting_function(plot_name, plot_module, backend): return plotting_method -def calculate_point_estimate(point_estimate, values, bw=4.5): +def calculate_point_estimate(point_estimate, values, bw="default", circular=False): """Validate and calculate the point estimate. Parameters @@ -620,10 +620,16 @@ def calculate_point_estimate(point_estimate, values, bw=4.5): Plot point estimate per variable. Values should be 'mean', 'median', 'mode' or None. Defaults to 'auto' i.e. it falls back to default set in rcParams. values : 1-d array - bw : float - Bandwidth scaling factor. Should be larger than 0. The higher this number the smoother the - KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule of thumb - (the default used rule by SciPy). + bw: Optional[float or str] + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. + circular: Optional[bool] + If True, it interprets the values passed are from a circular variable measured in radians + and a circular KDE is used. Only valid for 1D KDE. Defaults to False. Returns ------- @@ -643,8 +649,12 @@ def calculate_point_estimate(point_estimate, values, bw=4.5): point_value = values.mean() elif point_estimate == "mode": if isinstance(values[0], float): - density, lower, upper = _fast_kde(values, bw=bw) - x = np.linspace(lower, upper, len(density)) + if bw == "default": + if circular: + bw = "taylor" + else: + bw = "experimental" + x, density = _kde(values, circular=circular, bw=bw) point_value = x[np.argmax(density)] else: point_value = mode(values)[0][0] @@ -713,8 +723,7 @@ def sample_reference_distribution(dist, shape): densities = [] dist_rvs = dist.rvs(size=shape) for idx in range(shape[1]): - density, xmin, xmax = _fast_kde(dist_rvs[:, idx]) - x_s = np.linspace(xmin, xmax, len(density)) + x_s, density = _kde(dist_rvs[:, idx]) x_ss.append(x_s) densities.append(density) return np.array(x_ss).T, np.array(densities).T diff --git a/arviz/plots/posteriorplot.py b/arviz/plots/posteriorplot.py index 04d0307f6a..6cf43c7d9f 100644 --- a/arviz/plots/posteriorplot.py +++ b/arviz/plots/posteriorplot.py @@ -26,7 +26,8 @@ def plot_posterior( rope=None, ref_val=None, kind="kde", - bw=4.5, + bw="default", + circular=False, bins=None, ax=None, backend=None, @@ -82,10 +83,17 @@ def plot_posterior( kind: str Type of plot to display (kde or hist) For discrete variables this argument is ignored and a histogram is always used. - bw: float - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). Only works if `kind == kde`. + bw: float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. Only works if `kind == kde`. + circular: bool, optional + If True, it interprets the values passed are from a circular variable measured in radians + and a circular KDE is used. Only valid for 1D KDE. Defaults to False. + Only works if `kind == kde`. bins: integer or sequence or 'auto', optional Controls the number of bins, accepts the same keywords `matplotlib.hist()` does. Only works if `kind == hist`. If None (default) it will use `auto` for continuous variables and @@ -224,6 +232,7 @@ def plot_posterior( figsize=figsize, plotters=plotters, bw=bw, + circular=circular, bins=bins, kind=kind, point_estimate=point_estimate, diff --git a/arviz/plots/violinplot.py b/arviz/plots/violinplot.py index e69dd9012b..5037a3bec0 100644 --- a/arviz/plots/violinplot.py +++ b/arviz/plots/violinplot.py @@ -19,7 +19,8 @@ def plot_violin( rug=False, hdi_prob=None, shade=0.35, - bw=4.5, + bw="default", + circular=False, sharex=True, sharey=True, figsize=None, @@ -63,10 +64,16 @@ def plot_violin( shade: float Alpha blending value for the shaded area under the curve, between 0 (no shade) and 1 (opaque). Defaults to 0 - bw: float - Bandwidth scaling factor. Should be larger than 0. The higher this number the smoother the - KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule of thumb - (the default rule used by SciPy). + bw: float or str, optional + If numeric, indicates the bandwidth and must be positive. + If str, indicates the method to estimate the bandwidth and must be + one of "scott", "silverman", "isj" or "experimental" when `circular` is False + and "taylor" (for now) when `circular` is True. + Defaults to "default" which means "experimental" when variable is not circular + and "taylor" when it is. + circular: bool, optional. + If True, it interprets `values` is a circular variable measured in radians + and a circular KDE is used. Defaults to False. figsize: tuple Figure size. If None it will be defined automatically. textsize: int @@ -151,6 +158,7 @@ def plot_violin( rug_kwargs=rug_kwargs, bw=bw, textsize=textsize, + circular=circular, hdi_prob=hdi_prob, quartiles=quartiles, backend_kwargs=backend_kwargs, diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py index f7dd0b4639..c5b0370f4f 100644 --- a/arviz/stats/stats.py +++ b/arviz/stats/stats.py @@ -22,8 +22,10 @@ stats_variance_2d as svar, _circular_standard_deviation, get_log_likelihood as _get_log_likelihood, + histogram, ) -from ..numeric_utils import _fast_kde, histogram, get_bins +from ..numeric_utils import get_bins +from ..kde_utils import _kde from ..utils import _var_names, Numba, _numba_var, get_coords, credible_interval_warning from ..rcparams import rcParams @@ -545,10 +547,10 @@ def _hdi_multimodal(ary, hdi_prob, skipna, max_modes): ary = ary[~np.isnan(ary)] if ary.dtype.kind == "f": - density, lower, upper = _fast_kde(ary) + bins, density = _kde(ary) + lower, upper = bins[0], bins[-1] range_x = upper - lower dx = range_x / len(density) - bins = np.linspace(lower, upper, len(density)) else: bins = get_bins(ary) _, density, _ = histogram(ary, bins=bins) diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index b6e6d49064..08f4e44139 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -45,7 +45,7 @@ plot_bpv, ) from ...utils import _cov -from ...numeric_utils import _fast_kde +from ...kde_utils import _kde rcParams["data.load"] = "eager" @@ -923,23 +923,25 @@ def test_plot_hdi_dataset_error(models): @pytest.mark.parametrize("limits", [(-10.0, 10.0), (-5, 5), (None, None)]) -def test_fast_kde_scipy(limits): +def test_kde_scipy(limits): + """ + Evaluates if sum of density is the same for our implementation + and the implementation in scipy + """ data = np.random.normal(0, 1, 10000) - if limits[0] is None: - x = np.linspace(data.min(), data.max(), 200) # pylint: disable=no-member - else: - x = np.linspace(*limits, 500) - density = gaussian_kde(data).evaluate(x) - density_fast = _fast_kde(data, xmin=limits[0], xmax=limits[1])[0] - - np.testing.assert_almost_equal(density_fast.sum(), density.sum(), 1) + grid, density_own = _kde(data, custom_lims=limits) + density_sp = gaussian_kde(data).evaluate(grid) + np.testing.assert_almost_equal(density_own.sum(), density_sp.sum(), 1) @pytest.mark.parametrize("limits", [(-10.0, 10.0), (-5, 5), (None, None)]) -def test_fast_kde_cumulative(limits): +def test_kde_cumulative(limits): + """ + Evaluates if last value of cumulative density is 1 + """ data = np.random.normal(0, 1, 1000) - density_fast = _fast_kde(data, xmin=limits[0], xmax=limits[1], cumulative=True)[0] - np.testing.assert_almost_equal(round(density_fast[-1], 3), 1) + density = _kde(data, custom_lims=limits, cumulative=True)[1] + np.testing.assert_almost_equal(round(density[-1], 3), 1) @pytest.mark.parametrize( diff --git a/arviz/tests/base_tests/test_utils.py b/arviz/tests/base_tests/test_utils.py index 2d507d9121..cef49000f6 100644 --- a/arviz/tests/base_tests/test_utils.py +++ b/arviz/tests/base_tests/test_utils.py @@ -5,6 +5,7 @@ from unittest.mock import Mock import numpy as np import pytest +import scipy.stats as st from arviz.data.base import dict_to_dataset from ...utils import ( @@ -17,6 +18,7 @@ _subset_list, ) from ...data import load_arviz_data, from_dict +from ...numeric_utils import _circular_mean, _normalize_angle @pytest.fixture(scope="session") @@ -271,3 +273,23 @@ def test_flatten_inference_data_to_dict( assert not any("sample_stats" in item for item in res_dict) else: assert not any("prior" in item for item in res_dict) + + +@pytest.mark.parametrize("mean", [0, np.pi, 4 * np.pi, -2 * np.pi, -10 * np.pi]) +def test_circular_mean_scipy(mean): + """Test our `_circular_mean()` function gives same result than Scipy version.""" + rvs = st.vonmises.rvs(loc=mean, kappa=1, size=1000) + mean_az = _circular_mean(rvs) + mean_sp = st.circmean(rvs, low=-np.pi, high=np.pi) + np.testing.assert_almost_equal(mean_az, mean_sp) + + +@pytest.mark.parametrize("mean", [0, np.pi, 4 * np.pi, -2 * np.pi, -10 * np.pi]) +def test_normalize_angle(mean): + """Testing _normalize_angles() return values between expected bounds""" + rvs = st.vonmises.rvs(loc=mean, kappa=1, size=1000) + values = _normalize_angle(rvs, zero_centered=True) + assert ((-np.pi <= values) & (values <= np.pi)).all() + + values = _normalize_angle(rvs, zero_centered=False) + assert ((values >= 0) & (values <= 2 * np.pi)).all()