Source code for mlpy.stats._stats

from __future__ import division, print_function, absolute_import
# noinspection PyUnresolvedReferences
from six.moves import range

import numpy as np
# noinspection PyPackageRequirements
from sklearn.utils.extmath import logsumexp

from ..auxiliary.array import nunique

__all__ = ['is_posdef', 'randpd', 'stacked_randpd', 'normalize_logspace', 'sq_distance',
           'partitioned_mean', 'partitioned_cov', 'partitioned_sum', 'shrink_cov',
           'canonize_labels']


[docs]def is_posdef(a): """Test if matrix `a` is positive definite. The method uses Cholesky decomposition to determine if the matrix is positive definite. Parameters ---------- a : ndarray A matrix. Returns ------- bool : Whether the matrix is positive definite. Examples -------- >>> is_posdef() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ try: np.linalg.cholesky(np.asarray(a)) return True except np.linalg.LinAlgError: return False
[docs]def randpd(dim): """Create a random positive definite matrix of size `dim`-by-`dim`. Parameters ---------- dim : int The dimension of the matrix to create. Returns ------- ndarray : A `dim`-by-`dim` positive definite matrix. Examples -------- >>> randpd() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ x = np.random.randn(dim, dim) a = x * x.T while not is_posdef(a): a = a + np.diag(0.001 * np.ones(dim)) return a
[docs]def stacked_randpd(dim, k, p=0): """Create stacked positive definite matrices. Create multiple random positive definite matrices of size dim-by-dim and stack them. Parameters ---------- dim : int The dimension of each matrix. k : int The number of matrices. p : int The diagonal value of each matrix. Returns ------- ndarray : Multiple stacked random positive definite matrices. Examples -------- >>> stacked_randpd() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ s = np.zeros((k, dim, dim)) for i in range(k): s[i] = randpd(dim) + np.diag(p * np.ones(dim)) return s
[docs]def normalize_logspace(a): """Normalizes the array `a` in the log domain. Each row of `a` is a log discrete distribution. Returns the array normalized in the log domain while minimizing the possibility of numerical underflow. Parameters ---------- a : ndarray The array to normalize in the log domain. Returns ------- a : ndarray The array normalized in the log domain. lnorm : float log normalization constant. Examples -------- >>> normalize_logspace() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ l = logsumexp(a, 1) y = a.T - l return y.T, l
[docs]def sq_distance(p, q, p_sos=None, q_sos=None): """Efficiently compute squared Euclidean distances between stats of vectors. Compute the squared Euclidean distances between every d-dimensional point in `p` to every `d`-dimensional point in q. Both `p` and `q` are n-point-by-n-dimensions. Parameters ---------- p : array_like, shape (`n`, `dim`) Array where `n` is the number of points and `dim` is the number of dimensions. q : array_like, shape (`n`, `dim`) Array where `n` is the number of points and `dim` is the number of dimensions. p_sos : array_like, shape (`dim`,) q_sos : array_like, shape (`dim`,) Returns ------- ndarray : The squared Euclidean distance. Examples -------- >>> sq_distance() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ p_sos = np.sum(np.power(p, 2), 1) if p_sos is None else p_sos # noinspection PyTypeChecker q_sos = np.sum(np.power(q, 2), 1) if q_sos is None else q_sos # noinspection PyUnresolvedReferences n = q_sos.shape[0] # noinspection PyUnresolvedReferences return (q_sos.reshape((n, 1)) + p_sos).T - 2 * np.dot(p, q.T)
[docs]def partitioned_mean(x, y, c=None, return_counts=False): """Mean of groups. Groups the rows of `x` according to the class labels in y and takes the mean of each group. Parameters ---------- x : array_like, shape (`n`, `dim`) The data to group, where `n` is the number of data points and `dim` is the dimensionality of each data point. y : array_like, shape (`n`,) The class label for each data point. return_counts : bool Whether to return the number of elements in each group or not. Returns ------- mean : array_like The mean of each group. counts : int The number of elements in each group. Examples -------- >>> partitioned_mean() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ c = nunique(y) if c is None else c dim = x.shape[1] m = np.zeros((c, dim)) for i in range(c): ndx = y == i m[i] = np.mean(x[ndx], 0) if not return_counts: ret = m else: ret = (m,) # noinspection PyTupleAssignmentBalance _, counts = np.unique(y, return_counts=True) ret += (counts,) return ret
[docs]def partitioned_cov(x, y, c=None): """Covariance of groups. Partition the rows of `x` according to class labels in `y` and take the covariance of each group. Parameters ---------- x : array_like, shape (`n`, `dim`) The data to group, where `n` is the number of data points and `dim` is the dimensionality of each data point. y : array_like, shape (`n`,) The class label for each data point. c : int The number of components in `y`. Returns ------- cov : array_like The covariance of each group. Examples -------- >>> partitioned_cov() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ .. warning:: Implementation of this function is not finished yet. """ c = nunique(y) if c is None else c dim = x.shape[1] cov = np.zeros((c, dim, dim)) for i in range(c): cov[i] = np.cov(x[y == c])
[docs]def partitioned_sum(x, y, c=None): """Sums of groups. Groups the rows of `x` according to the class labels in `y` and sums each group. Parameters ---------- x : array_like, shape (`n`, `dim`) The data to group, where `n` is the number of data points and `dim` is the dimensionality of each data point. y : array_like, shape (`n`,) The class label for each data point. c : int The number of components in `y`. Returns ------- sums : array_like The sum of each group. Examples -------- >>> partitioned_sum() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ c = nunique(y) if c is None else c # noinspection PyTypeChecker return np.dot(np.arange(0, c).reshape(c, 1) == y, x)
[docs]def shrink_cov(x, return_lambda=False, return_estimate=False): """Covariance shrinkage estimation. Ledoit-Wolf optimal shrinkage estimator for cov(X) :math:`C = \\lambda*t + (1 - \\lambda) * s` using the diagonal variance 'target' t=np.diag(s) with the unbiased sample cov `s` as the unconstrained estimate. Parameters ---------- x : array_like, shape (`n`, `dim`) The data, where `n` is the number of data points and `dim` is the dimensionality of each data point. return_lambda : bool Whether to return lambda or not. return_estimate : bool Whether to return the unbiased estimate or not. Returns ------- C : array The shrunk final estimate lambda_ : float, optional Lambda estimate : array, optional Unbiased estimate. Examples -------- >>> shrink_cov() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ """ optional_returns = return_lambda or return_estimate n, p = x.shape x_mean = np.mean(x, 0) x = x - x_mean # noinspection PyTypeChecker s = np.asarray(np.dot(x.T, x) / (n - 1)) # unbiased estimate s_bar = (n - 1) * s / n s_var = np.zeros((p, p)) for i in range(n): # noinspection PyTypeChecker s_var += np.power(x[i].reshape(p, 1) * x[i] - s_bar, 2) s_var = np.true_divide(n, (n - 1)**3) * s_var # calculate optimal shrinkage o_shrink = np.triu(np.ones((p, p))) - np.eye(p) # Ledoit-Wolf formula lambda_ = np.sum(s_var[o_shrink.astype(np.bool)]) / np.sum(np.power(s[o_shrink.astype(np.bool)], 2)) # bound-constrain lambda lambda_ = np.max([0, np.min([1, lambda_])]) # shrunk final estimate C c = lambda_ * np.diag(np.diag(s)) + (1 - lambda_) * s if not optional_returns: ret = c else: ret = (c,) if return_lambda: ret += (lambda_,) if return_estimate: ret += (s,) return ret
[docs]def canonize_labels(labels, support=None): """Transform labels to 1:k. The size of canonized is the same as ladles but every label is transformed to its corresponding 1:k. If labels does not span the support, specify the support explicitly as the 2nd argument. Parameters ---------- labels : array_like support : optional Returns ------- Transformed labels. Examples -------- >>> canonize_labels() .. note:: Adapted from Matlab: | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_. | Copyright (2010) Kevin Murphy and Matt Dunham | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_ .. warning:: This is only a stub function. Implementation is still missing """ raise NotImplementedError