Source code for mlpy.stats._stats

from __future__ import division, print_function, absolute_import
# noinspection PyUnresolvedReferences
from six.moves import range

import numpy as np
# noinspection PyPackageRequirements
from sklearn.utils.extmath import logsumexp

from ..auxiliary.array import nunique

__all__ = ['is_posdef', 'randpd', 'stacked_randpd', 'normalize_logspace', 'sq_distance',
           'partitioned_mean', 'partitioned_cov', 'partitioned_sum', 'shrink_cov',
           'canonize_labels']


[docs]def is_posdef(a):
    """Test if matrix `a` is positive definite.

    The method uses Cholesky decomposition to determine if
    the matrix is positive definite.

    Parameters
    ----------
    a : ndarray
        A matrix.

    Returns
    -------
    bool :
        Whether the matrix is positive definite.

    Examples
    --------
    >>> is_posdef()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    try:
        np.linalg.cholesky(np.asarray(a))
        return True
    except np.linalg.LinAlgError:
        return False


[docs]def randpd(dim):
    """Create a random positive definite matrix of size `dim`-by-`dim`.

    Parameters
    ----------
    dim : int
        The dimension of the matrix to create.

    Returns
    -------
    ndarray :
        A `dim`-by-`dim` positive definite matrix.

    Examples
    --------
    >>> randpd()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    x = np.random.randn(dim, dim)
    a = x * x.T
    while not is_posdef(a):
        a = a + np.diag(0.001 * np.ones(dim))

    return a


[docs]def stacked_randpd(dim, k, p=0):
    """Create stacked positive definite matrices.

    Create multiple random positive definite matrices of size
    dim-by-dim and stack them.

    Parameters
    ----------
    dim : int
        The dimension of each matrix.
    k : int
        The number of matrices.
    p : int
        The diagonal value of each matrix.

    Returns
    -------
    ndarray :
        Multiple stacked random positive definite matrices.

    Examples
    --------
    >>> stacked_randpd()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    s = np.zeros((k, dim, dim))
    for i in range(k):
        s[i] = randpd(dim) + np.diag(p * np.ones(dim))

    return s


[docs]def normalize_logspace(a):
    """Normalizes the array `a` in the log domain.

    Each row of `a` is a log discrete distribution. Returns
    the array normalized in the log domain while minimizing the
    possibility of numerical underflow.

    Parameters
    ----------
    a : ndarray
        The array to normalize in the log domain.

    Returns
    -------
    a : ndarray
        The array normalized in the log domain.
    lnorm : float
        log normalization constant.

    Examples
    --------
    >>> normalize_logspace()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    l = logsumexp(a, 1)
    y = a.T - l
    return y.T, l


[docs]def sq_distance(p, q, p_sos=None, q_sos=None):
    """Efficiently compute squared Euclidean distances between stats of vectors.

    Compute the squared Euclidean distances between every d-dimensional point
    in `p` to every `d`-dimensional point in q. Both `p` and `q` are n-point-by-n-dimensions.

    Parameters
    ----------
    p : array_like, shape (`n`, `dim`)
        Array where `n` is the number of points and `dim` is the number of
        dimensions.
    q : array_like, shape (`n`, `dim`)
        Array where `n` is the number of points and `dim` is the number of
        dimensions.
    p_sos : array_like, shape (`dim`,)
    q_sos : array_like, shape (`dim`,)

    Returns
    -------
    ndarray :
        The squared Euclidean distance.

    Examples
    --------
    >>> sq_distance()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    p_sos = np.sum(np.power(p, 2), 1) if p_sos is None else p_sos
    # noinspection PyTypeChecker
    q_sos = np.sum(np.power(q, 2), 1) if q_sos is None else q_sos

    # noinspection PyUnresolvedReferences
    n = q_sos.shape[0]
    # noinspection PyUnresolvedReferences
    return (q_sos.reshape((n, 1)) + p_sos).T - 2 * np.dot(p, q.T)


[docs]def partitioned_mean(x, y, c=None, return_counts=False):
    """Mean of groups.

    Groups the rows of `x` according to the class labels in y and
    takes the mean of each group.

    Parameters
    ----------
    x : array_like, shape (`n`, `dim`)
        The data to group, where `n` is the number of data points and
        `dim` is the dimensionality of each data point.
    y : array_like, shape (`n`,)
        The class label for each data point.
    return_counts : bool
        Whether to return the number of elements in each group or not.

    Returns
    -------
    mean : array_like
        The mean of each group.
    counts : int
        The number of elements in each group.

    Examples
    --------
    >>> partitioned_mean()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    c = nunique(y) if c is None else c

    dim = x.shape[1]
    m = np.zeros((c, dim))

    for i in range(c):
        ndx = y == i
        m[i] = np.mean(x[ndx], 0)

    if not return_counts:
        ret = m
    else:
        ret = (m,)
        # noinspection PyTupleAssignmentBalance
        _, counts = np.unique(y, return_counts=True)
        ret += (counts,)

    return ret


[docs]def partitioned_cov(x, y, c=None):
    """Covariance of groups.

    Partition the rows of `x` according to class labels in `y` and
    take the covariance of each group.

    Parameters
    ----------
    x : array_like, shape (`n`, `dim`)
        The data to group, where `n` is the number of data points and
        `dim` is the dimensionality of each data point.
    y : array_like, shape (`n`,)
        The class label for each data point.
    c : int
        The number of components in `y`.

    Returns
    -------
    cov : array_like
        The covariance of each group.

    Examples
    --------
    >>> partitioned_cov()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    .. warning::
        Implementation of this function is not finished yet.

    """
    c = nunique(y) if c is None else c

    dim = x.shape[1]
    cov = np.zeros((c, dim, dim))

    for i in range(c):
        cov[i] = np.cov(x[y == c])


[docs]def partitioned_sum(x, y, c=None):
    """Sums of groups.

    Groups the rows of `x` according to the class labels in `y`
    and sums each group.

    Parameters
    ----------
    x : array_like, shape (`n`, `dim`)
        The data to group, where `n` is the number of data points and
        `dim` is the dimensionality of each data point.
    y : array_like, shape (`n`,)
        The class label for each data point.
    c : int
        The number of components in `y`.

    Returns
    -------
    sums : array_like
        The sum of each group.

    Examples
    --------
    >>> partitioned_sum()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    c = nunique(y) if c is None else c
    # noinspection PyTypeChecker
    return np.dot(np.arange(0, c).reshape(c, 1) == y, x)


[docs]def shrink_cov(x, return_lambda=False, return_estimate=False):
    """Covariance shrinkage estimation.

    Ledoit-Wolf optimal shrinkage estimator for cov(X)
    :math:`C = \\lambda*t + (1 - \\lambda) * s`
    using the diagonal variance 'target' t=np.diag(s) with the
    unbiased sample cov `s` as the unconstrained estimate.

    Parameters
    ----------
    x : array_like, shape (`n`, `dim`)
        The data, where `n` is the number of data points and
        `dim` is the dimensionality of each data point.
    return_lambda : bool
        Whether to return lambda or not.
    return_estimate : bool
        Whether to return the unbiased estimate or not.

    Returns
    -------
    C : array
        The shrunk final estimate
    lambda_ : float, optional
        Lambda
    estimate : array, optional
        Unbiased estimate.

    Examples
    --------
    >>> shrink_cov()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    """
    optional_returns = return_lambda or return_estimate

    n, p = x.shape

    x_mean = np.mean(x, 0)
    x = x - x_mean

    # noinspection PyTypeChecker
    s = np.asarray(np.dot(x.T, x) / (n - 1))      # unbiased estimate
    s_bar = (n - 1) * s / n

    s_var = np.zeros((p, p))
    for i in range(n):
        # noinspection PyTypeChecker
        s_var += np.power(x[i].reshape(p, 1) * x[i] - s_bar, 2)
    s_var = np.true_divide(n, (n - 1)**3) * s_var

    # calculate optimal shrinkage
    o_shrink = np.triu(np.ones((p, p))) - np.eye(p)

    # Ledoit-Wolf formula
    lambda_ = np.sum(s_var[o_shrink.astype(np.bool)]) / np.sum(np.power(s[o_shrink.astype(np.bool)], 2))

    # bound-constrain lambda
    lambda_ = np.max([0, np.min([1, lambda_])])

    # shrunk final estimate C
    c = lambda_ * np.diag(np.diag(s)) + (1 - lambda_) * s

    if not optional_returns:
        ret = c
    else:
        ret = (c,)
        if return_lambda:
            ret += (lambda_,)
        if return_estimate:
            ret += (s,)
    return ret


[docs]def canonize_labels(labels, support=None):
    """Transform labels to 1:k.

    The size of canonized is the same as ladles but every label is
    transformed to its corresponding 1:k. If labels does not span
    the support, specify the support explicitly as the 2nd argument.

    Parameters
    ----------
    labels : array_like
    support : optional

    Returns
    -------
    Transformed labels.

    Examples
    --------
    >>> canonize_labels()

    .. note::
        Adapted from Matlab:

        | Project: `Probabilistic Modeling Toolkit for Matlab/Octave <https://github.com/probml/pmtk3>`_.
        | Copyright (2010) Kevin Murphy and Matt Dunham
        | License: `MIT <https://github.com/probml/pmtk3/blob/5fefd068a2e84ae508684d3e4750bd72a4164ba0/license.txt>`_

    .. warning::
        This is only a stub function. Implementation is still missing
    """
    raise NotImplementedError