Source code for mlpy.auxiliary.datasets

"""
.. module:: mlpy.auxiliary.dataset
   :platform: Unix, Windows
   :synopsis: Manages recording of data.

.. moduleauthor:: Astrid Jackson <ajackson@eecs.ucf.edu>
"""
from __future__ import division, print_function, absolute_import

import numpy as np
from .io import load_from_file, save_to_file


[docs]class DataSet(object):
    """The data set.

    The data set class a container for tracked data. Data can be tracked by
    adding a ``field`` for the data of interest. A :class:`numpy.ndarray` is created
    for every field that is added for recording. Optionally a `description`
    and a :class:`numpy.dtype` can be associated with the field.

    Parameters
    ----------
    capacity : int
        The initial capacity of the record. Defaults to 10.
    filename : str
        The name of the file to load from/save to the record.
    append : bool
        Whether to append to the existing records loaded from file
        or to overwrite data. Defaults to ``False``.

    Examples
    --------
    Creating a new dataset that stores its records in ``my_history.pkl``:

    >>> history = DataSet(capacity=2, filename="my_history.pkl")

    Adding a new field:

    >>> history.add_field("state", 3, dtype=DataSet.DTYPE_FLOAT)
    >>> print history
    state: dim(2,)
    []

    Adding a new data record:

    >>> import numpy as np
    >>> history.append("state", np.ones(3))

    Add a new sequence:

    >>> history.new_sequence()

    Save the dataset to file:

    >>> history.save()

    """
    DTYPE_OBJECT = np.object
    DTYPE_FLOAT = np.float64
    DTYPE_INT = np.int32

    def __init__(self, capacity=None, filename=None, append=None):
        self._sequence_index = -1

        self._description = {}
        self._data = {}
        self._endmarker = {}
        self._dtype = {}

        self._capacity = capacity if capacity is not None else 10
        self._append = append if append is not None else False
        self._filename = filename

[docs]    def load(self, filename=None):
        """Load the records from file.

        If filename is ``None``, the record is loaded from the
        class variable filename.

        Parameters
        ----------
        filename : str
            The name of the file.

        Raises
        ------
        ValueError
            If no filename is passed to the function and the member
            variable filename is `None`
        IOError
            If a file with the name does not exist.

        """
        if self._append is False:
            return

        if filename is None:
            filename = self._filename
        if filename is None:
            raise ValueError("No filename specified.")

        try:
            data = load_from_file(filename)

            check_capacity = True
            for name in data:
                idx = name.find("_descr")
                if idx == -1:
                    self._data[name] = data[name]
                    self._dtype[name] = self._get_record_info(data[name][self._sequence_index][0])[1]
                    self._endmarker[name] = self._data[name][self._sequence_index].shape[1]

                    if check_capacity:
                        self._capacity = data[name].shape[0]
                        check_capacity = False
                    continue

                self._description[name[:idx]] = data[name]

            self._sequence_index = self._capacity - 1
        except IOError:
            pass

[docs]    def save(self, filename=None):
        """Save the record to file.

        If filename is `None`, the record is saved to the class
        variable filename.

        Parameters
        ----------
        filename : str
            The name of the file

        Raises
        ------
        ValueError
            If no filename is passed to the function and the member variable
            filename is `None`.

        Notes
        -----
        If an error occurred during saving, the function fails silently.

        """
        if filename is None:
            filename = self._filename
        if filename is None:
            raise ValueError("No filename specified.")

        data = {}
        description = {}
        for name in self.get_field_names():
            data[name] = self._reduce(self._data[name], self._sequence_index + 1)
            data[name][self._sequence_index] = self._reduce(self._data[name][self._sequence_index],
                                                            self._endmarker[name], axis=1)
            if name in self._description:
                description[name + "_descr"] = self._description[name]

        if self._description:
            data.update(description)

        save_to_file(filename, data)

    def __str__(self):
        s = ""
        for name in self._data:
            s = s + name + ": dim" + str(self._data[name].shape) + "\n" + str(
                self._data[name][:self._endmarker[name]]) + "\n\n"
        return s

    def __getitem__(self, name):
        return self.get_field(name)

[docs]    def get_field_names(self):
        """Returns all field names.

        Returns
        -------
        tuple[str] :
            A list of field names.

        """
        return tuple(self._data.keys())

[docs]    def get_field(self, name):
        """Returns the field with the given name.

        Parameters
        ----------
        name : str
            The name of the field.

        Returns
        -------
        ndarray :
            If a field with that name exists, the field data is returned.

        """
        if self.has_field(name):
            return self._data[name]

[docs]    def has_field(self, name):
        """Checks if a field with that name exists.

        Parameters
        ----------
        name : str
            The name of the field.

        Returns
        -------
        bool :
            Whether a field with that name exists.

        """
        return name in self._data

[docs]    def add_field(self, name, dim, dtype=None, description=None):
        """Add a field with given the specifications.

        Parameters
        ----------
        name : str
            The name of the field.
        dim : int
            The dimensions of the field
        dtype : dtype
            The :class:`numpy.dtype` for the underlying :class:`numpy.ndarray`.
        description : str
            An optional description of the field.

        """
        if self._sequence_index < 0:
            self._sequence_index += 1

        if name not in self._data:
            self._data[name] = np.zeros((self._capacity,), dtype=np.object)

            if description is not None:
                self._description[name] = description

            dtype = dtype if dtype is not None else np.float64
            self._data[name][self._sequence_index] = np.zeros((dim, 50), dtype=dtype)
            self._dtype[name] = dtype
            self._endmarker[name] = 0

[docs]    def append(self, name, data):
        """Append a new data record.

        Append a new data record to the current sequence of samples of
        the field with the given `name`.

        Parameters
        ----------
        name : str
            The name of the field.
        data : str or int or float or ndarray
            The data record.

        """
        if name not in self._data:
            raise KeyError("Field '%s' not registered." % name)

        dim, size = self._data[name][self._sequence_index].shape
        if size <= self._endmarker[name]:
            self._data[name][self._sequence_index] = self._resize(self._data[name][self._sequence_index],
                                                                  shape=(dim, size * 2), axis=1,
                                                                  dtype=self._dtype[name])
        if dim > 1:
            data = np.asarray(data, dtype=self._dtype[name])
        self._data[name][self._sequence_index][:, self._endmarker[name]] = data
        self._endmarker[name] += 1

[docs]    def new_sequence(self):
        """Adds a new sequence.

        Adds a new sequence of samples for all fields and
        increments the sequence counter.

        """
        self._sequence_index += 1

        resize = False
        if self._capacity <= self._sequence_index:
            self._capacity *= 2
            resize = True

        for name in self.get_field_names():
            if resize:
                self._data[name] = self._resize(self._data[name], shape=(self._capacity,), dtype=np.object)

            self._data[name][self._sequence_index - 1] = self._reduce(self._data[name][self._sequence_index - 1],
                                                                      self._endmarker[name], axis=1)

            dim = self._data[name][self._sequence_index - 1].shape[0]
            self._data[name][self._sequence_index] = np.zeros((dim, 50), dtype=self._dtype[name])
            self._endmarker[name] = 0

    # noinspection PyMethodMayBeStatic
    def _resize(self, a, shape, axis=0, dtype=None):
        dtype = dtype if dtype is not None else np.float64

        size = a.shape[axis]
        data = np.zeros(shape, dtype=dtype)
        if axis == 0:
            data[:size] = a
        elif axis == 1:
            data[:, :size] = a
        a = data
        return a

    # noinspection PyMethodMayBeStatic
    def _reduce(self, a, endmarker, axis=0):
        if axis == 0:
            return a[:endmarker]

        return a[:, :endmarker]

    # noinspection PyMethodMayBeStatic
    def _get_record_info(self, record):
        dim = 1
        dtype = np.float64

        try:
            r = record.tolist()
        except:
            r = record

        if isinstance(r, list):
            dim = len(r)
            r = r[0]
        if isinstance(r, basestring):
            dtype = np.object
        if isinstance(r, int):
            dtype = np.int32
        return dim, dtype