Source code for mlpy.learners.online.rl

from __future__ import division, print_function, absolute_import

import os

from ...tools.log import LoggingMgr
from ...planners.explorers.discrete import DiscreteExplorer
from ...mdp.discrete import DiscreteModel
from . import IOnlineLearner

__all__ = ['RLLearner', 'QLearner', 'RLDTLearner']


# noinspection PyAbstractClass
[docs]class RLLearner(IOnlineLearner):
    """The reinforcement learning learner interface.

    Parameters
    ----------
    max_steps : int, optional
        The maximum number of steps in an iteration. Default is 100.
    filename : str, optional
        The name of the file to save the learner state to after each iteration.
        If None is given, the learner state is not saved. Default is None.
    profile : bool, optional
        Turn on profiling at which point profiling data is collected
        and saved to a text file. Default is False.

    """
    def __init__(self, max_steps=None, filename=None, profile=False):
        super(RLLearner, self).__init__(filename)
        self._logger = LoggingMgr().get_logger(self._mid)

        self._step_iter = 0

        self._episode_cntr = 1
        self._cum_reward = 0
        self._num_wins = 0

        self._max_steps = max_steps if max_steps is not None else 100
        self._profile = profile

    def __getstate__(self):
        data = super(RLLearner, self).__getstate__()
        data.update(self.__dict__.copy())

        remove_list = ('_id', '_logger')
        for key in remove_list:
            if key in data:
                del data[key]

        return data

    def __setstate__(self, d):
        super(RLLearner, self).__setstate__(d)

        for name, value in d.iteritems():
            setattr(self, name, value)

        self._logger = LoggingMgr().get_logger(self._mid)
        self._logger.debug("Episode=%d", self._episode_cntr)

[docs]    def reset(self, t, **kwargs):
        """Reset reinforcement learner.

        Parameters
        ----------
        t : float
            The current time (sec)
        kwargs : dict, optional
            Non-positional parameters, optional.

        """
        super(RLLearner, self).reset(t, **kwargs)

        self._step_iter = 0

[docs]    def save(self, filename):
        """Save the learners state.

        If profiling is turned on, profile information is saved to a `txt` file
        with the same name.

        Parameters
        ----------
        filename : str
            The filename to save the information to.

        """
        super(RLLearner, self).save(filename)

        if self._profile:
            filename = os.path.splitext(self._filename)[0]
            with open(filename + ".txt", "a") as f:
                win_ratio = float(self._num_wins) / float(self._episode_cntr)
                f.write("%d, %d, %.2f, %.2f\n" % (self._episode_cntr, self._num_wins, self._cum_reward, win_ratio))

[docs]    def learn(self, experience=None):
        """Learn a policy from the experience.

        Parameters
        ----------
        experience : Experience
            The agent's experience consisting of the previous state, the action performed
            in that state, the current state and the reward awarded.

        """
        self._logger.info(experience)

        if self._profile and experience.reward is not None:
            if experience.reward > 0.0:
                self._num_wins += 1
            self._cum_reward += experience.reward
            self._logger.debug("cumReward: %.2f", self._cum_reward)


[docs]class QLearner(RLLearner):
    """Performs q-learning.

    Q-learning is a reinforcement learning variant.

    Parameters
    ----------
    explorer : Explorer, optional
        The exploration strategy used. Default is no exploration.
    max_steps : int, optional
        The maximum number of steps in an iteration. Default is 100
    alpha : float, optional
        The learning rate. Default is 0.5.
    gamma : float, optional
        The discounting factor. Default is 0.9.
    filename : str, optional
        The name of the file to save the learner state to after each iteration.
        If None is given, the learner state is not saved. Default is None.
    profile : bool, optional
        Turn on profiling at which point profiling data is collected
        and saved to a text file. Default is False.

    """
    def __init__(self, explorer=None, max_steps=None, alpha=None, gamma=None, filename=None, profile=False):
        super(QLearner, self).__init__(max_steps, filename, profile)

        self._model = DiscreteModel()
        self._explorer = explorer if explorer is not None else DiscreteExplorer()
        """:type: Explorer"""

        self._alpha = alpha if alpha is not None else 0.5
        self._gamma = gamma if gamma is not None else 0.9

[docs]    def execute(self, experience):
        """Execute learning specific updates.

        Learning specific updates are performed, e.g. model updates.

        Parameters
        ----------
        experience : Experience
            The actor's current experience consisting of previous state, the action
            performed in that state, the current state, and the reward awarded.

        """
        self._model.update(experience)

[docs]    def learn(self, experience=None):
        """ Learn a policy from the experience.

        By updating the Q table according to the experience a policy is learned.

        Parameters
        ----------
        experience : Experience
            The actor's current experience consisting of previous state, the action
            performed in that state, the current state, and the reward awarded.

        """
        super(QLearner, self).learn(experience)

        info = self._model.statespace[experience.state]
        info2 = self._model.statespace[experience.next_state]

        qvalue = info.q[experience.action]
        maxq = max([info2.q[a] for a in self._model.get_actions(experience.next_state)])

        delta = experience.reward + self._gamma * maxq - qvalue
        info.q[experience.action] = qvalue + self._alpha * delta

        self._logger.debug("%s action=%s reward=%.2f %s d=%.2f", experience.state, experience.action, experience.reward,
                           experience.next_state, delta)
        self._logger.debug("\tq_old=%.2f visits=%d", qvalue, info.models[experience.action].visits)
        self._logger.debug("\tq_new=%.2f", info.q[experience.action])

[docs]    def choose_action(self, state):
        """Choose the next action

        The next action is chosen according to the current policy and the
        selected exploration strategy.

        Parameters
        ----------
        state : State
            The current state.

        Returns
        -------
        Action :
            The chosen action.

        """
        self._model.add_state(state)

        action = None
        if self._step_iter < self._max_steps:
            actions = self._model.get_actions(state)
            info = self._model.statespace[state]

            action = self._explorer.choose_action(actions, [info.q[a] for a in actions])
            self._logger.debug("state=%s act=%s value=%.2f", state, action, self._model.statespace[state].q[action])

        return action


[docs]class RLDTLearner(RLLearner):
    """Performs reinforcement learning using decision trees.

    Reinforcement learning using decision trees (RL-DT) use decision trees
    to build the transition and reward models as described by Todd Hester and
    Peter Stone [1]_.

    Parameters
    ----------
    planner : IPlanner
        The planner to use to determine the best action.
    max_steps : int, optional
        The maximum number of steps in an iteration. Default is 100.
    filename : str, optional
        The name of the file to save the learner state to after each iteration.
        If None is given, the learner state is not saved. Default is None.
    profile : bool, optional
        Turn on profiling at which point profiling data is collected
        and saved to a text file. Default is False.

    References
    ----------
    .. [1] Hester, Todd, and Peter Stone. "Generalized model learning for reinforcement
        learning in factored domains." Proceedings of The 8th International Conference on
        Autonomous Agents and Multiagent Systems-Volume 2. International Foundation for Autonomous
        Agents and Multiagent Systems, 2009.

    """
    def __init__(self, planner, max_steps=None, filename=None, profile=False):
        super(RLDTLearner, self).__init__(max_steps, filename, profile)

        self._do_plan = True
        self._planner = planner

    def __getstate__(self):
        data = super(RLDTLearner, self).__getstate__()
        data.update({'_planner': self._planner})
        return data

    def __setstate__(self, d):
        super(RLDTLearner, self).__setstate__(d)

        for name, value in d.iteritems():
            setattr(self, name, value)

        self._do_plan = False

[docs]    def execute(self, experience):
        """Execute learning specific updates.

        Learning specific updates are performed, e.g. model updates.

        Parameters
        ----------
        experience : Experience
            The actor's current experience consisting of previous state, the action
            performed in that state, the current state, and the reward awarded.

        """
        self._do_plan = self._planner.model.update(experience)

[docs]    def learn(self, experience=None):
        """Learn a policy from the experience.

        A policy is learned from the experience by building the MDP model.

        Parameters
        ----------
        experience : Experience
            The actor's current experience consisting of previous state, the action
            performed in that state, the current state, and the reward awarded.

        """
        super(RLDTLearner, self).learn(experience)

        if self._do_plan:
            self._planner.plan()

[docs]    def choose_action(self, state):
        """Choose the next action

        The next action is chosen according to the current policy and the
        selected exploration strategy.

        Parameters
        ----------
        state : State
            The current state.

        Returns
        -------
        Action :
            The chosen action.

        """
        action = None

        if self._step_iter < self._max_steps:
            action = self._planner.get_next_action(state)
            self._step_iter += 1

        return action