Source code for mlpy.learners.online.rl

from __future__ import division, print_function, absolute_import

import numpy as np
from scipy.stats import bernoulli

from ...planners.explorers.discrete import DiscreteExplorer
from ...planners.explorers.discrete import IExplorer
from ...mdp.discrete import DiscreteModel
from ...mdp.stateaction import MDPState, MDPAction
from ...stats.models.ann.neuralnet import NeuralNetwork
from . import IOnlineLearner

__all__ = ['QLearner', 'ModelBasedLearner']


[docs]class QLearner(IOnlineLearner): """Performs q-learning. Q-learning is a reinforcement learning variant. Parameters ---------- explorer : Explorer, optional The exploration strategy used. Default is no exploration. alpha : float, optional The learning rate. Default is 0.5. gamma : float, optional The discounting factor. Default is 0.9. filename : str, optional The name of the file to save the learner state to after each iteration. If None is given, the learner state is not saved. Default is None. """ @property def type(self): return super(QLearner, self).type def __init__(self, explorer=None, alpha=None, gamma=None, filename=None): super(QLearner, self).__init__(filename) self._model = DiscreteModel() self._explorer = explorer if explorer is not None else DiscreteExplorer() """:type: Explorer""" if not isinstance(self._explorer, IExplorer): raise TypeError("'explorer' must be of type 'IExplorer'") self._alpha = alpha if alpha is not None else 0.5 self._gamma = gamma if gamma is not None else 0.9
[docs] def init(self): """Initialize the learner.""" self._model.init()
[docs] def step(self, experience): """Execute learning specific updates. Learning specific updates are performed, e.g. model updates. Parameters ---------- experience : Experience The actor's current experience consisting of previous state, the action performed in that state, the current state, and the reward awarded. """ self._model.update(experience)
[docs] def learn(self, experience): """ Learn a policy from the experience. By updating the Q table according to the experience a policy is learned. Parameters ---------- experience : Experience The actor's current experience consisting of previous state, the action performed in that state, the current state, and the reward awarded. """ info = self._model.statespace[experience.state] info2 = self._model.statespace[experience.next_state] qvalue = info.q[experience.action] maxq = max([info2.q[a] for a in self._model.get_actions(experience.next_state)]) delta = experience.reward + self._gamma * maxq - qvalue info.q[experience.action] = qvalue + self._alpha * delta self._logger.debug("%s action=%s reward=%.2f %s d=%.2f", experience.state, experience.action, experience.reward, experience.next_state, delta) self._logger.debug("\tq_old=%.2f visits=%d", qvalue, info.models[experience.action].visits) self._logger.debug("\tq_new=%.2f", info.q[experience.action])
[docs] def choose_action(self, state): """Choose the next action The next action is chosen according to the current policy and the selected exploration strategy. Parameters ---------- state : MDPState The current state. Returns ------- MDPAction : The chosen action. """ self._model.add_state(state) actions = self._model.get_actions(state) info = self._model.statespace[state] action = self._explorer.choose_action(actions, [info.q[a] for a in actions]) self._logger.debug("state=%s act=%s value=%.2f", state, action, self._model.statespace[state].q[action])
[docs]class ModelBasedLearner(IOnlineLearner): """Performs model based reinforcement learning. Model based reinforcement learning uses the model and planner provided to make decisions on which action to perform next. Parameters ---------- planner : IPlanner The planner to use to determine the best action. max_steps : int, optional The maximum number of steps in an iteration. Default is 100. filename : str, optional The name of the file to save the learner state to after each iteration. If None is given, the learner state is not saved. Default is None. profile : bool, optional Turn on profiling at which point profiling data is collected and saved to a text file. Default is False. """ @property def type(self): return super(ModelBasedLearner, self).type def __init__(self, planner, filename=None): super(ModelBasedLearner, self).__init__(filename) self._do_plan = True self._planner = planner def __setstate__(self, d): super(ModelBasedLearner, self).__setstate__(d) self._do_plan = False
[docs] def init(self): """Initialize the learner.""" self._planner.init()
[docs] def step(self, experience): """Execute learning specific updates. Learning specific updates are performed, e.g. model updates. Parameters ---------- experience : Experience The actor's current experience consisting of previous state, the action performed in that state, the current state, and the reward awarded. """ self._do_plan = self._planner.model.update(experience)
[docs] def learn(self, experience): """Learn a policy from the experience. A policy is learned from the experience by building the MDP model. Parameters ---------- experience : Experience The actor's current experience consisting of previous state, the action performed in that state, the current state, and the reward awarded. """ if self._do_plan: self._planner.plan()
[docs] def choose_action(self, state): """Choose the next action The next action is chosen according to the current policy and the selected exploration strategy. Parameters ---------- state : MDPState The current state. Returns ------- MDPAction : The chosen action. """ return self._planner.choose_action(state)
[docs]class Cacla(IOnlineLearner): @property def type(self): return super(Cacla, self).type def __init__(self, nhidden_q, nhidden_v, explorer_type=None, gamma=None, alpha=None, beta=None, explore_rate=None, filename=None): super(Cacla, self).__init__(filename) self._g1 = 0. self._g2 = 0. self._stored_gauss = False self._action = None self._value = None self._v_target = None self._nhidden_q = nhidden_q self._nhidden_v = nhidden_v self._gamma = gamma if gamma is not None else .99 self._alpha = alpha if alpha is not None else .01 self._beta = beta if beta is not None else .01 self._explorer_type = explorer_type if explorer_type is not None else "gaussian" self._explore_rate = explore_rate if explore_rate is not None else 5000
[docs] def init(self): """Initialize the learner.""" if MDPState.discretized: num_states = 1 for states_per_dim in zip(MDPState.states_per_dim): num_states *= states_per_dim self._action = np.zeros((num_states, MDPAction.nfeatures)) self._value = np.zeros((num_states,)) else: if self._nhidden_q == 0: self._action = np.random.random((MDPState.nfeatures + 1, MDPAction.nfeatures)) else: self._action = NeuralNetwork([MDPState.nfeatures, self._nhidden_q, MDPAction.nfeatures]) self._value = NeuralNetwork([MDPState.nfeatures, self._nhidden_v, 1]) self._v_target = np.zeros((1,)) self._stored_gauss = False
[docs] def step(self, experience): """Execute learning specific updates. Learning specific updates are performed, e.g. model updates. Parameters ---------- experience : Experience The actor's current experience consisting of previous state, the action performed in that state, the current state, and the reward awarded. """ if not MDPState.discretized and self._nhidden_q > 0: self._action.feed_forward(experience.next_state)
[docs] def end(self, experience): """End the episode. Perform all end of episode tasks and save the state of the learner to file. Parameters ---------- experience : Experience The agent's experience consisting of the previous state, the action performed in that state, the current state and the reward awarded. """ if MDPState.discretized: vt = self._value[experience.state][0] self._value[experience.state] += self._alpha * (experience.reward - self._value[experience.state]) if self._value[experience.state] > vt: self._action[experience.state] += self._beta * ( experience.action - self._action[experience.state]) else: self._v_target[0] = experience.reward vt = self._value.feed_forward(experience.state)[0] self._value.back_propagate(experience.state, self._v_target, self._alpha) if self._v_target[0] > vt: if self._nhidden_q == 0: st = np.ones((experience.state.get().shape[0] + 1,)) st[:-1] = experience.state self._action += self._beta * np.outer(st, (experience.action - self._get_action(experience.state))) else: self._action.back_propagate(experience.state, experience.action, self._beta)
[docs] def learn(self, experience): """Learn a policy from the experience. Perform the learning step to derive a new policy taking the latest experience into account. Parameters ---------- experience : Experience The agent's experience consisting of the previous state, the action performed in that state, the current state and the reward awarded. """ if MDPState.discretized: vt = self._value[experience.state][0] self._value[experience.state] += self._alpha * ( experience.reward + self._gamma * self._value[experience.next_state] - self._value[experience.state]) if self._value[experience.state] > vt: self._action[experience.state] += self._beta * ( experience.action - self._action[experience.state]) else: vs = self._value.feed_forward(experience.next_state)[0] self._v_target[0] = experience.reward + self._gamma * vs vt = self._value.feed_forward(experience.state)[0] self._value.back_propagate(experience.state, self._v_target, self._alpha) if self._v_target[0] > vt: if self._nhidden_q == 0: st = np.ones((experience.state.get().shape[0] + 1,)) st[:-1] = experience.state self._action += self._beta * np.outer(st, (experience.action - self._get_action(experience.state))) else: self._action.back_propagate(experience.state, experience.action, self._beta)
[docs] def choose_action(self, state): """Choose the next action The next action is chosen according to the current policy and the selected exploration strategy. Parameters ---------- state : MDPState The current state. Returns ------- MDPAction : The chosen action. """ if MDPState.discretized: action = self._action[state] else: if self._nhidden_q > 0: action = self._action.get_activations(-1) else: action = self._get_action(state) if self._explorer_type == "egreedy": if bernoulli.rvs(self._explore_rate): for i, (min_, max_) in enumerate(zip(MDPAction.min_features, MDPAction.max_features)): action[i] = np.random.uniform(min_, max_) if self._explorer_type == "gaussian": for i in range(MDPAction.nfeatures): action[i] += self._explore_rate * self._gaussian_random() return MDPAction(action)
def _get_action(self, state): st = np.ones((state.get().shape[0] + 1,)) st[:-1] = state action = np.asarray(np.dot(self._action.T, st)) if action.ndim == 0: action = action[np.newaxis] return action def _gaussian_random(self): if self._stored_gauss: self._stored_gauss = False return self._g2 x = y = 0. z = 1. while z >= 1.0: # x = np.random.uniform(-1., 1.) # y = np.random.uniform(-1., 1.) x = 2.0 * np.random.random() - 1.0 y = 2.0 * np.random.random() - 1.0 z = x * x + y * y z = np.sqrt(-2. * np.log(z) / z) self._g1 = x * z self._g2 = y * z self._stored_gauss = True return self._g1