Source code for mlpy.planners.explorers.discrete

from __future__ import division, print_function, absolute_import
# noinspection PyUnresolvedReferences
from six.moves import range
import random

import numpy as np

from ...stats import gibbs
from . import IExplorer

__all__ = ['DiscreteExplorer', 'EGreedyExplorer', 'SoftmaxExplorer']


[docs]class DiscreteExplorer(IExplorer): """The discrete explorer base class. The explorer class executes the exploration policy by choosing a next action based on the current qvalues of the state-action pairs. Notes ----- All discrete explorers should derive from this class. """ def __init__(self): super(DiscreteExplorer, self).__init__()
[docs] def choose_action(self, actions, qvalues): """Choose the next action according to the exploration strategy. Parameters ---------- actions : list[Actions] The available actions. qvalues : list[float] The q-value for each action. Returns ------- Action : The action with maximum q-value that can be taken from the given state. """ return self._get_maxq_action(actions, qvalues)
# noinspection PyMethodMayBeStatic def _get_maxq_action(self, actions, qvalues): """Find the highest valued action available for the given state. Parameters ---------- actions : list[Actions] The available actions. qvalues : list[float] The q-value for each action. Returns ------- Action : The action with maximum qvalue that can be taken from the given state. """ maxq = max(qvalues) count = qvalues.count(maxq) if count > 1: action = actions[np.random.choice([i for i in range(len(actions)) if qvalues[i] == maxq])] else: index = qvalues.index(maxq) action = actions[index] return action
[docs]class EGreedyExplorer(DiscreteExplorer): """The :math:`\\epsilon`-greedy explorer. The :math:`\\epsilon`-greedy explorer policy chooses as next action the action with the highest q-value, however with :math:`\\epsilon`-probability a random action is chosen to drive exploration of unknown states. Parameters ---------- epsilon : float, optional The :math:`\\epsilon` probability. Default is 0.5. decay : float, optional The value by which :math:`\\epsilon` decays. This value should be between 0 and 1. The probability :math:`\\epsilon` to decreases over time with a factor of `decay`. Set this value to 1 if :math:`\\epsilon` should remain the same throughout the experiment. Default is 1. """ def __init__(self, epsilon=None, decay=None): super(EGreedyExplorer, self).__init__() self._epsilon = epsilon if epsilon is not None else 0.5 self._decay = decay if decay is not None else 1 self._decay = max(0, self._decay) self._decay = min(self._decay, 1)
[docs] def choose_action(self, actions, qvalues): """Choose the next action. With :math:`\\epsilon` probability, a random action is chosen, otherwise the action resulting in the highest q-value is selected. Parameters ---------- actions : list[Actions] The available actions. qvalues : list[float] The q-value for each action. Returns ------- Action : The action with maximum qvalue that can be taken from the given state. """ action = self._get_maxq_action(actions, qvalues) if self._is_active and np.random.random() < self._epsilon: self._epsilon *= self._decay action = random.choice(actions) return action
[docs]class SoftmaxExplorer(DiscreteExplorer): """The softmax explorer. The softmax explorer varies the action probability as a graded function of estimated value. The greedy action is still given the highest selection probability, but all the others are ranked and weighted according to their value estimates. Parameters ---------- tau : float, optional The temperature value. Default is 2.0. decay : float, optional The value by which :math:`\\tau` decays. This value should be between 0 and 1. The temperature :math:`\\tau` to decrease over time with a factor of `decay`. Set this value to 1 if :math:`\\tau` should remain the same throughout the experiment. Default is 1. Notes ----- The softmax function implemented uses the Gibbs distribution. It chooses action `a` on the `t`-th play with probability: .. math:: \\frac{e^{Q_t(a)/\\tau}}{\\sum_{b=1}^ne^{Q_t(b)/\\tau}} where :math:`\\tau` is a positive parameter called the `temperature`. High temperatures cause all actions to be equiprobable. Low temperatures cause a greater difference in the selection probability. For :math:`\\tau` close to zero, the action selection because the same as greedy. """ def __init__(self, tau=None, decay=None): super(SoftmaxExplorer, self).__init__() self._tau = tau if tau is not None else 2.0 self._decay = decay if decay is not None else 1 self._decay = max(0, self._decay) self._decay = min(self._decay, 1)
[docs] def choose_action(self, actions, qvalues): """Choose the next action. Choose the next action according to the Gibbs distribution. Parameters ---------- actions : list[Actions] The available actions. qvalues : list[float] The q-value for each action. Returns ------- Action : The action with maximum q-value that can be taken from the given state. """ action = self._get_maxq_action(actions, qvalues) if self._is_active: self._tau *= self._decay pmf = gibbs.pmf(np.asarray(qvalues), self._tau) action = actions[np.random.choice(np.arange(len(np.asarray(qvalues))), p=pmf)] return action