Source code for genrl.agents.bandits.multiarmed.epsgreedy

import numpy as np

from genrl.agents.bandits.multiarmed.base import MABAgent
from genrl.core.bandit import MultiArmedBandit


[docs]class EpsGreedyMABAgent(MABAgent): """ Contextual Bandit Policy with Epsilon Greedy Action Selection Strategy. Refer to Section 2.3 of Reinforcement Learning: An Introduction. :param bandit: The Bandit to solve :param eps: Probability with which a random action is to be selected. :type bandit: MultiArmedlBandit type object :type eps: float """ def __init__(self, bandit: MultiArmedBandit, eps: float = 0.05): super(EpsGreedyMABAgent, self).__init__(bandit) self._eps = eps self._quality = np.zeros(shape=(bandit.bandits, bandit.arms)) self._counts = np.zeros(shape=(bandit.bandits, bandit.arms)) @property def eps(self) -> float: """float: Exploration constant""" return self._eps @property def quality(self) -> np.ndarray: """numpy.ndarray: Q values assigned by the policy to all actions""" return self._quality
[docs] def select_action(self, context: int) -> int: """ Select an action according to epsilon greedy startegy A random action is selected with espilon probability over the optimal action according to the current Q values to encourage exploration of the policy. :param context: the context to select action for :type context: int :returns: Selected action :rtype: int """ if np.random.random() < self.eps: action = np.random.randint(0, self._bandit.arms) else: action = np.argmax(self.quality[context]) self.action_hist.append((context, action)) return action
[docs] def update_params(self, context: int, action: int, reward: float) -> None: """ Update parmeters for the policy Updates the regret as the difference between max Q value and that of the action. Updates the Q values according to the reward recieved in this step. :param context: context for which action is taken :param action: action taken for the step :param reward: reward obtained for the step :type context: int :type action: int :type reward: float """ self.reward_hist.append(reward) self._regret += max(self.quality[context]) - self.quality[context, action] self.regret_hist.append(self.regret) self.quality[context, action] += (reward - self.quality[context, action]) / ( self.counts[context, action] + 1 ) self.counts[context, action] += 1