Source code for genrl.agents.bandits.multiarmed.gradient

import numpy as np

from genrl.agents.bandits.multiarmed.base import MABAgent
from genrl.core.bandit import MultiArmedBandit


[docs]class GradientMABAgent(MABAgent): """ Multi-Armed Bandit Solver with Softmax Action Selection Strategy. Refer to Section 2.8 of Reinforcement Learning: An Introduction. :param bandit: The Bandit to solve :param alpha: The step size parameter for gradient based update :param temp: Temperature for softmax distribution over Q values of actions :type bandit: MultiArmedlBandit type object :type alpha: float :type temp: float """ def __init__( self, bandit: MultiArmedBandit, alpha: float = 0.1, temp: float = 0.01 ): super(GradientMABAgent, self).__init__(bandit) self._alpha = alpha self._temp = temp self._quality = np.zeros(shape=(bandit.bandits, bandit.arms)) self._probability_hist = [] @property def alpha(self) -> float: """float: Step size parameter for gradient based update of policy""" return self._alpha @property def temp(self) -> float: """float: Temperature for softmax distribution over Q values of actions""" return self._temp @property def quality(self) -> np.ndarray: """numpy.ndarray: Q values assigned by the policy to all actions""" return self._quality @property def probability_hist(self) -> np.ndarray: """numpy.ndarray: History of probabilty values assigned to each action for each timestep""" return self._probability_hist def _softmax(self, x: np.ndarray) -> np.ndarray: r""" Softmax with temperature :math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i / temp)}{\sum_j \exp(x_j / temp)}` :param x: Set of values to compute softmax over :type x: numpy.ndarray :returns: Computed softmax over given values :rtype: numpy.ndarray """ exp = np.exp(x / self.temp) total = np.sum(exp) p = exp / total return p
[docs] def select_action(self, context: int) -> int: """ Select an action according by softmax action selection strategy Action is sampled from softmax distribution computed over the Q values for all actions :param context: the context to select action for :type context: int :returns: Selected action :rtype: int """ probabilities = self._softmax(self.quality[context]) action = np.random.choice(self._bandit.arms, 1, p=probabilities)[0] self.action_hist.append((context, action)) self.probability_hist.append(probabilities) return action
[docs] def update_params(self, context: int, action: int, reward: float) -> None: """ Update parmeters for the policy Updates the regret as the difference between max Q value and that of the action. Updates the Q values through a gradient ascent step :param context: context for which action is taken :param action: action taken for the step :param reward: reward obtained for the step :type context: int :type action: int :type reward: float """ self.reward_hist.append(reward) self._regret += max(self.quality[context]) - self.quality[context, action] self.regret_hist.append(self.regret) # compute reward baseline by taking mean of all rewards till t-1 if len(self.reward_hist) <= 1: reward_baseline = 0.0 else: reward_baseline = np.mean(self.reward_hist[:-1]) current_probailities = self.probability_hist[-1] # update Q values for the action taken and those not taken seperately self.quality[context, action] += ( self.alpha * (reward - reward_baseline) * (1 - current_probailities[action]) ) actions_not_taken = np.arange(self._bandit.arms) != action self.quality[context, actions_not_taken] += ( -1 * self.alpha * (reward - reward_baseline) * current_probailities[actions_not_taken] )