Source code for genrl.agents.bandits.multiarmed.gradient

import numpy as np

from genrl.agents.bandits.multiarmed.base import MABAgent
from genrl.core.bandit import MultiArmedBandit


[docs]class GradientMABAgent(MABAgent):
    """
    Multi-Armed Bandit Solver with Softmax Action Selection Strategy.

    Refer to Section 2.8 of Reinforcement Learning: An Introduction.

    :param bandit: The Bandit to solve
    :param alpha: The step size parameter for gradient based update
    :param temp: Temperature for softmax distribution over Q values of actions
    :type bandit: MultiArmedlBandit type object
    :type alpha: float
    :type temp: float
    """

    def __init__(
        self, bandit: MultiArmedBandit, alpha: float = 0.1, temp: float = 0.01
    ):
        super(GradientMABAgent, self).__init__(bandit)
        self._alpha = alpha
        self._temp = temp
        self._quality = np.zeros(shape=(bandit.bandits, bandit.arms))
        self._probability_hist = []

    @property
    def alpha(self) -> float:
        """float: Step size parameter for gradient based update of policy"""
        return self._alpha

    @property
    def temp(self) -> float:
        """float: Temperature for softmax distribution over Q values of actions"""
        return self._temp

    @property
    def quality(self) -> np.ndarray:
        """numpy.ndarray: Q values assigned by the policy to all actions"""
        return self._quality

    @property
    def probability_hist(self) -> np.ndarray:
        """numpy.ndarray: History of probabilty values assigned to each action for each timestep"""
        return self._probability_hist

    def _softmax(self, x: np.ndarray) -> np.ndarray:
        r"""
        Softmax with temperature
        :math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i / temp)}{\sum_j \exp(x_j / temp)}`

        :param x: Set of values to compute softmax over
        :type x: numpy.ndarray
        :returns: Computed softmax over given values
        :rtype: numpy.ndarray
        """
        exp = np.exp(x / self.temp)
        total = np.sum(exp)
        p = exp / total
        return p

[docs]    def select_action(self, context: int) -> int:
        """
        Select an action according by softmax action selection strategy

        Action is sampled from softmax distribution computed over
        the Q values for all actions

        :param context: the context to select action for
        :type context: int
        :returns: Selected action
        :rtype: int
        """
        probabilities = self._softmax(self.quality[context])
        action = np.random.choice(self._bandit.arms, 1, p=probabilities)[0]
        self.action_hist.append((context, action))
        self.probability_hist.append(probabilities)
        return action

[docs]    def update_params(self, context: int, action: int, reward: float) -> None:
        """
        Update parmeters for the policy

        Updates the regret as the difference between max Q value and that
        of the action. Updates the Q values through a gradient ascent step

        :param context: context for which action is taken
        :param action: action taken for the step
        :param reward: reward obtained for the step
        :type context: int
        :type action: int
        :type reward: float
        """
        self.reward_hist.append(reward)
        self._regret += max(self.quality[context]) - self.quality[context, action]
        self.regret_hist.append(self.regret)

        # compute reward baseline by taking mean of all rewards till t-1
        if len(self.reward_hist) <= 1:
            reward_baseline = 0.0
        else:
            reward_baseline = np.mean(self.reward_hist[:-1])

        current_probailities = self.probability_hist[-1]

        # update Q values for the action taken and those not taken seperately
        self.quality[context, action] += (
            self.alpha * (reward - reward_baseline) * (1 - current_probailities[action])
        )
        actions_not_taken = np.arange(self._bandit.arms) != action
        self.quality[context, actions_not_taken] += (
            -1
            * self.alpha
            * (reward - reward_baseline)
            * current_probailities[actions_not_taken]
        )