Source code for genrl.agents.bandits.multiarmed.gaussian_mab

from typing import Tuple

import numpy as np

from genrl.core.bandit import MultiArmedBandit


[docs]class GaussianMAB(MultiArmedBandit):
    """
    Contextual Bandit with categorial context and gaussian reward distribution

    :param bandits: Number of bandits
    :param arms: Number of arms in each bandit
    :param reward_means: Mean of gaussian distribution for each reward
    :type bandits: int
    :type arms: int
    :type reward_means: numpy.ndarray
    """

    def __init__(
        self,
        bandits: int = 10,
        arms: int = 5,
        reward_means: np.ndarray = None,
        context_type: str = "tensor",
    ):
        super(GaussianMAB, self).__init__(bandits, arms, context_type)
        if reward_means is not None:
            self.reward_means = reward_means
        else:
            self.reward_means = np.random.random(size=(bandits, arms))

    def _compute_reward(self, action: int) -> Tuple[float, float]:
        """
        Takes an action in the bandit and returns the sampled reward

        The reward is sampled from a gaussian distribution

        :param action: The action to take
        :type action: int
        :returns: Reward sampled for the action taken and maximum reward
        :rtype: tuples of int
        """
        reward_mean = self.reward_means[self.curr_bandit, action]
        reward = np.random.normal(reward_mean)
        return reward, max(self.reward_means[self.curr_bandit])