Source code for genrl.agents.bandits.multiarmed.bernoulli_mab

from typing import Tuple

import numpy as np

from genrl.core.bandit import MultiArmedBandit


[docs]class BernoulliMAB(MultiArmedBandit):
    """
    Contextual Bandit with categorial context and bernoulli reward distribution

    :param bandits: Number of bandits
    :param arms: Number of arms in each bandit
    :param reward_probs: Probabilities of getting rewards
    :type bandits: int
    :type arms: int
    :type reward_probs: numpy.ndarray
    """

    def __init__(
        self,
        bandits: int = 1,
        arms: int = 5,
        reward_probs: np.ndarray = None,
        context_type: str = "tensor",
    ):
        super(BernoulliMAB, self).__init__(bandits, arms, context_type)
        if reward_probs is not None:
            self.reward_probs = reward_probs
        else:
            self.reward_probs = np.random.random(size=(bandits, arms))

    def _compute_reward(self, action: int) -> Tuple[int, int]:
        """
        Takes an action in the bandit and returns the sampled reward

        The reward is sampled from a bernoulli distribution

        :param action: The action to take
        :type action: int
        :returns: Reward sampled for the action taken and maximum reward
        :rtype: tuple of int
        """
        reward_prob = self.reward_probs[self.curr_bandit, action]
        reward = int(np.random.random() > reward_prob)
        return reward, 1