Source code for genrl.agents.bandits.multiarmed.base

from typing import List, Tuple, Union

import numpy as np

from genrl.core.bandit import BanditAgent, MultiArmedBandit


[docs]class MABAgent(BanditAgent): """ Base Class for Contextual Bandit solving Policy :param bandit: The Bandit to solve :param requires_init_run: Indicated if initialisation of Q values is required :type bandit: MultiArmedlBandit type object """ def __init__(self, bandit: MultiArmedBandit): super(MABAgent, self).__init__() self._bandit = bandit assert bandit.context_type == "int", ( f"Context type of bandit should be int, " f"found {bandit.context_type}" ) self._regret = 0.0 self._regret_hist = [] self._action_hist = [] self._reward_hist = [] self._counts = np.zeros(shape=(bandit.bandits, bandit.arms)) @property def action_hist(self) -> Tuple[int, int]: """ Get the history of actions taken for contexts :returns: List of context, actions pairs :rtype: list """ return self._action_hist @property def regret(self) -> float: """ Get the current regret :returns: The current regret :rtype: float """ return self._regret @property def regret_hist(self) -> List[float]: """ Get the history of regrets incurred for each step :returns: List of rewards :rtype: list """ return self._regret_hist @property def reward_hist(self) -> List[float]: """ Get the history of rewards received for each step :returns: List of rewards :rtype: list """ return self._reward_hist @property def counts(self) -> np.ndarray: """ Get the number of times each action has been taken :returns: Numpy array with count for each action :rtype: numpy.ndarray """ return self._counts
[docs] def select_action(self, context: int) -> int: """ Select an action This method needs to be implemented in the specific policy. :param context: the context to select action for :type context: int :returns: Selected action :rtype: int """ raise NotImplementedError
[docs] def update_params( self, context: int, action: int, reward: Union[int, float] ) -> None: """ Update parmeters for the policy This method needs to be implemented in the specific policy. :param context: context for which action is taken :param action: action taken for the step :param reward: reward obtained for the step :type context: int :type action: int :type reward: int or float """ raise NotImplementedError