Source code for genrl.agents.bandits.multiarmed.thompson

import numpy as np

from genrl.agents.bandits.multiarmed.base import MABAgent
from genrl.core.bandit import MultiArmedBandit


[docs]class ThompsonSamplingMABAgent(MABAgent): """ Multi-Armed Bandit Solver with Bayesian Upper Confidence Bound based Action Selection Strategy. :param bandit: The Bandit to solve :param a: alpha value for beta distribution :param b: beta values for beta distibution :type bandit: MultiArmedlBandit type object :type a: float :type b: float """ def __init__(self, bandit: MultiArmedBandit, alpha: float = 1.0, beta: float = 1.0): super(ThompsonSamplingMABAgent, self).__init__(bandit) self._a = alpha * np.ones(shape=(bandit.bandits, bandit.arms)) self._b = beta * np.ones(shape=(bandit.bandits, bandit.arms)) @property def quality(self) -> np.ndarray: """numpy.ndarray: Q values for all the actions for alpha, beta and c""" return self.a / (self.a + self.b) @property def a(self) -> np.ndarray: """numpy.ndarray: alpha parameter of beta distribution associated with the policy""" return self._a @property def b(self) -> np.ndarray: """numpy.ndarray: beta parameter of beta distribution associated with the policy""" return self._b
[docs] def select_action(self, context: int) -> int: """ Select an action according to Thompson Sampling Samples are taken from beta distribution parameterized by alpha and beta for each action. The action with the highest sample is selected. :param context: the context to select action for :type context: int :returns: Selected action :rtype: int """ sample = np.random.beta(self.a[context], self.b[context]) action = np.argmax(sample) self.action_hist.append((context, action)) return action
[docs] def update_params(self, context: int, action: int, reward: float) -> None: """ Update parmeters for the policy Updates the regret as the difference between max Q value and that of the action. Updates the alpha value of beta distribution by adding the reward while the beta value is updated by adding 1 - reward. Update the counts the action taken. :param context: context for which action is taken :param action: action taken for the step :param reward: reward obtained for the step :type context: int :type action: int :type reward: float """ self.reward_hist.append(reward) self.a[context, action] += reward self.b[context, action] += 1 - reward self._regret += max(self.quality[context]) - self.quality[context, action] self.regret_hist.append(self.regret) self.counts[context, action] += 1