Source code for genrl.agents.deep.dqn.base

from copy import deepcopy
from typing import Any, Dict, List

import numpy as np
import torch
from torch import optim as opt

from genrl.agents import OffPolicyAgent
from genrl.utils import get_env_properties, get_model, safe_mean


[docs]class DQN(OffPolicyAgent):
    """Base DQN Class

    Paper: https://arxiv.org/abs/1312.5602

    Attributes:
        network (str): The network type of the Q-value function.
            Supported types: ["cnn", "mlp"]
        env (Environment): The environment that the agent is supposed to act on
        create_model (bool): Whether the model of the algo should be created when initialised
        batch_size (int): Mini batch size for loading experiences
        gamma (float): The discount factor for rewards
        value_layers (:obj:`tuple` of :obj:`int`): Layers in the Neural Network
            of the Q-value function
        lr_value (float): Learning rate for the Q-value function
        replay_size (int): Capacity of the Replay Buffer
        buffer_type (str): Choose the type of Buffer: ["push", "prioritized"]
        max_epsilon (str): Maximum epsilon for exploration
        min_epsilon (str): Minimum epsilon for exploration
        epsilon_decay (str): Rate of decay of epsilon (in order to decrease
            exploration with time)
        seed (int): Seed for randomness
        render (bool): Should the env be rendered during training?
        device (str): Hardware being used for training. Options:
            ["cuda" -> GPU, "cpu" -> CPU]
    """

    def __init__(
        self,
        *args,
        max_epsilon: float = 1.0,
        min_epsilon: float = 0.01,
        epsilon_decay: int = 1000,
        **kwargs
    ):
        super(DQN, self).__init__(*args, **kwargs)
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.dqn_type = ""
        self.noisy = False

        self.empty_logs()
        if self.create_model:
            self._create_model()

    def _create_model(self, *args, **kwargs) -> None:
        """Function to initialize Q-value model

        This will create the Q-value function of the agent.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env, self.network)
        if not discrete:
            raise Exception("Only Discrete Environments are supported for DQN")

        if isinstance(self.network, str):
            self.model = get_model("v", self.network + self.dqn_type)(
                state_dim, action_dim, "Qs", self.value_layers, **kwargs
            )
        else:
            self.model = self.network

        self.target_model = deepcopy(self.model)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr_value)

[docs]    def update_target_model(self) -> None:
        """Function to update the target Q model

        Updates the target model with the training model's weights when called
        """
        self.target_model.load_state_dict(self.model.state_dict())

[docs]    def update_params_before_select_action(self, timestep: int) -> None:
        """Update necessary parameters before selecting an action

        This updates the epsilon (exploration rate) of the agent every timestep

        Args:
            timestep (int): Timestep of training
        """
        self.timestep = timestep
        self.epsilon = self.calculate_epsilon_by_frame()
        self.logs["epsilon"].append(self.epsilon)

[docs]    def get_greedy_action(self, state: torch.Tensor) -> np.ndarray:
        """Greedy action selection

        Args:
            state (:obj:`np.ndarray`): Current state of the environment

        Returns:
            action (:obj:`np.ndarray`): Action taken by the agent
        """
        q_values = self.model(state.unsqueeze(0)).detach().numpy()
        action = np.argmax(q_values, axis=-1).squeeze(0)
        return action

[docs]    def select_action(
        self, state: np.ndarray, deterministic: bool = False
    ) -> np.ndarray:
        """Select action given state

        Epsilon-greedy action-selection

        Args:
            state (:obj:`np.ndarray`): Current state of the environment
            deterministic (bool): Should the policy be deterministic or stochastic

        Returns:
            action (:obj:`np.ndarray`): Action taken by the agent
        """
        state = torch.as_tensor(state).float()
        action = self.get_greedy_action(state)
        if not deterministic:
            if np.random.rand() < self.epsilon:
                action = np.asarray(self.env.sample())
        return action

    def _reshape_batch(self, batch: List):
        """Function to reshape experiences for DQN

        Most of the DQN experiences need to be reshaped before sending to the
        Neural Networks
        """
        states = batch[0]
        actions = batch[1].unsqueeze(-1).long()
        rewards = batch[2]
        next_states = batch[3]
        dones = batch[4]

        return states, actions, rewards, next_states, dones

[docs]    def get_q_values(self, states: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        """Get Q values corresponding to specific states and actions

        Args:
            states (:obj:`torch.Tensor`): States for which Q-values need to be found
            actions (:obj:`torch.Tensor`): Actions taken at respective states

        Returns:
            q_values (:obj:`torch.Tensor`): Q values for the given states and actions
        """
        q_values = self.model(states)
        q_values = q_values.gather(2, actions)
        return q_values

[docs]    def get_target_q_values(
        self, next_states: torch.Tensor, rewards: List[float], dones: List[bool]
    ) -> torch.Tensor:
        """Get target Q values for the DQN

        Args:
            next_states (:obj:`torch.Tensor`): Next states for which target Q-values
                need to be found
            rewards (:obj:`list`): Rewards at each timestep for each environment
            dones (:obj:`list`): Game over status for each environment

        Returns:
            target_q_values (:obj:`torch.Tensor`): Target Q values for the DQN
        """
        # Next Q-values according to target model
        next_q_target_values = self.target_model(next_states)
        # Maximum of next q_target values
        max_next_q_target_values = next_q_target_values.max(2)[0]
        target_q_values = rewards + self.gamma * torch.mul(  # Expected Target Q values
            max_next_q_target_values, (1 - dones)
        )
        # Needs to be unsqueezed to match dimension of q_values
        return target_q_values.unsqueeze(-1)

[docs]    def update_params(self, update_interval: int) -> None:
        """Update parameters of the model

        Args:
            update_interval (int): Interval between successive updates of the target model
        """
        self.update_target_model()

        for timestep in range(update_interval):
            batch = self.sample_from_buffer()
            loss = self.get_q_loss(batch)
            self.logs["value_loss"].append(loss.item())

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # In case the model uses Noisy layers, we must reset the noise every timestep
            if self.noisy:
                self.model.reset_noise()
                self.target_model.reset_noise()

[docs]    def calculate_epsilon_by_frame(self) -> float:
        """Helper function to calculate epsilon after every timestep

        Exponentially decays exploration rate from max epsilon to min epsilon
        The greater the value of epsilon_decay, the slower the decrease in epsilon
        """
        return self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(
            -1.0 * self.timestep / self.epsilon_decay
        )

[docs]    def get_hyperparams(self) -> Dict[str, Any]:
        """Get relevant hyperparameters to save

        Returns:
            hyperparams (:obj:`dict`): Hyperparameters to be saved
        """
        hyperparams = {
            "gamma": self.gamma,
            "batch_size": self.batch_size,
            "lr": self.lr_value,
            "replay_size": self.replay_size,
            "weights": self.model.state_dict(),
            "timestep": self.timestep,
        }
        return hyperparams

[docs]    def load_weights(self, weights) -> None:
        """Load weights for the agent from pretrained model

        Args:
            weights (:obj:`Dict`): Dictionary of different neural net weights
        """
        self.model.load_state_dict(weights["weights"])

[docs]    def get_logging_params(self) -> Dict[str, Any]:
        """Gets relevant parameters for logging

        Returns:
            logs (:obj:`dict`): Logging parameters for monitoring training
        """
        logs = {
            "value_loss": safe_mean(self.logs["value_loss"]),
            "epsilon": safe_mean(self.logs["epsilon"]),
        }
        self.empty_logs()
        return logs

[docs]    def empty_logs(self) -> None:
        """Empties logs"""
        self.logs = {}
        self.logs["value_loss"] = []
        self.logs["epsilon"] = []