Source code for genrl.agents.deep.vpg.vpg

from typing import Any, Dict

import gym
import numpy as np
import torch
from torch import optim as opt

from genrl.agents import OnPolicyAgent
from genrl.utils import get_env_properties, get_model, safe_mean


[docs]class VPG(OnPolicyAgent): """ Vanilla Policy Gradient algorithm Paper https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf network (str): The network type of the Q-value function. Supported types: ["cnn", "mlp"] env (Environment): The environment that the agent is supposed to act on create_model (bool): Whether the model of the algo should be created when initialised batch_size (int): Mini batch size for loading experiences gamma (float): The discount factor for rewards layers (:obj:`tuple` of :obj:`int`): Layers in the Neural Network of the Q-value function lr_policy (float): Learning rate for the policy/actor lr_value (float): Learning rate for the Q-value function rollout_size (int): Capacity of the Rollout Buffer buffer_type (str): Choose the type of Buffer: ["rollout"] seed (int): Seed for randomness render (bool): Should the env be rendered during training? device (str): Hardware being used for training. Options: ["cuda" -> GPU, "cpu" -> CPU] """ def __init__(self, *args, **kwargs): super(VPG, self).__init__(*args, **kwargs) self.empty_logs() if self.create_model: self._create_model() def _create_model(self): """Initialize policy network""" state_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network ) if isinstance(self.network, str): # Instantiate networks and optimizers self.actor = get_model("p", self.network)( state_dim, action_dim, self.policy_layers, "V", discrete, action_lim=action_lim, ).to(self.device) else: self.actor = self.network.to(self.device) self.optimizer_policy = opt.Adam(self.actor.parameters(), lr=self.lr_policy)
[docs] def select_action( self, state: np.ndarray, deterministic: bool = False ) -> np.ndarray: """Select action given state Action Selection for Vanilla Policy Gradient Args: state (:obj:`np.ndarray`): Current state of the environment deterministic (bool): Should the policy be deterministic or stochastic Returns: action (:obj:`np.ndarray`): Action taken by the agent value (:obj:`torch.Tensor`): Value of given state. In VPG, there is no critic to find the value so we set this to a default 0 for convenience log_prob (:obj:`torch.Tensor`): Log probability of selected action """ state = torch.as_tensor(state).float().to(self.device) # create distribution based on policy_fn output action, dist = self.actor.get_action(state, deterministic=deterministic) return ( action.detach().cpu().numpy(), torch.zeros((1, self.env.n_envs)), dist.log_prob(action).cpu(), )
[docs] def get_log_probs(self, states: torch.Tensor, actions: torch.Tensor): """Get log probabilities of action values Actions taken by actor and their respective states are analysed to get log probabilities Args: states (:obj:`torch.Tensor`): States encountered in rollout actions (:obj:`torch.Tensor`): Actions taken in response to respective states Returns: log_probs (:obj:`torch.Tensor`): Log of action probabilities given a state """ states, actions = states.to(self.device), actions.to(self.device) _, dist = self.actor.get_action(states, deterministic=False) return dist.log_prob(actions).cpu()
[docs] def get_traj_loss(self, values, dones): """Get loss from trajectory traversed by agent during rollouts Computes the returns and advantages needed for calculating loss Args: values (:obj:`torch.Tensor`): Values of states encountered during the rollout dones (:obj:`list` of bool): Game over statuses of each environment """ self.rollout.compute_returns_and_advantage(values.detach().cpu().numpy(), dones)
[docs] def update_params(self) -> None: """Updates the the A2C network Function to update the A2C actor-critic architecture """ for rollout in self.rollout.get(self.batch_size): actions = rollout.actions if isinstance(self.env.action_space, gym.spaces.Discrete): actions = actions.long().flatten() log_prob = self.get_log_probs(rollout.observations, actions) loss = rollout.returns * log_prob loss = -torch.mean(loss) self.logs["loss"].append(loss.item()) self.optimizer_policy.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 0.5) self.optimizer_policy.step()
[docs] def get_hyperparams(self) -> Dict[str, Any]: """Get relevant hyperparameters to save Returns: hyperparams (:obj:`dict`): Hyperparameters to be saved """ hyperparams = { "network": self.network, "batch_size": self.batch_size, "gamma": self.gamma, "lr_policy": self.lr_policy, "rollout_size": self.rollout_size, "weights": self.ac.state_dict(), } return hyperparams
[docs] def load_weights(self, weights) -> None: """Load weights for the agent from pretrained model Args: weights (:obj:`dict`): Dictionary of different neural net weights """ self.ac.load_state_dict(weights["weights"])
[docs] def get_logging_params(self) -> Dict[str, Any]: """Gets relevant parameters for logging Returns: logs (:obj:`dict`): Logging parameters for monitoring training """ logs = { "loss": safe_mean(self.logs["loss"]), "mean_reward": safe_mean(self.rewards), } self.empty_logs() return logs
[docs] def empty_logs(self): """Empties logs""" self.logs = {} self.logs["loss"] = [] self.rewards = []