Source code for l2rpn_baselines.utils.gymAgent

# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.

from abc import abstractmethod
import copy
from typing import List, Optional

from grid2op.Agent import BaseAgent
from grid2op.Observation import BaseObservation
from grid2op.Action import BaseAction

from l2rpn_baselines.utils.gymenv_custom import GymEnvWithHeuristics


[docs]class GymAgent(BaseAgent):
    """
    This class maps a neural network (trained using ray / rllib or stable baselines for example
    
    It can then be used as a "regular" grid2op agent, in a runner, grid2viz, grid2game etc.

    It is also compatible with the "l2rpn baselines" interface.

    Use it only with a trained agent. It does not provide the "save" method and
    is not suitable for training.
    
    .. note::
        To load a previously saved agent the function `GymAgent.load` will be called
        and you must provide the `nn_path` keyword argument.
        
        To build a new agent, the function `GymAgent.build` is called and
        you must provide the `nn_kwargs` keyword argument.
    
    Examples
    ---------
    Some examples of such agents are provided in the classes:
    
    - :class:`l2rpn_baselines.PPO_SB3.PPO_SB3` that implements such an agent with the "stable baselines3" RL framework
    - :class:`l2rpn_baselines.PPO_RLLIB.PPO_RLLIB` that implements such an agent with the "ray / rllib" RL framework
    
    Both can benefit from the feature of this class, most notably the possibility to include "heuristics" (such as: 
    "if a powerline can be reconnected, do it" or "do not act if the grid is not in danger")
    
    Notes
    -----
    The main goal of this class is to be able to use "heuristics" (both for training and at inference time) quite simply
    and with out of the box support of external libraries.
    
    All top performers in all l2rpn competitions (as of writing) used some kind of heuristics in their agent (such as: 
    "if a powerline can be reconnected, do it" or "do not act if the grid is not in danger"). This is why we made some 
    effort to develop a generic class that allows to train agents directly using these "heuristics".
    
    This features is split in two parts:
    
    - At training time, the "*heuristics*" are part of the environment. The agent will see only observations that are relevant
      to it (and not the stat handled by the heuristic.)
    - At inference time, the "*heuristics*" of the environment used to train the agent are included in the "agent.act" function.
      If a heuristic has been used at training time, the agent will first "ask" the environment is a heuristic should be
      performed on the grid (in this case it will do it) otherwise it will ask the underlying neural network what to do.
    
    Some examples are provided in the "examples" code (under the "examples/ppo_stable_baselines") repository that 
    demonstrates the use of :class:`l2rpn_baselines.utils.GymEnvWithRecoWithDN` .
    
    """
    def __init__(self,
                 g2op_action_space,
                 gym_act_space,
                 gym_obs_space,
                 *,  # to prevent positional argument
                 nn_path=None,
                 nn_kwargs=None,
                 gymenv=None,
                 _check_both_set=True,
                 _check_none_set=True):
        super().__init__(g2op_action_space)
        self._gym_act_space = gym_act_space
        self._gym_obs_space = gym_obs_space
        
        self._has_heuristic : bool = False
        self.gymenv : Optional[GymEnvWithHeuristics] = gymenv
        self._action_list : Optional[List] = None
        
        if self.gymenv is not None and isinstance(self.gymenv, GymEnvWithHeuristics):
            self._has_heuristic = True
            self._action_list = []
            
        if _check_none_set and (nn_path is None and nn_kwargs is None):
            raise RuntimeError("Impossible to build a GymAgent without providing at "
                               "least one of `nn_path` (to load the agent from disk) "
                               "or `nn_kwargs` (to create the underlying agent).")
        if _check_both_set and (nn_path is not None and nn_kwargs is not None):
            raise RuntimeError("Impossible to build a GymAgent by providing both "
                               "`nn_path` (*ie* you want load the agent from disk) "
                               "and `nn_kwargs` (*ie* you want to create the underlying agent from these "
                               "parameters).")
        if nn_path is not None:
            self._nn_path = nn_path
        else:
            self._nn_path = None
            
        if nn_kwargs is not None:
            self._nn_kwargs = copy.deepcopy(nn_kwargs)
        else:
            self._nn_kwargs = None
        
        self.nn_model = None
        if nn_path is not None:
            self.load()
        else:
            self.build()
            
[docs]    @abstractmethod
    def get_act(self, gym_obs, reward, done):
        """
        retrieve the action from the NN model
        """
        pass

[docs]    @abstractmethod
    def load(self):
        """
        Load the NN model
        
        ..info:: Only called if the agent has been build with `nn_path` not None and `nn_kwargs=None`
        """
        pass
    
[docs]    @abstractmethod
    def build(self):
        """
        Build the NN model.
        
        ..info:: Only called if the agent has been build with `nn_path=None` and `nn_kwargs` not None
        """
        pass
    
[docs]    def clean_heuristic_actions(self, observation: BaseObservation, reward: float, done: bool) -> None:
        """This function allows to cure the heuristic actions. 
        
        It is called at each step, just after the heuristic actions are computed (but before they are selected).
        
        It can be used, for example, to reorder the `self._action_list` for example.

        It is not used during training.
        
        Args:
            observation (BaseObservation): The current observation
            reward (float): the current reward
            done (bool): the current flag "done"
        """
        pass
    
[docs]    def act(self, observation: BaseObservation, reward: float, done: bool) -> BaseAction:
        """This function is called to "map" the grid2op world
        into a usable format by a neural networks (for example in a format
        usable by stable baselines or ray/rllib)

        Parameters
        ----------
        observation : BaseObservation
            The grid2op observation
        reward : ``float``
            The reward
        done : function
            the flag "done" by open ai gym.

        Returns
        -------
        BaseAction
            The action taken by the agent, in a form of a grid2op BaseAction.
        
        Notes
        -------
        In case your "real agent" wants to implement some "non learned" heuristic,
        you can also put them here.
        
        In this case the "gym agent" will only be used in particular settings.
        """
        grid2op_act = None
        
        # heuristic part
        if self._has_heuristic:
            if not self._action_list:
                # the list of actions is empty, i querry the heuristic to see if there's something I can do
                self._action_list = self.gymenv.heuristic_actions(observation, reward, done, {})
                
            self.clean_heuristic_actions(observation, reward, done)
            if self._action_list:
                # some heuristic actions have been selected, i select the first one
                grid2op_act = self._action_list.pop(0)
        
        # the heursitic did not select any actions, then ask the NN to do one !
        if grid2op_act is None:
            gym_obs = self._gym_obs_space.to_gym(observation)
            gym_act = self.get_act(gym_obs, reward, done)
            grid2op_act = self._gym_act_space.from_gym(gym_act)
            
            # fix the action if needed (for example by limiting curtailment and storage)
            if self._has_heuristic:
                grid2op_act = self.gymenv.fix_action(grid2op_act, observation)
            
        return grid2op_act