Source code for l2rpn_baselines.utils.gymenv_custom

# Copyright (c) 2020-2022 RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.

from abc import abstractmethod
from typing import Tuple, Dict, List
import numpy as np

from grid2op.Observation import BaseObservation
from grid2op.Action import BaseAction
from grid2op.gym_compat import GymEnv


[docs]class GymEnvWithHeuristics(GymEnv):
    """This abstract class is used to perform some actions, independantly of a RL
    agent on a grid2op environment.
    
    It can be used, for example, to train an agent (for example a deep-rl agent)
    if you want to use some heuristics at inference time (for example
    you reconnect every powerline that you can.)
    
    The heuristic you want to implement should be implemented in :func:`GymEnvWithHeuristics.heuristic_actions`.
    
    Examples
    --------
    Let's imagine, for example, that you want to implement an RL agent that performs actions on the grid. But you noticed
    that your agent performs better if the all the powerlines are reconnected (which is often the case by the way).
    
    To that end, you want to force the reconnection of powerline each time it's possible. When it's not possible, you want to
    let the neural network do what is best for the environment.
    
    Training an agent on such setting might be difficult and require recoding some (deep) part of the training framework (*eg*
    stable-baselines). Unless... You use a dedicated "environment".
    
    In this environment (compatible, inheriting the base class `gym.Env`) will handle all the "heuristic" part and only show
    the agent with the state where it should act.
    
    Basically a "step" happens like this:
    
    #. the agent issue an action (gym format)
    #. the action (gym format) is decoded to a grid2op compatible action (thanks to the action_space)
    #. this grid2op action is implemented on the grid (thanks to the underlying grid2op environment)
       and the corresponding grid2op observation is generated
    #. this observation is processed by the  :func:`GymEnvWithHeuristics.apply_heuristics_actions`: the grid2op_env.step
       is called until the NN agent is require to take a decision (or the flag `done=True` is set).
    #. the observation (corresponding to the last step above) is then converted to a gym action (thanks to the observation_space)
       which is forwarded to the agent.
    
    The agent then only "sees" what is not processed by the heuristic. It is trained only on the relevant "state".

    """
    POSSIBLE_REWARD_CUMUL = ["init", "last", "sum", "max"]
    def __init__(self, env_init, *args, reward_cumul="last", **kwargs):
        super().__init__(env_init, *args, **kwargs)
        self._reward_cumul = reward_cumul
        
        if not self._reward_cumul in type(self).POSSIBLE_REWARD_CUMUL:
            raise RuntimeError("Wrong argument for the reward_cumul parameters. "
                               f"You provided \"{self._reward_cumul}\" (possible "
                               f"values are {type(self).POSSIBLE_REWARD_CUMUL}).")
            
[docs]    @abstractmethod
    def heuristic_actions(self,
                          g2op_obs: BaseObservation,
                          reward: float,
                          done: bool,
                          info: Dict) -> List[BaseAction]:
        """This function has the same signature as the "agent.act"  function. It allows to implement a heuristic.
        
        It can be called multiple times per "gymenv step" and is expect to return a list of grid2op actions (in the
        correct order) to be done on the underlying grid2op environment. 

        An implementation of such a function (for example) can be found at :func:`GymEnvWithReco.heuristic_actions` or
        :func:`GymEnvWithRecoWithDN.heuristic_actions`
        
        This function can return a list of action that will "in turn" be executed on the grid. It is only after each 
        and every actions that are returned that this function is called again.
        
        .. note::
            You MUST return "[do_nothing]" if your heuristic chose to do nothing at a certain step. Otherwise (if
            the returned list is empty "[]" the agent is asked to perform an action.)
        
        .. note::
            We remind that inside a "gym env" step, a lot of "grid2op env" steps might be happening.
            
            As long as a heuristic action is selected (ie as long as this function does not return the empty list)
            this action is performed on the grid2op environment.
            
        Parameters
        ----------
        g2op_obs : BaseObservation
            [description]
        reward : float
            The last reward the agent (or the heuristic) had.
            This is the `reward` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`
        done : bool
            Whether the environment is "done" or not. It should be "False" in most cases. 
            This is the `done` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`
        info : Dict
            `info` part of the last call to `obs, reward, done, info = grid2op_env.step(grid2op_act)`

        Returns
        -------
        List[BaseAction]
            The ordered list of actions to implement, selected by the "heuristic" / "expert knowledge" / "automatic action".
        """
        return []
    
[docs]    def apply_heuristics_actions(self,
                                 g2op_obs: BaseObservation,
                                 reward: float,
                                 done: bool,
                                 info: Dict ) -> Tuple[BaseObservation, float, bool, Dict]:
        """This function implements the "logic" behind the heuristic part. Unless you have a particular reason too, you
        probably should not modify this function.
        
        If you modify it, you should also modify the way the agent implements it (remember: this function is used 
        at training time, the "GymAgent" part is used at inference time. Both behaviour should match for the best
        performance).

        While there are "heuristics" / "expert rules" / etc. this function should perform steps in the underlying grid2op
        environment.
        
        It is expected to return when:
        
        - either the flag `done` is ``True`` 
        - or the neural network agent is asked to perform action on the grid
        
        The neural network agent will receive the outpout of this function. 
        
        Parameters
        ----------
        g2op_obs : BaseObservation
            The grid2op observation.
            
        reward : ``float``
            The reward
            
        done : ``bool``
            The flag that indicates whether the environment is over or not.
            
        info : Dict
            Other information flags

        Returns
        -------
        Tuple[BaseObservation, float, bool, Dict]
            It should return `obs, reward, done, info`(same as a single call to `grid2op_env.step(grid2op_act)`)
            
            Then, this will be transmitted to the neural network agent (but before the observation will be 
            transformed to a gym observation thanks to the observation space.)
            
        """
        need_action = True
        res_reward = reward
        
        tmp_reward = reward
        tmp_info = info
        while need_action:
            need_action = False
            g2op_actions = self.heuristic_actions(g2op_obs, tmp_reward, done, tmp_info)
            for g2op_act in g2op_actions:
                need_action = True
                tmp_obs, tmp_reward, tmp_done, tmp_info = self.init_env.step(g2op_act)
                g2op_obs = tmp_obs
                done = tmp_done
                
                if self._reward_cumul == "max":
                    res_reward = max(tmp_reward, res_reward)
                elif self._reward_cumul == "sum":
                    res_reward += tmp_reward
                elif self._reward_cumul == "last":
                    res_reward = tmp_reward
                    
                if tmp_done:
                    break
            if done:
                break
        return g2op_obs, res_reward, done, info
    
[docs]    def fix_action(self, grid2op_action, g2op_obs):
        """This function can be used to "fix" / "modify" / "cut" / "change"
        a grid2op action just before it will be applied to the underlying "env.step(...)"
        
        This can be used, for example to "limit the curtailment or storage" of the
        action in case this one is too strong and would lead to a game over.

        By default it does nothing.
        
        Parameters
        ----------
        grid2op_action : _type_
            _description_
            
        """
        return grid2op_action
    
[docs]    def step(self, gym_action):
        """This function implements the special case of the "step" function (as seen by the "gym environment") that might
        call multiple times the "step" function of the underlying "grid2op environment" depending on the
        heuristic.
        
        It takes a gym action, convert it to a grid2op action (thanks to the action space).
        
        Then process the heuristics / expert rules / forced actions / etc. and return the next gym observation that will
        be processed by the agent.
        
        The number of "grid2op steps" can vary between different "gym environment" call to "step".
        
        It has the same signature as the `gym.Env` "step" function, of course. 

        Parameters
        ----------
        gym_action :
            the action (represented as a gym one) that the agent wants to perform.

        Returns
        -------
        gym_obs:
            The gym observation that will be processed by the agent
            
        reward: ``float``
            The reward of the agent (that might be computed by the )
            
        done: ``bool``
            Whether the episode is over or not
            
        info: Dict
            Other type of informations
            
        """
        g2op_act_tmp = self.action_space.from_gym(gym_action)
        g2op_act = self.fix_action(g2op_act_tmp, self._previous_act)
        g2op_obs, reward, done, info = self.init_env.step(g2op_act)
        if not done:
            g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, done, info)
        self._previous_act = g2op_obs
        gym_obs = self.observation_space.to_gym(g2op_obs)
        if hasattr(type(self), "_gymnasium") and type(self)._gymnasium:
            truncated = False
            return gym_obs, float(reward), done, truncated, info
        else:
            return gym_obs, float(reward), done, info
        
[docs]    def reset(self, *, seed=None, return_info=False, options=None):
        """This function implements the "reset" function. It is called at the end of every episode and
        marks the beginning of a new one.
        
        Again, before the agents sees any observations from the environment, they are processed by the 
        "heuristics" / "expert rules".
        
        .. note::
            The first observation seen by the agent is not necessarily the first observation of the grid2op environment.

        Returns
        -------
        gym_obs:
            The first open ai gym observation received by the agent
        """
        if hasattr(type(self), "_gymnasium") and type(self)._gymnasium:
            return_info = True
            
        done = True
        info = {}  # no extra information provided !
        while done:
            super()._aux_reset(seed, return_info, options)  # reset the scenario
            g2op_obs = self.init_env.get_obs()  # retrieve the observation
            reward = self.init_env.reward_range[0]  # the reward at first step is always minimal
            
            # perform the "heuristics" steps
            g2op_obs, reward, done, info = self.apply_heuristics_actions(g2op_obs, reward, False, info)
            
            # convert back the observation to gym
            if not done:
                self._previous_act = g2op_obs
                gym_obs = self.observation_space.to_gym(g2op_obs)
                break
            
        if return_info:
            return gym_obs, info
        else:
            return gym_obs
    
[docs]class GymEnvWithReco(GymEnvWithHeuristics):
    """This specific type of environment with "heuristics" / "expert rules" / "expert actions" is an
    example to illustrate how to perfom an automatic powerline reconnection.
    
    For this type of environment the only heuristic implemented is the following: "each time i can
    reconnect a powerline, i don't ask the agent, i reconnect it and send it the state after the powerline
    has been reconnected".

    With the proposed class, implementing it is fairly easy as shown in function :func:`GymEnvWithReco.heuristic_actions`
    
    """
[docs]    def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
        """The heuristic is pretty simple: each there is a powerline with a cooldown at 0 and that is disconnected
        the heuristic reconnects it.

        Parameters
        ----------
        See parameters of :func:`GymEnvWithHeuristics.heuristic_actions`

        Returns
        -------
        See return values of :func:`GymEnvWithHeuristics.heuristic_actions`
        """
        
        # computes which powerline can be reconnected
        to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
        res = []
        if np.any(to_reco):
            # If I can reconnect any, I do it
            reco_id = np.where(to_reco)[0]
            for line_id in reco_id:
                g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
                res.append(g2op_act)
        return res
        
    
[docs]class GymEnvWithRecoWithDN(GymEnvWithHeuristics):
    """This environment is slightly more complex that the other one.
    
    It consists in 2 things:
    
    #. reconnecting the powerlines if possible
    #. doing nothing is the state of the grid is "safe" (for this class, the notion of "safety" is pretty simple: if all
       flows are bellow 90% (by default) of the thermal limit, then it is safe)
    
    If for a given step, non of these things is applicable, the underlying trained agent is asked to perform an action
    
    .. warning::
        When using this environment, we highly recommend to adapt the parameter `safe_max_rho` to suit your need.
        
        Sometimes, 90% of the thermal limit is too high, sometimes it is too low.
        
    """
    def __init__(self, env_init, *args, reward_cumul="init", safe_max_rho=0.9, **kwargs):
        super().__init__(env_init, reward_cumul=reward_cumul, *args, **kwargs)
        self._safe_max_rho = safe_max_rho
        
[docs]    def heuristic_actions(self, g2op_obs, reward, done, info) -> List[BaseAction]:
        """To match the description of the environment, this heuristic will:
        
        - return the list of all the powerlines that can be reconnected if any
        - return the list "[do nothing]" is the grid is safe
        - return the empty list (signaling the agent should take control over the heuristics) otherwise

        Parameters
        ----------
        See parameters of :func:`GymEnvWithHeuristics.heuristic_actions`

        Returns
        -------
        See return values of :func:`GymEnvWithHeuristics.heuristic_actions`
        """
        
        to_reco = (g2op_obs.time_before_cooldown_line == 0) & (~g2op_obs.line_status)
        res = []
        if np.any(to_reco):
            # reconnect something if it can be
            reco_id = np.where(to_reco)[0]
            for line_id in reco_id:
                g2op_act = self.init_env.action_space({"set_line_status": [(line_id, +1)]})
                res.append(g2op_act)
        elif g2op_obs.rho.max() <= self._safe_max_rho:
            # play do nothing if there is "no problem" according to the "rule of thumb"
            res = [self.init_env.action_space()]
        return res