PPO: with ray/rllib

Description

This “baseline” aims at providing a code example on how to use an agent from the ray/rllib repository (see https://docs.ray.io/en/master/rllib/) with grid2op.

It also serve a second goal, to show how to train a PPO agent to perform continuous actions on the powergrid (eg adjusting the generator value, either by applying redispatching kind of action for controlable generators or by with curtailment on generator using new renewable energy sources - solar and wind or even to control the state of the storage units.)

It is pretty much the same as the l2rpn_baselines.PPO_RLLIB but uses rllib instead of stable Baselines3.

Exported class

You can use this class with:

from l2rpn_baselines.PPO_RLLIB import train, evaluate, PPO_RLLIB

Used a trained agent

You first need to train it:

import re
import grid2op
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from grid2op.Chronics import MultifolderWithCache  # highly recommended
from lightsim2grid import LightSimBackend  # highly recommended for training !
import ray
from l2rpn_baselines.PPO_RLLIB import train


env_name = "l2rpn_case14_sandbox"
env = grid2op.make(env_name,
                   backend=LightSimBackend())

ray.init()
try:
    trained_aget = train(
          env,
          iterations=10,  # any number of iterations you want
          save_path="./saved_model",  # where the NN weights will be saved
          name="test",  # name of the baseline
          net_arch=[100, 100, 100],  # architecture of the NN
          save_every_xxx_steps=2,  # save the NN every 2 training steps
          env_kwargs={"reward_class": LinesCapacityReward,
                      "chronics_class": MultifolderWithCache,  # highly recommended
                      "data_feeding_kwargs": {
                          'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
                          }
          },
          verbose=True
          )
finally:
    env.close()
    ray.shutdown()

Then you can load it:

import grid2op
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from lightsim2grid import LightSimBackend  # highly recommended !
from l2rpn_baselines.PPO_RLLIB import evaluate

nb_episode = 7
nb_process = 1
verbose = True

env_name = "l2rpn_case14_sandbox"
env = grid2op.make(env_name,
                    reward_class=LinesCapacityReward,
                    backend=LightSimBackend()
                    )

try:
    trained_agent = evaluate(
             env,
             nb_episode=nb_episode,
             load_path="./saved_model",  # should be the same as what has been called in the train function !
             name="test3",  # should be the same as what has been called in the train function !
             nb_process=1,
             verbose=verbose,
             )

    # you can also compare your agent with the do nothing agent relatively
    # easily
    runner_params = env.get_params_for_runner()
    runner = Runner(**runner_params)

    res = runner.run(nb_episode=nb_episode,
                    nb_process=nb_process
                    )

    # Print summary
    if verbose:
        print("Evaluation summary for DN:")
        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
            msg_tmp = "chronics at: {}".format(chron_name)
            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
            print(msg_tmp)
finally:
    env.close()

Create an agent from scratch

For example, to create an agent from scratch, with some parameters:

import grid2op
from grid2op.gym_compat import BoxGymObsSpace, BoxGymActSpace
from lightsim2grid import LightSimBackend
from l2rpn_baselines.PPO_RLLIB import PPO_RLLIB

env_name = "l2rpn_case14_sandbox"  # or any other name
obs_attr_to_keep = ["day_of_week", "hour_of_day", "minute_of_hour", "prod_p", "prod_v", "load_p", "load_q",
                    "actual_dispatch", "target_dispatch", "topo_vect", "time_before_cooldown_line",
                    "time_before_cooldown_sub", "rho", "timestep_overflow", "line_status",
                    "storage_power", "storage_charge"]
act_attr_to_keep = ["redispatch"]

# create the grid2op environment
env = grid2op.make(env_name, backend=LightSimBackend())

# define the action space and observation space that your agent
# will be able to use
gym_observation_space = BoxGymObsSpace(env.observation_space, attr_to_keep=obs_attr_to_keep)
gym_action_space = BoxGymActSpace(env.action_space, attr_to_keep=act_attr_to_keep)

# define the configuration for the environment
env_config = {"env_name": env.env_name,
              "backend_class": LightSimBackend,
              "obs_attr_to_keep": obs_attr_to_keep,
              "act_attr_to_keep": act_attr_to_keep,
                # other type of parameters used in the "grid2op.make"
                # function eg:
                # "param": ...
                # "reward_class": ...
                # "other_reward": ...
                # "difficulty": ...
                }

# now define the configuration for the PPOTrainer
env_config_ppo = {
    # config to pass to env class
    "env_config": env_config,
    #neural network config
    "lr": 1e-4, # learning_rate
    "model": {
        "fcnet_hiddens": [100, 100, 100],  # neural net architecture
    },
    # other keyword arguments
}

# create a grid2gop agent based on that (this will reload the save weights)
grid2op_agent = RLLIBAgent(env.action_space,
                            gym_action_space,
                            gym_observation_space,
                            nn_config=env_config_ppo,
                            nn_path=None  # don't load it from anywhere
                            )

# use it
obs = env.reset()
reward = env.reward_range[0]
done = False
grid2op_act = grid2op_agent.act(obs, reward, done)
obs, reward, done, info = env.step(grid2op_act)

Note

The agent above is NOT trained. So it will basically output “random” actions.

You should probably train it before hand (see the train function)

Detailed documentation

Classes:

Env_RLLIB(env_config)

This class represents the Environment usable from rllib, mapping a grid2op environment.

PPO_RLLIB

alias of RLLIBAgent

Functions:

evaluate(env[, name, load_path, logs_path, ...])

This function will use rllib package to evalute a previously trained PPO agent (with rllib) on a grid2op environment "env".

train(env[, name, iterations, save_path, ...])

This function will use the rllib to train a PPO agent on a grid2op environment "env".

class l2rpn_baselines.PPO_RLLIB.Env_RLLIB(env_config)[source]

This class represents the Environment usable from rllib, mapping a grid2op environment.

It is primarily made to serve as example of what is possible to achieve. You might probably want to customize this environment to your specific needs.

This agents uses the rllib framework to code for a neural network.

Warning

A grid2op environment is created when this agent is made. We found out rllib worked better this way.

To be built, it requires the env_config parameters. This parameter is a dictionnary with keys:

  • “env_name”: the name of the environment you want to make

  • “obs_attr_to_keep”: the attributes of the observation you want to use in the gym observation space (gym observation space is converted to a Box)

  • “act_attr_to_keep” : the attributes of the action you want to use in the gym action space (gym action space is also converted to a Box)

  • “backend_class”: the type of backed to use

  • “backend_kwargs”: the extra key word arguments to used when creating the backend

  • all other arguments are passed to grid2op.make(…) function

Methods:

reset(*[, seed, options])

Resets the environment to an initial internal state, returning an initial observation and info.

step(action)

Run one timestep of the environment's dynamics using the agent actions.

reset(*, seed=None, options=None)[source]

Resets the environment to an initial internal state, returning an initial observation and info.

This method generates a new starting state often with some randomness to ensure that the agent explores the state space and learns a generalised policy about the environment. This randomness can be controlled with the seed parameter otherwise if the environment already has a random number generator and reset() is called with seed=None, the RNG is not reset.

Therefore, reset() should (in the typical use case) be called with a seed right after initialization and then never again.

For Custom environments, the first line of reset() should be super().reset(seed=seed) which implements the seeding correctly.

Changed in version v0.25: The return_info parameter was removed and now info is expected to be returned.

Parameters:
  • seed (optional int) – The seed that is used to initialize the environment’s PRNG (np_random). If the environment does not already have a PRNG and seed=None (the default option) is passed, a seed will be chosen from some source of entropy (e.g. timestamp or /dev/urandom). However, if the environment already has a PRNG and seed=None is passed, the PRNG will not be reset. If you pass an integer, the PRNG will be reset even if it already exists. Usually, you want to pass an integer right after the environment has been initialized and then never again. Please refer to the minimal example above to see this paradigm in action.

  • options (optional dict) – Additional information to specify how the environment is reset (optional, depending on the specific environment)

Returns:

Observation of the initial state. This will be an element of observation_space

(typically a numpy array) and is analogous to the observation returned by step().

info (dictionary): This dictionary contains auxiliary information complementing observation. It should be analogous to

the info returned by step().

Return type:

observation (ObsType)

step(action)[source]

Run one timestep of the environment’s dynamics using the agent actions.

When the end of an episode is reached (terminated or truncated), it is necessary to call reset() to reset this environment’s state for the next episode.

Changed in version 0.26: The Step API was changed removing done in favor of terminated and truncated to make it clearer to users when the environment had terminated or truncated which is critical for reinforcement learning bootstrapping algorithms.

Parameters:

action (ActType) – an action provided by the agent to update the environment state.

Returns:

An element of the environment’s observation_space as the next observation due to the agent actions.

An example is a numpy array containing the positions and velocities of the pole in CartPole.

reward (SupportsFloat): The reward as a result of taking the action. terminated (bool): Whether the agent reaches the terminal state (as defined under the MDP of the task)

which can be positive or negative. An example is reaching the goal state or moving into the lava from the Sutton and Barton, Gridworld. If true, the user needs to call reset().

truncated (bool): Whether the truncation condition outside the scope of the MDP is satisfied.

Typically, this is a timelimit, but could also be used to indicate an agent physically going out of bounds. Can be used to end the episode prematurely before a terminal state is reached. If true, the user needs to call reset().

info (dict): Contains auxiliary diagnostic information (helpful for debugging, learning, and logging).

This might, for instance, contain: metrics that describe the agent’s performance state, variables that are hidden from observations, or individual reward terms that are combined to produce the total reward. In OpenAI Gym <v26, it contains “TimeLimit.truncated” to distinguish truncation and termination, however this is deprecated in favour of returning terminated and truncated variables.

done (bool): (Deprecated) A boolean value for if the episode has ended, in which case further step() calls will

return undefined results. This was removed in OpenAI Gym v26 in favor of terminated and truncated attributes. A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully, a certain timelimit was exceeded, or the physics simulation has entered an invalid state.

Return type:

observation (ObsType)

l2rpn_baselines.PPO_RLLIB.PPO_RLLIB

alias of RLLIBAgent Methods:

build()

Create the underlying NN model from scratch.

get_act(gym_obs, reward, done)

Retrieve the gym action from the gym observation and the reward.

load()

Load the NN model.

l2rpn_baselines.PPO_RLLIB.evaluate(env, name='ppo_rllib', load_path='.', logs_path=None, nb_episode=1, nb_process=1, max_steps=-1, verbose=False, save_gif=False, **kwargs)[source]

This function will use rllib package to evalute a previously trained PPO agent (with rllib) on a grid2op environment “env”.

It will use the grid2op “gym_compat” module to convert the action space to a BoxActionSpace and the observation to a BoxObservationSpace.

It is suited for the studying the impact of continuous actions:

  • on storage units

  • on dispatchable generators

  • on generators with renewable energy sources

Parameters:
  • env (grid2op.Environment) – Then environment on which you need to train your agent.

  • name (str`) – The name of your agent.

  • load_path (str) – If you want to reload your baseline, specify the path where it is located. NB if a baseline is reloaded some of the argument provided to this function will not be used.

  • logs_dir (str) – Where to store the tensorboard generated logs during the training. None if you don’t want to log them.

  • nb_episode (str) – How many episodes to run during the assessment of the performances

  • nb_process (int) – On how many process the assessment will be made. (setting this > 1 can lead to some speed ups but can be unstable on some plaform)

  • max_steps (int) – How many steps at maximum your agent will be assessed

  • verbose (bool) – Currently un used

  • save_gif (bool) – Whether or not you want to save, as a gif, the performance of your agent. It might cause memory issues (might take a lot of ram) and drastically increase computation time.

  • kwargs – extra parameters passed to the PPO from stable baselines 3

Returns:

The loaded baseline as a stable baselines PPO element.

Return type:

baseline

Examples

Here is an example on how to evaluate a PPO agent (trained using RLLIB):

import grid2op
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from lightsim2grid import LightSimBackend  # highly recommended !
from l2rpn_baselines.PPO_RLLIB import evaluate

nb_episode = 7
nb_process = 1
verbose = True

env_name = "l2rpn_case14_sandbox"
env = grid2op.make(env_name,
                   reward_class=LinesCapacityReward,
                   backend=LightSimBackend()
                   )

try:
    evaluate(env,
            nb_episode=nb_episode,
            load_path="./saved_model",  # should be the same as what has been called in the train function !
            name="test",  # should be the same as what has been called in the train function !
            nb_process=1,
            verbose=verbose,
            )

    # you can also compare your agent with the do nothing agent relatively
    # easily
    runner_params = env.get_params_for_runner()
    runner = Runner(**runner_params)

    res = runner.run(nb_episode=nb_episode,
                    nb_process=nb_process
                    )

    # Print summary
    if verbose:
        print("Evaluation summary for DN:")
        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
            msg_tmp = "chronics at: {}".format(chron_name)
            msg_tmp += "        total score: {:.6f}".format(cum_reward)
            msg_tmp += "        time steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
            print(msg_tmp)
finally:
    env.close()
l2rpn_baselines.PPO_RLLIB.train(env, name='ppo_rllib', iterations=1, save_path=None, load_path=None, net_arch=None, learning_rate=0.0003, verbose=False, save_every_xxx_steps=None, obs_attr_to_keep=['day_of_week', 'hour_of_day', 'minute_of_hour', 'prod_p', 'prod_v', 'load_p', 'load_q', 'actual_dispatch', 'target_dispatch', 'topo_vect', 'time_before_cooldown_line', 'time_before_cooldown_sub', 'rho', 'timestep_overflow', 'line_status', 'storage_power', 'storage_charge'], act_attr_to_keep=['redispatch', 'curtail', 'set_storage'], env_kwargs=None, **kwargs)[source]

This function will use the rllib to train a PPO agent on a grid2op environment “env”.

It will use the grid2op “gym_compat” module to convert the action space to a BoxActionSpace and the observation to a BoxObservationSpace.

It is suited for the studying the impact of continuous actions:

  • on storage units

  • on dispatchable generators

  • on generators with renewable energy sources

Warning

The environment used by RLLIB is copied and remade. This class does not work if you over specialize the environment ! For example, opponent is not taken into account (yet), nor the chronics class etc.

If you want such level of control, please use the env_kwargs parameters !

Parameters:
  • env (grid2op.Environment) –

    Then environment on which you need to train your agent.

    Only the name of the environment, and its backend is used. The rest will be created by rllib.

  • name (str`) – The name of your agent.

  • iterations (int) – For how many iterations do you want to train the model. These are NOT steps, but ray internal number of iterations. For some experiments we performed,

  • save_path (str) – Where do you want to save your baseline.

  • load_path (str) – If you want to reload your baseline, specify the path where it is located. NB if a baseline is reloaded some of the argument provided to this function will not be used.

  • net_arch – The neural network architecture, used to create the neural network of the PPO (see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html)

  • learning_rate (float) – The learning rate, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html

  • save_every_xxx_steps (int) – If set (by default it’s None) the stable baselines3 model will be saved to the hard drive each save_every_xxx_steps steps performed in the environment.

  • obs_attr_to_keep (list of string) – Grid2op attribute to use to build the BoxObservationSpace. It is passed as the “attr_to_keep” value of the BoxObservation space (see https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymObsSpace)

  • act_attr_to_keep (list of string) – Grid2op attribute to use to build the BoxGymActSpace. It is passed as the “attr_to_keep” value of the BoxAction space (see https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.BoxGymActSpace)

  • verbose (bool) – If you want something to be printed on the terminal (a better logging strategy will be put at some point)

  • env_kwargs (Optional[dict]) – Extra key word arguments passed to the building of the grid2op environment.

  • kwargs – extra parameters passed to the trainer from rllib

Returns:

The trained baseline as a stable baselines PPO element.

Return type:

baseline

Examples

Here is an example on how to train a ppo_stablebaseline .

First define a python script, for example

import re
import grid2op
import ray
from grid2op.Reward import LinesCapacityReward  # or any other rewards
from grid2op.Chronics import MultifolderWithCache  # highly recommended
from lightsim2grid import LightSimBackend  # highly recommended for training !

env_name = "l2rpn_case14_sandbox"
env = grid2op.make(env_name,
                   backend=LightSimBackend())

ray.init()  # if needed (you might have it already working somewhere)
try:
    train(env,
          iterations=10,  # any number of iterations you want
          save_path="./saved_model",  # where the NN weights will be saved
          name="test",  # name of the baseline
          net_arch=[100, 100, 100],  # architecture of the NN
          save_every_xxx_steps=2,  # save the NN every 2 training steps
          env_kwargs={"reward_class": LinesCapacityReward,
                      "chronics_class": MultifolderWithCache,  # highly recommended
                      "data_feeding_kwargs": {
                          'filter_func': lambda x: re.match(".*00$", x) is not None  #use one over 100 chronics to train (for speed)
                          }
          },
          verbose=True
          )
finally:
    env.close()
    ray.shutdown()  # if needed (you might have it already working somewhere)