Source code for l2rpn_baselines.utils.trainingParam

# Copyright (c) 2020, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
import os
import json
import numpy as np


[docs]class TrainingParam(object): """ A class to store the training parameters of the models. It was hard coded in the getting_started/notebook 3 of grid2op and put in this repository instead. .. warning:: This baseline recodes entire the RL training procedure. You can use it if you want to have a deeper look at Deep Q Learning algorithm and a possible (non optimized, slow, etc. implementation ). For a much better implementation, you can reuse the code of "PPO_RLLIB" or the "PPO_SB3" baseline. Prefer to use the :class:`GymAgent` class and the :class:`GymEnvWithHeuristics` classes to train agent interacting with grid2op and fully compatible with gym framework. Attributes ---------- buffer_size: ``int`` Size of the replay buffer minibatch_size: ``int`` Size of the training minibatch update_freq: ``int`` Frequency at which the model is trained. Model is trained once every `update_freq` steps using `minibatch_size` from an experience replay buffer. final_epsilon: ``float`` value for the final epsilon (for the e-greedy) initial_epsilon: ``float`` value for the initial epsilon (for the e-greedy) step_for_final_epsilon: ``int`` number of step at which the final epsilon (for the epsilon greedy exploration) will be reached min_observation: ``int`` number of observations before starting to train the neural nets. Before this number of iterations, the agent will simply interact with the environment. lr: ``float`` The initial learning rate lr_decay_steps: ``int`` The learning rate decay step lr_decay_rate: ``float`` The learning rate decay rate num_frames: ``int`` Currently not used discount_factor: ``float`` The discount factor (a high discount factor is in favor of longer episode, a small one not really). This is often called "gamma" in some RL paper. It's the gamma in: "RL wants to minize the sum of the dicounted reward, which are sum_{t >= t_0} \gamma^{t - t_0} r_t tau: ``float`` Update the target model. Target model is updated according to $target_model_weights[i] = self.training_param.tau * model_weights[i] + (1 - self.training_param.tau) * \ target_model_weights[i]$ min_iter: ``int`` It is possible in the training schedule to limit the number of time steps an episode can last. This is mainly useful at beginning of training, to not get in a state where the grid has been modified so much the agent will never get into a state resembling this one ever again). Stopping the episode before this happens can help the learning. max_iter: ``int`` Just like "min_iter" but instead of being the minimum number of iteration, it's the maximum. update_nb_iter: ``int`` If max_iter_fun is the default one, this numer give the number of time we need to succeed a scenario before having to increase the maximum number of timestep allowed step_increase_nb_iter: ``int`` or ``None`` Of how many timestep we increase the maximum number of timesteps allowed per episode. Set it to O to deactivate this. max_iter_fun: ``function`` A function that return the maximum number of steps an episode can count as for the current epoch. For example it can be `max_iter_fun = lambda epoch_num : np.sqrt(50 * epoch_num)` [default lambda x: x / self.update_nb_iter] oversampling_rate: ``float`` or ``None`` Set it to None to deactivate the oversampling of hard scenarios. Otherwise, this oversampling is done with something like `proba = 1. / (time_step_lived**oversampling_rate + 1)` where `proba` is the probability to be selected at the next call to "reset" and `time_step_lived` is the number of time steps random_sample_datetime_start: ``int`` or ``None`` If ``None`` during training the chronics will always start at the datetime the chronics start. Otherwise, the training scheme will skip a number of time steps between 0 and `random_sample_datetime_start` when loading the next chronics. This is particularly useful when you want your agent to learn to operate the grid regardless of the hour of day or day of the week. update_tensorboard_freq: ``int`` Frequency at which tensorboard is refresh (tensorboard summaries are saved every update_tensorboard_freq steps) save_model_each: ``int`` Frequency at which the model is saved (it is saved every "save_model_each" steps) max_global_norm_grad: ``float`` Maximum gradient norm allowed (can make the training more stable) default to None if deactivated. Not all baselines are compatible. max_value_grad: ``float`` Maximum value the gradient can take. Assign it to ``None`` to deactivate it. This can make the training more stable in some cases, but can slow down the training process too. Not all baselines are compatible. max_loss: ``float`` Clip the value of the loss function. Set it to ``None`` to deactivate it. Again, this can make the training more stable but possibly slower. Not all baselines are compatible. """ _tol_float_equal = float(1e-8) _int_attr = ["buffer_size", "minibatch_size", "step_for_final_epsilon", "min_observation", "last_step", "num_frames", "update_freq", "min_iter", "max_iter", "update_tensorboard_freq", "save_model_each", "_update_nb_iter", "step_increase_nb_iter", "min_observe", "sample_one_random_action_begin"] _float_attr = ["_final_epsilon", "_initial_epsilon", "lr", "lr_decay_steps", "lr_decay_rate", "discount_factor", "tau", "oversampling_rate", "max_global_norm_grad", "max_value_grad", "max_loss"] def __init__(self, buffer_size=40000, minibatch_size=64, step_for_final_epsilon=100000, # step at which min_espilon is obtain min_observation=5000, # 5000 final_epsilon=1./(7*288.), # have on average 1 random action per week of approx 7*288 time steps initial_epsilon=0.4, lr=1e-4, lr_decay_steps=10000, lr_decay_rate=0.999, num_frames=1, discount_factor=0.99, tau=0.01, update_freq=256, min_iter=50, max_iter=8064, # 1 month update_nb_iter=10, step_increase_nb_iter=0, # by default no oversampling / under sampling based on difficulty update_tensorboard_freq=1000, # update tensorboard every "update_tensorboard_freq" steps save_model_each=10000, # save the model every "update_tensorboard_freq" steps random_sample_datetime_start=None, oversampling_rate=None, max_global_norm_grad=None, max_value_grad=None, max_loss=None, # observer: let the neural network "observe" for a given amount of time # all actions are replaced by a do nothing min_observe=None, # i do a random action at the beginning of an episode until a certain number of step # is made # it's recommended to have "min_observe" to be larger that this (this is an int) sample_one_random_action_begin=None, ): self.random_sample_datetime_start = random_sample_datetime_start self.buffer_size = int(buffer_size) self.minibatch_size = int(minibatch_size) self.min_observation = int(min_observation) self._final_epsilon = float(final_epsilon) # have on average 1 random action per day of approx 288 timesteps at the end (never kill completely the exploration) self._initial_epsilon = float(initial_epsilon) self.step_for_final_epsilon = float(step_for_final_epsilon) self.lr = float(lr) self.lr_decay_steps = float(lr_decay_steps) self.lr_decay_rate = float(lr_decay_rate) # gradient clipping (if supported) self.max_global_norm_grad = max_global_norm_grad self.max_value_grad = max_value_grad self.max_loss = max_loss # observer self.min_observe = min_observe self.sample_one_random_action_begin = sample_one_random_action_begin self.last_step = int(0) self.num_frames = int(num_frames) self.discount_factor = float(discount_factor) self.tau = float(tau) self.update_freq = int(update_freq) self.min_iter = int(min_iter) self.max_iter = int(max_iter) self._1_update_nb_iter = None self._update_nb_iter = int(update_nb_iter) if step_increase_nb_iter is None: # 0 and None have the same effect: it disable the feature step_increase_nb_iter = 0 self.step_increase_nb_iter = step_increase_nb_iter if oversampling_rate is not None: self.oversampling_rate = float(oversampling_rate) else: self.oversampling_rate = None self.update_tensorboard_freq = update_tensorboard_freq self.save_model_each = save_model_each self.max_iter_fun = self.default_max_iter_fun self._compute_exp_facto() @property def final_epsilon(self): return self._final_epsilon @final_epsilon.setter def final_epsilon(self, final_epsilon): self._final_epsilon = final_epsilon self._compute_exp_facto() @property def initial_epsilon(self): return self._initial_epsilon @initial_epsilon.setter def initial_epsilon(self, initial_epsilon): self._initial_epsilon = initial_epsilon self._compute_exp_facto() @property def update_nb_iter(self): return self._update_nb_iter @update_nb_iter.setter def update_nb_iter(self, update_nb_iter): self._update_nb_iter = update_nb_iter if self._update_nb_iter is not None and self._update_nb_iter > 0: self._1_update_nb_iter = 1.0 / self._update_nb_iter else: self._1_update_nb_iter = 1.0 def _compute_exp_facto(self): if self.final_epsilon is not None and self.initial_epsilon is not None and self.final_epsilon > 0: self._exp_facto = np.log(self.initial_epsilon/self.final_epsilon) else: # TODO self._exp_facto = 1
[docs] def default_max_iter_fun(self, nb_success): """the default max iteration function used""" return self.step_increase_nb_iter * int(nb_success * self._1_update_nb_iter)
[docs] def tell_step(self, current_step): """tell this instance the number of training steps that have been made""" self.last_step = current_step
[docs] def get_next_epsilon(self, current_step): """get the next epsilon for the e greedy exploration""" self.tell_step(current_step) if self.step_for_final_epsilon is None or self.initial_epsilon is None \ or self._exp_facto is None or self.final_epsilon is None: res = 0. else: if current_step > self.step_for_final_epsilon: res = self.final_epsilon else: # exponential decrease res = self.initial_epsilon * np.exp(- (current_step / self.step_for_final_epsilon) * self._exp_facto ) return res
[docs] def to_dict(self): """serialize this instance to a dictionnary.""" res = {} for attr_nm in self._int_attr: tmp = getattr(self, attr_nm) if tmp is not None: res[attr_nm] = int(tmp) else: res[attr_nm] = None for attr_nm in self._float_attr: tmp = getattr(self, attr_nm) if tmp is not None: res[attr_nm] = float(tmp) else: res[attr_nm] = None return res
[docs] @staticmethod def from_dict(tmp): """initialize this instance from a dictionary""" if not isinstance(tmp, dict): raise RuntimeError("TrainingParam from dict must be called with a dictionary, and not {}".format(tmp)) res = TrainingParam() for attr_nm in TrainingParam._int_attr: if attr_nm in tmp: tmp_ = tmp[attr_nm] if tmp_ is not None: setattr(res, attr_nm, int(tmp_)) else: setattr(res, attr_nm, None) for attr_nm in TrainingParam._float_attr: if attr_nm in tmp: tmp_ = tmp[attr_nm] if tmp_ is not None: setattr(res, attr_nm, float(tmp_)) else: setattr(res, attr_nm, None) res.update_nb_iter = res._update_nb_iter res.initial_epsilon = res._initial_epsilon res._compute_exp_facto() return res
[docs] @staticmethod def from_json(json_path): """initialize this instance from a json""" if not os.path.exists(json_path): raise FileNotFoundError("No path are located at \"{}\"".format(json_path)) with open(json_path, "r") as f: dict_ = json.load(f) return TrainingParam.from_dict(dict_)
[docs] def save_as_json(self, path, name=None): """save this instance as a json""" res = self.to_dict() if name is None: name = "training_parameters.json" if not os.path.exists(path): raise RuntimeError("Directory \"{}\" not found to save the training parameters".format(path)) if not os.path.isdir(path): raise NotADirectoryError("\"{}\" should be a directory".format(path)) path_out = os.path.join(path, name) with open(path_out, "w", encoding="utf-8") as f: json.dump(res, fp=f, indent=4, sort_keys=True)
[docs] def do_train(self): """return whether or not i should train the model at this time step""" return self.last_step % self.update_freq == 0
def __eq__(self, other): res = True for el in self._int_attr: me_ = getattr(self, el) oth_ = getattr(other, el) if me_ is None and oth_ is not None: res = False break if oth_ is None and me_ is not None: res = False break if me_ is None and oth_ is None: continue if int(me_) != int(oth_): res = False break if res: for el in self._float_attr: me_ = getattr(self, el) oth_ = getattr(other, el) if me_ is None and oth_ is not None: res = False break if oth_ is None and me_ is not None: res = False break if me_ is None and oth_ is None: continue if abs(float(me_) - float(oth_)) > self._tol_float_equal: res = False break return res