Source code for l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN

# Copyright (c) 2020, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.

import os
import json
import math
import numpy as np
try:
    import tensorflow as tf
    _CAN_USE_TENSORFLOW = True
except ImportError:
    _CAN_USE_TENSORFLOW = False
    
from grid2op.Agent import AgentWithConverter
from grid2op.Converter import IdToAct

from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQNConfig import DoubleDuelingDQNConfig as cfg
from l2rpn_baselines.DoubleDuelingDQN.doubleDuelingDQN_NN import DoubleDuelingDQN_NN
from l2rpn_baselines.DoubleDuelingDQN.prioritized_replay_buffer import PrioritizedReplayBuffer


[docs]class DoubleDuelingDQN(AgentWithConverter): def __init__(self, observation_space, action_space, name=__name__, is_training=False): if not _CAN_USE_TENSORFLOW: raise RuntimeError("Cannot import tensorflow, this function cannot be used.") # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) self.obs_space = observation_space # Filter #print("Actions filtering...") self.action_space.filter_action(self._filter_action) #print("..Done") # Store constructor params self.name = name self.num_frames = cfg.N_FRAMES self.is_training = is_training self.batch_size = cfg.BATCH_SIZE self.lr = cfg.LR # Declare required vars self.Qmain = None self.obs = None self.state = [] self.frames = [] # Declare training vars self.per_buffer = None self.done = False self.frames2 = None self.epoch_rewards = None self.epoch_alive = None self.Qtarget = None self.epsilon = 0.0 # Compute dimensions from intial spaces self.observation_size = self.obs_space.size_obs() self.action_size = self.action_space.size() # Load network graph self.Qmain = DoubleDuelingDQN_NN(self.action_size, self.observation_size, num_frames=self.num_frames, learning_rate=self.lr, learning_rate_decay_steps=cfg.LR_DECAY_STEPS, learning_rate_decay_rate=cfg.LR_DECAY_RATE) # Setup training vars if needed if self.is_training: self._init_training() def _filter_action(self, action): MAX_ELEM = 2 act_dict = action.impact_on_objects() elem = 0 elem += act_dict["force_line"]["reconnections"]["count"] elem += act_dict["force_line"]["disconnections"]["count"] elem += act_dict["switch_line"]["count"] elem += len(act_dict["topology"]["bus_switch"]) elem += len(act_dict["topology"]["assigned_bus"]) elem += len(act_dict["topology"]["disconnect_bus"]) elem += len(act_dict["redispatch"]["generators"]) if elem <= MAX_ELEM: return True return False def _init_training(self): self.epsilon = cfg.INITIAL_EPSILON self.frames2 = [] self.epoch_rewards = [] self.epoch_alive = [] self.per_buffer = PrioritizedReplayBuffer(cfg.PER_CAPACITY, cfg.PER_ALPHA) self.Qtarget = DoubleDuelingDQN_NN(self.action_size, self.observation_size, num_frames = self.num_frames) def _reset_state(self, current_obs): # Initial state self.obs = current_obs self.state = self.convert_obs(self.obs) self.done = False def _reset_frame_buffer(self): # Reset frame buffers self.frames = [] if self.is_training: self.frames2 = [] def _save_current_frame(self, state): self.frames.append(state.copy()) if len(self.frames) > self.num_frames: self.frames.pop(0) def _save_next_frame(self, next_state): self.frames2.append(next_state.copy()) if len(self.frames2) > self.num_frames: self.frames2.pop(0) def _adaptive_epsilon_decay(self, step): ada_div = cfg.DECAY_EPSILON / 10.0 step_off = step + ada_div ada_eps = cfg.INITIAL_EPSILON * -math.log10((step_off + 1) / (cfg.DECAY_EPSILON + ada_div)) ada_eps_up_clip = min(cfg.INITIAL_EPSILON, ada_eps) ada_eps_low_clip = max(cfg.FINAL_EPSILON, ada_eps_up_clip) return ada_eps_low_clip def _save_hyperparameters(self, logpath, env, steps): try: # change of name in grid2op >= 1.2.3 r_instance = env._reward_helper.template_reward except AttributeError as nm_exc_: r_instance = env.reward_helper.template_reward hp = { "lr": cfg.LR, "lr_decay_steps": cfg.LR_DECAY_STEPS, "lr_decay_rate": cfg.LR_DECAY_RATE, "batch_size": cfg.BATCH_SIZE, "stack_frames": cfg.N_FRAMES, "iter": steps, "e_start": cfg.INITIAL_EPSILON, "e_end": cfg.FINAL_EPSILON, "e_decay": cfg.DECAY_EPSILON, "discount": cfg.DISCOUNT_FACTOR, "per_alpha": cfg.PER_ALPHA, "per_beta": cfg.PER_BETA, "per_capacity": cfg.PER_CAPACITY, "update_freq": cfg.UPDATE_FREQ, "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) hp_path = os.path.join(logpath, hp_filename) with open(hp_path, 'w') as fp: json.dump(hp, fp=fp, indent=2) ## Agent Interface
[docs] def convert_obs(self, observation): li_vect= [] for el in observation.attr_list_vect: v = observation._get_array_from_attr_name(el).astype(np.float32) v_fix = np.nan_to_num(v) v_norm = np.linalg.norm(v_fix) if v_norm > 1e6: v_res = (v_fix / v_norm) * 10.0 else: v_res = v_fix li_vect.append(v_res) return np.concatenate(li_vect)
[docs] def convert_act(self, action): return super().convert_act(action)
## Baseline Interface
[docs] def reset(self, observation): self._reset_state(observation) self._reset_frame_buffer()
[docs] def my_act(self, state, reward, done=False): # Register current state to stacking buffer self._save_current_frame(state) # We need at least num frames to predict if len(self.frames) < self.num_frames: return 0 # Do nothing # Infer with the last num_frames states a, _ = self.Qmain.predict_move(np.array(self.frames)) return a
def load(self, path): self.Qmain.load_network(path) if self.is_training: self.Qmain.update_target_hard(self.Qtarget.model) def save(self, path): self.Qmain.save_network(path) ## Training Procedure def train(self, env, iterations, save_path, num_pre_training_steps=0, logdir = "logs-train"): # Make sure we can fill the experience buffer if num_pre_training_steps < self.batch_size * self.num_frames: num_pre_training_steps = self.batch_size * self.num_frames # Loop vars num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 self.epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 self.done = True # Create file system related vars logpath = os.path.join(logdir, self.name) os.makedirs(save_path, exist_ok=True) modelpath = os.path.join(save_path, self.name + ".h5") self.tf_writer = tf.summary.create_file_writer(logpath, name=self.name) self._save_hyperparameters(save_path, env, num_steps) # Training loop while step < num_steps: # Init first time or new episode if self.done: new_obs = env.reset() # This shouldn't raise self.reset(new_obs) if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Random [{}]".format(step, self.epsilon)) # Save current observation to stacking buffer self._save_current_frame(self.state) # Choose an action if step <= num_pre_training_steps: a = self.Qmain.random_move() elif np.random.rand(1) < self.epsilon: a = self.Qmain.random_move() elif len(self.frames) < self.num_frames: a = 0 # Do nothing else: a, _ = self.Qmain.predict_move(np.array(self.frames)) # Convert it to a valid action act = self.convert_act(a) # Execute action new_obs, reward, self.done, info = env.step(act) new_state = self.convert_obs(new_obs) if info["is_illegal"] or info["is_ambiguous"] or \ info["is_dispatching_illegal"] or info["is_illegal_reco"]: if cfg.VERBOSE: print (a, info) # Save new observation to stacking buffer self._save_next_frame(new_state) # Save to experience buffer if len(self.frames2) == self.num_frames: self.per_buffer.add(np.array(self.frames), a, reward, np.array(self.frames2), self.done) # Perform training when we have enough experience in buffer if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Decay chance of random action self.epsilon = self._adaptive_epsilon_decay(training_step) # Perform training at given frequency if step % cfg.UPDATE_FREQ == 0 and \ len(self.per_buffer) >= self.batch_size: # Perform training self._batch_train(training_step, step) if cfg.UPDATE_TARGET_SOFT_TAU > 0.0: tau = cfg.UPDATE_TARGET_SOFT_TAU # Update target network towards primary network self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings, update target completely if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) if cfg.VERBOSE: print("Survived [{}] steps".format(alive_steps)) print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: alive_steps += 1 # Save the network every 1000 iterations if step > 0 and step % 1000 == 0: self.save(modelpath) # Iterate to next loop step += 1 # Make new obs the current obs self.obs = new_obs self.state = new_state # Save model after all steps self.save(modelpath) def _batch_train(self, training_step, step): """Trains network to fit given parameters""" # Sample from experience buffer sample_batch = self.per_buffer.sample(self.batch_size, cfg.PER_BETA) s_batch = sample_batch[0] a_batch = sample_batch[1] r_batch = sample_batch[2] s2_batch = sample_batch[3] d_batch = sample_batch[4] w_batch = sample_batch[5] idx_batch = sample_batch[6] Q = np.zeros((self.batch_size, self.action_size)) # Reshape frames to 1D input_size = self.observation_size * self.num_frames input_t = np.reshape(s_batch, (self.batch_size, input_size)) input_t_1 = np.reshape(s2_batch, (self.batch_size, input_size)) # Save the graph just the first time if training_step == 0: tf.summary.trace_on() # T Batch predict Q = self.Qmain.model.predict(input_t, batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: with self.tf_writer.as_default(): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict Q1 = self.Qmain.model.predict(input_t_1, batch_size=self.batch_size) Q2 = self.Qtarget.model.predict(input_t_1, batch_size=self.batch_size) # Compute batch Qtarget using Double DQN for i in range(self.batch_size): doubleQ = Q2[i, np.argmax(Q1[i])] Q[i, a_batch[i]] = r_batch[i] if d_batch[i] == False: Q[i, a_batch[i]] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train loss = self.Qmain.train_on_batch(input_t, Q, w_batch) # Update PER buffer priorities = self.Qmain.batch_sq_error # Can't be zero, no upper limit priorities = np.clip(priorities, a_min=1e-8, a_max=None) self.per_buffer.update_priorities(idx_batch, priorities) # Log some useful metrics every even updates if step % (cfg.UPDATE_FREQ * 2) == 0: with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) if len(self.epoch_rewards) >= 100: mean_reward_100 = np.mean(self.epoch_rewards[-100:]) mean_alive_100 = np.mean(self.epoch_alive[-100:]) else: mean_reward_100 = mean_reward mean_alive_100 = mean_alive tf.summary.scalar("mean_reward", mean_reward, step) tf.summary.scalar("mean_alive", mean_alive, step) tf.summary.scalar("mean_reward_100", mean_reward_100, step) tf.summary.scalar("mean_alive_100", mean_alive_100, step) tf.summary.scalar("loss", loss, step) tf.summary.scalar("lr", self.Qmain.train_lr, step) if cfg.VERBOSE: print("loss =", loss)