Source code for l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN

# Copyright (c) 2020, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.

import os
import json
import copy
import numpy as np
try:
    import tensorflow as tf
    _CAN_USE_TENSORFLOW = True
except ImportError:
    _CAN_USE_TENSORFLOW = False


from grid2op.Agent import AgentWithConverter
from grid2op.Converter import IdToAct

from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQNConfig import DoubleDuelingRDQNConfig as cfg
from l2rpn_baselines.DoubleDuelingRDQN.experienceBuffer import ExperienceBuffer
from l2rpn_baselines.DoubleDuelingRDQN.doubleDuelingRDQN_NN import DoubleDuelingRDQN_NN

[docs]class DoubleDuelingRDQN(AgentWithConverter): """ .. warning:: This baseline recodes entire the RL training procedure. You can use it if you want to have a deeper look at Deep Q Learning algorithm and a possible (non optimized, slow, etc. implementation ). For a much better implementation, you can reuse the code of "PPO_RLLIB" or the "PPO_SB3" baseline. """ def __init__(self, observation_space, action_space, name=__name__, is_training=False): if not _CAN_USE_TENSORFLOW: raise RuntimeError("Cannot import tensorflow, this function cannot be used.") # Call parent constructor AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct) # Store constructor params self.observation_space = observation_space self.name = name self.trace_length = cfg.TRACE_LENGTH self.batch_size = cfg.BATCH_SIZE self.is_training = is_training self.lr = cfg.LR # Declare required vars self.Qmain = None self.obs = None self.state = [] self.mem_state = None self.carry_state = None # Declare training vars self.exp_buffer = None self.done = False self.epoch_rewards = None self.epoch_alive = None self.Qtarget = None # Compute dimensions from intial state self.observation_size = self.observation_space.size_obs() self.action_size = self.action_space.size() # Load network graph self.Qmain = DoubleDuelingRDQN_NN(self.action_size, self.observation_size, learning_rate = self.lr) # Setup training vars if needed if self.is_training: self._init_training() def _init_training(self): self.exp_buffer = ExperienceBuffer(cfg.REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length) self.done = True self.epoch_rewards = [] self.epoch_alive = [] self.Qtarget = DoubleDuelingRDQN_NN(self.action_size, self.observation_size, learning_rate = self.lr) def _reset_state(self, current_obs): # Initial state self.obs = current_obs self.state = self.convert_obs(self.obs) self.done = False self.mem_state = np.zeros(self.Qmain.h_size) self.carry_state = np.zeros(self.Qmain.h_size) def _register_experience(self, episode_exp, episode): missing_obs = self.trace_length - len(episode_exp) if missing_obs > 0: # We are missing exp to make a trace exp = episode_exp[0] # Use inital state to fill out for missing in range(missing_obs): # Use do_nothing action at index 0 self.exp_buffer.add(exp[0], 0, exp[2], exp[3], exp[4], episode) # Register the actual experience for exp in episode_exp: self.exp_buffer.add(exp[0], exp[1], exp[2], exp[3], exp[4], episode) def _save_hyperparameters(self, logpath, env, steps): try: # change of name in grid2op >= 1.2.3 r_instance = env._reward_helper.template_reward except AttributeError as nm_exc_: r_instance = env.reward_helper.template_reward hp = { "lr": cfg.LR, "batch_size": cfg.BATCH_SIZE, "trace_len": cfg.TRACE_LENGTH, "e_start": cfg.INITIAL_EPSILON, "e_end": cfg.FINAL_EPSILON, "e_decay": cfg.DECAY_EPSILON, "discount": cfg.DISCOUNT_FACTOR, "buffer_size": cfg.REPLAY_BUFFER_SIZE, "update_freq": cfg.UPDATE_FREQ, "update_hard": cfg.UPDATE_TARGET_HARD_FREQ, "update_soft": cfg.UPDATE_TARGET_SOFT_TAU, "reward": dict(r_instance) } hp_filename = "{}-hypers.json".format(self.name) hp_path = os.path.join(logpath, hp_filename) with open(hp_path, 'w') as fp: json.dump(hp, fp=fp, indent=2) ## Agent Interface
[docs] def convert_obs(self, observation): # Made a custom version to normalize per attribute #return observation.to_vect() li_vect= [] for el in observation.attr_list_vect: v = observation._get_array_from_attr_name(el).astype(np.float) v_fix = np.nan_to_num(v) v_norm = np.linalg.norm(v_fix) if v_norm > 1e6: v_res = (v_fix / v_norm) * 10.0 else: v_res = v_fix li_vect.append(v_res) return np.concatenate(li_vect)
[docs] def convert_act(self, action): return super().convert_act(action)
[docs] def reset(self, observation): self._reset_state(observation)
[docs] def my_act(self, state, reward, done=False): data_input = np.array(state) data_input.reshape(1, 1, self.observation_size) a, _, m, c = self.Qmain.predict_move(data_input, self.mem_state, self.carry_state) self.mem_state = m self.carry_state = c return a
def load(self, path): self.Qmain.load_network(path) if self.is_training: self.Qmain.update_target_hard(self.Qtarget.model) def save(self, path): self.Qmain.save_network(path) ## Training Procedure def train(self, env, iterations, save_path, num_pre_training_steps = 0, logdir = "logs"): # Loop vars num_training_steps = iterations num_steps = num_pre_training_steps + num_training_steps step = 0 epsilon = cfg.INITIAL_EPSILON alive_steps = 0 total_reward = 0 episode = 0 episode_exp = [] # Create file system related vars logpath = os.path.join(logdir, self.name) os.makedirs(save_path, exist_ok=True) modelpath = os.path.join(save_path, self.name + ".tf") self.tf_writer = tf.summary.create_file_writer(logpath, name=self.name) self._save_hyperparameters(save_path, env, num_steps) # Training loop self._reset_state(env.current_obs) while step < num_steps: # New episode if self.done: new_obs = env.reset() # This shouldn't raise self._reset_state(new_obs) # Push current episode experience to experience buffer self._register_experience(episode_exp, episode) # Reset current episode experience episode += 1 episode_exp = [] if cfg.VERBOSE and step % 1000 == 0: print("Step [{}] -- Dropout [{}]".format(step, epsilon)) # Choose an action if step <= num_pre_training_steps: a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) elif len(episode_exp) < self.trace_length: a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state) a = 0 # Do Nothing else: a, _, m, c = self.Qmain.bayesian_move(self.state, self.mem_state, self.carry_state, epsilon) # Update LSTM state self.mem_state = m self.carry_state = c # Convert it to a valid action act = self.convert_act(a) # Execute action new_obs, reward, self.done, info = env.step(act) new_state = self.convert_obs(new_obs) # Save to current episode experience episode_exp.append((self.state, a, reward, self.done, new_state)) # Train when pre-training is over if step >= num_pre_training_steps: training_step = step - num_pre_training_steps # Slowly decay dropout rate if epsilon > cfg.FINAL_EPSILON: epsilon -= cfg.STEP_EPSILON if epsilon < cfg.FINAL_EPSILON: epsilon = cfg.FINAL_EPSILON # Perform training at given frequency if step % cfg.UPDATE_FREQ == 0 and \ self.exp_buffer.can_sample(): # Sample from experience buffer batch = self.exp_buffer.sample() # Perform training self._batch_train(batch, step, training_step) # Update target network towards primary network if cfg.UPDATE_TARGET_SOFT_TAU > 0: tau = cfg.UPDATE_TARGET_SOFT_TAU self.Qmain.update_target_soft(self.Qtarget.model, tau) # Every UPDATE_TARGET_HARD_FREQ trainings, # update target completely if cfg.UPDATE_TARGET_HARD_FREQ > 0 and \ step % (cfg.UPDATE_FREQ * cfg.UPDATE_TARGET_HARD_FREQ) == 0: self.Qmain.update_target_hard(self.Qtarget.model) total_reward += reward if self.done: self.epoch_rewards.append(total_reward) self.epoch_alive.append(alive_steps) if cfg.VERBOSE: print("Survived [{}] steps".format(alive_steps)) print("Total reward [{}]".format(total_reward)) alive_steps = 0 total_reward = 0 else: alive_steps += 1 # Save the network every 1000 iterations if step > 0 and step % 1000 == 0: self.save(modelpath) # Iterate to next loop step += 1 self.obs = new_obs self.state = new_state # Save model after all steps self.save(modelpath) def _batch_train(self, batch, step, training_step): """Trains network to fit given parameters""" Q = np.zeros((self.batch_size, self.action_size)) batch_mem = np.zeros((self.batch_size, self.Qmain.h_size)) batch_carry = np.zeros((self.batch_size, self.Qmain.h_size)) input_size = self.observation_size m_data = np.vstack(batch[:, 0]) m_data = m_data.reshape(self.batch_size, self.trace_length, input_size) t_data = np.vstack(batch[:, 4]) t_data = t_data.reshape(self.batch_size, self.trace_length, input_size) q_input = [ copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(m_data) ] q1_input = [ copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data) ] q2_input = [ copy.deepcopy(batch_mem), copy.deepcopy(batch_carry), copy.deepcopy(t_data) ] # Batch predict self.Qmain.trace_length.assign(self.trace_length) self.Qmain.dropout_rate.assign(0.0) self.Qtarget.trace_length.assign(self.trace_length) self.Qtarget.dropout_rate.assign(0.0) # Save the graph just the first time if training_step == 0: tf.summary.trace_on() # T Batch predict Q, _, _ = self.Qmain.model.predict(q_input, batch_size = self.batch_size) ## Log graph once and disable graph logging if training_step == 0: with self.tf_writer.as_default(): tf.summary.trace_export(self.name + "-graph", step) # T+1 batch predict Q1, _, _ = self.Qmain.model.predict(q1_input, batch_size=self.batch_size) Q2, _, _ = self.Qtarget.model.predict(q2_input, batch_size=self.batch_size) # Compute batch Double Q update to Qtarget for i in range(self.batch_size): idx = i * (self.trace_length - 1) doubleQ = Q2[i, np.argmax(Q1[i])] a = batch[idx][1] r = batch[idx][2] d = batch[idx][3] Q[i, a] = r if d == False: Q[i, a] += cfg.DISCOUNT_FACTOR * doubleQ # Batch train batch_x = [batch_mem, batch_carry, m_data] batch_y = [Q, batch_mem, batch_carry] loss = self.Qmain.model.train_on_batch(batch_x, batch_y) loss = loss[0] # Log some useful metrics if step % (cfg.UPDATE_FREQ * 2) == 0: if cfg.VERBOSE: print("loss =", loss) with self.tf_writer.as_default(): mean_reward = np.mean(self.epoch_rewards) mean_alive = np.mean(self.epoch_alive) if len(self.epoch_rewards) >= 100: mean_reward_100 = np.mean(self.epoch_rewards[-100:]) mean_alive_100 = np.mean(self.epoch_alive[-100:]) else: mean_reward_100 = mean_reward mean_alive_100 = mean_alive tf.summary.scalar("mean_reward", mean_reward, step) tf.summary.scalar("mean_alive", mean_alive, step) tf.summary.scalar("mean_reward_100", mean_reward_100, step) tf.summary.scalar("mean_alive_100", mean_alive_100, step) tf.summary.scalar("loss", loss, step)