# Copyright (c) 2020, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.
import numpy as np
import os
# tf2.0 friendly
import warnings
try:
import tensorflow as tf
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning)
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.layers import Input, Concatenate
_CAN_USE_TENSORFLOW = True
except ImportError:
_CAN_USE_TENSORFLOW = False
from l2rpn_baselines.utils import BaseDeepQ, TrainingParam
# This class implements the "Sof Actor Critic" model.
# It is a custom implementation, courtesy to Clement Goubet
# The original paper is: https://arxiv.org/abs/1801.01290
[docs]class SACOld_NN(BaseDeepQ):
"""
Constructs the desired soft actor critic network.
.. warning::
This baseline recodes entire the RL training procedure. You can use it if you
want to have a deeper look at Deep Q Learning algorithm and a possible (non
optimized, slow, etc. implementation ).
For a much better implementation, you can reuse the code of "PPO_RLLIB"
or the "PPO_SB3" baseline.
Compared to other baselines shown elsewhere (*eg* :class:`l2rpn_baselines.DeepQSimple` or
:class:`l2rpn_baselines.DeepQSimple`) the implementation of the SAC is a bit more tricky
(and was most likely NOT done properly in this class). For a more correct implementation
of SAC please look at the :class:`l2rpn_baselines.SAC.SAC` instead. This class is only
present for backward compatibility.
However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom
parameters class (in this case :class:`SACOld_NNParam` is flexible enough to meet our needs.
References
-----------
Original paper:
https://arxiv.org/abs/1801.01290
modified for discrete action space:
https://arxiv.org/abs/1910.07207
"""
def __init__(self,
nn_params,
training_param=None,
verbose=False):
if not _CAN_USE_TENSORFLOW:
raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
if training_param is None:
training_param = TrainingParam()
BaseDeepQ.__init__(self,
nn_params,
training_param,
verbose=verbose)
# TODO add as meta param the number of "Q" you want to use (here 2)
# TODO add as meta param size and types of the networks
self.average_reward = 0
self.life_spent = 1
self.qvalue_evolution = np.zeros((0,))
self.Is_nan = False
self.model_value_target = None
self.model_value = None
self.model_Q = None
self.model_Q2 = None
self.model_policy = None
self.previous_size = 0
self.previous_eyes = None
self.previous_arange = None
self.previous_size_train = 0
self.previous_eyes_train = None
# optimizers and learning rate
self.schedule_lr_policy = None
self.optimizer_policy = None
self.schedule_lr_Q = None
self.optimizer_Q = None
self.schedule_lr_Q2 = None
self.optimizer_Q2 = None
self.schedule_lr_value = None
self.optimizer_value = None
self.construct_q_network()
def _build_q_NN(self):
input_states = Input(shape=(self._observation_size,))
input_action = Input(shape=(self._action_size,))
input_layer = Concatenate()([input_states, input_action])
lay = input_layer
for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes, self._nn_archi.activs)):
lay = Dense(size, name="layer_{}".format(lay_num))(lay) # put at self.action_size
lay = Activation(act)(lay)
advantage = Dense(1, activation='linear')(lay)
model = Model(inputs=[input_states, input_action], outputs=[advantage])
return model
def _build_model_value(self):
input_states = Input(shape=(self._observation_size,))
lay = input_states
for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_value, self._nn_archi.activs_value)):
lay = Dense(size)(lay)
lay = Activation(act)(lay)
advantage = Dense(self._action_size, activation='relu')(lay)
state_value = Dense(1, activation='linear', name="state_value")(advantage)
model = Model(inputs=[input_states], outputs=[state_value])
return model
[docs] def construct_q_network(self):
"""
This constructs all the networks needed for the SAC agent.
"""
self.model_Q = self._build_q_NN()
self.schedule_lr_Q, self.optimizer_Q = self.make_optimiser()
self.model_Q.compile(loss='mse', optimizer=self.optimizer_Q)
self.model_Q2 = self._build_q_NN()
self.schedule_lr_Q2, self.optimizer_Q2 = self.make_optimiser()
self.model_Q2.compile(loss='mse', optimizer=self.optimizer_Q2)
# state value function approximation
self.model_value = self._build_model_value()
self.schedule_lr_value, self.optimizer_value = self.make_optimiser()
self._optimizer_model = self.optimizer_value
self.model_value.compile(loss='mse', optimizer=self.optimizer_value)
self.model_value_target = self._build_model_value()
self.model_value_target.set_weights(self.model_value.get_weights())
# policy function approximation
self.model_policy = Sequential()
# proba of choosing action a depending on policy pi
input_states = Input(shape=(self._observation_size,))
lay = input_states
for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_policy, self._nn_archi.activs_policy)):
lay = Dense(size)(lay)
lay = Activation(act)(lay)
soft_proba = Dense(self._action_size, activation="softmax", kernel_initializer='uniform', name="soft_proba")(lay)
self.model_policy = Model(inputs=[input_states], outputs=[soft_proba])
self.schedule_lr_policy, self.optimizer_policy = self.make_optimiser()
self.model_policy.compile(loss='categorical_crossentropy', optimizer=self.optimizer_policy)
def _get_eye_pm(self, batch_size):
if batch_size != self.previous_size:
tmp = np.zeros((batch_size, self._action_size), dtype=np.float32)
self.previous_eyes = tmp
self.previous_arange = np.arange(batch_size)
self.previous_size = batch_size
return self.previous_eyes, self.previous_arange
[docs] def predict_movement(self, data, epsilon, batch_size=None, training=False):
"""
predict the next movements in a vectorized fashion
"""
if batch_size is None:
batch_size = data.shape[0]
rand_val = np.random.random(data.shape[0])
p_actions = self.model_policy(data, training=training).numpy()
opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1)
opt_policy = 1.0 * opt_policy_orig
opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
opt_policy = opt_policy.astype(np.int)
return opt_policy, p_actions[:, opt_policy], p_actions
def _get_eye_train(self, batch_size):
if batch_size != self.previous_size_train:
self.previous_eyes_train = np.repeat(np.eye(self._action_size),
batch_size * np.ones(self._action_size, dtype=np.int),
axis=0)
self.previous_eyes_train = tf.convert_to_tensor(self.previous_eyes_train, dtype=tf.float32)
self.previous_size_train = batch_size
return self.previous_eyes_train
[docs] def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
"""Trains networks to fit given parameters"""
if batch_size is None:
batch_size = s_batch.shape[0]
target = np.zeros((batch_size, 1))
# training of the action state value networks
last_action = np.zeros((batch_size, self._action_size))
# Save the graph just the first time
if tf_writer is not None:
tf.summary.trace_on()
# TODO is it s2 or s ? For me it should be s...
fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1)
# TODO ***_target should be for the Q function instead imho
if tf_writer is not None:
with tf_writer.as_default():
tf.summary.trace_export("model_value_target-graph", 0)
tf.summary.trace_off()
# TODO is it rather `targets[:, a_batch]`
target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
# target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
loss = self.model_Q.train_on_batch([s_batch, last_action], target)
loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target)
self.life_spent += 1
temp = 1 / np.log(self.life_spent) / 2
tiled_batch = np.tile(s_batch, (self._action_size, 1))
tiled_batch_ts = tf.convert_to_tensor(tiled_batch)
# tiled_batch: output something like: batch, batch, batch
# TODO save that somewhere not to compute it each time, you can even save this in the
# TODO tensorflow graph!
tmp = self._get_eye_train(batch_size)
action_v1_orig = self.model_Q.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
action_v2_orig = self.model_Q2.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
action_v1 = action_v1_orig - np.amax(action_v1_orig, axis=-1).reshape(batch_size, 1)
new_proba = np.exp(action_v1 / temp) / np.sum(np.exp(action_v1 / temp), axis=-1).reshape(batch_size, 1)
new_proba_ts = tf.convert_to_tensor(new_proba)
loss_policy = self.model_policy.train_on_batch(s_batch, new_proba_ts)
target_pi = self.model_policy.predict(s_batch, batch_size=batch_size)
value_target = np.fmin(action_v1_orig[0, a_batch], action_v2_orig[0, a_batch]) - np.sum(
target_pi * np.log(target_pi + 1e-6))
value_target_ts = tf.convert_to_tensor(value_target.reshape(-1, 1))
loss_value = self.model_value.train_on_batch(s_batch, value_target_ts)
self.Is_nan = np.isnan(loss) + np.isnan(loss_2) + np.isnan(loss_policy) + np.isnan(loss_value)
return np.all(np.isfinite(loss)) & np.all(np.isfinite(loss_2)) & np.all(np.isfinite(loss_policy)) & \
np.all(np.isfinite(loss_value))
@staticmethod
def _get_path_model(path, name=None):
if name is None:
path_model = path
else:
path_model = os.path.join(path, name)
path_target_model = "{}_target".format(path_model)
path_modelQ = "{}_Q".format(path_model)
path_modelQ2 = "{}_Q2".format(path_model)
path_policy = "{}_policy".format(path_model)
return path_model, path_target_model, path_modelQ, path_modelQ2, path_policy
[docs] def save_network(self, path, name=None, ext="h5"):
"""
Saves all the models with unique names
"""
path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
self.model_value.save('{}.{}'.format(path_model, ext))
self.model_value_target.save('{}.{}'.format(path_target_model, ext))
self.model_Q.save('{}.{}'.format(path_modelQ, ext))
self.model_Q2.save('{}.{}'.format(path_modelQ2, ext))
self.model_policy.save('{}.{}'.format(path_policy, ext))
[docs] def load_network(self, path, name=None, ext="h5"):
"""
We load all the models using the keras "load_model" function.
"""
path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
self.construct_q_network()
self.model_value.load_weights('{}.{}'.format(path_model, ext))
self.model_value_target.load_weights('{}.{}'.format(path_target_model, ext))
self.model_Q.load_weights('{}.{}'.format(path_modelQ, ext))
self.model_Q2.load_weights('{}.{}'.format(path_modelQ2, ext))
self.model_policy.load_weights('{}.{}'.format(path_policy, ext))
if self.verbose:
print("Succesfully loaded network.")
[docs] def target_train(self):
"""
This update the target model.
"""
model_weights = self.model_value.get_weights()
target_model_weights = self.model_value_target.get_weights()
for i in range(len(model_weights)):
target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \
target_model_weights[i]
self.model_value_target.set_weights(model_weights)