Source code for l2rpn_baselines.SACOld.sacOld_NN

# Copyright (c) 2020, RTE (https://www.rte-france.com)
# See AUTHORS.txt
# This Source Code Form is subject to the terms of the Mozilla Public License, version 2.0.
# If a copy of the Mozilla Public License, version 2.0 was not distributed with this file,
# you can obtain one at http://mozilla.org/MPL/2.0/.
# SPDX-License-Identifier: MPL-2.0
# This file is part of L2RPN Baselines, L2RPN Baselines a repository to host baselines for l2rpn competitions.

import numpy as np
import os

# tf2.0 friendly
import warnings
try:
    import tensorflow as tf
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        from tensorflow.keras.models import Sequential, Model
        from tensorflow.keras.layers import Activation, Dense
        from tensorflow.keras.layers import Input, Concatenate
    _CAN_USE_TENSORFLOW = True
except ImportError:
    _CAN_USE_TENSORFLOW = False

from l2rpn_baselines.utils import BaseDeepQ, TrainingParam


# This class implements the "Sof Actor Critic" model.
# It is a custom implementation, courtesy to Clement Goubet
# The original paper is: https://arxiv.org/abs/1801.01290
[docs]class SACOld_NN(BaseDeepQ):
    """
    Constructs the desired soft actor critic network.

    .. warning::
        This baseline recodes entire the RL training procedure. You can use it if you
        want to have a deeper look at Deep Q Learning algorithm and a possible (non 
        optimized, slow, etc. implementation ).
        
        For a much better implementation, you can reuse the code of "PPO_RLLIB" 
        or the "PPO_SB3" baseline.
        
    Compared to other baselines shown elsewhere (*eg* :class:`l2rpn_baselines.DeepQSimple` or
    :class:`l2rpn_baselines.DeepQSimple`) the implementation of the SAC is a bit more tricky
    (and was most likely NOT done properly in this class). For a more correct implementation
    of SAC please look at the :class:`l2rpn_baselines.SAC.SAC` instead. This class is only
    present for backward compatibility.

    However, we demonstrate here that the use of :class:`l2rpn_baselines.utils.BaseDeepQ` with custom
    parameters class (in this case :class:`SACOld_NNParam` is flexible enough to meet our needs.

    References
    -----------
    Original paper:
    https://arxiv.org/abs/1801.01290

    modified for discrete action space:
    https://arxiv.org/abs/1910.07207
    """
    def __init__(self,
                 nn_params,
                 training_param=None,
                 verbose=False):
        if not _CAN_USE_TENSORFLOW:
            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
        
        if training_param is None:
            training_param = TrainingParam()
        BaseDeepQ.__init__(self,
                           nn_params,
                           training_param,
                           verbose=verbose)

        # TODO add as meta param the number of "Q" you want to use (here 2)
        # TODO add as meta param size and types of the networks
        self.average_reward = 0
        self.life_spent = 1
        self.qvalue_evolution = np.zeros((0,))
        self.Is_nan = False

        self.model_value_target = None
        self.model_value = None
        self.model_Q = None
        self.model_Q2 = None
        self.model_policy = None

        self.previous_size = 0
        self.previous_eyes = None
        self.previous_arange = None
        self.previous_size_train = 0
        self.previous_eyes_train = None

        # optimizers and learning rate
        self.schedule_lr_policy = None
        self.optimizer_policy = None
        self.schedule_lr_Q = None
        self.optimizer_Q = None
        self.schedule_lr_Q2 = None
        self.optimizer_Q2 = None
        self.schedule_lr_value = None
        self.optimizer_value = None

        self.construct_q_network()

    def _build_q_NN(self):
        input_states = Input(shape=(self._observation_size,))
        input_action = Input(shape=(self._action_size,))

        input_layer = Concatenate()([input_states, input_action])
        lay = input_layer
        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes, self._nn_archi.activs)):
            lay = Dense(size, name="layer_{}".format(lay_num))(lay)  # put at self.action_size
            lay = Activation(act)(lay)

        advantage = Dense(1, activation='linear')(lay)

        model = Model(inputs=[input_states, input_action], outputs=[advantage])
        return model

    def _build_model_value(self):
        input_states = Input(shape=(self._observation_size,))

        lay = input_states
        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_value, self._nn_archi.activs_value)):
            lay = Dense(size)(lay)
            lay = Activation(act)(lay)

        advantage = Dense(self._action_size, activation='relu')(lay)
        state_value = Dense(1, activation='linear', name="state_value")(advantage)
        model = Model(inputs=[input_states], outputs=[state_value])
        return model

[docs]    def construct_q_network(self):
        """
        This constructs all the networks needed for the SAC agent.
        """
        self.model_Q = self._build_q_NN()
        self.schedule_lr_Q, self.optimizer_Q = self.make_optimiser()
        self.model_Q.compile(loss='mse', optimizer=self.optimizer_Q)

        self.model_Q2 = self._build_q_NN()
        self.schedule_lr_Q2, self.optimizer_Q2 = self.make_optimiser()
        self.model_Q2.compile(loss='mse', optimizer=self.optimizer_Q2)

        # state value function approximation
        self.model_value = self._build_model_value()
        self.schedule_lr_value, self.optimizer_value = self.make_optimiser()
        self._optimizer_model = self.optimizer_value
        self.model_value.compile(loss='mse', optimizer=self.optimizer_value)

        self.model_value_target = self._build_model_value()
        self.model_value_target.set_weights(self.model_value.get_weights())

        # policy function approximation
        self.model_policy = Sequential()
        # proba of choosing action a depending on policy pi
        input_states = Input(shape=(self._observation_size,))
        lay = input_states
        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes_policy, self._nn_archi.activs_policy)):
            lay = Dense(size)(lay)
            lay = Activation(act)(lay)
        soft_proba = Dense(self._action_size, activation="softmax", kernel_initializer='uniform', name="soft_proba")(lay)
        self.model_policy = Model(inputs=[input_states], outputs=[soft_proba])
        self.schedule_lr_policy, self.optimizer_policy = self.make_optimiser()
        self.model_policy.compile(loss='categorical_crossentropy', optimizer=self.optimizer_policy)

    def _get_eye_pm(self, batch_size):
        if batch_size != self.previous_size:
            tmp = np.zeros((batch_size, self._action_size), dtype=np.float32)
            self.previous_eyes = tmp
            self.previous_arange = np.arange(batch_size)
            self.previous_size = batch_size
        return self.previous_eyes, self.previous_arange

[docs]    def predict_movement(self, data, epsilon, batch_size=None, training=False):
        """
        predict the next movements in a vectorized fashion
        """
        if batch_size is None:
            batch_size = data.shape[0]
        rand_val = np.random.random(data.shape[0])
        p_actions = self.model_policy(data, training=training).numpy()
        opt_policy_orig = np.argmax(np.abs(p_actions), axis=-1)
        opt_policy = 1.0 * opt_policy_orig
        opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
        opt_policy = opt_policy.astype(np.int)
        return opt_policy, p_actions[:, opt_policy], p_actions

    def _get_eye_train(self, batch_size):
        if batch_size != self.previous_size_train:
            self.previous_eyes_train = np.repeat(np.eye(self._action_size),
                                                 batch_size * np.ones(self._action_size, dtype=np.int),
                                                 axis=0)
            self.previous_eyes_train = tf.convert_to_tensor(self.previous_eyes_train, dtype=tf.float32)
            self.previous_size_train = batch_size
        return self.previous_eyes_train

[docs]    def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
        """Trains networks to fit given parameters"""
        if batch_size is None:
            batch_size = s_batch.shape[0]
        target = np.zeros((batch_size, 1))

        # training of the action state value networks
        last_action = np.zeros((batch_size, self._action_size))

        # Save the graph just the first time
        if tf_writer is not None:
            tf.summary.trace_on()
        # TODO is it s2 or s ? For me it should be s...
        fut_action = self.model_value_target(s2_batch, training=True).numpy().reshape(-1)
        # TODO ***_target should be for the Q function instead imho

        if tf_writer is not None:
            with tf_writer.as_default():
                tf.summary.trace_export("model_value_target-graph", 0)
            tf.summary.trace_off()

        # TODO is it rather `targets[:, a_batch]`
        target[:, 0] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
        # target[:, a_batch] = r_batch + (1 - d_batch) * self._training_param.discount_factor * fut_action
        loss = self.model_Q.train_on_batch([s_batch, last_action], target)
        loss_2 = self.model_Q2.train_on_batch([s_batch, last_action], target)

        self.life_spent += 1
        temp = 1 / np.log(self.life_spent) / 2
        tiled_batch = np.tile(s_batch, (self._action_size, 1))
        tiled_batch_ts = tf.convert_to_tensor(tiled_batch)
        # tiled_batch: output something like: batch, batch, batch
        # TODO save that somewhere not to compute it each time, you can even save this in the
        # TODO tensorflow graph!
        tmp = self._get_eye_train(batch_size)

        action_v1_orig = self.model_Q.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
        action_v2_orig = self.model_Q2.predict([tiled_batch_ts, tmp], batch_size=batch_size).reshape(batch_size, -1)
        action_v1 = action_v1_orig - np.amax(action_v1_orig, axis=-1).reshape(batch_size, 1)
        new_proba = np.exp(action_v1 / temp) / np.sum(np.exp(action_v1 / temp), axis=-1).reshape(batch_size, 1)
        new_proba_ts = tf.convert_to_tensor(new_proba)
        loss_policy = self.model_policy.train_on_batch(s_batch, new_proba_ts)

        target_pi = self.model_policy.predict(s_batch, batch_size=batch_size)
        value_target = np.fmin(action_v1_orig[0, a_batch], action_v2_orig[0, a_batch]) - np.sum(
            target_pi * np.log(target_pi + 1e-6))
        value_target_ts = tf.convert_to_tensor(value_target.reshape(-1, 1))
        loss_value = self.model_value.train_on_batch(s_batch, value_target_ts)

        self.Is_nan = np.isnan(loss) + np.isnan(loss_2) + np.isnan(loss_policy) + np.isnan(loss_value)
        return np.all(np.isfinite(loss)) & np.all(np.isfinite(loss_2)) & np.all(np.isfinite(loss_policy)) & \
               np.all(np.isfinite(loss_value))

    @staticmethod
    def _get_path_model(path, name=None):
        if name is None:
            path_model = path
        else:
            path_model = os.path.join(path, name)
        path_target_model = "{}_target".format(path_model)
        path_modelQ = "{}_Q".format(path_model)
        path_modelQ2 = "{}_Q2".format(path_model)
        path_policy = "{}_policy".format(path_model)
        return path_model, path_target_model, path_modelQ, path_modelQ2, path_policy

[docs]    def save_network(self, path, name=None, ext="h5"):
        """
        Saves all the models with unique names
        """
        path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
        self.model_value.save('{}.{}'.format(path_model, ext))
        self.model_value_target.save('{}.{}'.format(path_target_model, ext))
        self.model_Q.save('{}.{}'.format(path_modelQ, ext))
        self.model_Q2.save('{}.{}'.format(path_modelQ2, ext))
        self.model_policy.save('{}.{}'.format(path_policy, ext))

[docs]    def load_network(self, path, name=None, ext="h5"):
        """
        We load all the models using the keras "load_model" function.
        """
        path_model, path_target_model, path_modelQ, path_modelQ2, path_policy = self._get_path_model(path, name)
        self.construct_q_network()
        self.model_value.load_weights('{}.{}'.format(path_model, ext))
        self.model_value_target.load_weights('{}.{}'.format(path_target_model, ext))
        self.model_Q.load_weights('{}.{}'.format(path_modelQ, ext))
        self.model_Q2.load_weights('{}.{}'.format(path_modelQ2, ext))
        self.model_policy.load_weights('{}.{}'.format(path_policy, ext))
        if self.verbose:
            print("Succesfully loaded network.")

[docs]    def target_train(self):
        """
        This update the target model.
        """
        model_weights = self.model_value.get_weights()
        target_model_weights = self.model_value_target.get_weights()
        for i in range(len(model_weights)):
            target_model_weights[i] = self._training_param.tau * model_weights[i] + (1 - self._training_param.tau) * \
                                      target_model_weights[i]
        self.model_value_target.set_weights(model_weights)