Module TeachMyAgent.students.openai_baselines.ppo2.synchronous_runner

Expand source code
import numpy as np
from TeachMyAgent.students.openai_baselines.common.runners import AbstractEnvRunner

class SynchronousRunner():
    """
    Synchronous runner to interact with environment and sample trajectories (created for TeachMyAgent).

    We use this object to make a mini batch of experiences
    __init__:
    - Initialize the runner

    run():
    - Make a mini batch
    """
    def __init__(self, *, env, model, nsteps, gamma, lam, max_ep_len, Teacher):
        self.env = env
        self.model = model
        self.nsteps = nsteps
        self.nb_total_steps = 0
        self.batch_ob_shape = (1 * nsteps,) + env.observation_space.shape
        self.obs = np.zeros((1,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)

        if Teacher:
            Teacher.set_env_params(env.get_raw_env())
        unscaled_o, o = env.reset()
        if Teacher:
            Teacher.record_train_task_initial_state(unscaled_o[0])
        self.prev_unscaled_obs = unscaled_o[0]
        self.obs = np.array(o)
        self.states = model.initial_state
        self.dones = [False]

        # Lambda used in GAE (General Advantage Estimation)
        self.lam = lam
        # Discount rate
        self.gamma = gamma

        # ACL utils
        self.max_ep_len = max_ep_len
        self.ep_len = 0
        self.ep_ret = 0
        self.raw_ep_ret = 0

    def run(self, Teacher):
        # Here, we init the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
        mb_states = self.states
        epinfos = []
        # For n in range number of steps
        for _ in range(self.nsteps):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)
            self.nb_total_steps += 1

            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            a = actions
            o, r, d, i = self.env.step(a)
            unscaled_reward = i[0]["original_reward"][0]
            unscaled_o = i[0]["original_obs"][0]
            self.ep_len += 1
            self.ep_ret += r[0]
            self.raw_ep_ret += unscaled_reward
            Teacher.record_train_step(self.prev_unscaled_obs, a[0], unscaled_reward, unscaled_o, d[0])

            if d or self.ep_len == self.max_ep_len:
                if Teacher:
                    success = False if 'success' not in i else i["success"]
                    Teacher.record_train_episode(self.raw_ep_ret, self.ep_len, success)
                    Teacher.set_env_params(self.env.get_raw_env())
                unscaled_o, o = self.env.reset()
                unscaled_o = unscaled_o[0]
                Teacher.record_train_task_initial_state(unscaled_o)
                epinfos.append({
                    "r": self.raw_ep_ret,
                    "l": self.ep_len
                })
                self.ep_len = 0
                self.ep_ret = 0
                self.raw_ep_ret = 0

            self.obs, self.prev_unscaled_obs, rewards, self.dones, infos = o, unscaled_o, r, d, i
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = self.model.value(self.obs, S=self.states, M=self.dones)

        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t+1]
                nextvalues = mb_values[t+1]
            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values
        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
            mb_states, epinfos)
# obs, returns, masks, actions, values, neglogpacs, states = runner.run()
def sf01(arr):
    """
    swap and then flatten axes 0 and 1
    """
    s = arr.shape
    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

Functions

def sf01(arr)

swap and then flatten axes 0 and 1

Expand source code
def sf01(arr):
    """
    swap and then flatten axes 0 and 1
    """
    s = arr.shape
    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])

Classes

class SynchronousRunner (*, env, model, nsteps, gamma, lam, max_ep_len, Teacher)

Synchronous runner to interact with environment and sample trajectories (created for TeachMyAgent).

We use this object to make a mini batch of experiences init: - Initialize the runner

run(): - Make a mini batch

Expand source code
class SynchronousRunner():
    """
    Synchronous runner to interact with environment and sample trajectories (created for TeachMyAgent).

    We use this object to make a mini batch of experiences
    __init__:
    - Initialize the runner

    run():
    - Make a mini batch
    """
    def __init__(self, *, env, model, nsteps, gamma, lam, max_ep_len, Teacher):
        self.env = env
        self.model = model
        self.nsteps = nsteps
        self.nb_total_steps = 0
        self.batch_ob_shape = (1 * nsteps,) + env.observation_space.shape
        self.obs = np.zeros((1,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)

        if Teacher:
            Teacher.set_env_params(env.get_raw_env())
        unscaled_o, o = env.reset()
        if Teacher:
            Teacher.record_train_task_initial_state(unscaled_o[0])
        self.prev_unscaled_obs = unscaled_o[0]
        self.obs = np.array(o)
        self.states = model.initial_state
        self.dones = [False]

        # Lambda used in GAE (General Advantage Estimation)
        self.lam = lam
        # Discount rate
        self.gamma = gamma

        # ACL utils
        self.max_ep_len = max_ep_len
        self.ep_len = 0
        self.ep_ret = 0
        self.raw_ep_ret = 0

    def run(self, Teacher):
        # Here, we init the lists that will contain the mb of experiences
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
        mb_states = self.states
        epinfos = []
        # For n in range number of steps
        for _ in range(self.nsteps):
            # Given observations, get action value and neglopacs
            # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
            actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
            mb_obs.append(self.obs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(self.dones)
            self.nb_total_steps += 1

            # Take actions in env and look the results
            # Infos contains a ton of useful informations
            a = actions
            o, r, d, i = self.env.step(a)
            unscaled_reward = i[0]["original_reward"][0]
            unscaled_o = i[0]["original_obs"][0]
            self.ep_len += 1
            self.ep_ret += r[0]
            self.raw_ep_ret += unscaled_reward
            Teacher.record_train_step(self.prev_unscaled_obs, a[0], unscaled_reward, unscaled_o, d[0])

            if d or self.ep_len == self.max_ep_len:
                if Teacher:
                    success = False if 'success' not in i else i["success"]
                    Teacher.record_train_episode(self.raw_ep_ret, self.ep_len, success)
                    Teacher.set_env_params(self.env.get_raw_env())
                unscaled_o, o = self.env.reset()
                unscaled_o = unscaled_o[0]
                Teacher.record_train_task_initial_state(unscaled_o)
                epinfos.append({
                    "r": self.raw_ep_ret,
                    "l": self.ep_len
                })
                self.ep_len = 0
                self.ep_ret = 0
                self.raw_ep_ret = 0

            self.obs, self.prev_unscaled_obs, rewards, self.dones, infos = o, unscaled_o, r, d, i
            mb_rewards.append(rewards)
        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_actions = np.asarray(mb_actions)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        last_values = self.model.value(self.obs, S=self.states, M=self.dones)

        # discount/bootstrap off value fn
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        lastgaelam = 0
        for t in reversed(range(self.nsteps)):
            if t == self.nsteps - 1:
                nextnonterminal = 1.0 - self.dones
                nextvalues = last_values
            else:
                nextnonterminal = 1.0 - mb_dones[t+1]
                nextvalues = mb_values[t+1]
            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
        mb_returns = mb_advs + mb_values
        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
            mb_states, epinfos)

Methods

def run(self, Teacher)
Expand source code
def run(self, Teacher):
    # Here, we init the lists that will contain the mb of experiences
    mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
    mb_states = self.states
    epinfos = []
    # For n in range number of steps
    for _ in range(self.nsteps):
        # Given observations, get action value and neglopacs
        # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
        actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
        mb_obs.append(self.obs.copy())
        mb_actions.append(actions)
        mb_values.append(values)
        mb_neglogpacs.append(neglogpacs)
        mb_dones.append(self.dones)
        self.nb_total_steps += 1

        # Take actions in env and look the results
        # Infos contains a ton of useful informations
        a = actions
        o, r, d, i = self.env.step(a)
        unscaled_reward = i[0]["original_reward"][0]
        unscaled_o = i[0]["original_obs"][0]
        self.ep_len += 1
        self.ep_ret += r[0]
        self.raw_ep_ret += unscaled_reward
        Teacher.record_train_step(self.prev_unscaled_obs, a[0], unscaled_reward, unscaled_o, d[0])

        if d or self.ep_len == self.max_ep_len:
            if Teacher:
                success = False if 'success' not in i else i["success"]
                Teacher.record_train_episode(self.raw_ep_ret, self.ep_len, success)
                Teacher.set_env_params(self.env.get_raw_env())
            unscaled_o, o = self.env.reset()
            unscaled_o = unscaled_o[0]
            Teacher.record_train_task_initial_state(unscaled_o)
            epinfos.append({
                "r": self.raw_ep_ret,
                "l": self.ep_len
            })
            self.ep_len = 0
            self.ep_ret = 0
            self.raw_ep_ret = 0

        self.obs, self.prev_unscaled_obs, rewards, self.dones, infos = o, unscaled_o, r, d, i
        mb_rewards.append(rewards)
    #batch of steps to batch of rollouts
    mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
    mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
    mb_actions = np.asarray(mb_actions)
    mb_values = np.asarray(mb_values, dtype=np.float32)
    mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
    mb_dones = np.asarray(mb_dones, dtype=np.bool)
    last_values = self.model.value(self.obs, S=self.states, M=self.dones)

    # discount/bootstrap off value fn
    mb_returns = np.zeros_like(mb_rewards)
    mb_advs = np.zeros_like(mb_rewards)
    lastgaelam = 0
    for t in reversed(range(self.nsteps)):
        if t == self.nsteps - 1:
            nextnonterminal = 1.0 - self.dones
            nextvalues = last_values
        else:
            nextnonterminal = 1.0 - mb_dones[t+1]
            nextvalues = mb_values[t+1]
        delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
        mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
    mb_returns = mb_advs + mb_values
    return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
        mb_states, epinfos)