Source code for pytorchrl.agent.actors.reward_functions.gym_reward_functions

import torch
import math

"""Good Resource of reward functions: https://arxiv.org/pdf/1907.02057.pdf 
    including gym envrionemnts as well as pybullet environments
"""


[docs]def cartpole(state: torch.Tensor, action: torch.Tensor, next_state: torch.Tensor) -> torch.Tensor:
    """
    Based on https://arxiv.org/pdf/1907.02057.pdf
    reward = cos(θ_t) - 0.01x²
    """
    x, x_dot, theta, theta_dot = state[:, 0], state[:, 1], state[:, 2], state[:, 3]

    reward = torch.cos(theta)[:, None] - 0.01 * x[:, None] ** 2
    return reward


[docs]def pendulum(state: torch.Tensor, action: torch.Tensor, next_state: torch.Tensor) -> torch.Tensor:
    # Original env reward function seems not to work!
    # max_torque = 2.0
    # th = torch.acos(state[:, 0][:, None])
    # thdot = state[:, 2][:, None]
    # action = torch.clamp(action, -max_torque, max_torque)

    # reward = angle_normalize(th) ** 2 + 0.1 * thdot ** 2 + 0.001 * (action ** 2)

    # reward function from Paper: https://arxiv.org/pdf/1907.02057.pdf
    cos_theta, sin_theta, theta_dot = state[:, 0], state[:, 1], state[:, 2]
    reward = - cos_theta[:, None] - 0.1 * sin_theta[:, None] - 0.1 * theta_dot[:, None] ** 2 - 0.001 * action ** 2
    return reward


[docs]def angle_normalize(x: torch.Tensor):
    return ((x + math.pi) % (2 * math.pi)) - math.pi


[docs]def inverted_pendulum_mujoco(state: torch.Tensor, action: torch.Tensor, next_state: torch.Tensor) -> torch.Tensor:
    """
    Env info: https://github.com/openai/gym/blob/master/gym/envs/mujoco/inverted_pendulum.py
    Reward function based on: https://arxiv.org/pdf/1907.02057.pdf
    
    reward = - theta², where theta = state[1]
    
    """
    reward = - state[:, 1][:, None] ** 2
    return reward


[docs]def halfcheetah_mujoco(state: torch.Tensor, action: torch.Tensor, next_state: torch.Tensor) -> torch.Tensor:
    """
    First 8 values in the state are position data
    other 9 are position velocities (x,y,z) and rest angular
    -> idx 8 is x_velocitiy
    """
    x_velocities = state[:, 8]
    action_penalty = - 0.1 * (torch.sum(action ** 2, axis=1))
    reward = (x_velocities + action_penalty)[:, None]
    assert reward.shape == (state.shape[0], 1)
    return reward