diff --git a/README.rst b/README.rst index d9bfb4fda04..21b7cf6a5fa 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ OpenAI Gym **OpenAI Gym is a toolkit for developing and comparing reinforcement learning algorithms.** This is the ``gym`` open-source library, which gives you access to an ever-growing variety of environments. .. image:: https://travis-ci.org/openai/gym.svg?branch=master - :target: https://travis-ci.org/openai/gym + :target: https://travis-ci.org/openai/gym `See What's New section below <#what-s-new>`_ @@ -126,7 +126,7 @@ fake display. The easiest way to do this is by running under .. code:: shell - xvfb-run -s "-screen 0 1400x900x24" bash + xvfb-run -s "-screen 0 1400x900x24" bash Installing dependencies for specific environments ------------------------------------------------- @@ -262,6 +262,10 @@ We are using `pytest `_ for tests. You can run them via: What's new ========== +- 2018-01-24: All continuous control environments now use mujoco_py >= 1.50. + Versions have been updated accordingly to -v2, e.g. HalfCheetah-v2. Performance + should be similar (see https://github.com/openai/gym/pull/834) but there are likely + some differences due to changes in MuJoCo. - 2017-06-16: Make env.spec into a property to fix a bug that occurs when you try to print out an unregistered Env. - 2017-05-13: BACKWARDS INCOMPATIBILITY: The Atari environments are now at diff --git a/gym/envs/__init__.py b/gym/envs/__init__.py index db015440229..32fb6b241da 100644 --- a/gym/envs/__init__.py +++ b/gym/envs/__init__.py @@ -204,89 +204,89 @@ # 2D register( - id='Reacher-v1', + id='Reacher-v2', entry_point='gym.envs.mujoco:ReacherEnv', max_episode_steps=50, reward_threshold=-3.75, ) register( - id='Pusher-v0', + id='Pusher-v2', entry_point='gym.envs.mujoco:PusherEnv', max_episode_steps=100, reward_threshold=0.0, ) register( - id='Thrower-v0', + id='Thrower-v2', entry_point='gym.envs.mujoco:ThrowerEnv', max_episode_steps=100, reward_threshold=0.0, ) register( - id='Striker-v0', + id='Striker-v2', entry_point='gym.envs.mujoco:StrikerEnv', max_episode_steps=100, reward_threshold=0.0, ) register( - id='InvertedPendulum-v1', + id='InvertedPendulum-v2', entry_point='gym.envs.mujoco:InvertedPendulumEnv', max_episode_steps=1000, reward_threshold=950.0, ) register( - id='InvertedDoublePendulum-v1', + id='InvertedDoublePendulum-v2', entry_point='gym.envs.mujoco:InvertedDoublePendulumEnv', max_episode_steps=1000, reward_threshold=9100.0, ) register( - id='HalfCheetah-v1', + id='HalfCheetah-v2', entry_point='gym.envs.mujoco:HalfCheetahEnv', max_episode_steps=1000, reward_threshold=4800.0, ) register( - id='Hopper-v1', + id='Hopper-v2', entry_point='gym.envs.mujoco:HopperEnv', max_episode_steps=1000, reward_threshold=3800.0, ) register( - id='Swimmer-v1', + id='Swimmer-v2', entry_point='gym.envs.mujoco:SwimmerEnv', max_episode_steps=1000, reward_threshold=360.0, ) register( - id='Walker2d-v1', + id='Walker2d-v2', max_episode_steps=1000, entry_point='gym.envs.mujoco:Walker2dEnv', ) register( - id='Ant-v1', + id='Ant-v2', entry_point='gym.envs.mujoco:AntEnv', max_episode_steps=1000, reward_threshold=6000.0, ) register( - id='Humanoid-v1', + id='Humanoid-v2', entry_point='gym.envs.mujoco:HumanoidEnv', max_episode_steps=1000, ) register( - id='HumanoidStandup-v1', + id='HumanoidStandup-v2', entry_point='gym.envs.mujoco:HumanoidStandupEnv', max_episode_steps=1000, ) diff --git a/gym/envs/mujoco/ant.py b/gym/envs/mujoco/ant.py index 57947912582..09ac3cc44af 100644 --- a/gym/envs/mujoco/ant.py +++ b/gym/envs/mujoco/ant.py @@ -14,7 +14,7 @@ def _step(self, a): forward_reward = (xposafter - xposbefore)/self.dt ctrl_cost = .5 * np.square(a).sum() contact_cost = 0.5 * 1e-3 * np.sum( - np.square(np.clip(self.model.data.cfrc_ext, -1, 1))) + np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) survive_reward = 1.0 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self.state_vector() @@ -30,9 +30,9 @@ def _step(self, a): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[2:], - self.model.data.qvel.flat, - np.clip(self.model.data.cfrc_ext, -1, 1).flat, + self.sim.data.qpos.flat[2:], + self.sim.data.qvel.flat, + np.clip(self.sim.data.cfrc_ext, -1, 1).flat, ]) def reset_model(self): diff --git a/gym/envs/mujoco/half_cheetah.py b/gym/envs/mujoco/half_cheetah.py index 9a49e0a32e0..c2b6ba2209c 100644 --- a/gym/envs/mujoco/half_cheetah.py +++ b/gym/envs/mujoco/half_cheetah.py @@ -8,9 +8,9 @@ def __init__(self): utils.EzPickle.__init__(self) def _step(self, action): - xposbefore = self.model.data.qpos[0, 0] + xposbefore = self.sim.data.qpos[0] self.do_simulation(action, self.frame_skip) - xposafter = self.model.data.qpos[0, 0] + xposafter = self.sim.data.qpos[0] ob = self._get_obs() reward_ctrl = - 0.1 * np.square(action).sum() reward_run = (xposafter - xposbefore)/self.dt @@ -20,8 +20,8 @@ def _step(self, action): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[1:], - self.model.data.qvel.flat, + self.sim.data.qpos.flat[1:], + self.sim.data.qvel.flat, ]) def reset_model(self): diff --git a/gym/envs/mujoco/hopper.py b/gym/envs/mujoco/hopper.py index 2a5a3999fb3..28fb1449b43 100644 --- a/gym/envs/mujoco/hopper.py +++ b/gym/envs/mujoco/hopper.py @@ -8,9 +8,9 @@ def __init__(self): utils.EzPickle.__init__(self) def _step(self, a): - posbefore = self.model.data.qpos[0, 0] + posbefore = self.sim.data.qpos[0] self.do_simulation(a, self.frame_skip) - posafter, height, ang = self.model.data.qpos[0:3, 0] + posafter, height, ang = self.sim.data.qpos[0:3] alive_bonus = 1.0 reward = (posafter - posbefore) / self.dt reward += alive_bonus @@ -23,8 +23,8 @@ def _step(self, a): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[1:], - np.clip(self.model.data.qvel.flat, -10, 10) + self.sim.data.qpos.flat[1:], + np.clip(self.sim.data.qvel.flat, -10, 10) ]) def reset_model(self): diff --git a/gym/envs/mujoco/humanoid.py b/gym/envs/mujoco/humanoid.py index 83bb2c13d6a..7c926b60f0c 100644 --- a/gym/envs/mujoco/humanoid.py +++ b/gym/envs/mujoco/humanoid.py @@ -2,9 +2,9 @@ from gym.envs.mujoco import mujoco_env from gym import utils -def mass_center(model): - mass = model.body_mass - xpos = model.data.xipos +def mass_center(model, sim): + mass = np.expand_dims(model.body_mass, 1) + xpos = sim.data.xipos return (np.sum(mass * xpos, 0) / np.sum(mass))[0] class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): @@ -13,7 +13,7 @@ def __init__(self): utils.EzPickle.__init__(self) def _get_obs(self): - data = self.model.data + data = self.sim.data return np.concatenate([data.qpos.flat[2:], data.qvel.flat, data.cinert.flat, @@ -22,17 +22,17 @@ def _get_obs(self): data.cfrc_ext.flat]) def _step(self, a): - pos_before = mass_center(self.model) + pos_before = mass_center(self.model, self.sim) self.do_simulation(a, self.frame_skip) - pos_after = mass_center(self.model) + pos_after = mass_center(self.model, self.sim) alive_bonus = 5.0 - data = self.model.data + data = self.sim.data lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() quad_impact_cost = min(quad_impact_cost, 10) reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus - qpos = self.model.data.qpos + qpos = self.sim.data.qpos done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) diff --git a/gym/envs/mujoco/humanoidstandup.py b/gym/envs/mujoco/humanoidstandup.py index ebc5fb500af..98b771853eb 100644 --- a/gym/envs/mujoco/humanoidstandup.py +++ b/gym/envs/mujoco/humanoidstandup.py @@ -2,18 +2,13 @@ from gym.envs.mujoco import mujoco_env from gym import utils -def mass_center(model): - mass = model.body_mass - xpos = model.data.xipos - return (np.sum(mass * xpos, 0) / np.sum(mass))[0] - class HumanoidStandupEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): mujoco_env.MujocoEnv.__init__(self, 'humanoidstandup.xml', 5) utils.EzPickle.__init__(self) def _get_obs(self): - data = self.model.data + data = self.sim.data return np.concatenate([data.qpos.flat[2:], data.qvel.flat, data.cinert.flat, @@ -23,8 +18,8 @@ def _get_obs(self): def _step(self, a): self.do_simulation(a, self.frame_skip) - pos_after = self.model.data.qpos[2][0] - data = self.model.data + pos_after = self.sim.data.qpos[2] + data = self.sim.data uph_cost = (pos_after - 0) / self.model.opt.timestep quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() diff --git a/gym/envs/mujoco/inverted_double_pendulum.py b/gym/envs/mujoco/inverted_double_pendulum.py index 0c29659581d..ae9f2e44eed 100644 --- a/gym/envs/mujoco/inverted_double_pendulum.py +++ b/gym/envs/mujoco/inverted_double_pendulum.py @@ -11,22 +11,22 @@ def __init__(self): def _step(self, action): self.do_simulation(action, self.frame_skip) ob = self._get_obs() - x, _, y = self.model.data.site_xpos[0] + x, _, y = self.sim.data.site_xpos[0] dist_penalty = 0.01 * x ** 2 + (y - 2) ** 2 - v1, v2 = self.model.data.qvel[1:3] + v1, v2 = self.sim.data.qvel[1:3] vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 alive_bonus = 10 - r = (alive_bonus - dist_penalty - vel_penalty)[0] + r = alive_bonus - dist_penalty - vel_penalty done = bool(y <= 1) return ob, r, done, {} def _get_obs(self): return np.concatenate([ - self.model.data.qpos[:1], # cart x pos - np.sin(self.model.data.qpos[1:]), # link angles - np.cos(self.model.data.qpos[1:]), - np.clip(self.model.data.qvel, -10, 10), - np.clip(self.model.data.qfrc_constraint, -10, 10) + self.sim.data.qpos[:1], # cart x pos + np.sin(self.sim.data.qpos[1:]), # link angles + np.cos(self.sim.data.qpos[1:]), + np.clip(self.sim.data.qvel, -10, 10), + np.clip(self.sim.data.qfrc_constraint, -10, 10) ]).ravel() def reset_model(self): diff --git a/gym/envs/mujoco/inverted_pendulum.py b/gym/envs/mujoco/inverted_pendulum.py index 86a1f27d149..371f156babc 100644 --- a/gym/envs/mujoco/inverted_pendulum.py +++ b/gym/envs/mujoco/inverted_pendulum.py @@ -22,7 +22,7 @@ def reset_model(self): return self._get_obs() def _get_obs(self): - return np.concatenate([self.model.data.qpos, self.model.data.qvel]).ravel() + return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel() def viewer_setup(self): v = self.viewer diff --git a/gym/envs/mujoco/mujoco_env.py b/gym/envs/mujoco/mujoco_env.py index 32378c3e33d..e67621c67f5 100644 --- a/gym/envs/mujoco/mujoco_env.py +++ b/gym/envs/mujoco/mujoco_env.py @@ -9,7 +9,6 @@ try: import mujoco_py - from mujoco_py.mjlib import mjlib except ImportError as e: raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) @@ -25,8 +24,9 @@ def __init__(self, model_path, frame_skip): if not path.exists(fullpath): raise IOError("File %s does not exist" % fullpath) self.frame_skip = frame_skip - self.model = mujoco_py.MjModel(fullpath) - self.data = self.model.data + self.model = mujoco_py.load_model_from_path(fullpath) + self.sim = mujoco_py.MjSim(self.model) + self.data = self.sim.data self.viewer = None self.metadata = { @@ -34,8 +34,8 @@ def __init__(self, model_path, frame_skip): 'video.frames_per_second': int(np.round(1.0 / self.dt)) } - self.init_qpos = self.model.data.qpos.ravel().copy() - self.init_qvel = self.model.data.qvel.ravel().copy() + self.init_qpos = self.sim.data.qpos.ravel().copy() + self.init_qvel = self.sim.data.qvel.ravel().copy() observation, _reward, done, _info = self._step(np.zeros(self.model.nu)) assert not done self.obs_dim = observation.size @@ -76,33 +76,33 @@ def viewer_setup(self): # ----------------------------- def _reset(self): - mjlib.mj_resetData(self.model.ptr, self.data.ptr) + self.sim.reset() ob = self.reset_model() if self.viewer is not None: - self.viewer.autoscale() self.viewer_setup() return ob def set_state(self, qpos, qvel): assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) - self.model.data.qpos = qpos - self.model.data.qvel = qvel - self.model._compute_subtree() # pylint: disable=W0212 - self.model.forward() + old_state = self.sim.get_state() + new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, + old_state.act, old_state.udd_state) + self.sim.set_state(new_state) + self.sim.forward() @property def dt(self): return self.model.opt.timestep * self.frame_skip def do_simulation(self, ctrl, n_frames): - self.model.data.ctrl = ctrl + self.sim.data.ctrl[:] = ctrl for _ in range(n_frames): - self.model.step() + self.sim.step() def _render(self, mode='human', close=False): if close: if self.viewer is not None: - self._get_viewer().finish() + self._get_viewer() self.viewer = None return @@ -111,30 +111,19 @@ def _render(self, mode='human', close=False): data, width, height = self._get_viewer().get_image() return np.fromstring(data, dtype='uint8').reshape(height, width, 3)[::-1, :, :] elif mode == 'human': - self._get_viewer().loop_once() + self._get_viewer().render() def _get_viewer(self): if self.viewer is None: - self.viewer = mujoco_py.MjViewer() - self.viewer.start() - self.viewer.set_model(self.model) + self.viewer = mujoco_py.MjViewer(self.sim) self.viewer_setup() return self.viewer def get_body_com(self, body_name): - idx = self.model.body_names.index(six.b(body_name)) - return self.model.data.com_subtree[idx] - - def get_body_comvel(self, body_name): - idx = self.model.body_names.index(six.b(body_name)) - return self.model.body_comvels[idx] - - def get_body_xmat(self, body_name): - idx = self.model.body_names.index(six.b(body_name)) - return self.model.data.xmat[idx].reshape((3, 3)) + return self.data.get_body_xpos(body_name) def state_vector(self): return np.concatenate([ - self.model.data.qpos.flat, - self.model.data.qvel.flat + self.sim.data.qpos.flat, + self.sim.data.qvel.flat ]) diff --git a/gym/envs/mujoco/pusher.py b/gym/envs/mujoco/pusher.py index 96bfe9dbd5e..c5bed3db668 100644 --- a/gym/envs/mujoco/pusher.py +++ b/gym/envs/mujoco/pusher.py @@ -3,7 +3,6 @@ from gym.envs.mujoco import mujoco_env import mujoco_py -from mujoco_py.mjlib import mjlib class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle): def __init__(self): @@ -50,8 +49,8 @@ def reset_model(self): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[:7], - self.model.data.qvel.flat[:7], + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], self.get_body_com("tips_arm"), self.get_body_com("object"), self.get_body_com("goal"), diff --git a/gym/envs/mujoco/reacher.py b/gym/envs/mujoco/reacher.py index 1730db9c30b..e288df5ae02 100644 --- a/gym/envs/mujoco/reacher.py +++ b/gym/envs/mujoco/reacher.py @@ -33,11 +33,11 @@ def reset_model(self): return self._get_obs() def _get_obs(self): - theta = self.model.data.qpos.flat[:2] + theta = self.sim.data.qpos.flat[:2] return np.concatenate([ np.cos(theta), np.sin(theta), - self.model.data.qpos.flat[2:], - self.model.data.qvel.flat[:2], + self.sim.data.qpos.flat[2:], + self.sim.data.qvel.flat[:2], self.get_body_com("fingertip") - self.get_body_com("target") ]) diff --git a/gym/envs/mujoco/striker.py b/gym/envs/mujoco/striker.py index 24a01e3761b..7855d801c77 100644 --- a/gym/envs/mujoco/striker.py +++ b/gym/envs/mujoco/striker.py @@ -67,8 +67,8 @@ def reset_model(self): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[:7], - self.model.data.qvel.flat[:7], + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], self.get_body_com("tips_arm"), self.get_body_com("object"), self.get_body_com("goal"), diff --git a/gym/envs/mujoco/swimmer.py b/gym/envs/mujoco/swimmer.py index b79829e717e..8d351ab79b3 100644 --- a/gym/envs/mujoco/swimmer.py +++ b/gym/envs/mujoco/swimmer.py @@ -9,9 +9,9 @@ def __init__(self): def _step(self, a): ctrl_cost_coeff = 0.0001 - xposbefore = self.model.data.qpos[0, 0] + xposbefore = self.sim.data.qpos[0] self.do_simulation(a, self.frame_skip) - xposafter = self.model.data.qpos[0, 0] + xposafter = self.sim.data.qpos[0] reward_fwd = (xposafter - xposbefore) / self.dt reward_ctrl = - ctrl_cost_coeff * np.square(a).sum() reward = reward_fwd + reward_ctrl @@ -19,8 +19,8 @@ def _step(self, a): return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl) def _get_obs(self): - qpos = self.model.data.qpos - qvel = self.model.data.qvel + qpos = self.sim.data.qpos + qvel = self.sim.data.qvel return np.concatenate([qpos.flat[2:], qvel.flat]) def reset_model(self): diff --git a/gym/envs/mujoco/thrower.py b/gym/envs/mujoco/thrower.py index 2627e0a8c94..116e43ca295 100644 --- a/gym/envs/mujoco/thrower.py +++ b/gym/envs/mujoco/thrower.py @@ -52,8 +52,8 @@ def reset_model(self): def _get_obs(self): return np.concatenate([ - self.model.data.qpos.flat[:7], - self.model.data.qvel.flat[:7], + self.sim.data.qpos.flat[:7], + self.sim.data.qvel.flat[:7], self.get_body_com("r_wrist_roll_link"), self.get_body_com("ball"), self.get_body_com("goal"), diff --git a/gym/envs/mujoco/walker2d.py b/gym/envs/mujoco/walker2d.py index 2fa94593306..728a6b18400 100644 --- a/gym/envs/mujoco/walker2d.py +++ b/gym/envs/mujoco/walker2d.py @@ -9,9 +9,9 @@ def __init__(self): utils.EzPickle.__init__(self) def _step(self, a): - posbefore = self.model.data.qpos[0, 0] + posbefore = self.sim.data.qpos[0] self.do_simulation(a, self.frame_skip) - posafter, height, ang = self.model.data.qpos[0:3, 0] + posafter, height, ang = self.sim.data.qpos[0:3] alive_bonus = 1.0 reward = ((posafter - posbefore) / self.dt) reward += alive_bonus @@ -22,8 +22,8 @@ def _step(self, a): return ob, reward, done, {} def _get_obs(self): - qpos = self.model.data.qpos - qvel = self.model.data.qvel + qpos = self.sim.data.qpos + qvel = self.sim.data.qvel return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel() def reset_model(self): diff --git a/gym/version.py b/gym/version.py index aa111788f04..ce86b2da7d7 100644 --- a/gym/version.py +++ b/gym/version.py @@ -1 +1 @@ -VERSION = '0.9.3' +VERSION = '0.9.5' diff --git a/setup.py b/setup.py index 666cb8a7543..79f990906b0 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ 'board_game' : ['pachi-py>=0.0.19'], 'box2d': ['Box2D-kengz'], 'classic_control': ['PyOpenGL'], - 'mujoco': ['mujoco_py<1.0.0,>=0.4.3', 'imageio'], + 'mujoco': ['mujoco_py>=1.50', 'imageio'], 'parameter_tuning': ['keras', 'theano'], }