From 94caa24fa8002263f49d095505e6d910611d286f Mon Sep 17 00:00:00 2001 From: elliottower Date: Thu, 9 Mar 2023 14:36:10 -0500 Subject: [PATCH] Added pre-commit hooks and fixed linting errors --- .pre-commit-config.yaml | 62 +++ gobblet/__init__.py | 2 +- gobblet/__version__.py | 2 +- gobblet/examples/example_RLlib.py | 31 +- gobblet/examples/example_basic.py | 28 +- gobblet/examples/example_record_game.py | 38 +- gobblet/examples/example_tianshou_DQN.py | 242 +++++--- gobblet/examples/example_tianshou_greedy.py | 121 ++-- gobblet/examples/example_user_input.py | 32 +- gobblet/game/board.py | 89 +-- gobblet/game/collector_manual_policy.py | 41 +- gobblet/game/gobblet.py | 247 ++++++--- gobblet/game/greedy_policy.py | 124 +++-- gobblet/game/greedy_policy_rllib.py | 22 +- gobblet/game/greedy_policy_tianshou.py | 25 +- gobblet/game/manual_policy.py | 98 ++-- .../game/random_admissible_policy_rllib.py | 37 +- gobblet/game/utils.py | 128 +++-- gobblet/gobblet_v1.py | 4 +- requirements.txt | 1 + setup.py | 4 +- tests/test_gobblet_env.py | 8 +- tests/test_manual_policy_collector.py | 516 +++++++++++++++--- 23 files changed, 1376 insertions(+), 526 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..97ad3a7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,62 @@ +--- +repos: + - repo: https://github.com/python/black + rev: 23.1.0 + hooks: + - id: black + - repo: https://github.com/codespell-project/codespell + rev: v2.2.2 + hooks: + - id: codespell + args: + - --skip=*.css,*.js,*.map,*.scss,*svg + - --ignore-words-list=magent + - repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 + args: + - '--per-file-ignores=*/__init__.py:F401 test/all_parameter_combs_test.py:F405 pettingzoo/classic/go/go.py:W605' + - --extend-ignore=E203 + - --max-complexity=205 + - --max-line-length=300 + - --show-source + - --statistics + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + - repo: https://github.com/asottile/pyupgrade + rev: v3.3.1 + hooks: + - id: pyupgrade + # TODO: remove `--keep-runtime-typing` option + args: ["--py37-plus", "--keep-runtime-typing"] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/pycqa/pydocstyle + rev: 6.3.0 + hooks: + - id: pydocstyle + args: + - --source + - --explain + - --convention=google + - --count + # TODO: Remove ignoring rules D101, D102, D103, D105 + - --add-ignore=D100,D107,D101,D102,D103,D105 + exclude: "__init__.py$|^pettingzoo.test|^docs" + additional_dependencies: ["toml"] +# - repo: local +# hooks: +# - id: pyright +# name: pyright +# entry: pyright +# language: node +# pass_filenames: false +# types: [python] +# additional_dependencies: ["pyright"] diff --git a/gobblet/__init__.py b/gobblet/__init__.py index a80c3f6..9226fe7 100644 --- a/gobblet/__init__.py +++ b/gobblet/__init__.py @@ -1 +1 @@ -from . __version__ import __version__ \ No newline at end of file +from .__version__ import __version__ diff --git a/gobblet/__version__.py b/gobblet/__version__.py index ac422f1..f9e47b6 100644 --- a/gobblet/__version__.py +++ b/gobblet/__version__.py @@ -1 +1 @@ -__version__ = '1.3.4' +__version__ = "1.3.4" diff --git a/gobblet/examples/example_RLlib.py b/gobblet/examples/example_RLlib.py index 68e4cf3..e6e023d 100644 --- a/gobblet/examples/example_RLlib.py +++ b/gobblet/examples/example_RLlib.py @@ -1,7 +1,6 @@ import glob import os from typing import Tuple -from gymnasium import spaces import ray.tune from ray import init @@ -13,8 +12,8 @@ from ray.tune.registry import register_env from gobblet import gobblet_v1 -from gobblet.models.action_mask_model import TorchActionMaskModel from gobblet.game.utils import get_project_root +from gobblet.models.action_mask_model import TorchActionMaskModel torch, nn = try_import_torch() @@ -32,11 +31,6 @@ def env_creator(): # wrap the pettingzoo env in MultiAgent RLLib env = PettingZooEnv(env_creator()) - # Convert obs space and action space to gym - # observation_space = env.observation_space["observation"] - # observation_space = spaces.Box(observation_space.low, observation_space.high, observation_space.shape, observation_space.dtype) - # action_space = spaces.Discrete(env.action_space.n) - agents = ["player_1", "player_2"] custom_config = { "env": env_name, @@ -47,7 +41,7 @@ def env_creator(): # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. # "num_gpus": int(torch.cuda.device_count()), "num_gpus": 0, - "num_workers": 2, #os.cpu_count() - 1, + "num_workers": 2, # os.cpu_count() - 1, "multiagent": { "policies": { name: (None, env.observation_space, env.action_space, {}) @@ -97,13 +91,17 @@ def train_ray(ppo_config, timesteps_total: int = 10): def load_ray(path, ppo_config): - """ + """Load ray. + Load a trained RLlib agent from the specified path. Call this before testing a trained agent. - :param path: - Path pointing to the agent's saved checkpoint (only used for RLlib agents) - :param ppo_config: - dict config + + Args: + path: Path pointing to the agent's saved checkpoint (only used for RLlib agents) + ppo_config: dict config + + Returns: + trainer: RLlib trainer object """ trainer = ppo.PPOTrainer(config=ppo_config) trainer.restore(path) @@ -145,7 +143,7 @@ def sample_trainer(trainer, env): def tune_training_loop(timesteps_total=10000): - """train trainer and sample""" + """Train trainer and sample.""" trainer, env, ppo_config = prepare_train() # train trainer @@ -162,8 +160,7 @@ def tune_training_loop(timesteps_total=10000): def manual_training_loop(timesteps_total=10000): - """train trainer and sample""" - + """Train trainer and sample.""" trainer, env, ppo_config = prepare_train() trainer_trained = train(trainer, max_steps=timesteps_total) @@ -172,4 +169,4 @@ def manual_training_loop(timesteps_total=10000): if __name__ == "__main__": init(local_mode=True) - tune_training_loop() \ No newline at end of file + tune_training_loop() diff --git a/gobblet/examples/example_basic.py b/gobblet/examples/example_basic.py index 58adacd..6af71c8 100644 --- a/gobblet/examples/example_basic.py +++ b/gobblet/examples/example_basic.py @@ -1,14 +1,19 @@ -from gobblet import gobblet_v1 import argparse -import numpy as np import time +import numpy as np + +from gobblet import gobblet_v1 + def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( - "--render_mode", type=str, default="human", choices=["human", "rgb_array", "text", "text_full"], - help="Choose the rendering mode for the game." + "--render_mode", + type=str, + default="human", + choices=["human", "rgb_array", "text", "text_full"], + help="Choose the rendering mode for the game.", ) parser.add_argument( @@ -23,10 +28,12 @@ def get_parser() -> argparse.ArgumentParser: return parser + def get_args() -> argparse.Namespace: parser = get_parser() return parser.parse_known_args()[0] + if __name__ == "__main__": # train the agent and watch its performance in a match! args = get_args() @@ -41,7 +48,6 @@ def get_args() -> argparse.Namespace: env.render() # need to render the environment before pygame can take user input for agent in env.agent_iter(): - observation, reward, termination, truncation, info = env.last() if termination or truncation: @@ -49,10 +55,14 @@ def get_args() -> argparse.Namespace: env.step(None) else: - action_mask = observation['action_mask'] - action = np.random.choice(np.arange(len(action_mask)), p=action_mask / np.sum(action_mask)) + action_mask = observation["action_mask"] + action = np.random.choice( + np.arange(len(action_mask)), p=action_mask / np.sum(action_mask) + ) if args.render_mode == "human": - time.sleep(.5) # Wait .5 seconds between moves so the user can follow the sequence of moves + time.sleep( + 0.5 + ) # Wait .5 seconds between moves so the user can follow the sequence of moves - env.step(action) \ No newline at end of file + env.step(action) diff --git a/gobblet/examples/example_record_game.py b/gobblet/examples/example_record_game.py index 7bad479..694a11b 100644 --- a/gobblet/examples/example_record_game.py +++ b/gobblet/examples/example_record_game.py @@ -1,18 +1,32 @@ +import argparse + +import numpy as np import pygame + from gobblet.game.utils import GIFRecorder -import numpy as np -import argparse + def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( - "--seed", type=int, default=None, help="Set random seed manually (will only affect CPU agents)" + "--seed", + type=int, + default=None, + help="Set random seed manually (will only affect CPU agents)", ) parser.add_argument( - "--cpu-players", type=int, default=1, choices=[0, 1, 2], help="Number of CPU players (options: 0, 1, 2)" + "--cpu-players", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of CPU players (options: 0, 1, 2)", ) parser.add_argument( - "--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1" + "--player", + type=int, + default=0, + choices=[0, 1], + help="Choose which player to play as: red = 0, yellow = 1", ) parser.add_argument( "--screen-width", type=int, default=640, help="Width of pygame screen in pixels" @@ -20,10 +34,12 @@ def get_parser() -> argparse.ArgumentParser: return parser + def get_args() -> argparse.Namespace: parser = get_parser() return parser.parse_known_args()[0] + if __name__ == "__main__": from gobblet import gobblet_v1 @@ -44,7 +60,7 @@ def get_args() -> argparse.Namespace: # Record the first frame (empty board) recorder.capture_frame(env.unwrapped.screen) - manual_policy = gobblet_v1.ManualPolicy(env, recorder=recorder) + manual_policy = gobblet_v1.ManualGobbletPolicy(env, recorder=recorder) for agent in env.agent_iter(): clock.tick(env.metadata["render_fps"]) @@ -60,11 +76,13 @@ def get_args() -> argparse.Namespace: continue if agent == manual_policy.agent and args.cpu_players < 2: - action = manual_policy(observation, agent) + action = manual_policy(observation, agent) else: - action_mask = observation['action_mask'] - action = np.random.choice(np.arange(len(action_mask)), p=action_mask / np.sum(action_mask)) + action_mask = observation["action_mask"] + action = np.random.choice( + np.arange(len(action_mask)), p=action_mask / np.sum(action_mask) + ) env.step(action) - env.render() \ No newline at end of file + env.render() diff --git a/gobblet/examples/example_tianshou_DQN.py b/gobblet/examples/example_tianshou_DQN.py index 8e72953..f11fdde 100644 --- a/gobblet/examples/example_tianshou_DQN.py +++ b/gobblet/examples/example_tianshou_DQN.py @@ -1,18 +1,8 @@ # adapted from https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/Tianshou/3_cli_and_logging.py -""" -This is a full example of using Tianshou with MARL to train agents, complete with argument parsing (CLI) and logging. - -Author: Will (https://github.com/WillDudley) - -Python version used: 3.8.10 - -Requirements: -pettingzoo == 1.22.0 -git+https://github.com/thu-ml/tianshou -""" import argparse import os +import time from copy import deepcopy from typing import Optional, Tuple @@ -23,7 +13,7 @@ from tianshou.data import Collector, VectorReplayBuffer from tianshou.env import DummyVectorEnv from tianshou.env.pettingzoo_env import PettingZooEnv -from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy +from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager from tianshou.trainer import offpolicy_trainer from tianshou.utils import TensorboardLogger from tianshou.utils.net.common import Net @@ -31,9 +21,8 @@ from gobblet import gobblet_v1 from gobblet.game.collector_manual_policy import ManualPolicyCollector -from gobblet.game.utils import GIFRecorder from gobblet.game.greedy_policy_tianshou import GreedyPolicy -import time +from gobblet.game.utils import GIFRecorder def get_parser() -> argparse.ArgumentParser: @@ -42,7 +31,9 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument("--eps-test", type=float, default=0.05) parser.add_argument("--eps-train", type=float, default=0.1) parser.add_argument("--buffer-size", type=int, default=20000) - parser.add_argument("--lr", type=float, default=1e-4) # TODO: Changing this to 1e-5 for some reason makes it pause after 3 or 4 epochs + parser.add_argument( + "--lr", type=float, default=1e-4 + ) # TODO: Changing this to 1e-5 for some reason makes it pause after 3 or 4 epochs parser.add_argument( "--gamma", type=float, default=0.9, help="a smaller gamma favors earlier win" ) @@ -60,14 +51,53 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument("--test-num", type=int, default=10) parser.add_argument("--logdir", type=str, default="log") parser.add_argument("--render", type=float, default=0.1) - parser.add_argument("--render_mode", type=str, default="human", choices=["human","rgb_array", "text", "text_full"], help="Choose the rendering mode for the game.") - parser.add_argument("--debug", action="store_true", help="Flag to enable to print extra debugging info") - parser.add_argument("--self_play", action="store_true", help="Flag to enable training via self-play (as opposed to fixed opponent)") - parser.add_argument("--self_play_generations", type=int, default=5, help="Number of generations of self-play agents to train (each generation can have multiple epochs of training)") - parser.add_argument("--self_play_greedy", action="store_true", help="Flag to have self-play train against a greedy agent for the first generation") - parser.add_argument("--cpu-players", type=int, default=2, choices=[1, 2], help="Number of CPU players (options: 1, 2)") - parser.add_argument("--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1") - parser.add_argument("--record", action="store_true", help="Flag to save a recording of the game (game.gif)") + parser.add_argument( + "--render_mode", + type=str, + default="human", + choices=["human", "rgb_array", "text", "text_full"], + help="Choose the rendering mode for the game.", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Flag to enable to print extra debugging info", + ) + parser.add_argument( + "--self_play", + action="store_true", + help="Flag to enable training via self-play (as opposed to fixed opponent)", + ) + parser.add_argument( + "--self_play_generations", + type=int, + default=5, + help="Number of generations of self-play agents to train (each generation can have multiple epochs of training)", + ) + parser.add_argument( + "--self_play_greedy", + action="store_true", + help="Flag to have self-play train against a greedy agent for the first generation", + ) + parser.add_argument( + "--cpu-players", + type=int, + default=2, + choices=[1, 2], + help="Number of CPU players (options: 1, 2)", + ) + parser.add_argument( + "--player", + type=int, + default=0, + choices=[0, 1], + help="Choose which player to play as: red = 0, yellow = 1", + ) + parser.add_argument( + "--record", + action="store_true", + help="Flag to save a recording of the game (game.gif)", + ) parser.add_argument( "--win-rate", type=float, @@ -120,12 +150,11 @@ def get_agents( env = get_env() observation_space = ( env.observation_space["observation"] - if isinstance(env.observation_space, gym.spaces.Dict) or isinstance(env.observation_space, gymnasium.spaces.Dict) + if isinstance(env.observation_space, gym.spaces.Dict) + or isinstance(env.observation_space, gymnasium.spaces.Dict) else env.observation_space ) - args.state_shape = ( - observation_space.shape or observation_space.n - ) + args.state_shape = observation_space.shape or observation_space.n args.action_shape = env.action_space.shape or env.action_space.n if agent_learn is None: # model @@ -168,7 +197,9 @@ def get_agents( agent_opponent.load_state_dict(torch.load(args.opponent_path)) else: # agent_opponent = RandomPolicy() - agent_opponent = GreedyPolicy() # Greedy policy is a difficult opponent, should yeild much better results than random + agent_opponent = ( + GreedyPolicy() + ) # Greedy policy is a difficult opponent, should yield much better results than random if args.agent_id == 1: agents = [agent_learn, agent_opponent] @@ -203,33 +234,46 @@ def train_selfplay( env = get_env() observation_space = ( env.observation_space["observation"] - if isinstance(env.observation_space, gymnasium.spaces.Dict) or isinstance(env.observation_space, gym.spaces.Dict) + if isinstance(env.observation_space, gymnasium.spaces.Dict) + or isinstance(env.observation_space, gym.spaces.Dict) else env.observation_space ) - args.state_shape = ( - observation_space.shape or observation_space.n - ) + args.state_shape = observation_space.shape or observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # ======== model setup ========= # Note: custom models can be specified using the agent_fixed and agent_learn arguments if agent_learn is None: - net = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, device=args.device - ).to(args.device) + net = Net( + args.state_shape, + args.action_shape, + hidden_sizes=args.hidden_sizes, + device=args.device, + ).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) agent_learn = DQNPolicy( - net, optim, args.gamma, args.n_step, - target_update_freq=args.target_update_freq) + net, + optim, + args.gamma, + args.n_step, + target_update_freq=args.target_update_freq, + ) if agent_fixed is None: - net_fixed = Net(args.state_shape, args.action_shape, - hidden_sizes=args.hidden_sizes, device=args.device - ).to(args.device) + net_fixed = Net( + args.state_shape, + args.action_shape, + hidden_sizes=args.hidden_sizes, + device=args.device, + ).to(args.device) optim_fixed = torch.optim.SGD(net_fixed.parameters(), lr=0) agent_fixed = DQNPolicy( - net_fixed, optim_fixed, args.gamma, args.n_step, - target_update_freq=args.target_update_freq) + net_fixed, + optim_fixed, + args.gamma, + args.n_step, + target_update_freq=args.target_update_freq, + ) # Load initial opponent from file # path = os.path.join(args.logdir, 'gobblet', 'dqn', 'policy.pth') @@ -240,7 +284,6 @@ def train_selfplay( policy = MultiAgentPolicyManager(agents, env) agents_list = list(policy.policies.keys()) - # ======== collector setup ========= train_collector = Collector( policy, @@ -266,7 +309,8 @@ def save_best_fn(policy): args.logdir, "gobblet", "dqn-selfplay", "policy.pth" ) torch.save( - policy.policies[agents_list[args.agent_id - 1]].state_dict(), model_save_path + policy.policies[agents_list[args.agent_id - 1]].state_dict(), + model_save_path, ) def stop_fn(mean_rewards): @@ -287,29 +331,44 @@ def test_fn(epoch, env_step): def reward_metric(rews): return rews[:, 0] - # Self-play loop for i in range(args.self_play_generations): result = offpolicy_trainer( - policy, train_collector, test_collector, args.epoch, - args.step_per_epoch, args.step_per_collect, args.test_num, - args.batch_size, train_fn=train_fn, test_fn=test_fn, - stop_fn=stop_fn, save_best_fn=save_best_fn, update_per_step=args.update_per_step, - logger=logger, test_in_train=False, reward_metric=reward_metric) + policy, + train_collector, + test_collector, + args.epoch, + args.step_per_epoch, + args.step_per_collect, + args.test_num, + args.batch_size, + train_fn=train_fn, + test_fn=test_fn, + stop_fn=stop_fn, + save_best_fn=save_best_fn, + update_per_step=args.update_per_step, + logger=logger, + test_in_train=False, + reward_metric=reward_metric, + ) - print(f"Launching game between learned agent ({type(policy.policies[agents_list[0]]).__name__}) and fixed agent ({type(policy.policies[agents_list[1]]).__name__}):") + print( + f"Launching game between learned agent ({type(policy.policies[agents_list[0]]).__name__}) and fixed agent ({type(policy.policies[agents_list[1]]).__name__}):" + ) # Render a single game between the learned policy and fixed policy from last generation watch_selfplay(args, policy.policies[agents_list[0]]) # Set fixed opponent policy as to the current trained policy (updates every epoch) policy.policies[agents_list[1]] = deepcopy(policy.policies[agents_list[0]]) - print('--- SELF-PLAY GENERATION: {} ---'.format(i + 1)) + print(f"--- SELF-PLAY GENERATION: {i + 1} ---") - print(f"Launching game between learned agent ({type(policy.policies[agents_list[0]]).__name__}) and itself ({type(policy.policies[agents_list[1]]).__name__}):") + print( + f"Launching game between learned agent ({type(policy.policies[agents_list[0]]).__name__}) and itself ({type(policy.policies[agents_list[1]]).__name__}):" + ) # Render a single game between the learned policy and itself watch_selfplay(args, policy.policies[agents_list[0]]) - model_save_path = os.path.join(args.logdir, 'gobblet', 'dqn-selfplay', 'policy.pth') + model_save_path = os.path.join(args.logdir, "gobblet", "dqn-selfplay", "policy.pth") torch.save(policy.policies[agents_list[0]].state_dict(), model_save_path) return result, policy.policies[agents_list[0]] @@ -358,11 +417,10 @@ def save_best_fn(policy): if hasattr(args, "model_save_path"): model_save_path = args.model_save_path else: - model_save_path = os.path.join( - args.logdir, "gobblet", "dqn", "policy.pth" - ) + model_save_path = os.path.join(args.logdir, "gobblet", "dqn", "policy.pth") torch.save( - policy.policies[agents_list[args.agent_id - 1]].state_dict(), model_save_path + policy.policies[agents_list[args.agent_id - 1]].state_dict(), + model_save_path, ) def stop_fn(mean_rewards): @@ -416,7 +474,9 @@ def watch( policy.policies[agents_list[args.agent_id - 1]].set_eps(args.eps_test) collector = Collector(policy, env, exploration_noise=True) - pettingzoo_env = env.workers[0].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env + pettingzoo_env = env.workers[ + 0 + ].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env if args.record: recorder = GIFRecorder() recorder.capture_frame(pettingzoo_env.unwrapped.screen) @@ -432,30 +492,39 @@ def watch( if collector.data.terminated or collector.data.truncated: rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()} [{type(policy.policies[agents_list[0]]).__name__}]") - print(f"Final reward: {rews[:, 1].mean()}, length: {lens.mean()} [{type(policy.policies[agents_list[1]]).__name__}]") + print( + f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()} [{type(policy.policies[agents_list[0]]).__name__}]" + ) + print( + f"Final reward: {rews[:, 1].mean()}, length: {lens.mean()} [{type(policy.policies[agents_list[1]]).__name__}]" + ) if recorder is not None: recorder.end_recording(pettingzoo_env.unwrapped.screen) recorder = None + def watch_selfplay( - args: argparse.Namespace = get_args(), - agent: Optional[BasePolicy] = None, + args: argparse.Namespace = get_args(), + agent: Optional[BasePolicy] = None, ) -> None: # env = get_env(render_mode=args.render_mode, args=args) env = DummyVectorEnv([lambda: get_env(render_mode=args.render_mode, args=args)]) - policy = MultiAgentPolicyManager(policies=[agent, deepcopy(agent)], env=get_env(render_mode=args.render_mode, args=args)) # fixed here + policy = MultiAgentPolicyManager( + policies=[agent, deepcopy(agent)], + env=get_env(render_mode=args.render_mode, args=args), + ) # fixed here policy.eval() collector = Collector(policy, env, exploration_noise=True) result = collector.collect(n_episode=1, render=True) rews, lens = result["rews"], result["lens"] print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()}") + # ======== allows the user to input moves and play vs a pre-trained agent ====== def play( - args: argparse.Namespace = get_args(), - agent_learn: Optional[BasePolicy] = None, - agent_opponent: Optional[BasePolicy] = None, + args: argparse.Namespace = get_args(), + agent_learn: Optional[BasePolicy] = None, + agent_opponent: Optional[BasePolicy] = None, ) -> None: env = DummyVectorEnv([lambda: get_env(render_mode=args.render_mode, args=args)]) # env = get_env(render_mode=args.render_mode, args=args) # Throws error because collector looks for length, could just override though since I'm using my own collector @@ -469,24 +538,34 @@ def play( # Experimental: let the CPU agent to continue training (TODO: check if this actually changes things meaningfully) # policy.policies[agents[args.agent_id - 1]].set_eps(args.eps_train) - collector = ManualPolicyCollector(policy, env, exploration_noise=True) # Collector for CPU actions + collector = ManualPolicyCollector( + policy, env, exploration_noise=True + ) # Collector for CPU actions - pettingzoo_env = env.workers[0].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env + pettingzoo_env = env.workers[ + 0 + ].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env if args.record: recorder = GIFRecorder() else: recorder = None - manual_policy = gobblet_v1.ManualPolicy(env=pettingzoo_env, agent_id=args.player, recorder=recorder) # Gobblet keyboard input requires access to raw_env (uses functions from board) + manual_policy = gobblet_v1.ManualGobbletPolicy( + env=pettingzoo_env, agent_id=args.player, recorder=recorder + ) # Gobblet keyboard input requires access to raw_env (uses functions from board) while pettingzoo_env.agents: agent_id = collector.data.obs.agent_id # If it is the players turn and there are less than 2 CPU players (at least one human player) if agent_id == pettingzoo_env.agents[args.player]: - observation = {"observation": collector.data.obs.obs.flatten(), - "action_mask": collector.data.obs.mask.flatten()} # PettingZoo expects a dict with this format + observation = { + "observation": collector.data.obs.obs.flatten(), + "action_mask": collector.data.obs.mask.flatten(), + } # PettingZoo expects a dict with this format action = manual_policy(observation, agent_id) - result = collector.collect_result(action=action.reshape(1), render=args.render) + result = collector.collect_result( + action=action.reshape(1), render=args.render + ) else: result = collector.collect(n_step=1, render=args.render) @@ -495,24 +574,31 @@ def play( if collector.data.terminated or collector.data.truncated: rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews[:, args.player].mean()}, length: {lens.mean()} [Human]") - print(f"Final reward: {rews[:, 1-args.player].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1-args.player]]).__name__}]") + print( + f"Final reward: {rews[:, args.player].mean()}, length: {lens.mean()} [Human]" + ) + print( + f"Final reward: {rews[:, 1-args.player].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1-args.player]]).__name__}]" + ) if recorder is not None: recorder.end_recording(pettingzoo_env.unwrapped.screen) recorder = None + if __name__ == "__main__": # train the agent and watch its performance in a match! args = get_args() if args.player == 1: - args.agent_id = 1 # Ensures trained agent is in the correct spot + args.agent_id = 1 # Ensures trained agent is in the correct spot if args.self_play: print("Training agent...") # Hard code the first fixed agent to be the greedy policy (after one generation it will be switched to a copy of the learned agent) agent_fixed = GreedyPolicy() if args.self_play_greedy else None - result, agent = train_selfplay(args=args, agent_fixed=agent_fixed) # Hard code the first fixed agent to be the greedy policy + result, agent = train_selfplay( + args=args, agent_fixed=agent_fixed + ) # Hard code the first fixed agent to be the greedy policy print("Starting game...") watch_selfplay(args, agent) @@ -524,4 +610,4 @@ def play( if args.cpu_players == 2: watch(args, agent) else: - play(args, agent) \ No newline at end of file + play(args, agent) diff --git a/gobblet/examples/example_tianshou_greedy.py b/gobblet/examples/example_tianshou_greedy.py index 005d7aa..1952617 100644 --- a/gobblet/examples/example_tianshou_greedy.py +++ b/gobblet/examples/example_tianshou_greedy.py @@ -1,31 +1,18 @@ # adapted from https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/Tianshou/3_cli_and_logging.py -""" -This is a full example of using Tianshou with MARL to train agents, complete with argument parsing (CLI) and logging. - -Author: Will (https://github.com/WillDudley) - -Python version used: 3.8.10 - -Requirements: -pettingzoo == 1.22.0 -git+https://github.com/thu-ml/tianshou -""" import argparse -from typing import Optional, Tuple - +import time +from typing import Tuple import torch from tianshou.env import DummyVectorEnv from tianshou.env.pettingzoo_env import PettingZooEnv -from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy - +from tianshou.policy import BasePolicy, MultiAgentPolicyManager from gobblet import gobblet_v1 from gobblet.game.collector_manual_policy import ManualPolicyCollector -from gobblet.game.utils import GIFRecorder from gobblet.game.greedy_policy_tianshou import GreedyPolicy -import time +from gobblet.game.utils import GIFRecorder def get_parser() -> argparse.ArgumentParser: @@ -34,7 +21,9 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument("--eps-test", type=float, default=0.05) parser.add_argument("--eps-train", type=float, default=0.1) parser.add_argument("--buffer-size", type=int, default=20000) - parser.add_argument("--lr", type=float, default=1e-4) # TODO: Changing this to 1e-5 for some reason makes it pause after 3 or 4 epochs + parser.add_argument( + "--lr", type=float, default=1e-4 + ) # TODO: Changing this to 1e-5 for some reason makes it pause after 3 or 4 epochs parser.add_argument( "--gamma", type=float, default=0.9, help="a smaller gamma favors earlier win" ) @@ -52,13 +41,49 @@ def get_parser() -> argparse.ArgumentParser: parser.add_argument("--test-num", type=int, default=10) parser.add_argument("--logdir", type=str, default="log") parser.add_argument("--render", type=float, default=0.1) - parser.add_argument("--render_mode", type=str, default="human", choices=["human","rgb_array", "text", "text_full"], help="Choose the rendering mode for the game.") - parser.add_argument("--debug", action="store_true", help="Flag to enable to print extra debugging info") - parser.add_argument("--self_play", action="store_true", help="Flag to enable training via self-play (as opposed to fixed opponent)") - parser.add_argument("--cpu-players", type=int, default=1, choices=[1, 2], help="Number of CPU players (options: 1, 2)") - parser.add_argument("--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1") - parser.add_argument("--record", action="store_true", help="Flag to save a recording of the game (game.gif)") - parser.add_argument("--depth", type=int, default=2, choices=[1, 2, 3], help="Search depth for greedy agent. Options: 1,2,3") + parser.add_argument( + "--render_mode", + type=str, + default="human", + choices=["human", "rgb_array", "text", "text_full"], + help="Choose the rendering mode for the game.", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Flag to enable to print extra debugging info", + ) + parser.add_argument( + "--self_play", + action="store_true", + help="Flag to enable training via self-play (as opposed to fixed opponent)", + ) + parser.add_argument( + "--cpu-players", + type=int, + default=1, + choices=[1, 2], + help="Number of CPU players (options: 1, 2)", + ) + parser.add_argument( + "--player", + type=int, + default=0, + choices=[0, 1], + help="Choose which player to play as: red = 0, yellow = 1", + ) + parser.add_argument( + "--record", + action="store_true", + help="Flag to save a recording of the game (game.gif)", + ) + parser.add_argument( + "--depth", + type=int, + default=2, + choices=[1, 2, 3], + help="Search depth for greedy agent. Options: 1,2,3", + ) parser.add_argument( "--win-rate", type=float, @@ -120,7 +145,9 @@ def watch() -> None: collector = ManualPolicyCollector(policy, env) - pettingzoo_env = env.workers[0].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env + pettingzoo_env = env.workers[ + 0 + ].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env if args.record: recorder = GIFRecorder() else: @@ -134,35 +161,50 @@ def watch() -> None: if collector.data.terminated or collector.data.truncated: rews, lens = result["rews"], result["lens"] - print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()} [{type(policy.policies[agents[0]]).__name__}]") - print(f"Final reward: {rews[:, 1].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1]]).__name__}]") + print( + f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()} [{type(policy.policies[agents[0]]).__name__}]" + ) + print( + f"Final reward: {rews[:, 1].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1]]).__name__}]" + ) if recorder is not None: recorder.end_recording(pettingzoo_env.unwrapped.screen) recorder = None + # ======== allows the user to input moves and play vs a greedy agent ====== def play() -> None: env = DummyVectorEnv([lambda: get_env(render_mode=args.render_mode, args=args)]) policy, agents = get_agents() - collector = ManualPolicyCollector(policy, env, exploration_noise=True) # Collector for CPU actions + collector = ManualPolicyCollector( + policy, env, exploration_noise=True + ) # Collector for CPU actions - pettingzoo_env = env.workers[0].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env + pettingzoo_env = env.workers[ + 0 + ].env.env # DummyVectorEnv -> Tianshou PettingZoo Wrapper -> PettingZoo Env if args.record: recorder = GIFRecorder() else: recorder = None - manual_policy = gobblet_v1.ManualPolicy(env=pettingzoo_env, agent_id=args.player, recorder=recorder) # Gobblet keyboard input requires access to raw_env (uses functions from board) + manual_policy = gobblet_v1.ManualGobbletPolicy( + env=pettingzoo_env, agent_id=args.player, recorder=recorder + ) # Gobblet keyboard input requires access to raw_env (uses functions from board) while pettingzoo_env.agents: agent_id = collector.data.obs.agent_id # If it is the players turn and there are less than 2 CPU players (at least one human player) if agent_id == pettingzoo_env.agents[args.player]: - observation = {"observation": collector.data.obs.obs.flatten(), - "action_mask": collector.data.obs.mask.flatten()} # PettingZoo expects a dict with this format + observation = { + "observation": collector.data.obs.obs.flatten(), + "action_mask": collector.data.obs.mask.flatten(), + } # PettingZoo expects a dict with this format action = manual_policy(observation, agent_id) - result = collector.collect_result(action=action.reshape(1), render=args.render) + result = collector.collect_result( + action=action.reshape(1), render=args.render + ) else: result = collector.collect(n_step=1, render=args.render) if recorder is not None: @@ -170,11 +212,16 @@ def play() -> None: if collector.data.terminated or collector.data.truncated: rews, lens = result["rews"], result["lens"] - print(f"\nFinal reward: {rews[:, args.player].mean()}, length: {lens.mean()} [Human]") - print(f"Final reward: {rews[:, 1-args.player].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1-args.player]]).__name__}]") + print( + f"\nFinal reward: {rews[:, args.player].mean()}, length: {lens.mean()} [Human]" + ) + print( + f"Final reward: {rews[:, 1-args.player].mean()}, length: {lens.mean()} [{type(policy.policies[agents[1-args.player]]).__name__}]" + ) if recorder is not None: recorder.end_recording(pettingzoo_env.unwrapped.screen) + if __name__ == "__main__": # train the agent and watch its performance in a match! args = get_args() @@ -182,4 +229,4 @@ def play() -> None: if args.cpu_players == 2: watch() else: - play() \ No newline at end of file + play() diff --git a/gobblet/examples/example_user_input.py b/gobblet/examples/example_user_input.py index 0ba7f7f..628b751 100644 --- a/gobblet/examples/example_user_input.py +++ b/gobblet/examples/example_user_input.py @@ -1,18 +1,30 @@ -import pygame -import numpy as np import argparse +import numpy as np +import pygame + def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument( - "--seed", type=int, default=None, help="Set random seed manually (will only affect CPU agents)" + "--seed", + type=int, + default=None, + help="Set random seed manually (will only affect CPU agents)", ) parser.add_argument( - "--cpu-players", type=int, default=1, choices=[0, 1, 2], help="Number of CPU players (options: 0, 1, 2)" + "--cpu-players", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of CPU players (options: 0, 1, 2)", ) parser.add_argument( - "--player", type=int, default=0, choices=[0,1], help="Choose which player to play as: red = 0, yellow = 1" + "--player", + type=int, + default=0, + choices=[0, 1], + help="Choose which player to play as: red = 0, yellow = 1", ) parser.add_argument( "--screen-width", type=int, default=640, help="Width of pygame screen in pixels" @@ -38,7 +50,7 @@ def get_args() -> argparse.Namespace: env = gobblet_v1.env(render_mode="human", args=args) env.reset() - manual_policy = gobblet_v1.ManualPolicy(env) + manual_policy = gobblet_v1.ManualGobbletPolicy(env) for agent in env.agent_iter(): clock.tick(env.metadata["render_fps"]) @@ -53,9 +65,11 @@ def get_args() -> argparse.Namespace: if agent == manual_policy.agent and args.cpu_players < 2: action = manual_policy(observation, agent) else: - action_mask = observation['action_mask'] - action = np.random.choice(np.arange(len(action_mask)), p=action_mask / np.sum(action_mask)) + action_mask = observation["action_mask"] + action = np.random.choice( + np.arange(len(action_mask)), p=action_mask / np.sum(action_mask) + ) env.step(action) - env.render() \ No newline at end of file + env.render() diff --git a/gobblet/game/board.py b/gobblet/game/board.py index c0ef6b8..abff10c 100644 --- a/gobblet/game/board.py +++ b/gobblet/game/board.py @@ -1,5 +1,6 @@ import numpy as np + class Board: def __init__(self, squares=None): # internally self.board.squares holds a representation of the gobblet board, consisting of three stacked 3x3 boards, one for each piece size. @@ -26,8 +27,6 @@ def __init__(self, squares=None): # [0, 0, 0], # [0, 0, 6] - - # empty -- 0 # player 0 -- 1 # player 1 -- -1 # Default: 2 @@ -41,7 +40,7 @@ def setup(self): self.calculate_winners() def get_action_from_pos_piece(self, pos, piece): - if pos in range(9) and piece in range(1,7): + if pos in range(9) and piece in range(1, 7): return 9 * (piece - 1) + pos else: return -1 @@ -76,29 +75,30 @@ def get_piece_size_from_action(self, action): # Returns the index on the board [0-26] for a given action def get_index_from_action(self, action): pos = self.get_pos_from_action(action) # [0-8] - piece_size = self.get_piece_size_from_action(action) # [1-3] - return pos + 9 * (piece_size - 1) # [0-26] - + piece_size = self.get_piece_size_from_action(action) # [1-3] + return pos + 9 * (piece_size - 1) # [0-26] # Return true if an action is legal, false otherwise. def is_legal(self, action, agent_index=0): - pos = self.get_pos_from_action(action) # [0-8] - piece = self.get_piece_from_action(action) # [1-6] - piece_size = self.get_piece_size_from_action(action) # [1-3] + pos = self.get_pos_from_action(action) # [0-8] + piece = self.get_piece_from_action(action) # [1-6] + piece_size = self.get_piece_size_from_action(action) # [1-3] agent_multiplier = 1 if agent_index == 0 else -1 board = self.squares.reshape(3, 9) # Check if this piece has been placed (if the piece number occurs anywhere on the level of that piece size) - if any(board[piece_size-1] == piece * agent_multiplier): - current_loc = np.where(board[piece_size-1] == piece * agent_multiplier)[0] # Returns array of values where piece is placed + if any(board[piece_size - 1] == piece * agent_multiplier): + current_loc = np.where(board[piece_size - 1] == piece * agent_multiplier)[ + 0 + ] # Returns array of values where piece is placed if len(current_loc) > 1: raise Exception("PIECE HAS BEEN USED TWICE") - return False # Piece has been used twice (not valid) + return False # Piece has been used twice (not valid) else: - current_loc = current_loc[0] # Current location [0-27] + current_loc = current_loc[0] # Current location [0-27] # If this piece is currently covered, moving it is not a legal action - if self.check_covered().reshape(3,9)[piece_size - 1][current_loc] == 1: + if self.check_covered().reshape(3, 9)[piece_size - 1][current_loc] == 1: return False # If this piece has not been placed @@ -107,8 +107,8 @@ def is_legal(self, action, agent_index=0): if flatboard[pos] == 0: return True else: - existing_piece_number = flatboard[pos] # [1-6] - existing_piece_size = (abs(existing_piece_number) + 1) // 2 # [1-3] + existing_piece_number = flatboard[pos] # [1-6] + existing_piece_size = (abs(existing_piece_number) + 1) // 2 # [1-3] if piece_size > existing_piece_size: return True else: @@ -153,20 +153,27 @@ def calculate_winners(self): self.winning_combinations = winning_combinations def print(self): - print(self.get_flatboard().reshape(3,3).transpose()) + print(self.get_flatboard().reshape(3, 3).transpose()) # returns flattened board consisting of only top pieces (excluding pieces which are gobbled by other pieces) def get_flatboard(self): flatboard = np.zeros(9) board = self.squares.reshape(3, 9) - for i in range(9): # For every square in the 3x3 grid, find the topmost element (largest piece) - top_piece_size = (np.amax( - abs(board[:, i]))) # [-3, 2, 0] denotes a large piece gobbling a medium piece, this will return 3 + for i in range( + 9 + ): # For every square in the 3x3 grid, find the topmost element (largest piece) + top_piece_size = np.amax( + abs(board[:, i]) + ) # [-3, 2, 0] denotes a large piece gobbling a medium piece, this will return 3 top_piece_index = list(abs(board[:, i])).index( - top_piece_size) # Get the row of the top piece (have to index [0] - top_piece_color = np.sign(board[ - top_piece_index, i]) # Get the color of the top piece: 1 for player_1 and -1 for player_2, 0 for neither - flatboard[i] = top_piece_color * top_piece_size # Simplify the board into only the top elements + top_piece_size + ) # Get the row of the top piece (have to index [0] + top_piece_color = np.sign( + board[top_piece_index, i] + ) # Get the color of the top piece: 1 for player_1 and -1 for player_2, 0 for neither + flatboard[i] = ( + top_piece_color * top_piece_size + ) # Simplify the board into only the top elements return flatboard # returns: @@ -193,26 +200,38 @@ def check_game_over(self): else: return False - def check_covered(self): # Return a 27 length array indicating which positions have a piece which is covered + def check_covered( + self, + ): # Return a 27 length array indicating which positions have a piece which is covered board = self.squares.reshape(3, 9) covered = np.zeros((3, 9)) - for i in range(9): # Check small pieces - if board[0, i] != 0 and (board[1, i] != 0 or board[2, i] != 0): # If there is a small piece, and either a large or medium piece covering it + for i in range(9): # Check small pieces + if board[0, i] != 0 and ( + board[1, i] != 0 or board[2, i] != 0 + ): # If there is a small piece, and either a large or medium piece covering it covered[0, i] = 1 - for i in range(9): # Check medium pieces - if board[1, i] != 0 and board[2, i] != 0: # If there is a meidum piece and a large piece covering it + for i in range(9): # Check medium pieces + if ( + board[1, i] != 0 and board[2, i] != 0 + ): # If there is a meidum piece and a large piece covering it covered[1, i] = 1 - covered[2, :] = 0 # Large pieces can't be covered + covered[2, :] = 0 # Large pieces can't be covered # Doesn't matter about what color is covering it, you can't move that piece this turn (allows self-gobbling) return covered.flatten() -# DEBUG + # DEBUG def print_pieces(self): open_indices = [i for i in range(len(self.squares)) if self.squares[i] == 0] open_squares = [np.where(self.get_flatboard() == 0)[0]] - occupied_squares = [i % 9 for i in range(len(self.squares)) if self.squares[i] != 0] # List with entries 0-9 - movable_squares = [i % 9 for i in occupied_squares if self.check_covered()[i] == 0] # List with entries 0-9 - covered_squares = [i % 9 for i in np.where(self.check_covered() == 1)[0] ] # List with entries 0-9 + occupied_squares = [ + i % 9 for i in range(len(self.squares)) if self.squares[i] != 0 + ] # List with entries 0-9 + movable_squares = [ + i % 9 for i in occupied_squares if self.check_covered()[i] == 0 + ] # List with entries 0-9 + covered_squares = [ + i % 9 for i in np.where(self.check_covered() == 1)[0] + ] # List with entries 0-9 print("open_indices: ", open_indices) print("open_squares: ", open_squares) print("squares with pieces: ", occupied_squares) @@ -220,4 +239,4 @@ def print_pieces(self): print("squares with covered pieces: ", covered_squares) def __str__(self): - return str(self.squares.reshape(3,3,3)) + return str(self.squares.reshape(3, 3, 3)) diff --git a/gobblet/game/collector_manual_policy.py b/gobblet/game/collector_manual_policy.py index 1dc8b7e..ad8983a 100644 --- a/gobblet/game/collector_manual_policy.py +++ b/gobblet/game/collector_manual_policy.py @@ -1,20 +1,13 @@ # Extending tianshou collector class to work with manual policy (for user input) import time -import warnings -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, Optional, Union import gym import numpy as np -import torch - -from tianshou.data import ( - Batch, - ReplayBuffer, -) -from tianshou.env import BaseVectorEnv, DummyVectorEnv -from tianshou.policy import BasePolicy - +from tianshou.data import Batch, ReplayBuffer from tianshou.data.collector import Collector +from tianshou.env import BaseVectorEnv +from tianshou.policy import BasePolicy class ManualPolicyCollector(Collector): @@ -26,15 +19,15 @@ def __init__( preprocess_fn: Optional[Callable[..., Batch]] = None, exploration_noise: bool = False, ) -> None: - super(ManualPolicyCollector, self).__init__(policy=policy, env=env, exploration_noise=exploration_noise) + super().__init__(policy=policy, env=env, exploration_noise=exploration_noise) # Custom function to collect the result of an inputted action def collect_result( - self, - action: int = None, - render: Optional[float] = None, - no_grad: bool = True, - gym_reset_kwargs: Optional[Dict[str, Any]] = None, + self, + action: int = None, + render: Optional[float] = None, + no_grad: bool = True, + gym_reset_kwargs: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Collect the results of an inputted action.. @@ -73,7 +66,7 @@ def collect_result( while True: assert len(self.data) == len(ready_env_ids) # restore the state: if the last state is None, it won't store - last_state = self.data.policy.pop("hidden_state", None) + last_state = self.data.policy.pop("hidden_state", None) # noqa: F841 # use hard coded action (rather than using a policy or randomly sampling) self.data.update(act=action) @@ -109,7 +102,7 @@ def collect_result( terminated=terminated, truncated=truncated, done=done, - info=info + info=info, ) if self.preprocess_fn: self.data.update( @@ -154,10 +147,9 @@ def collect_result( # remove surplus env id from ready_env_ids # to avoid bias in selecting environments - self.data.obs = self.data.obs_next - if (n_step and step_count >= n_step): + if n_step and step_count >= n_step: break # generate statistics @@ -167,10 +159,7 @@ def collect_result( if episode_count > 0: rews, lens, idxs = list( - map( - np.concatenate, - [episode_rews, episode_lens, episode_start_indices] - ) + map(np.concatenate, [episode_rews, episode_lens, episode_start_indices]) ) rew_mean, rew_std = rews.mean(), rews.std() len_mean, len_std = lens.mean(), lens.std() @@ -188,4 +177,4 @@ def collect_result( "len": len_mean, "rew_std": rew_std, "len_std": len_std, - } \ No newline at end of file + } diff --git a/gobblet/game/gobblet.py b/gobblet/game/gobblet.py index edc9efc..d2aaad4 100644 --- a/gobblet/game/gobblet.py +++ b/gobblet/game/gobblet.py @@ -93,15 +93,16 @@ """ +import os + import gymnasium import numpy as np -from gymnasium import spaces import pygame -import os - +from gymnasium import spaces from pettingzoo import AECEnv from pettingzoo.utils import agent_selector, wrappers from pettingzoo.utils.conversions import parallel_wrapper_fn + from .board import Board from .utils import get_image, load_chip, load_chip_preview @@ -115,8 +116,10 @@ def env(render_mode=None, args=None): env = wrappers.OrderEnforcingWrapper(env) return env + parallel_env = parallel_wrapper_fn(env) + class raw_env(AECEnv): metadata = { "render_modes": ["human", "rgb_array", "text", "text_full"], @@ -129,7 +132,7 @@ class raw_env(AECEnv): def __init__(self, render_mode=None, args=None): super().__init__() self.board = Board() - self.board_size = 3 # Will need to make a separate file for 4x4 + self.board_size = 3 # Will need to make a separate file for 4x4 self.agents = ["player_1", "player_2"] self.possible_agents = self.agents[:] @@ -141,7 +144,9 @@ def __init__(self, render_mode=None, args=None): "observation": spaces.Box( low=0, high=1, shape=(3, 3, 13), dtype=np.int8 ), - "action_mask": spaces.Box(low=0, high=1, shape=(54,), dtype=np.int8), + "action_mask": spaces.Box( + low=0, high=1, shape=(54,), dtype=np.int8 + ), } ) for i in self.agents @@ -175,22 +180,30 @@ def observe(self, agent): board = self.board.squares.reshape(3, 3, 3) if self.agents.index(agent) == 1: - board = board * -1 # Swap the signs on the board for the two different agents + board = ( + board * -1 + ) # Swap the signs on the board for the two different agents # Represent observations in the same way as pettingzoo.chess: specific channel for each color piece (e.g., two for each white small piece) layers = [] for i in range(1, 7): - layers.append(board[(i - 1) // 2] == i) # 3x3 array with an entry of 1 for squares with each white piece (1, ..., 6) + layers.append( + board[(i - 1) // 2] == i + ) # 3x3 array with an entry of 1 for squares with each white piece (1, ..., 6) for i in range(1, 7): - layers.append(board[(i - 1) // 2] == -i) # 3x3 array with an entry of 1 for squares with each black piece (-1, ..., -6) + layers.append( + board[(i - 1) // 2] == -i + ) # 3x3 array with an entry of 1 for squares with each black piece (-1, ..., -6) if self.agents.index(agent) == 1: agents_layer = np.ones((3, 3)) else: agents_layer = np.zeros((3, 3)) - layers.append(agents_layer) # Thirteenth layer encoding the current player (agent index 0 or 1) + layers.append( + agents_layer + ) # Thirteenth layer encoding the current player (agent index 0 or 1) observation = np.stack(layers, axis=2).astype(np.int8) legal_moves = self._legal_moves() if agent == self.agent_selection else [] @@ -234,7 +247,7 @@ def step(self, action): if self.board.check_game_over(): winner = self.board.check_for_winner() - if winner == 0: # NOTE: don't think ties are possible in gobblet + if winner == 0: # NOTE: don't think ties are possible in gobblet # tie pass elif winner == 1: @@ -259,7 +272,6 @@ def step(self, action): if self.render_mode in ["human", "text", "text_full", "rgb_array"]: self.render() - def reset(self, seed=None, return_info=False, options=None): # reset environment self.board = Board() @@ -288,17 +300,17 @@ def getSymbolFull(input): if input == 0: return "- " if input > 0: - return "+{}".format(int(input)) + return f"+{int(input)}" else: - return "{}".format(int(input)) + return f"{int(input)}" def getSymbol(input): if input == 0: return "- " if input > 0: - return "+{}".format(int((input + 1) // 2)) + return f"+{int((input + 1) // 2)}" else: - return "{}".format(int((input) // 2)) + return f"{int((input) // 2)}" if self.debug: self.board.print_pieces() @@ -306,48 +318,111 @@ def getSymbol(input): pos = self.action % 9 piece = (self.action // 9) + 1 piece = (piece + 1) // 2 - print(f"TURN: {self.turn}, AGENT: {self.agent_selection}, ACTION: {self.action}, POSITION: {pos}, PIECE: {piece}") + print( + f"TURN: {self.turn}, AGENT: {self.agent_selection}, ACTION: {self.action}, POSITION: {pos}, PIECE: {piece}" + ) board = list(map(getSymbol, self.board.get_flatboard())) print(" " * 7 + "|" + " " * 7 + "|" + " " * 7) - print(f" {board[0]} " + "|" + f" {board[3]} " + "|" + f" {board[6]} ") + print( + f" {board[0]} " + "|" + f" {board[3]} " + "|" + f" {board[6]} " + ) print("_" * 7 + "|" + "_" * 7 + "|" + "_" * 7) print(" " * 7 + "|" + " " * 7 + "|" + " " * 7) - print(f" {board[1]} " + "|" + f" {board[4]} " + "|" + f" {board[7]} ") + print( + f" {board[1]} " + "|" + f" {board[4]} " + "|" + f" {board[7]} " + ) print("_" * 7 + "|" + "_" * 7 + "|" + "_" * 7) print(" " * 7 + "|" + " " * 7 + "|" + " " * 7) - print(f" {board[2]} " + "|" + f" {board[5]} " + "|" + f" {board[8]} ") + print( + f" {board[2]} " + "|" + f" {board[5]} " + "|" + f" {board[8]} " + ) print(" " * 7 + "|" + " " * 7 + "|" + " " * 7) print() elif self.render_mode == "text_full": pos = self.action % 9 piece = (self.action // 9) + 1 - print(f"TURN: {self.turn}, AGENT: {self.agent_selection}, ACTION: {self.action}, POSITION: {pos}, PIECE: {piece}") + print( + f"TURN: {self.turn}, AGENT: {self.agent_selection}, ACTION: {self.action}, POSITION: {pos}, PIECE: {piece}" + ) board = list(map(getSymbolFull, self.board.squares)) - print(" " * 9 + "SMALL" + " " * 9 + " " + - " " * 10 + "MED" + " " * 10 + " " + - " " * 9 + "LARGE" + " " * 9 + " ") - top= " " * 7 + "|" + " " * 7 + "|" + " " * 7 + print( + " " * 9 + + "SMALL" + + " " * 9 + + " " + + " " * 10 + + "MED" + + " " * 10 + + " " + + " " * 9 + + "LARGE" + + " " * 9 + + " " + ) + top = " " * 7 + "|" + " " * 7 + "|" + " " * 7 bottom = "_" * 7 + "|" + "_" * 7 + "|" + "_" * 7 - top1= f" {board[0]} " + "|" + f" {board[3]} " + "|" + f" {board[6]} " - top2 = f" {board[9]} " + "|" + f" {board[12]} " + "|" + f" {board[15]} " - top3 = f" {board[18]} " + "|" + f" {board[21]} " + "|" + f" {board[24]} " + top1 = ( + f" {board[0]} " + "|" + f" {board[3]} " + "|" + f" {board[6]} " + ) + top2 = ( + f" {board[9]} " + + "|" + + f" {board[12]} " + + "|" + + f" {board[15]} " + ) + top3 = ( + f" {board[18]} " + + "|" + + f" {board[21]} " + + "|" + + f" {board[24]} " + ) print(top + " " + top + " " + top) print(top1 + " " + top2 + " " + top3) print(bottom + " " + bottom + " " + bottom) - mid1 = f" {board[1]} " + "|" + f" {board[4]} " + "|" + f" {board[7]} " - mid2 = f" {board[10]} " + "|" + f" {board[13]} " + "|" + f" {board[16]} " - mid3 = f" {board[19]} " + "|" + f" {board[22]} " + "|" + f" {board[25]} " + mid1 = ( + f" {board[1]} " + "|" + f" {board[4]} " + "|" + f" {board[7]} " + ) + mid2 = ( + f" {board[10]} " + + "|" + + f" {board[13]} " + + "|" + + f" {board[16]} " + ) + mid3 = ( + f" {board[19]} " + + "|" + + f" {board[22]} " + + "|" + + f" {board[25]} " + ) print(top + " " + top + " " + top) print(mid1 + " " + mid2 + " " + mid3) print(bottom + " " + bottom + " " + bottom) - bot1 = f" {board[2]} " + "|" + f" {board[5]} " + "|" + f" {board[8]} " - bot2 = f" {board[9+2]} " + "|" + f" {board[9+5]} " + "|" + f" {board[9+8]} " - bot3 = f" {board[18+2]} " + "|" + f" {board[18+5]} " + "|" + f" {board[18+8]} " + bot1 = ( + f" {board[2]} " + "|" + f" {board[5]} " + "|" + f" {board[8]} " + ) + bot2 = ( + f" {board[9+2]} " + + "|" + + f" {board[9+5]} " + + "|" + + f" {board[9+8]} " + ) + bot3 = ( + f" {board[18+2]} " + + "|" + + f" {board[18+5]} " + + "|" + + f" {board[18+8]} " + ) print(top + " " + top + " " + top) print(bot1 + " " + bot2 + " " + bot3) print(top + " " + top + " " + top) @@ -358,7 +433,9 @@ def getSymbol(input): if self.render_mode == "human": if self.screen is None: pygame.init() - self.screen = pygame.display.set_mode((self.screen_width, self.screen_height)) + self.screen = pygame.display.set_mode( + (self.screen_width, self.screen_height) + ) pygame.event.get() elif self.screen is None: self.screen = pygame.Surface((self.screen_width, self.screen_height)) @@ -381,14 +458,26 @@ def getSymbol(input): self.preview = {} self.preview["player_1"] = {} - self.preview["player_1"][3] = load_chip_preview(tile_size, "GobbletLargeRedPreview.png", scale_large) - self.preview["player_1"][2] = load_chip_preview(tile_size, "GobbletMedRedPreview.png", scale_med) - self.preview["player_1"][1] = load_chip_preview(tile_size, "GobbletSmallRedPreview.png", scale_small) + self.preview["player_1"][3] = load_chip_preview( + tile_size, "GobbletLargeRedPreview.png", scale_large + ) + self.preview["player_1"][2] = load_chip_preview( + tile_size, "GobbletMedRedPreview.png", scale_med + ) + self.preview["player_1"][1] = load_chip_preview( + tile_size, "GobbletSmallRedPreview.png", scale_small + ) self.preview["player_2"] = {} - self.preview["player_2"][3] = load_chip_preview(tile_size, "GobbletLargeYellowPreview.png", scale_large) - self.preview["player_2"][2] = load_chip_preview(tile_size, "GobbletMedYellowPreview.png", scale_med) - self.preview["player_2"][1] = load_chip_preview(tile_size, "GobbletSmallYellowPreview.png", scale_small) + self.preview["player_2"][3] = load_chip_preview( + tile_size, "GobbletLargeYellowPreview.png", scale_large + ) + self.preview["player_2"][2] = load_chip_preview( + tile_size, "GobbletMedYellowPreview.png", scale_med + ) + self.preview["player_2"][1] = load_chip_preview( + tile_size, "GobbletSmallYellowPreview.png", scale_small + ) # preview_chips = {self.agents[0]: self.preview["player_1"], self.agents[1]: self.preview["player_1"]} @@ -399,34 +488,50 @@ def getSymbol(input): self.screen.blit(board_img, (0, 0)) - offset = (self.screen_width * ((9+4) / 47)) # Piece is 9px wide, gap between pieces 4px, total width is 47px - offset_side = (self.screen_width * (6 / 47)) - 1 # Distance from the side of the board to the first piece is 6px - offset_centering = offset * 1/3 + \ - (5 * self.screen_width/1000 if self.screen_width > 500 else 8 * self.screen_width/1000) + offset = self.screen_width * ( + (9 + 4) / 47 + ) # Piece is 9px wide, gap between pieces 4px, total width is 47px + offset_side = ( + self.screen_width * (6 / 47) + ) - 1 # Distance from the side of the board to the first piece is 6px + offset_centering = offset * 1 / 3 + ( + 5 * self.screen_width / 1000 + if self.screen_width > 500 + else 8 * self.screen_width / 1000 + ) # Extra 5px fixed alignment issues at 1000x1000, but ratio needs to be higher for lower res (trial & error) # Blit the chips and their positions for i in range(9): for j in range(1, 4): - if self.board.squares[i + 9 * (j - 1)] == 2 * j - 1 or \ - self.board.squares[i + 9 * (j - 1)] == 2 * j: # small pieces (1,2), medium pieces (3,4), large pieces (5,6) + if ( + self.board.squares[i + 9 * (j - 1)] == 2 * j - 1 + or self.board.squares[i + 9 * (j - 1)] == 2 * j + ): # small pieces (1,2), medium pieces (3,4), large pieces (5,6) self.screen.blit( red[j], - red[j].get_rect(center = - (int(i / 3) * (offset) + offset_side + offset_centering, - (i % 3) * (offset) + offset_side + offset_centering - ) - ) + red[j].get_rect( + center=( + int(i / 3) * (offset) + + offset_side + + offset_centering, + (i % 3) * (offset) + offset_side + offset_centering, + ) + ), ) - if self.board.squares[i + 9 * (j - 1)] == -1 * (2 * j - 1) or\ - self.board.squares[i + 9 * (j - 1)] == -1 * (2 * j): + if self.board.squares[i + 9 * (j - 1)] == -1 * ( + 2 * j - 1 + ) or self.board.squares[i + 9 * (j - 1)] == -1 * (2 * j): self.screen.blit( yellow[j], - yellow[j].get_rect(center = - (int(i / 3) * (offset) + offset_side + offset_centering, - (i % 3) * (offset) + offset_side + offset_centering - ) - ) + yellow[j].get_rect( + center=( + int(i / 3) * (offset) + + offset_side + + offset_centering, + (i % 3) * (offset) + offset_side + offset_centering, + ) + ), ) # Blit the preview chips and their positions @@ -435,20 +540,26 @@ def getSymbol(input): if self.board.squares_preview[i + 9 * (j - 1)] == 1: self.screen.blit( self.preview["player_1"][j], - self.preview["player_1"][j].get_rect(center = - (int(i / 3) * (offset) + offset_side + offset_centering, - (i % 3) * (offset) + offset_side + offset_centering - ) - ) + self.preview["player_1"][j].get_rect( + center=( + int(i / 3) * (offset) + + offset_side + + offset_centering, + (i % 3) * (offset) + offset_side + offset_centering, + ) + ), ) if self.board.squares_preview[i + 9 * (j - 1)] == -1: self.screen.blit( self.preview["player_2"][j], - self.preview["player_2"][j].get_rect(center = - (int(i / 3) * (offset) + offset_side + offset_centering, - (i % 3) * (offset) + offset_side + offset_centering - ) - ) + self.preview["player_2"][j].get_rect( + center=( + int(i / 3) * (offset) + + offset_side + + offset_centering, + (i % 3) * (offset) + offset_side + offset_centering, + ) + ), ) pygame.display.update() diff --git a/gobblet/game/greedy_policy.py b/gobblet/game/greedy_policy.py index acbf540..d4ef434 100644 --- a/gobblet/game/greedy_policy.py +++ b/gobblet/game/greedy_policy.py @@ -1,14 +1,16 @@ -from typing import Optional, Any +from typing import Any, Optional import numpy as np + from gobblet.game.board import Board + class GreedyGobbletPolicy: def __init__( - self, - depth: Optional[int] = 2, - seed: Optional[int] = 0, - **kwargs: Any, + self, + depth: Optional[int] = 2, + seed: Optional[int] = 0, + **kwargs: Any, ) -> None: super().__init__(**kwargs) self.board = None @@ -17,7 +19,9 @@ def __init__( def compute_actions_rllib(self, obs_batch): observations = obs_batch["observation"] - observations = observations.reshape(observations.shape[0], 3, 3, -1) # Infer observation dimension and batch size + observations = observations.reshape( + observations.shape[0], 3, 3, -1 + ) # Infer observation dimension and batch size masks = obs_batch["action_mask"] actions = [] for i in range(len(observations)): @@ -25,30 +29,38 @@ def compute_actions_rllib(self, obs_batch): actions.append(act) return actions - def compute_action_tianshou( - self, - obs - ): + def compute_action_tianshou(self, obs): mask = obs.mask obs = obs.obs if hasattr(obs, "obs") else obs # return self.compute_action(obs, mask) def compute_action( - self, - obs, - mask, + self, + obs, + mask, ) -> np.ndarray: - obs_player = obs[..., :6] # Index the last dimension - board_player = np.array([(i + 1) * obs_player[..., i] + (i + 2) * obs_player[..., i + 1] for i in - range(0, 6, 2)]) # Reshapes [3,3,6] to [3,3,3] + board_player = np.array( + [ + (i + 1) * obs_player[..., i] + (i + 2) * obs_player[..., i + 1] + for i in range(0, 6, 2) + ] + ) # Reshapes [3,3,6] to [3,3,3] obs_opponent = obs[..., 6:12] board_opponent = np.array( - [(i + 1) * obs_opponent[..., i] + (i + 2) * obs_opponent[..., i + 1] for i in range(0, 6, 2)]) + [ + (i + 1) * obs_opponent[..., i] + (i + 2) * obs_opponent[..., i + 1] + for i in range(0, 6, 2) + ] + ) board = np.where(board_player > board_opponent, board_player, -board_opponent) - agent_index = obs[..., 12].max() # Thirteenth layer of obs encodes agent_index (all zeros or all ones) - opponent_index = 1 - agent_index # If agent index is 1, we want 0; if agent index is 0, we want 1 + agent_index = obs[ + ..., 12 + ].max() # Thirteenth layer of obs encodes agent_index (all zeros or all ones) + opponent_index = ( + 1 - agent_index + ) # If agent index is 1, we want 0; if agent index is 0, we want 1 # If we are playing as the second agent, we need to adjust the representation of the board to reflect that if agent_index == 1: @@ -61,7 +73,9 @@ def compute_action( winner_values = [1, -1] legal_actions = mask.flatten().nonzero()[0] - actions_depth1 = list(legal_actions) # Initialize the same as legal actions, then remove ones that cause a loss + actions_depth1 = list( + legal_actions + ) # Initialize the same as legal actions, then remove ones that cause a loss chosen_action = None results = {} @@ -77,7 +91,9 @@ def compute_action( if results[action] == winner_values[agent_index]: # Win for our agent chosen_action = action break - elif results[action] == winner_values[opponent_index]: # Loss for our agent + elif ( + results[action] == winner_values[opponent_index] + ): # Loss for our agent if len(actions_depth1) > 1: actions_depth1.remove(action) else: @@ -92,20 +108,26 @@ def compute_action( depth1_board.play_turn(agent_index=agent_index, action=action) # Check what the opponent can do after this potential move - legal_actions_depth2 = [act for act in range(len(mask.flatten())) if - depth1_board.is_legal(agent_index=opponent_index, action=act)] + legal_actions_depth2 = [ + act + for act in range(len(mask.flatten())) + if depth1_board.is_legal(agent_index=opponent_index, action=act) + ] results_depth2 = {} for action_depth2 in legal_actions_depth2: depth2_board = Board() depth2_board.squares = depth1_board.squares.copy() - depth2_board.play_turn(agent_index=opponent_index, action=action_depth2) + depth2_board.play_turn( + agent_index=opponent_index, action=action_depth2 + ) results_depth2[action_depth2] = depth2_board.check_for_winner() # If the opponent can win in the next move, we don't want to do this action - if results_depth2[action_depth2] == winner_values[ - opponent_index]: # Check if it's possible for the opponent to win next turn after + if ( + results_depth2[action_depth2] == winner_values[opponent_index] + ): # Check if it's possible for the opponent to win next turn after if len(actions_depth1) > 1: if action in actions_depth1: actions_depth1.remove(action) @@ -116,15 +138,21 @@ def compute_action( # BUT this might miss us from winning the game ourselves in blcoking them, so don't exit the loop # So we only do it if there aren't deterministic wins already chosen (and we don't break, keep looking after) if self.board.is_legal(action_depth2, agent_index=agent_index): - if chosen_action == None: + if chosen_action is None: chosen_action = action_depth2 # Depth 2: if this move sets the opponent up with only moves that win the game for us, pick this move - if all(winner == winner_values[agent_index] for winner in results_depth2.values()): + if all( + winner == winner_values[agent_index] + for winner in results_depth2.values() + ): chosen_action = action break # Depth 2: if this move blocks the opponent from winning, pick this if we cannot find any guaranteed wins - if all(winner != winner_values[opponent_index] for winner in results_depth2.values()): + if all( + winner != winner_values[opponent_index] + for winner in results_depth2.values() + ): chosen_action = action # BLOCKING ACTION # Depth 3: Given that we block the opponent this way, can we win the next turn? @@ -134,13 +162,22 @@ def compute_action( depth1_board.play_turn(agent_index=agent_index, action=action) # Search over depth 2 actions where the opponent doesn't win - for action_depth2 in [key for key, value in results_depth2.items() if value == 0]: + for action_depth2 in [ + key for key, value in results_depth2.items() if value == 0 + ]: depth2_board = Board() depth2_board.squares = depth1_board.squares.copy() - depth2_board.play_turn(agent_index=agent_index, action=action_depth2) - - legal_actions_depth3 = [act for act in range(len(mask.flatten())) if - depth2_board.is_legal(agent_index=agent_index, action=act)] + depth2_board.play_turn( + agent_index=agent_index, action=action_depth2 + ) + + legal_actions_depth3 = [ + act + for act in range(len(mask.flatten())) + if depth2_board.is_legal( + agent_index=agent_index, action=act + ) + ] actions_depth3 = list(legal_actions_depth3) results_depth3 = {} @@ -148,15 +185,24 @@ def compute_action( for action_depth3 in legal_actions_depth3: depth3_board = Board() depth3_board.squares = depth2_board.squares.copy() - depth3_board.play_turn(agent_index=agent_index, action=action) + depth3_board.play_turn( + agent_index=agent_index, action=action + ) # If we can win next turn, do it - results_depth3[action_depth3] = depth3_board.check_for_winner() - if results_depth3[action_depth3] == winner_values[agent_index]: # Win for our agent + results_depth3[ + action_depth3 + ] = depth3_board.check_for_winner() + if ( + results_depth3[action_depth3] + == winner_values[agent_index] + ): # Win for our agent chosen_action = action # If we can win in depth 3, then we know this blocking action is good break - elif results_depth3[action_depth3] == winner_values[ - opponent_index]: # Loss for our agent + elif ( + results_depth3[action_depth3] + == winner_values[opponent_index] + ): # Loss for our agent if len(actions_depth3) > 1: actions_depth3.remove(action) else: diff --git a/gobblet/game/greedy_policy_rllib.py b/gobblet/game/greedy_policy_rllib.py index 6d57f05..55fa865 100644 --- a/gobblet/game/greedy_policy_rllib.py +++ b/gobblet/game/greedy_policy_rllib.py @@ -1,15 +1,11 @@ -from typing import ( - List, - Optional, - Union, -) +from typing import List, Optional, Union +import numpy as np +from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.utils.annotations import override from ray.rllib.utils.typing import TensorStructType, TensorType -from ray.rllib.examples.policy.random_policy import RandomPolicy from gobblet.game.greedy_policy import GreedyGobbletPolicy -import numpy as np class GreedyPolicy(RandomPolicy): @@ -19,12 +15,12 @@ def __init__(self, *args, **kwargs): @override(RandomPolicy) def compute_actions( - self, - obs_batch: Union[List[TensorStructType], TensorStructType], - state_batches: Optional[List[TensorType]] = None, - prev_action_batch: Union[List[TensorStructType], TensorStructType] = None, - prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None, - **kwargs + self, + obs_batch: Union[List[TensorStructType], TensorStructType], + state_batches: Optional[List[TensorType]] = None, + prev_action_batch: Union[List[TensorStructType], TensorStructType] = None, + prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None, + **kwargs, ): actions = self.policy.compute_actions_rllib(obs_batch) return ( diff --git a/gobblet/game/greedy_policy_tianshou.py b/gobblet/game/greedy_policy_tianshou.py index 062a7c4..4f97db9 100644 --- a/gobblet/game/greedy_policy_tianshou.py +++ b/gobblet/game/greedy_policy_tianshou.py @@ -1,19 +1,22 @@ -from typing import Any, Dict, Optional, Union from copy import deepcopy +from typing import Any, Dict, Optional, Union + import numpy as np from numpy import ndarray - -from tianshou.data import Batch, ReplayBuffer, to_numpy, to_torch_as +from tianshou.data import Batch from tianshou.policy import BasePolicy from gobblet.game.greedy_policy import GreedyGobbletPolicy + + class GreedyPolicy(BasePolicy): - """ + """Greedy Policy. + Basic greedy policy which checks if a move results in a victory, and if it sets the opponent up to win (or lose) in the next turn. The depth argument controls the agent's search depth (default of 2 is a balance between computational efficiency and optimal play) * depth = 1: Agent considers moves which it can use to directly win * depth = 2: Agent also considers moves it can take to block the opponent from winning next turn - * depth = 3: Agent also considers moves which set it up to win in two moves: no matter waht opponents does in retaliation (unblockable wins) + * depth = 3: Agent also considers moves which set it up to win in two moves: no matter what opponents does in retaliation (unblockable wins) """ def __init__( @@ -65,11 +68,15 @@ def forward( if len(batch_single[input].agent_id) > 1: batch_single[input].obs = batch_single[input].obs[b, ...] batch_single[input].mask = batch_single[input].mask[b, ...] - batch_single[input].agent_id = batch_single[input].agent_id[b, ...].reshape(1) + batch_single[input].agent_id = ( + batch_single[input].agent_id[b, ...].reshape(1) + ) act = self.forward_unbatched(batch=batch_single) # array(1) - acts.append(act) # [ array(1), array(2), ... array(10) ] + acts.append(act) # [ array(1), array(2), ... array(10) ] acts = np.array(acts) - return Batch(act=acts) # Batch( act: [ array(1), array(2), ... array(10) ] ) + return Batch( + act=acts + ) # Batch( act: [ array(1), array(2), ... array(10) ] ) else: batch_single = deepcopy(full_batch) act = self.forward_unbatched(batch=batch_single) @@ -88,4 +95,4 @@ def forward_unbatched( def learn(self, batch: Batch, **kwargs: Any) -> Dict[str, float]: """Since a random agent learns nothing, it returns an empty dict.""" - return {} \ No newline at end of file + return {} diff --git a/gobblet/game/manual_policy.py b/gobblet/game/manual_policy.py index 354d92a..adbcea9 100644 --- a/gobblet/game/manual_policy.py +++ b/gobblet/game/manual_policy.py @@ -1,13 +1,14 @@ # Adapted from https://github.com/Farama-Foundation/PettingZoo/blob/master/pettingzoo/butterfly/knights_archers_zombies/manual_policy.py -import pygame -from .utils import GIFRecorder import sys + import numpy as np +import pygame +from .utils import GIFRecorder -class ManualPolicy: - def __init__(self, env, agent_id: int = 0, recorder: GIFRecorder = None): +class ManualGobbletPolicy: + def __init__(self, env, agent_id: int = 0, recorder: GIFRecorder = None): self.env = env self.agent_id = agent_id self.agent = self.env.agents[self.agent_id] @@ -34,7 +35,7 @@ def __call__(self, observation, agent): pygame.display.quit() sys.exit() - ''' GET MOUSE INPUT''' + """ GET MOUSE INPUT""" mousex, mousey = pygame.mouse.get_pos() width, height = pygame.display.get_surface().get_size() pos_x = 0 @@ -55,45 +56,57 @@ def __call__(self, observation, agent): agent_multiplier = 1 if agent == env.agents[0] else -1 - ''' FIND PLACED PIECES ''' - placed_pieces = env.unwrapped.board.squares[env.unwrapped.board.squares.nonzero()] - placed_pieces_agent = [a for a in placed_pieces if np.sign(a) == agent_multiplier] + """ FIND PLACED PIECES """ + placed_pieces = env.unwrapped.board.squares[ + env.unwrapped.board.squares.nonzero() + ] + placed_pieces_agent = [ + a for a in placed_pieces if np.sign(a) == agent_multiplier + ] placed_pieces_agent_abs = [abs(p) for p in placed_pieces_agent] pieces = np.arange(1, 7) unplaced = [p for p in pieces if p not in placed_pieces_agent_abs] flatboard = env.unwrapped.board.get_flatboard() # Selected piece size persists as we loop through events, allowing the user to cycle through sizes - if piece_size_selected > 0: - piece = piece - else: + if piece_size_selected == 0: if len(unplaced) > 0: piece = unplaced[-1] piece_size_selected = (piece + 1) // 2 else: piece = -1 - ''' READ KEYBOARD INPUT''' + """ READ KEYBOARD INPUT""" if event.type == pygame.KEYDOWN: if not picked_up: - if event.key == pygame.K_SPACE: # Cycle through pieces (from largest to smallest) + if ( + event.key == pygame.K_SPACE + ): # Cycle through pieces (from largest to smallest) piece_cycle += 1 cycle_choices = np.unique( - [(p + 1) // 2 for p in unplaced]) # Transform [1,2,3,4,5,6] to [1,2,3) + [(p + 1) // 2 for p in unplaced] + ) # Transform [1,2,3,4,5,6] to [1,2,3) if len(cycle_choices) > 0: - piece_size = cycle_choices[(np.amax(cycle_choices) - (piece_cycle + 1)) % len(cycle_choices)] + piece_size = cycle_choices[ + (np.amax(cycle_choices) - (piece_cycle + 1)) + % len(cycle_choices) + ] # else: # piece_size = -1 piece_size_selected = piece_size - if (piece_size * 2) - 1 in unplaced: # Check if the first of this piece size is available + if ( + piece_size * 2 + ) - 1 in unplaced: # Check if the first of this piece size is available piece = piece_size * 2 - 1 else: - piece = piece_size * 2 # Otherwise choose the second of this piece size + piece = ( + piece_size * 2 + ) # Otherwise choose the second of this piece size else: - if event.key == pygame.K_1: # Select piece size 1 + if event.key == pygame.K_1: # Select piece size 1 piece_size_selected = 1 if 1 in unplaced: piece = 1 @@ -103,7 +116,7 @@ def __call__(self, observation, agent): piece_cycle = 2 else: piece = -1 - elif event.key == pygame.K_2: # Select piece size 2 + elif event.key == pygame.K_2: # Select piece size 2 piece_size_selected = 2 if 3 in unplaced: piece = 3 @@ -113,7 +126,7 @@ def __call__(self, observation, agent): piece_cycle = 1 else: piece = -1 - elif event.key == pygame.K_3: # Select piece size 3 + elif event.key == pygame.K_3: # Select piece size 3 piece_size_selected = 3 if 5 in unplaced: piece = 5 @@ -129,31 +142,36 @@ def __call__(self, observation, agent): piece_size = (piece + 1) // 2 # Don't render a preview if both pieces of a given size have been placed - ''' GET PREVIEW ACTION ''' + """ GET PREVIEW ACTION """ # Get the action from the preview (in position the mouse cursor is currently hovering over) - action_prev = env.unwrapped.board.get_action(pos, piece_size, env.agents.index(agent)) + action_prev = env.unwrapped.board.get_action( + pos, piece_size, env.agents.index(agent) + ) - ''' CLEAR ACTION PREVIEW FOR ILLEGAL MOVES''' + """ CLEAR ACTION PREVIEW FOR ILLEGAL MOVES""" # If the hovered over position means placing a picked up piece in the same spot, mark it as illegal if pos == picked_up_pos or piece == -1: action_prev = -1 - ''' CLEAR PREVIOUSLY PREVIEWED MOVES ''' + """ CLEAR PREVIOUSLY PREVIEWED MOVES """ env.unwrapped.board.squares_preview[:] = 0 if action_prev != -1: - if not env.unwrapped.board.is_legal(action_prev, env.agents.index(agent)): # If this action is illegal + if not env.unwrapped.board.is_legal( + action_prev, env.agents.index(agent) + ): # If this action is illegal action_prev = -1 else: env.unwrapped.board.squares_preview[ - pos + 9 * (piece_size - 1)] = agent_multiplier # Preview this position + pos + 9 * (piece_size - 1) + ] = agent_multiplier # Preview this position - ''' UPDATE DISPLAY with previewed move''' + """ UPDATE DISPLAY with previewed move""" env.render() pygame.display.update() if recorder is not None: recorder.capture_frame(env.unwrapped.screen) - ''' PICK UP / PLACE A PIECE ''' + """ PICK UP / PLACE A PIECE """ if event.type == pygame.MOUSEBUTTONDOWN: # Pick up a piece (only able to if it has already been placed, and is not currently picked up) if flatboard[pos] in placed_pieces_agent and not picked_up: @@ -164,28 +182,38 @@ def __call__(self, observation, agent): # If the piece size selected is larger, clicking here should self-gobble the smaller piece if piece_size_on_board >= piece_size_selected: # Can only pick up a piece if there is a legal move to place it, other than where it was before - if not all(observation["action_mask"][9 * (piece - 1): 9 * piece] == 0): + if not all( + observation["action_mask"][9 * (piece - 1) : 9 * piece] == 0 + ): picked_up = True picked_up_pos = pos # Remove a placed piece piece_size_selected = (piece + 1) // 2 - index = np.where(env.unwrapped.board.squares == piece_to_pick_up)[0][0] + index = np.where( + env.unwrapped.board.squares == piece_to_pick_up + )[0][0] env.unwrapped.board.squares[index] = 0 # Set the only possible actions to be moving this piece to a new square - observation["action_mask"][pos + 9 * (piece - 1)] = 0 # TODO: check if this is already zero - observation["action_mask"][:9 * (piece - 1)] = 0 # Zero out all the possible actions - observation["action_mask"][9 * (piece):] = 0 + observation["action_mask"][ + pos + 9 * (piece - 1) + ] = 0 # TODO: check if this is already zero + observation["action_mask"][ + : 9 * (piece - 1) + ] = 0 # Zero out all the possible actions + observation["action_mask"][9 * (piece) :] = 0 # Place a piece (if it is legal to do so) else: if action_prev != -1: - env.unwrapped.board.squares_preview[pos + 9 * (piece_size - 1)] = 0 + env.unwrapped.board.squares_preview[ + pos + 9 * (piece_size - 1) + ] = 0 action = pos + 9 * (piece - 1) break return np.int32(action) @property def available_agents(self): - return self.env.agent_name_mapping \ No newline at end of file + return self.env.agent_name_mapping diff --git a/gobblet/game/random_admissible_policy_rllib.py b/gobblet/game/random_admissible_policy_rllib.py index 99cf7f2..3f7c308 100644 --- a/gobblet/game/random_admissible_policy_rllib.py +++ b/gobblet/game/random_admissible_policy_rllib.py @@ -1,14 +1,11 @@ -from typing import ( - List, - Optional, - Union, -) +from typing import List, Optional, Union -from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import TensorStructType, TensorType -from ray.rllib.examples.policy.random_policy import RandomPolicy import numpy as np import tree # pip install dm_tree +from ray.rllib.examples.policy.random_policy import RandomPolicy +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import TensorStructType, TensorType + class RandomAdmissiblePolicy(RandomPolicy): def __init__(self, *args, **kwargs): @@ -16,20 +13,26 @@ def __init__(self, *args, **kwargs): @override(RandomPolicy) def compute_actions( - self, - obs_batch: Union[List[TensorStructType], TensorStructType], - state_batches: Optional[List[TensorType]] = None, - prev_action_batch: Union[List[TensorStructType], TensorStructType] = None, - prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None, - **kwargs + self, + obs_batch: Union[List[TensorStructType], TensorStructType], + state_batches: Optional[List[TensorType]] = None, + prev_action_batch: Union[List[TensorStructType], TensorStructType] = None, + prev_reward_batch: Union[List[TensorStructType], TensorStructType] = None, + **kwargs, ): if "action_mask" in obs_batch.keys(): action_masks = obs_batch["action_mask"] - actions = [np.random.choice(np.arange(len(action_mask)), p=action_mask / np.sum(action_mask)) for - action_mask in action_masks] + actions = [ + np.random.choice( + np.arange(len(action_mask)), p=action_mask / np.sum(action_mask) + ) + for action_mask in action_masks + ] else: obs_batch_size = len(tree.flatten(obs_batch)[0]) - actions = [self.action_space_for_sampling.sample() for _ in range(obs_batch_size)] + actions = [ + self.action_space_for_sampling.sample() for _ in range(obs_batch_size) + ] return ( actions, [], diff --git a/gobblet/game/utils.py b/gobblet/game/utils.py index 73a439e..c42f8f5 100644 --- a/gobblet/game/utils.py +++ b/gobblet/game/utils.py @@ -1,99 +1,145 @@ # adapted from pettingzoo.classic.connect_four +import glob import os +import re +import subprocess +import time +from pathlib import Path +from typing import Union + import pygame -import sys + def get_image(path): + """Get image. + Load an image from file into a pygame object + + Returns: + sfc: Pygame surface object with the image + """ cwd = os.path.dirname(__file__) image = pygame.image.load(os.path.join(cwd, path)) sfc = pygame.Surface(image.get_size(), flags=pygame.SRCALPHA) sfc.blit(image, (0, 0)) return sfc + def load_chip(tile_size, filename, scale): + """Load chip. + + Load the image of a single chip + + Returns: + Chip: Pygame surface object containing the chip + """ chip = get_image(os.path.join("img", filename)) chip = pygame.transform.scale( chip, (int(tile_size * (scale)), int(tile_size * (scale))) ) return chip + def load_chip_preview(tile_size, filename, scale): - chip = get_image(os.path.join(os.path.join("img","preview"), filename)) + """Load chip preview. + + Load the preview image of a single chip + + Returns: + Chip: Pygame surface object containing the chip preview + """ + chip = get_image(os.path.join(os.path.join("img", "preview"), filename)) chip = pygame.transform.scale( chip, (int(tile_size * (scale)), int(tile_size * (scale))) ) return chip + # from https://github.com/michaelfeil/skyjo_rl/blob/dev/rlskyjo/utils.py -from pathlib import Path -from typing import Union -import glob -import os -import re + def get_project_root() -> Path: - """return Path to the project directory, top folder of gobblet-rl + """Get project root. + + return Path to the project directory, top folder of gobblet-rl. + Returns: Path: Path to the project directory """ return Path(__file__).parent.parent.parent.resolve() -def find_file_in_subdir(parent_dir: Union[Path, str], file_str: Union[Path, str], regex_match: str = None) -> Union[str, None]: - files = glob.glob( - os.path.join(parent_dir, "**", file_str), recursive=True - ) + +def find_file_in_subdir( + parent_dir: Union[Path, str], file_str: Union[Path, str], regex_match: str = None +) -> Union[str, None]: + """Find file in subdirectory. + + Finds the path of a file in a subdirectory + + Args: + parent_dir: directory to look for the file in + + Returns: + Path: Path to the project directory + """ + files = glob.glob(os.path.join(parent_dir, "**", file_str), recursive=True) if regex_match is not None: p = re.compile(regex_match) - files = [ s for s in files if p.match(s) ] + files = [s for s in files if p.match(s)] return sorted(files)[-1] if len(files) else None -# adapted from https://commons.wikimedia.org/wiki/File:SquareWaveFourierArrows.gif#Source_code -import time -import subprocess +# adapted from https://commons.wikimedia.org/wiki/File:SquareWaveFourierArrows.gif#Source_code class GIFRecorder: + """GIF Recorder. + + This class is used to record a PyGame surface and save it to a .gif file. """ - This class is used to record a PyGame surface and save it to a gif file. - """ - def __init__(self, out_file=f'game.gif'): - """ - Initialize the recorder - :param out_file: Output file to save the recording + + def __init__(self, out_file="game.gif"): + """Initialize the recorder. + + Args: + out_file: Output file to save the recording """ - print(f'Initializing GIF Recorder...') - print(f'Output of the recording will be saved to {out_file}.') + print("Initializing GIF Recorder...") + print(f"Output of the recording will be saved to {out_file}.") self.filename_list = [] self.frame_num = 0 self.start_time = time.time() self.path = get_project_root() - # self.path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir), os.pardir) # Root project directoy print(self.path) self.out_file = out_file self.ended = False - - def capture_frame(self, surf): - """ + """Capture frame. + Call this method every frame, pass in the pygame surface to capture. - :param surf: pygame surface to capture - :return: None - """ - """ + Note: surface must have the dimensions specified in the constructor. + + Args: + surf: pygame surface to capture - Note: surface must have the dimensions specified in the constructor. + Returns: + None """ - if not self.ended: # Stop saving frames after we have exported the recording + if not self.ended: # Stop saving frames after we have exported the recording # transform the pixels to the format used by open-cv - self.filename_list.append(os.path.join(self.path, f'temp_{time.time()}_' + str(self.frame_num) + '.png')) + self.filename_list.append( + os.path.join( + self.path, f"temp_{time.time()}_" + str(self.frame_num) + ".png" + ) + ) pygame.image.save(surf, self.filename_list[-1]) self.frame_num += 1 - # Convert indivual image files to GIF + # Convert individual image files to GIF def end_recording(self, surf): - """ + """End recording. + Call this method to stop recording. + :return: None """ if not self.ended: @@ -103,9 +149,13 @@ def end_recording(self, surf): # stop recording duration = time.time() - self.start_time - seconds_per_frame = duration/ self.frame_num + seconds_per_frame = duration / self.frame_num frame_delay = str(int(seconds_per_frame * 100)) - command_list = ['convert', '-delay', frame_delay, '-loop', '0'] + self.filename_list + [self.out_file] + command_list = ( + ["convert", "-delay", frame_delay, "-loop", "0"] + + self.filename_list + + [self.out_file] + ) # Use the "convert" command (part of ImageMagick) to build the animation subprocess.call(command_list, cwd=self.path) # Earlier, we saved an image file for each frame of the animation. Now diff --git a/gobblet/gobblet_v1.py b/gobblet/gobblet_v1.py index dceacff..c5cbbb9 100644 --- a/gobblet/gobblet_v1.py +++ b/gobblet/gobblet_v1.py @@ -1,2 +1,2 @@ -from gobblet.game.gobblet import env, raw_env, parallel_env # noqa: 401 -from gobblet.game.manual_policy import ManualPolicy \ No newline at end of file +from gobblet.game.gobblet import env, parallel_env, raw_env # noqa: 401 +from gobblet.game.manual_policy import ManualGobbletPolicy # noqa 401 diff --git a/requirements.txt b/requirements.txt index 74ced2a..a5b40ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ pytest==7.1.2 ray==2.2.0 tianshou==0.4.11 torch==1.12.1 +pre-commit==3.1.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 1977cec..374f6ae 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python from setuptools import setup -if __name__=="__main__": - setup() \ No newline at end of file +if __name__ == "__main__": + setup() diff --git a/tests/test_gobblet_env.py b/tests/test_gobblet_env.py index bd951e1..8834dfb 100644 --- a/tests/test_gobblet_env.py +++ b/tests/test_gobblet_env.py @@ -1,9 +1,11 @@ +import numpy as np import pettingzoo import pettingzoo.test import pytest -import numpy as np + from gobblet import gobblet_v1 + # Note: raw_env is required in order to test the board state, as env() only allows observations @pytest.fixture(scope="function") def env(): @@ -21,9 +23,7 @@ def test_reset(env): def test_reset_starting(env): "Verify that reset() sets the board state to the correct starting position" - assert ( - (env.board.squares == np.zeros(27)).all() - ) + assert (env.board.squares == np.zeros(27)).all() def test_api(env): diff --git a/tests/test_manual_policy_collector.py b/tests/test_manual_policy_collector.py index 9ac019b..d502c74 100644 --- a/tests/test_manual_policy_collector.py +++ b/tests/test_manual_policy_collector.py @@ -11,135 +11,501 @@ git+https://github.com/thu-ml/tianshou """ -from typing import Optional, Tuple +import time +from typing import Tuple -import gym import numpy as np -import torch from tianshou.env import DummyVectorEnv from tianshou.env.pettingzoo_env import PettingZooEnv from tianshou.policy import BasePolicy, MultiAgentPolicyManager - from gobblet import gobblet_v1 from gobblet.game.collector_manual_policy import ManualPolicyCollector -from gobblet.game.greedy_policy import GreedyPolicy -import time +from gobblet.game.greedy_policy import GreedyGobbletPolicy def get_agents() -> Tuple[BasePolicy, list]: env = get_env() - agents = [GreedyPolicy(), GreedyPolicy()] + agents = [GreedyGobbletPolicy(), GreedyGobbletPolicy()] policy = MultiAgentPolicyManager(agents, env) return policy, env.agents + def get_env(render_mode=None, args=None): return PettingZooEnv(gobblet_v1.env(render_mode=render_mode, args=args)) + # ======== allows the user to input moves and play vs a pre-trained agent ====== def test_collector() -> None: env = DummyVectorEnv([lambda: get_env(render_mode="human", args=None)]) policy, agents = get_agents() - collector = ManualPolicyCollector(policy, env, exploration_noise=True) # Collector for CPU actions + collector = ManualPolicyCollector( + policy, env, exploration_noise=True + ) # Collector for CPU actions pettingzoo_env = env.workers[0].env.env - output0 = np.array([[True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True]]) - assert(np.array_equal(collector.data.obs.mask, output0)) + output0 = np.array( + [ + [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) + assert np.array_equal(collector.data.obs.mask, output0) - ''' PLAYER 1''' + """ PLAYER 1""" action = np.array(18) result = collector.collect_result(action=action.reshape(1), render=0.1) - output1 = np.array([[False, True, True, True, True, True, True, True, True, False, True, True, - True, True, True, True, True, True, False, True, True, True, True, True, - True, True, True, False, True, True, True, True, True, True, True, True, - True, True, True, True, True, True, True, True, True, True, True, True, - True, True, True, True, True, True]]) - assert(np.array_equal(collector.data.obs.mask, output1)) + output1 = np.array( + [ + [ + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) + assert np.array_equal(collector.data.obs.mask, output1) - time.sleep(.25) + time.sleep(0.25) - ''' PLAYER 2 (covers it)''' + """ PLAYER 2 (covers it)""" action = np.array(36) result = collector.collect_result(action=action.reshape(1), render=0.1) - output2 = np.array([[False, True, True, True, True, True, True, True, True, - False, True, True, True, True, True, True, True, True, - False, False, False, False, False, False, False, False, False, - False, True, True, True, True, True, True, True, True, - False, True, True, True, True, True, True, True, True, - False, True, True, True, True, True, True, True, True]]) + output2 = np.array( + [ + [ + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) assert np.array_equal(collector.data.obs.mask, output2) - time.sleep(.25) + time.sleep(0.25) - ''' PLAYER 1''' - action = np.array(27+1) + """ PLAYER 1""" + action = np.array(27 + 1) result = collector.collect_result(action=action.reshape(1), render=0.1) - output3 = np.array([[False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True, - False, True, True, True, True, True, True, True, True, - False, True, True, True, True, True, True, True, True]]) - assert(np.array_equal(collector.data.obs.mask, output3)) - - time.sleep(.25) - ''' PLAYER 2 (covers it)''' - action = np.array(45+1) + output3 = np.array( + [ + [ + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + False, + True, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) + assert np.array_equal(collector.data.obs.mask, output3) + + time.sleep(0.25) + """ PLAYER 2 (covers it)""" + action = np.array(45 + 1) result = collector.collect_result(action=action.reshape(1), render=0.1) - output4 = np.array([[False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True, - False, False, False, False, False, False, False, False, False, - False, False, False, False, False, False, False, False, False, - False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True]]) - assert(np.array_equal(collector.data.obs.mask, output4)) + output4 = np.array( + [ + [ + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) + assert np.array_equal(collector.data.obs.mask, output4) - time.sleep(.25) + time.sleep(0.25) - ''' PLAYER 1 (tries to move covered piece [ILLEGAL])''' - action = np.array(27+2) + """ PLAYER 1 (tries to move covered piece [ILLEGAL])""" + action = np.array(27 + 2) # Moves 18-35 should be illegal as they are with medium pieces. (36 and 37 as well but they are with a large piece) - output6 = [2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 38, 39, 40, 41, 42, 43, 44, 47, 48, 49, 50, 51, 52, 53] + output6 = [ + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 38, + 39, + 40, + 41, + 42, + 43, + 44, + 47, + 48, + 49, + 50, + 51, + 52, + 53, + ] legal_moves = pettingzoo_env.unwrapped._legal_moves() - assert(output6 == legal_moves) + assert output6 == legal_moves - output5 = np.array([[False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True, - False, False, False, False, False, False, False, False, False, - False, False, False, False, False, False, False, False, False, - False, False, True, True, True, True, True, True, True, - False, False, True, True, True, True, True, True, True]]) - assert(np.array_equal(collector.data.obs.mask, output5)) + output5 = np.array( + [ + [ + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + True, + True, + True, + True, + True, + True, + ] + ] + ) + assert np.array_equal(collector.data.obs.mask, output5) result = collector.collect_result(action=action.reshape(1), render=0.1) # Result should be empty because this is an illegal move - output7 = {'n/ep': 0, 'n/st': 1, 'rews': np.array([], dtype=np.float64), 'lens': np.array([], dtype=np.int64), 'idxs': np.array([], dtype=np.int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0} - assert(str(result) == str(output7)) + output7 = { + "n/ep": 0, + "n/st": 1, + "rews": np.array([], dtype=np.float64), + "lens": np.array([], dtype=np.int64), + "idxs": np.array([], dtype=np.int64), + "rew": 0, + "len": 0, + "rew_std": 0, + "len_std": 0, + } + assert str(result) == str(output7) # Board state should be unchanged because the bot tried to execute an illegal move" - output8 = np.array([[[ 0., 0., 0.], - [ 0., 0., 0.], - [ 0., 0., 0.]], - [[ 3., 4., 0.], - [ 0., 0., 0.], - [ 0., 0., 0.]], - [[-5., -6., 0.], - [ 0., 0., 0.], - [ 0., 0., 0.]]]) - assert(np.array_equal(pettingzoo_env.unwrapped.board.squares.reshape(3,3,3), output8)) + output8 = np.array( + [ + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[3.0, 4.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + [[-5.0, -6.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], + ] + ) + assert np.array_equal( + pettingzoo_env.unwrapped.board.squares.reshape(3, 3, 3), output8 + ) if __name__ == "__main__": # train the agent and watch its performance in a match! print("Starting game...") - test_collector() \ No newline at end of file + test_collector()