Skip to content

Commit

Permalink
Merge pull request #169 from l1onh3art88/tower_climb
Browse files Browse the repository at this point in the history
Tower climb
  • Loading branch information
jsuarez5341 authored Feb 18, 2025
2 parents 2299926 + 6c8c123 commit 3b5ab6a
Show file tree
Hide file tree
Showing 20 changed files with 2,514 additions and 44 deletions.
59 changes: 59 additions & 0 deletions config/ocean/tower_climb.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
[base]
package = ocean
env_name = puffer_tower_climb
policy_name = TowerClimb
rnn_name = TowerClimbLSTM

[env]
num_envs = 4096
reward_climb_row = 0.636873185634613
reward_fall_row = -0.15898257493972778
reward_illegal_move = -0.003928301855921745
reward_move_block = 0.235064297914505
[train]
total_timesteps = 1_500_000_000
checkpoint_interval = 50
num_envs = 2
num_workers = 2
env_batch_size = 1
batch_size = 131072
update_epochs = 1
minibatch_size = 16384
bptt_horizon = 8
ent_coef = 0.09437782543268033
gae_lambda = 0.8891903904462561
gamma = 0.9844750061125859
learning_rate = 0.0025992502188953532
max_grad_norm = 0.9384621977806091
vf_coef = 0.5290350317955017
anneal_lr = False
device = cuda

[sweep.metric]
goal = maximize
name = environment/levels_completed

[sweep.parameters.train.parameters.total_timesteps]
distribution = uniform
min = 50_000_000
max = 200_000_000

[sweep.parameters.env.parameters.reward_climb_row]
distribution = uniform
min = 0.0
max = 1.0

[sweep.parameters.env.parameters.reward_fall_row]
distribution = uniform
min = -1.0
max = 0.0

[sweep.parameters.env.parameters.reward_illegal_move]
distribution = uniform
min = -1e-2
max = -1e-4

[sweep.parameters.env.parameters.reward_move_block]
distribution = uniform
min = 0.0
max = 1.0
3 changes: 2 additions & 1 deletion pufferlib/ocean/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .grid.grid import PufferGrid
#from .rocket_lander import rocket_lander
from .trash_pickup.trash_pickup import TrashPickupEnv
from .tower_climb.tower_climb import TowerClimb

def make_foraging(width=1080, height=720, num_agents=4096, horizon=512,
discretize=True, food_reward=0.1, render_mode='rgb_array'):
Expand Down Expand Up @@ -140,7 +141,7 @@ def make_multiagent(buf=None, **kwargs):
'go': Go,
'rware': Rware,
'trash_pickup': TrashPickupEnv,

'tower_climb': TowerClimb,
#'rocket_lander': rocket_lander.RocketLander,
'foraging': make_foraging,
'predator_prey': make_predator_prey,
Expand Down
50 changes: 50 additions & 0 deletions pufferlib/ocean/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,53 @@ def decode_actions(self, flat_hidden, lookup, concat=None):
action = self.actor(flat_hidden)
value = self.value_fn(flat_hidden)
return action, value

class TowerClimbLSTM(pufferlib.models.LSTMWrapper):
def __init__(self, env, policy, input_size = 256, hidden_size = 256, num_layers = 1):
super().__init__(env, policy, input_size, hidden_size, num_layers)

class TowerClimb(nn.Module):
def __init__(self, env, cnn_channels=16, hidden_size = 256, **kwargs):
super().__init__()
self.network = nn.Sequential(
pufferlib.pytorch.layer_init(
nn.Conv3d(1, cnn_channels, 3, stride = 1)),
nn.ReLU(),
pufferlib.pytorch.layer_init(
nn.Conv3d(cnn_channels, cnn_channels, 3, stride=1)),
nn.Flatten()
)
cnn_flat_size = cnn_channels * 1 * 1 * 5

# Process player obs
self.flat = pufferlib.pytorch.layer_init(nn.Linear(3,16))

# combine
self.proj = pufferlib.pytorch.layer_init(
nn.Linear(cnn_flat_size + 16, hidden_size))
self.actor = pufferlib.pytorch.layer_init(
nn.Linear(hidden_size, env.single_action_space.n), std = 0.01)
self.value_fn = pufferlib.pytorch.layer_init(
nn.Linear(hidden_size, 1 ), std=1)

def forward(self, observations, state=None):
hidden, lookup = self.encode_observations(observations)
actions, value = self.decode_actions(hidden, lookup)
return actions, value, state
def encode_observations(self, observations):
board_state = observations[:,:225]
player_info = observations[:, -3:]
board_features = board_state.view(-1, 1, 5,5,9).float()
cnn_features = self.network(board_features)
flat_features = self.flat(player_info.float())

features = torch.cat([cnn_features,flat_features],dim = 1)
features = self.proj(features)
return features, None

def decode_actions(self, flat_hidden, lookup, concat=None):
action = self.actor(flat_hidden)
value = self.value_fn(flat_hidden)

return action, value

169 changes: 169 additions & 0 deletions pufferlib/ocean/tower_climb/cy_tower_climb.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from libc.stdlib cimport calloc, free, rand
from libc.stdint cimport uint64_t
import numpy as np
cdef extern from "tower_climb.h":
int LOG_BUFFER_SIZE

ctypedef struct Log:
float episode_return;
float episode_length;
float rows_cleared;
float levels_completed;

ctypedef struct LogBuffer
LogBuffer* allocate_logbuffer(int)
void free_logbuffer(LogBuffer*)
Log aggregate_and_clear(LogBuffer*)

ctypedef struct Level:
const int* map;
int rows;
int cols;
int size;
int total_length;
int goal_location;
int spawn_location;

ctypedef struct PuzzleState:
unsigned char* blocks;
int robot_position;
int robot_orientation;
int robot_state;
int block_grabbed;

ctypedef struct VisitedNode:
PuzzleState state;
uint64_t hashVal;
VisitedNode* next;

ctypedef struct BFSNode:
PuzzleState state;
int depth;
int parent;
int action;

ctypedef struct CTowerClimb:
unsigned char* observations;
int* actions;
float* rewards;
unsigned char* dones;
LogBuffer* log_buffer;
Log log;
float score;
Level* level;
PuzzleState* state;
int distance_to_goal;
float reward_climb_row;
float reward_fall_row;
float reward_illegal_move;
float reward_move_block;

ctypedef struct Client:
int enable_animations;
int isMoving;

void init(CTowerClimb* env)
void free_allocated(CTowerClimb* env)
void init_level(Level* level)
void init_puzzle_state(PuzzleState* state)
void init_random_level(CTowerClimb* env, int goal_height, int max_moves, int min_moves, int seed)
void cy_init_random_level(Level* level, int goal_height, int max_moves, int min_moves, int seed)
void levelToPuzzleState(Level* level, PuzzleState* state)
void setPuzzle(CTowerClimb* dest, PuzzleState* src, Level* lvl)


Client* make_client(CTowerClimb* env)
void close_client(Client* client)
void c_render(Client* client, CTowerClimb* env)
void c_reset(CTowerClimb* env)
int c_step(CTowerClimb* env)

cdef class CyTowerClimb:
cdef:
CTowerClimb* envs
Level* levels
PuzzleState* puzzle_states
Client* client
LogBuffer* logs
int num_envs
int num_maps

def __init__(self, unsigned char[:, :] observations, int[:] actions,
float[:] rewards, unsigned char[:] terminals, int num_envs,
int num_maps, float reward_climb_row, float reward_fall_row,
float reward_illegal_move, float reward_move_block):

self.client = NULL
self.num_envs = num_envs
self.num_maps = num_maps
self.levels = <Level*> calloc(num_maps, sizeof(Level))
self.puzzle_states = <PuzzleState*> calloc(num_maps, sizeof(PuzzleState))
self.envs = <CTowerClimb*> calloc(num_envs, sizeof(CTowerClimb))
self.logs = allocate_logbuffer(LOG_BUFFER_SIZE)
cdef int i
for i in range(num_envs):
self.envs[i] = CTowerClimb(
observations=&observations[i, 0],
actions=&actions[i],
rewards=&rewards[i],
dones=&terminals[i],
log_buffer=self.logs,
reward_climb_row=reward_climb_row,
reward_fall_row=reward_fall_row,
reward_illegal_move=reward_illegal_move,
reward_move_block=reward_move_block,
)
init(&self.envs[i])
self.client = NULL

cdef int goal_height
cdef int max_moves
for i in range(num_maps):
goal_height = np.random.randint(5,9)
min_moves = 10
max_moves = 15
init_level(&self.levels[i])
init_puzzle_state(&self.puzzle_states[i])
cy_init_random_level(&self.levels[i], goal_height, max_moves, min_moves, i)
levelToPuzzleState(&self.levels[i], &self.puzzle_states[i])
if (i + 1 ) % 50 == 0:
print(f"Created {i+1} maps..")


def reset(self):
cdef int i, idx
for i in range(self.num_envs):
idx = np.random.randint(0, self.num_maps)
c_reset(&self.envs[i])
setPuzzle(&self.envs[i], &self.puzzle_states[idx], &self.levels[idx])

def step(self):
cdef int i, idx, done
for i in range(self.num_envs):
done = c_step(&self.envs[i])
if (done):
idx = np.random.randint(0, self.num_maps)
c_reset(&self.envs[i])
setPuzzle(&self.envs[i], &self.puzzle_states[idx], &self.levels[idx])

def render(self):
cdef CTowerClimb* env = &self.envs[0]
if self.client == NULL:
self.client = make_client(env)
self.client.enable_animations = 1
cdef int isMoving
while True:
c_render(self.client, &self.envs[0])
isMoving = self.client.isMoving
if not isMoving:
break
def close(self):
if self.client != NULL:
close_client(self.client)
self.client = NULL

free(self.envs)

def log(self):
cdef Log log = aggregate_and_clear(self.logs)
return log
Loading

0 comments on commit 3b5ab6a

Please sign in to comment.