Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Change placement group strategy for learner #36929

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ head_node_type:

worker_node_types:
- name: worker_node
instance_type: g4dn.xlarge
min_workers: 2
max_workers: 2
instance_type: g3.8xlarge
min_workers: 1
max_workers: 1
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@avnishn since you change the number of worker, you also need to change the number here https://github.com/ray-project/ray/blob/master/release/release_tests.yaml#L3679 to wait for the correct number of node

This broke this test, and some other rllib test in this latest run https://buildkite.com/ray-project/release-tests-branch/builds/1846

Please cherry pick the fix as well since the branch is cut

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm putting #37022 to fix

use_spot: false

aws:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ head_node_type:

worker_node_types:
- name: worker_node
instance_type: n1-standard-4-nvidia-tesla-t4-1 # g4dn.xlarge
min_workers: 2
max_workers: 2
instance_type: n1-standard-4-nvidia-t4-16gb-2
min_workers: 1
max_workers: 1
use_spot: false
101 changes: 59 additions & 42 deletions rllib/algorithms/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,42 @@
from ray.util.timer import _Timer
from ray.tune.registry import get_trainable_cls


try:
from ray.rllib.extensions import AlgorithmBase
except ImportError:

class AlgorithmBase:

@staticmethod
def _get_learner_bundles(cf: AlgorithmConfig):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document what it returns?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

"""Selects the right resource bundles for learner workers based off of cf.

Args:
cf: The algorithm config.
"""
if cf.num_learner_workers > 0:
if cf.num_gpus_per_learner_worker:
learner_bundles = [
{"GPU": cf.num_learner_workers * cf.num_gpus_per_learner_worker}
]
elif cf.num_cpus_per_learner_worker:
learner_bundles = [
{
"CPU": cf.num_cpus_per_learner_worker * cf.num_learner_workers,
}
]
else:
learner_bundles = {
# sampling and training is not done concurrently when local is
# used, so pick the max.
"CPU": max(
cf.num_cpus_per_learner_worker, cf.num_cpus_for_local_worker
),
"GPU": cf.num_gpus_per_learner_worker,
}
return learner_bundles

tf1, tf, tfv = try_import_tf()

logger = logging.getLogger(__name__)
Expand All @@ -146,7 +182,7 @@ def with_common_config(*args, **kwargs):


@PublicAPI
class Algorithm(Trainable):
class Algorithm(Trainable, AlgorithmBase):
"""An RLlib algorithm responsible for optimizing one or more Policies.

Algorithms contain a WorkerSet under `self.workers`. A WorkerSet is
Expand Down Expand Up @@ -2157,30 +2193,26 @@ def default_resource_request(
eval_cf.freeze()

# resources for the driver of this trainable
if cf._enable_learner_api:
if cf.num_learner_workers == 0:
# in this case local_worker only does sampling and training is done on
# local learner worker
driver = {
# sampling and training is not done concurrently when local is
# used, so pick the max.
"CPU": max(
cf.num_cpus_per_learner_worker, cf.num_cpus_for_local_worker
),
"GPU": cf.num_gpus_per_learner_worker,
}
else:
# in this case local_worker only does sampling and training is done on
# remote learner workers
driver = {"CPU": cf.num_cpus_for_local_worker, "GPU": 0}
else:
# Without Learner API, the local_worker can do both sampling and training.
# So, we need to allocate the same resources for the driver as for the
# local_worker.
driver = {
"CPU": cf.num_cpus_for_local_worker,
"GPU": 0 if cf._fake_gpus else cf.num_gpus,
}
# if cf._enable_learner_api:
# if cf.num_learner_workers == 0:
# # in this case local_worker only does sampling and training is done on
# # local learner worker
# driver = {
# # sampling and training is not done concurrently when local is
# # used, so pick the max.
# "CPU": max(
# cf.num_cpus_per_learner_worker, cf.num_cpus_for_local_worker
# ),
# "GPU": cf.num_gpus_per_learner_worker,
# }
# else:
# # in this case local_worker only does sampling and training is done on
# # remote learner workers
# driver = {"CPU": cf.num_cpus_for_local_worker, "GPU": 0}
driver = {
"CPU": cf.num_cpus_for_local_worker,
"GPU": 0 if cf._fake_gpus else cf.num_gpus,
}

# resources for remote rollout env samplers
rollout_bundles = [
Expand Down Expand Up @@ -2222,23 +2254,8 @@ def default_resource_request(

# resources for remote learner workers
learner_bundles = []
if cf._enable_learner_api and cf.num_learner_workers > 0:
# can't specify cpus for learner workers at the same
# time as gpus
if cf.num_gpus_per_learner_worker:
learner_bundles = [
{
"GPU": cf.num_gpus_per_learner_worker,
}
for _ in range(cf.num_learner_workers)
]
elif cf.num_cpus_per_learner_worker:
learner_bundles = [
{
"CPU": cf.num_cpus_per_learner_worker,
}
for _ in range(cf.num_learner_workers)
]
if cf._enable_learner_api:
learner_bundles = cls._get_learner_bundles(cf)

bundles = [driver] + rollout_bundles + evaluation_bundles + learner_bundles

Expand Down
29 changes: 3 additions & 26 deletions rllib/algorithms/impala/impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,32 +839,9 @@ def default_resource_request(
# factories.
if cf._enable_learner_api:
# Resources for the Algorithm.
if cf.num_learner_workers == 0:
# if num_learner_workers is 0, then we need to allocate one gpu if
# num_gpus_per_learner_worker is greater than 0.
trainer_bundle = [
{
"CPU": cf.num_cpus_per_learner_worker,
"GPU": cf.num_gpus_per_learner_worker,
}
]
else:
if cf.num_gpus_per_learner_worker:
trainer_bundle = [
{
"GPU": cf.num_gpus_per_learner_worker,
}
for _ in range(cf.num_learner_workers)
]
elif cf.num_cpus_per_learner_worker:
trainer_bundle = [
{
"CPU": cf.num_cpus_per_learner_worker,
}
for _ in range(cf.num_learner_workers)
]

bundles += trainer_bundle
learner_bundles = cls._get_learner_bundles(cf)

bundles += learner_bundles

# Return PlacementGroupFactory containing all needed resources
# (already properly defined as device bundles).
Expand Down