Skip to content

Commit

Permalink
[core][autoscaler] GCS Autoscaler V2: Add node type name to ray (#36714)
Browse files Browse the repository at this point in the history
Why are these changes needed?
This PR adds way to pass instance type (ray node type name) to ray, and make it available to autoscaler.

I will be adding e2e tests in a separate PR (from installer -> GCS -> autoscaler). This PR only adds the unit testing.
---------

Signed-off-by: rickyyx <rickyx@anyscale.com>
  • Loading branch information
rickyyx authored Jun 24, 2023
1 parent 761b3e4 commit 789cf34
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 4 deletions.
2 changes: 2 additions & 0 deletions python/ray/autoscaler/v2/instance_manager/ray_installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def install_ray(self, instance: Instance, head_node_ip: str) -> bool:
# `RAY_HEAD_IP=<head_node_ip> \
# RAY_CLOUD_INSTANCE_ID=<instance_id> \
# ray start --head ...`
# See src/ray/common/constants.h for ENV name definitions.
ray_start_commands=with_envs(
ray_start_commands,
{
"RAY_HEAD_IP": head_node_ip,
"RAY_CLOUD_INSTANCE_ID": instance.instance_id,
"RAY_NODE_TYPE_NAME": instance.instance_type,
},
),
runtime_hash=self._config.runtime_hash,
Expand Down
5 changes: 4 additions & 1 deletion python/ray/tests/test_state_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2135,7 +2135,7 @@ def verify():


@pytest.mark.asyncio
async def test_node_instance_id(ray_start_cluster, monkeypatch):
async def test_cloud_envs(ray_start_cluster, monkeypatch):
cluster = ray_start_cluster
cluster.add_node(num_cpus=1, node_name="head_node")
ray.init(address=cluster.address)
Expand All @@ -2144,6 +2144,7 @@ async def test_node_instance_id(ray_start_cluster, monkeypatch):
"RAY_CLOUD_INSTANCE_ID",
"test_cloud_id",
)
m.setenv("RAY_NODE_TYPE_NAME", "test-node-type")
cluster.add_node(num_cpus=1, node_name="worker_node")
client = state_source_client(cluster.address)

Expand All @@ -2154,8 +2155,10 @@ async def verify():
for node_info in reply.node_info_list:
if node_info.node_name == "worker_node":
assert node_info.instance_id == "test_cloud_id"
assert node_info.node_type_name == "test-node-type"
else:
assert node_info.instance_id == ""
assert node_info.node_type_name == ""

return True

Expand Down
9 changes: 9 additions & 0 deletions src/ray/common/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,18 @@ constexpr char kSetupWorkerFilename[] = "setup_worker.py";
/// The version of Ray
constexpr char kRayVersion[] = "3.0.0.dev0";

/*****************************/
/* ENV labels for autoscaler */
/*****************************/
/// Name for cloud instance id env
constexpr char kNodeCloudInstanceIdEnv[] = "RAY_CLOUD_INSTANCE_ID";

constexpr char kNodeTypeNameEnv[] = "RAY_NODE_TYPE_NAME";

/**********************************/
/* ENV labels for autoscaler ends */
/**********************************/

/// Key for the placement group's bundle placement constraint.
/// Used by FormatPlacementGroupLabelName()
constexpr char kPlacementGroupConstraintKeyPrefix[] = "_PG_";
Expand Down
1 change: 1 addition & 0 deletions src/ray/gcs/gcs_server/gcs_autoscaler_state_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ void GcsAutoscalerStateManager::GetNodeStates(
auto node_state_proto = state->add_node_states();
node_state_proto->set_node_id(gcs_node_info.node_id());
node_state_proto->set_instance_id(gcs_node_info.instance_id());
node_state_proto->set_ray_node_type_name(gcs_node_info.node_type_name());
node_state_proto->set_node_state_version(last_cluster_resource_state_version_);
node_state_proto->set_status(status);

Expand Down
2 changes: 2 additions & 0 deletions src/ray/protobuf/experimental/autoscaler.proto
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,14 @@ message NodeState {

// The instance id that the node is running on.
// This is passed in when the node is registered.
// Set by ray from ENV at src/ray/common/constants.h::kNodeCloudInstanceIdEnv
string instance_id = 2;

// The node type name, e.g. ray-head-node, matching `available_node_types`
// in the autoscaler config. See `ray/autoscaler/ray-schema.json`
// Should be set when a ray node is starting - and this will be empty
// if it's not set when starting the node.
// Set by ray from ENV at src/ray/common/constants.h::kNodeTypeNameEnv
string ray_node_type_name = 3;

// The available resources on the node.
Expand Down
4 changes: 4 additions & 0 deletions src/ray/protobuf/gcs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ message GcsNodeInfo {
// The instance id of the node if it's running on a cloud provider.
string instance_id = 13;

// The instance node type of the node if it's running on a cloud provider.
// Set through ENV of src/ray/common/constants.h::kNodeTypeNameEnv
string node_type_name = 14;

// The unix ms timestamp the node was started at.
uint64 start_time_ms = 23;
// The unix ms timestamp the node was ended at.
Expand Down
9 changes: 6 additions & 3 deletions src/ray/raylet/raylet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,14 @@ Raylet::Raylet(instrumented_io_context &main_service,
resource_map.end());
self_node_info_.set_start_time_ms(current_sys_time_ms());
self_node_info_.set_is_head_node(is_head_node);
// Read from env
auto instance_id = std::getenv(kNodeCloudInstanceIdEnv);
self_node_info_.set_instance_id(instance_id ? instance_id : "");
self_node_info_.mutable_labels()->insert(node_manager_config.labels.begin(),
node_manager_config.labels.end());

// Setting up autoscaler related fields from ENV
auto instance_id = std::getenv(kNodeCloudInstanceIdEnv);
self_node_info_.set_instance_id(instance_id ? instance_id : "");
auto cloud_node_type_name = std::getenv(kNodeTypeNameEnv);
self_node_info_.set_node_type_name(cloud_node_type_name ? cloud_node_type_name : "");
}

Raylet::~Raylet() {}
Expand Down

0 comments on commit 789cf34

Please sign in to comment.