-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core] Add ClusterID token to GCS server [3/n] #36535
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
#include "ray/gcs/gcs_server/store_client_kv.h" | ||
#include "ray/gcs/store_client/observable_store_client.h" | ||
#include "ray/pubsub/publisher.h" | ||
#include "ray/util/util.h" | ||
|
||
namespace ray { | ||
namespace gcs { | ||
|
@@ -86,6 +87,7 @@ GcsServer::GcsServer(const ray::gcs::GcsServerConfig &config, | |
RAY_CHECK(status.ok()) << "Failed to put internal config"; | ||
this->main_service_.stop(); | ||
}; | ||
|
||
ray::rpc::StoredConfig stored_config; | ||
stored_config.set_config(config_.raylet_config_list); | ||
RAY_CHECK_OK(gcs_table_storage_->InternalConfigTable().Put( | ||
|
@@ -138,7 +140,45 @@ RedisClientOptions GcsServer::GetRedisClientOptions() const { | |
void GcsServer::Start() { | ||
// Load gcs tables data asynchronously. | ||
auto gcs_init_data = std::make_shared<GcsInitData>(gcs_table_storage_); | ||
gcs_init_data->AsyncLoad([this, gcs_init_data] { DoStart(*gcs_init_data); }); | ||
// Init KV Manager. This needs to be initialized first here so that | ||
// it can be used to retrieve the cluster ID. | ||
InitKVManager(); | ||
gcs_init_data->AsyncLoad([this, gcs_init_data] { | ||
GetOrGenerateClusterId([this, gcs_init_data](ClusterID cluster_id) { | ||
rpc_server_.SetClusterId(cluster_id); | ||
DoStart(*gcs_init_data); | ||
}); | ||
}); | ||
} | ||
|
||
void GcsServer::GetOrGenerateClusterId( | ||
std::function<void(ClusterID cluster_id)> &&continuation) { | ||
static std::string const kTokenNamespace = "cluster"; | ||
kv_manager_->GetInstance().Get( | ||
kTokenNamespace, | ||
kClusterIdKey, | ||
[this, continuation = std::move(continuation)]( | ||
std::optional<std::string> provided_cluster_id) mutable { | ||
if (!provided_cluster_id.has_value()) { | ||
ClusterID cluster_id = ClusterID::FromRandom(); | ||
RAY_LOG(INFO) << "No existing server cluster ID found. Generating new ID: " | ||
<< cluster_id.Hex(); | ||
kv_manager_->GetInstance().Put( | ||
kTokenNamespace, | ||
kClusterIdKey, | ||
cluster_id.Binary(), | ||
false, | ||
[&cluster_id, | ||
continuation = std::move(continuation)](bool added_entry) mutable { | ||
RAY_CHECK(added_entry) << "Failed to persist new cluster ID!"; | ||
continuation(cluster_id); | ||
}); | ||
} else { | ||
ClusterID cluster_id = ClusterID::FromBinary(provided_cluster_id.value()); | ||
RAY_LOG(INFO) << "Found existing server token: " << cluster_id; | ||
continuation(cluster_id); | ||
} | ||
}); | ||
} | ||
|
||
void GcsServer::DoStart(const GcsInitData &gcs_init_data) { | ||
|
@@ -160,8 +200,8 @@ void GcsServer::DoStart(const GcsInitData &gcs_init_data) { | |
// Init gcs health check manager. | ||
InitGcsHealthCheckManager(gcs_init_data); | ||
|
||
// Init KV Manager | ||
InitKVManager(); | ||
// Init KV service. | ||
InitKVService(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we remove this? It's already called in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One is KV Manager and one is KV service |
||
|
||
// Init function manager | ||
InitFunctionManager(); | ||
|
@@ -208,7 +248,6 @@ void GcsServer::DoStart(const GcsInitData &gcs_init_data) { | |
gcs_actor_manager_->SetUsageStatsClient(usage_stats_client_.get()); | ||
gcs_placement_group_manager_->SetUsageStatsClient(usage_stats_client_.get()); | ||
gcs_task_manager_->SetUsageStatsClient(usage_stats_client_.get()); | ||
|
||
RecordMetrics(); | ||
|
||
periodical_runner_.RunFnPeriodically( | ||
|
@@ -265,8 +304,10 @@ void GcsServer::Stop() { | |
|
||
void GcsServer::InitGcsNodeManager(const GcsInitData &gcs_init_data) { | ||
RAY_CHECK(gcs_table_storage_ && gcs_publisher_); | ||
gcs_node_manager_ = std::make_unique<GcsNodeManager>( | ||
gcs_publisher_, gcs_table_storage_, raylet_client_pool_); | ||
gcs_node_manager_ = std::make_unique<GcsNodeManager>(gcs_publisher_, | ||
gcs_table_storage_, | ||
raylet_client_pool_, | ||
rpc_server_.GetClusterId()); | ||
// Initialize by gcs tables data. | ||
gcs_node_manager_->Initialize(gcs_init_data); | ||
// Register service. | ||
|
@@ -547,6 +588,10 @@ void GcsServer::InitKVManager() { | |
} | ||
|
||
kv_manager_ = std::make_unique<GcsInternalKVManager>(std::move(instance)); | ||
} | ||
|
||
void GcsServer::InitKVService() { | ||
RAY_CHECK(kv_manager_); | ||
kv_service_ = std::make_unique<rpc::InternalKVGrpcService>(main_service_, *kv_manager_); | ||
// Register service. | ||
rpc_server_.RegisterService(*kv_service_, false /* token_auth */); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -177,6 +177,13 @@ service ActorInfoGcsService { | |
rpc KillActorViaGcs(KillActorViaGcsRequest) returns (KillActorViaGcsReply); | ||
} | ||
|
||
message GetClusterIdRequest {} | ||
|
||
message GetClusterIdReply { | ||
GcsStatus status = 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, but it seems to be convention in other RPC replies, even when it's not used. |
||
bytes cluster_id = 2; | ||
} | ||
|
||
message RegisterNodeRequest { | ||
// Info of node. | ||
GcsNodeInfo node_info = 1; | ||
|
@@ -618,6 +625,8 @@ message GcsStatus { | |
|
||
// Service for node info access. | ||
service NodeInfoGcsService { | ||
// Register a client to GCS Service. Must be called before any other RPC in GCSClient. | ||
rpc GetClusterId(GetClusterIdRequest) returns (GetClusterIdReply); | ||
// Register a node to GCS Service. | ||
rpc RegisterNode(RegisterNodeRequest) returns (RegisterNodeReply); | ||
// Drain a node from GCS Service. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
token -> cluster id