-
Notifications
You must be signed in to change notification settings - Fork 6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core] Only get single node info rather then all when needed #49727
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -396,145 +396,106 @@ std::string GlobalStateAccessor::GetSystemConfig() { | |
return future.get(); | ||
} | ||
|
||
ray::Status GlobalStateAccessor::GetAliveNodes(std::vector<rpc::GcsNodeInfo> &nodes) { | ||
std::promise<std::pair<Status, std::vector<rpc::GcsNodeInfo>>> promise; | ||
{ | ||
absl::ReaderMutexLock lock(&mutex_); | ||
RAY_CHECK_OK(gcs_client_->Nodes().AsyncGetAll( | ||
[&promise](Status status, std::vector<rpc::GcsNodeInfo> &&nodes) { | ||
promise.set_value( | ||
std::pair<Status, std::vector<rpc::GcsNodeInfo>>(status, std::move(nodes))); | ||
}, | ||
/*timeout_ms=*/-1)); | ||
} | ||
auto result = promise.get_future().get(); | ||
auto status = result.first; | ||
if (!status.ok()) { | ||
return status; | ||
} | ||
|
||
std::copy_if(result.second.begin(), | ||
result.second.end(), | ||
std::back_inserter(nodes), | ||
[](const rpc::GcsNodeInfo &node) { | ||
return node.state() == rpc::GcsNodeInfo::ALIVE; | ||
}); | ||
return status; | ||
} | ||
|
||
ray::Status GlobalStateAccessor::GetNode(const std::string &node_id, | ||
ray::Status GlobalStateAccessor::GetNode(const std::string &node_id_hex_str, | ||
std::string *node_info) { | ||
auto start_ms = current_time_ms(); | ||
auto node_id_binary = NodeID::FromHex(node_id).Binary(); | ||
const auto end_time_point = | ||
current_time_ms() + RayConfig::instance().raylet_start_wait_time_s() * 1000; | ||
const auto node_id_binary = NodeID::FromHex(node_id_hex_str).Binary(); | ||
|
||
std::vector<rpc::GcsNodeInfo> node_infos; | ||
while (true) { | ||
std::vector<rpc::GcsNodeInfo> nodes; | ||
auto status = GetAliveNodes(nodes); | ||
if (!status.ok()) { | ||
return status; | ||
rpc::GetAllNodeInfoRequest_Filters filters; | ||
filters.set_state(rpc::GcsNodeInfo_GcsNodeState::GcsNodeInfo_GcsNodeState_ALIVE); | ||
filters.set_node_id(node_id_binary); | ||
{ | ||
absl::ReaderMutexLock lock(&mutex_); | ||
auto timeout_ms = | ||
std::max(end_time_point - current_time_ms(), static_cast<int64_t>(0)); | ||
RAY_ASSIGN_OR_RETURN( | ||
node_infos, | ||
gcs_client_->Nodes().GetAllNoCacheWithFilters(timeout_ms, std::move(filters))); | ||
} | ||
|
||
if (nodes.empty()) { | ||
status = Status::NotFound("GCS has started but no raylets have registered yet."); | ||
} else { | ||
int relevant_client_index = -1; | ||
for (int i = 0; i < static_cast<int>(nodes.size()); i++) { | ||
const auto &node = nodes[i]; | ||
if (node_id_binary == node.node_id()) { | ||
relevant_client_index = i; | ||
break; | ||
} | ||
} | ||
|
||
if (relevant_client_index < 0) { | ||
status = Status::NotFound( | ||
"GCS cannot find the node with node ID " + node_id + | ||
". The node registration may not be complete yet before the timeout." + | ||
" Try increase the RAY_raylet_start_wait_time_s config."); | ||
} else { | ||
*node_info = nodes[relevant_client_index].SerializeAsString(); | ||
return Status::OK(); | ||
} | ||
if (!node_infos.empty()) { | ||
*node_info = node_infos[0].SerializeAsString(); | ||
return Status::OK(); | ||
} | ||
|
||
if (current_time_ms() - start_ms >= | ||
RayConfig::instance().raylet_start_wait_time_s() * 1000) { | ||
return status; | ||
if (current_time_ms() >= end_time_point) { | ||
return Status::NotFound( | ||
"GCS cannot find the node with node ID " + node_id_hex_str + | ||
". The node registration may not be complete yet before the timeout." + | ||
" Try increase the RAY_raylet_start_wait_time_s config."); | ||
} | ||
RAY_LOG(WARNING) << "Retrying to get node with node ID " << node_id; | ||
RAY_LOG(WARNING) << "Retrying to get node with node ID " << node_id_hex_str; | ||
// Some of the information may not be in GCS yet, so wait a little bit. | ||
std::this_thread::sleep_for(std::chrono::seconds(1)); | ||
} | ||
} | ||
|
||
ray::Status GlobalStateAccessor::GetNodeToConnectForDriver( | ||
const std::string &node_ip_address, std::string *node_to_connect) { | ||
auto start_ms = current_time_ms(); | ||
const auto end_time_point = | ||
current_time_ms() + RayConfig::instance().raylet_start_wait_time_s() * 1000; | ||
|
||
std::vector<rpc::GcsNodeInfo> node_infos; | ||
rpc::GetAllNodeInfoRequest_Filters filters; | ||
filters.set_state(rpc::GcsNodeInfo_GcsNodeState::GcsNodeInfo_GcsNodeState_ALIVE); | ||
filters.set_node_ip_address(node_ip_address); | ||
while (true) { | ||
std::vector<rpc::GcsNodeInfo> nodes; | ||
auto status = GetAliveNodes(nodes); | ||
if (!status.ok()) { | ||
return status; | ||
{ | ||
absl::ReaderMutexLock lock(&mutex_); | ||
auto timeout_ms = | ||
std::max(end_time_point - current_time_ms(), static_cast<int64_t>(0)); | ||
RAY_ASSIGN_OR_RETURN( | ||
node_infos, gcs_client_->Nodes().GetAllNoCacheWithFilters(timeout_ms, filters)); | ||
} | ||
if (!node_infos.empty()) { | ||
*node_to_connect = node_infos[0].SerializeAsString(); | ||
return Status::OK(); | ||
} | ||
|
||
if (nodes.empty()) { | ||
status = Status::NotFound("GCS has started but no raylets have registered yet."); | ||
} else { | ||
int relevant_client_index = -1; | ||
int head_node_client_index = -1; | ||
std::pair<std::string, int> gcs_address; | ||
std::string gcs_address; | ||
{ | ||
absl::WriterMutexLock lock(&mutex_); | ||
auto [address, _] = gcs_client_->GetGcsServerAddress(); | ||
gcs_address = std::move(address); | ||
} | ||
filters.set_node_ip_address(gcs_address); | ||
{ | ||
absl::ReaderMutexLock lock(&mutex_); | ||
auto timeout_ms = end_time_point - current_time_ms(); | ||
RAY_ASSIGN_OR_RETURN( | ||
node_infos, gcs_client_->Nodes().GetAllNoCacheWithFilters(timeout_ms, filters)); | ||
} | ||
if (node_infos.empty() && node_ip_address == gcs_address) { | ||
filters.set_node_ip_address("127.0.0.1"); | ||
Comment on lines
+470
to
+471
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you know, can you comment on what this case is? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related code (listen_to_localhost_only sets GrpcServer to listen to 127.0.0.1) with and pr that introduced this #16810. Don't see a concrete reason to have this last case anywhere, ideally the above case of looking for gcs_address should cover it, but maybe some case of container or something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, lets keep it for now. |
||
{ | ||
absl::WriterMutexLock lock(&mutex_); | ||
gcs_address = gcs_client_->GetGcsServerAddress(); | ||
} | ||
|
||
for (int i = 0; i < static_cast<int>(nodes.size()); i++) { | ||
const auto &node = nodes[i]; | ||
std::string ip_address = node.node_manager_address(); | ||
if (ip_address == node_ip_address) { | ||
relevant_client_index = i; | ||
break; | ||
} | ||
// TODO(kfstorm): Do we need to replace `node_ip_address` with | ||
// `get_node_ip_address()`? | ||
if ((ip_address == "127.0.0.1" && gcs_address.first == node_ip_address) || | ||
ip_address == gcs_address.first) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i feel like the error messages don't totally align with the error messages, here we'll just return a node_info as long it matches the gcs_address we fetched |
||
head_node_client_index = i; | ||
} | ||
} | ||
|
||
if (relevant_client_index < 0 && head_node_client_index >= 0) { | ||
RAY_LOG(INFO) << "This node has an IP address of " << node_ip_address | ||
<< ", but we cannot find a local Raylet with the same address. " | ||
<< "This can happen when you connect to the Ray cluster " | ||
<< "with a different IP address or when connecting to a container."; | ||
relevant_client_index = head_node_client_index; | ||
} | ||
if (relevant_client_index < 0) { | ||
std::ostringstream oss; | ||
oss << "This node has an IP address of " << node_ip_address << ", and Ray " | ||
<< "expects this IP address to be either the GCS address or one of" | ||
<< " the Raylet addresses. Connected to GCS at " << gcs_address.first | ||
<< " and found raylets at "; | ||
for (size_t i = 0; i < nodes.size(); i++) { | ||
if (i > 0) { | ||
oss << ", "; | ||
} | ||
oss << nodes[i].node_manager_address(); | ||
} | ||
oss << " but none of these match this node's IP " << node_ip_address | ||
<< ". Are any of these actually a different IP address for the same node?" | ||
<< "You might need to provide --node-ip-address to specify the IP " | ||
<< "address that the head should use when sending to this node."; | ||
status = Status::NotFound(oss.str()); | ||
} else { | ||
*node_to_connect = nodes[relevant_client_index].SerializeAsString(); | ||
return Status::OK(); | ||
absl::ReaderMutexLock lock(&mutex_); | ||
auto timeout_ms = | ||
std::max(end_time_point - current_time_ms(), static_cast<int64_t>(0)); | ||
RAY_ASSIGN_OR_RETURN( | ||
node_infos, | ||
gcs_client_->Nodes().GetAllNoCacheWithFilters(timeout_ms, filters)); | ||
} | ||
} | ||
if (!node_infos.empty()) { | ||
RAY_LOG(INFO) << "This node has an IP address of " << node_ip_address | ||
<< ", but we cannot find a local Raylet with the same address. " | ||
<< "This can happen when you connect to the Ray cluster " | ||
<< "with a different IP address or when connecting to a container."; | ||
Comment on lines
+482
to
+485
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ?? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. previously we were still logging this if we don't get any node info that matches with the node_ip_address id and have to resort to using the gcs_address we fetched instead of the node_ip_address passed into the function |
||
*node_to_connect = node_infos[0].SerializeAsString(); | ||
return Status::OK(); | ||
} | ||
|
||
if (current_time_ms() - start_ms >= | ||
RayConfig::instance().raylet_start_wait_time_s() * 1000) { | ||
return status; | ||
if (current_time_ms() >= end_time_point) { | ||
std::ostringstream oss; | ||
oss << "This node has an IP address of " << node_ip_address << ", and Ray " | ||
<< "expects this IP address to be either the GCS address or one of" | ||
<< " the Raylet addresses. Connected to GCS at " << gcs_address | ||
<< ", and found no Raylet with this IP address. " | ||
<< "You might need to provide --node-ip-address to specify the IP " | ||
<< "address that the head should use when sending to this node."; | ||
return Status::NotFound(oss.str()); | ||
} | ||
RAY_LOG(WARNING) << "Some processes that the driver needs to connect to have " | ||
"not registered with GCS, so retrying. Have you run " | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why lock?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gcs_client_ is protected by mutex, this is how it's used throughout