From e742bc6961ea31163f0497a0dbbb2c9227c69236 Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Fri, 21 Oct 2022 06:14:55 -0700 Subject: [PATCH] Change sort order of nodes list in dashboard to Headnode -> state -> node id (#29486) Signed-off-by: Alan Guo aguo@anyscale.com This sort order prevents dead nodes from showing up above alive nodes while still maintaining consistent ordering to prevent nodes jumping around. Currently this checks the head node by comparing the ip of the node to the ip of the machine the dashboard process is running on. --- .../client/src/pages/node/hook/useNodeList.ts | 24 +++++++++++++++++-- dashboard/client/src/type/raylet.d.ts | 1 + dashboard/datacenter.py | 16 +++++++++++++ dashboard/head.py | 1 + dashboard/modules/node/tests/test_node.py | 1 + 5 files changed, 41 insertions(+), 2 deletions(-) diff --git a/dashboard/client/src/pages/node/hook/useNodeList.ts b/dashboard/client/src/pages/node/hook/useNodeList.ts index f578082756cc4..d07a371ac87e3 100644 --- a/dashboard/client/src/pages/node/hook/useNodeList.ts +++ b/dashboard/client/src/pages/node/hook/useNodeList.ts @@ -50,11 +50,31 @@ export const useNodeList = () => { }; }, [getList]); + const finalSortFunc = (a: NodeDetail, b: NodeDetail) => { + const sortFuncs: ((a: NodeDetail, b: NodeDetail) => number)[] = [ + // user override first + sorterFunc, + // Head node is always first + (a, b) => (a.raylet.isHeadNode ? 0 : 1) - (b.raylet.isHeadNode ? 0 : 1), + // Then sort by state + (a, b) => (a.raylet.state > b.raylet.state ? 1 : -1), + // Finally sort by nodeId + (a, b) => (a.raylet.nodeId > b.raylet.nodeId ? 1 : -1), + ]; + + for (const sortFunc of sortFuncs) { + const val = sortFunc(a, b); + if (val !== 0) { + return val; + } + } + return 0; + }; + return { nodeList: nodeList .map((e) => ({ ...e, state: e.raylet.state })) - .sort((a, b) => (a.raylet.nodeId > b.raylet.nodeId ? 1 : -1)) - .sort(sorterFunc) + .sort(finalSortFunc) .filter((node) => filter.every((f) => node[f.key] && node[f.key].includes(f.val)), ), diff --git a/dashboard/client/src/type/raylet.d.ts b/dashboard/client/src/type/raylet.d.ts index 7698b39129db5..69c13928487a5 100644 --- a/dashboard/client/src/type/raylet.d.ts +++ b/dashboard/client/src/type/raylet.d.ts @@ -27,4 +27,5 @@ export type Raylet = { terminateTime: number; objectStoreAvailableMemory: number; objectStoreUsedMemory: number; + isHeadNode: boolean; }; diff --git a/dashboard/datacenter.py b/dashboard/datacenter.py index 08559ad14ee7e..63a6031f83cd9 100644 --- a/dashboard/datacenter.py +++ b/dashboard/datacenter.py @@ -61,6 +61,8 @@ class DataSource: class DataOrganizer: + head_node_ip = None + @staticmethod @async_loop_forever(dashboard_consts.PURGE_DATA_INTERVAL_SECONDS) async def purge(): @@ -175,6 +177,13 @@ async def get_node_info(cls, node_id): # Merge GcsNodeInfo to node physical stats node_info["raylet"].update(node) + # Add "is_head_node" field + # TODO(aguo): Grab head node information from a source of truth + node_info["raylet"]["is_head_node"] = ( + cls.head_node_ip == node_physical_stats.get("ip") + if node_physical_stats.get("ip") + else False + ) # Merge actors to node physical stats node_info["actors"] = DataSource.node_actors.get(node_id, {}) # Update workers to node physical stats @@ -205,6 +214,13 @@ async def get_node_summary(cls, node_id): node_summary["raylet"].update(ray_stats) # Merge GcsNodeInfo to node physical stats node_summary["raylet"].update(node) + # Add "is_head_node" field + # TODO(aguo): Grab head node information from a source of truth + node_summary["raylet"]["is_head_node"] = ( + cls.head_node_ip == node_physical_stats.get("ip") + if node_physical_stats.get("ip") + else False + ) await GlobalSignals.node_summary_fetched.send(node_summary) diff --git a/dashboard/head.py b/dashboard/head.py index 9c7822ecd06cd..dd8f5dfa41e46 100644 --- a/dashboard/head.py +++ b/dashboard/head.py @@ -108,6 +108,7 @@ def __init__( self.gcs_error_subscriber = None self.gcs_log_subscriber = None self.ip = ray.util.get_node_ip_address() + DataOrganizer.head_node_ip = self.ip ip, port = gcs_address.split(":") self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0),)) diff --git a/dashboard/modules/node/tests/test_node.py b/dashboard/modules/node/tests/test_node.py index ff300025df773..b995d40695e02 100644 --- a/dashboard/modules/node/tests/test_node.py +++ b/dashboard/modules/node/tests/test_node.py @@ -104,6 +104,7 @@ def getpid(self): detail = detail["data"]["detail"] assert detail["hostname"] == hostname assert detail["raylet"]["state"] == "ALIVE" + assert detail["raylet"]["isHeadNode"] is True assert "raylet" in detail["cmdline"][0] assert len(detail["workers"]) >= 2 assert len(detail["actors"]) == 2, detail["actors"]