Skip to content

Commit

Permalink
fix(metrics): switch-leader, restore configuration
Browse files Browse the repository at this point in the history
Make sure the metrics instance on the new leader has the same configuration of the old leader
  • Loading branch information
gsanchietti committed Feb 25, 2025
1 parent 38136bd commit fbfd029
Showing 1 changed file with 27 additions and 2 deletions.
29 changes: 27 additions & 2 deletions core/imageroot/usr/local/sbin/switch-leader
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,18 @@ if install_loki and node_id == self_id:
}))
redis_pipeline.execute()

# Install metrics on the leader node, remove existing metrics instances
if install_metrics and node_id == self_id:
settings = None
custom_alerts = None
custom_templates = None
# Remove existing metrics instances
remove_tasks = []
for mkey in rdb.scan_iter("module/metrics*/environment"):
module_id = mkey.removeprefix("module/").removesuffix("/environment")
settings = rdb.hgetall(f'module/{module_id}/settings')
custom_alerts = rdb.hgetall(f'module/{module_id}/custom_alerts')
custom_templates = rdb.hgetall(f'module/{module_id}/custom_templates')
module_node = rdb.hget(mkey, 'NODE_ID')
remove_tasks.append({
'agent_id': f'node/{module_node}',
Expand All @@ -166,15 +173,14 @@ if install_metrics and node_id == self_id:
"preserve_data": False,
}
})

if len(remove_tasks) > 0:
subtasks = agent.tasks.runp_nowait(
remove_tasks,
endpoint="redis://cluster-leader",
)

# Install prometheus and grafana on the leader node
module = "ghcr.io/nethserver/metrics:latest" # FIXME
module = "ghcr.io/nethserver/metrics:save_to_redis" # FIXME
result = agent.tasks.run("cluster", "add-module", data={
"image": module,
"node": node_id,
Expand All @@ -187,6 +193,25 @@ if install_metrics and node_id == self_id:
if result['exit_code'] != 0:
print(f"[ERROR] Failed to install {module} on the new leader node: {result['error']}", file=sys.stderr)
errors += 1
else:
mid = result['output']['module_id'] # New module ID
rdb.hset(f'module/{mid}/settings', mapping=settings)
rdb.hset(f'module/{mid}/custom_alerts', custom_alerts)
rdb.hset(f'module/{mid}/custom_templates', custom_templates)

result_config = agent.tasks.run("cluster", "restart-module", data={
"module_id": mid,
"node": node_id,
"check_idle_time": 0,
},
extra={
"isNotificationHidden": True,
},
endpoint="redis://cluster-leader")
if result_config['exit_code'] != 0:
print(f"[ERROR] Failed to restart {module} on the new leader node: {result_config['error']}", file=sys.stderr)
errors += 1


if errors > 0:
sys.exit(1)

0 comments on commit fbfd029

Please sign in to comment.