Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SSmetrics - Elasticsearch powered metrics viewing #16549

Merged
merged 10 commits into from
Aug 17, 2021
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions code/__DEFINES/MC.dm
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,14 @@
/datum/controller/subsystem/##X/New(){\
NEW_SS_GLOBAL(SS##X);\
PreInit();\
ss_id=#X;\
}\
/datum/controller/subsystem/##X

#define PROCESSING_SUBSYSTEM_DEF(X) GLOBAL_REAL(SS##X, /datum/controller/subsystem/processing/##X);\
/datum/controller/subsystem/processing/##X/New(){\
NEW_SS_GLOBAL(SS##X);\
PreInit();\
ss_id="processing_[#X]";\
}\
/datum/controller/subsystem/processing/##X
3 changes: 2 additions & 1 deletion code/__HELPERS/time.dm
Original file line number Diff line number Diff line change
Expand Up @@ -203,5 +203,6 @@ GLOBAL_VAR_INIT(midnight_rollovers, 0)
GLOBAL_VAR_INIT(rollovercheck_last_timeofday, 0)
/proc/update_midnight_rollover()
if(world.timeofday < GLOB.rollovercheck_last_timeofday) //TIME IS GOING BACKWARDS!
return GLOB.midnight_rollovers++
GLOB.midnight_rollovers++
GLOB.rollovercheck_last_timeofday = world.timeofday
return GLOB.midnight_rollovers
6 changes: 5 additions & 1 deletion code/controllers/configuration/configuration_core.dm
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ GLOBAL_DATUM_INIT(configuration, /datum/server_configuration, new())
var/datum/configuration_section/logging_configuration/logging
/// Holder for the MC configuration datum
var/datum/configuration_section/mc_configuration/mc
/// Holder for the MC configuration datum
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a typo fix

/// Holder for the metrics configuration datum
var/datum/configuration_section/metrics_configuration/metrics
/// Holder for the movement configuration datum
var/datum/configuration_section/movement_configuration/movement
/// Holder for the overflow configuration datum
var/datum/configuration_section/overflow_configuration/overflow
Expand Down Expand Up @@ -71,6 +73,7 @@ GLOBAL_DATUM_INIT(configuration, /datum/server_configuration, new())
jobs = new()
logging = new()
mc = new()
metrics = new()
movement = new()
overflow = new()
ruins = new()
Expand Down Expand Up @@ -99,6 +102,7 @@ GLOBAL_DATUM_INIT(configuration, /datum/server_configuration, new())
jobs.load_data(raw_config_data["job_configuration"])
logging.load_data(raw_config_data["logging_configuration"])
mc.load_data(raw_config_data["mc_configuration"])
metrics.load_data(raw_config_data["metrics_configuration"])
movement.load_data(raw_config_data["movement_configuration"])
overflow.load_data(raw_config_data["overflow_configuration"])
ruins.load_data(raw_config_data["ruin_configuration"])
Expand Down
17 changes: 17 additions & 0 deletions code/controllers/configuration/sections/metrics_configuration.dm
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/// Config holder for stuff relating to metrics management
/datum/configuration_section/metrics_configuration
// NO EDITS OR READS TO THIS EVER
protection_state = PROTECTION_PRIVATE
/// Are metrics enabled or disabled
var/enable_metrics = FALSE
/// Endpoint to send metrics to, including protocol
var/metrics_endpoint = null
/// Endpoint authorisation API key
var/metrics_api_token = null

/datum/configuration_section/metrics_configuration/load_data(list/data)
// Use the load wrappers here. That way the default isnt made 'null' if you comment out the config line
CONFIG_LOAD_BOOL(enable_metrics, data["enable_metrics"])

CONFIG_LOAD_STR(metrics_endpoint, data["metrics_endpoint"])
CONFIG_LOAD_STR(metrics_api_token, data["metrics_api_token"])
16 changes: 16 additions & 0 deletions code/controllers/subsystem.dm
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
/datum/controller/subsystem
// Metadata; you should define these.
name = "fire codertrain" //name of the subsystem
/// Subsystem ID. Used for when we need a technical name for the SS
var/ss_id = "fire_codertrain_again"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👌

var/init_order = INIT_ORDER_DEFAULT //order of initialization. Higher numbers are initialized first, lower numbers later. Use defines in __DEFINES/subsystems.dm for easy understanding of order.
var/wait = 20 //time to wait (in deciseconds) between each call to fire(). Must be a positive integer.
var/priority = FIRE_PRIORITY_DEFAULT //When mutiple subsystems need to run in the same tick, higher priority subsystems will run first and be given a higher share of the tick before MC_TICK_CHECK triggers a sleep
Expand Down Expand Up @@ -227,3 +229,17 @@
if("queued_priority") //editing this breaks things.
return 0
. = ..()

/**
* Returns the metrics for the subsystem.
*
* This can be overriden on subtypes for variables that could affect tick usage
* Example: ATs on SSair
*/
/datum/controller/subsystem/proc/get_metrics()
SHOULD_CALL_PARENT(TRUE)
var/list/out = list()
out["cost"] = cost
out["tick_usage"] = tick_usage
out["custom"] = list() // Override as needed on child
return out
5 changes: 5 additions & 0 deletions code/controllers/subsystem/acid.dm
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ SUBSYSTEM_DEF(acid)
/datum/controller/subsystem/acid/stat_entry()
..("P:[processing.len]")

/datum/controller/subsystem/acid/get_metrics()
. = ..()
var/list/cust = list()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These being cust rather than just custom or something else entirely seems a bit odd. It's pretty obvious that it does mean "custom" of course, but it could probably be a bit clearer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for easier differentiation between the list index and the entry itself. I also recall something having a custom var and I didn't want these to interfere.

cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/acid/fire(resumed = 0)
if(!resumed)
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/air.dm
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ SUBSYSTEM_DEF(air)
msg += "AT/MS:[round((cost ? active_turfs.len/cost : 0),0.1)]"
..(msg)

/datum/controller/subsystem/air/get_metrics()
. = ..()
var/list/cust = list()
cust["active_turfs"] = length(active_turfs)
cust["hotspots"] = length(hotspots)
.["custom"] = cust

/datum/controller/subsystem/air/Initialize(timeofday)
setup_overlays() // Assign icons and such for gas-turf-overlays
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/fires.dm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ SUBSYSTEM_DEF(fires)
..("P:[processing.len]")


/datum/controller/subsystem/fires/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/fires/fire(resumed = 0)
if(!resumed)
src.currentrun = processing.Copy()
Expand Down
16 changes: 16 additions & 0 deletions code/controllers/subsystem/garbage.dm
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ SUBSYSTEM_DEF(garbage)
msg += " | Fail:[fail_counts.Join(",")]"
..(msg)

/datum/controller/subsystem/garbage/get_metrics()
. = ..()
var/list/cust = list()
if((delslasttick + gcedlasttick) == 0) // Account for DIV0
cust["gcr"] = 0
else
cust["gcr"] = (gcedlasttick / (delslasttick + gcedlasttick))
cust["total_harddels"] = totaldels
cust["total_softdels"] = totalgcs
var/i = 0
for(var/list/L in queues)
i++
cust["queue_[i]"] = length(L)

.["custom"] = cust

/datum/controller/subsystem/garbage/Shutdown()
//Adds the del() log to the qdel log file
var/list/dellog = list()
Expand Down
8 changes: 8 additions & 0 deletions code/controllers/subsystem/lighting.dm
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ SUBSYSTEM_DEF(lighting)
/datum/controller/subsystem/lighting/stat_entry()
..("L:[length(sources_queue)]|C:[length(corners_queue)]|O:[length(objects_queue)]")

/datum/controller/subsystem/lighting/get_metrics()
. = ..()
var/list/cust = list()
cust["sources_queue"] = length(sources_queue)
cust["corners_queue"] = length(corners_queue)
cust["objects_queue"] = length(objects_queue)
.["custom"] = cust

/datum/controller/subsystem/lighting/Initialize(timeofday)
if(!initialized)
if(GLOB.configuration.general.starlight)
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/machinery.dm
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ SUBSYSTEM_DEF(machines)
fire()
return ..()

/datum/controller/subsystem/machines/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/machines/proc/makepowernets()
for(var/datum/powernet/PN in powernets)
qdel(PN)
Expand Down
47 changes: 47 additions & 0 deletions code/controllers/subsystem/metrics.dm
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
SUBSYSTEM_DEF(metrics)
name = "Metrics"
wait = 30 SECONDS
offline_implications = "Server metrics will no longer be ingested into monitoring systems. No immediate action is needed."
runlevels = RUNLEVEL_LOBBY | RUNLEVEL_SETUP | RUNLEVEL_GAME | RUNLEVEL_POSTGAME // ALL THE LEVELS
flags = SS_KEEP_TIMING // This needs to ingest every IRL minute
/// The real time of day the server started. Used to calculate time drift
var/world_init_time = 0 // Not set in here. Set in world/New()

/datum/controller/subsystem/metrics/Initialize(start_timeofday)
if(!GLOB.configuration.metrics.enable_metrics)
flags |= SS_NO_FIRE // Disable firing to save CPU
return ..()


/datum/controller/subsystem/metrics/fire(resumed)
SShttp.create_async_request(RUSTG_HTTP_METHOD_POST, GLOB.configuration.metrics.metrics_endpoint, get_metrics_json(), list(
"Authorization" = "ApiKey [GLOB.configuration.metrics.metrics_api_token]",
"Content-Type" = "application/json"
))

/datum/controller/subsystem/metrics/proc/get_metrics_json()
var/list/out = list()
out["@timestamp"] = time_stamp() // This is required by ElasticSearch, complete with this name. DO NOT REMOVE THIS.
out["cpu"] = world.cpu
// out["maptick"] = world.map_cpu // TODO: 514
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

out["elapsed_processed"] = world.time
out["elapsed_real"] = (REALTIMEOFDAY - world_init_time)
out["client_count"] = length(GLOB.clients)
out["round_id"] = text2num(GLOB.round_id) // This is so we can filter the metrics by a single round ID

// Funnel in all SS metrics
var/list/ss_data = list()
for(var/datum/controller/subsystem/SS in Master.subsystems)
ss_data[SS.ss_id] = SS.get_metrics()

out["subsystems"] = ss_data
// And send it all
return json_encode(out)

/*

// Uncomment this if you add new metrics to verify how the JSON formats

/client/verb/debugmetricts()
usr << browse(SSmetrics.get_metrics_json(), "window=aadebug")
*/
6 changes: 6 additions & 0 deletions code/controllers/subsystem/mobs.dm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ SUBSYSTEM_DEF(mobs)
/// The amount of giant spiders that exist in the world. Used for mob capping.
var/giant_spiders = 0

/datum/controller/subsystem/mobs/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(GLOB.mob_living_list)
.["custom"] = cust

/datum/controller/subsystem/mobs/stat_entry()
..("P:[GLOB.mob_living_list.len]")

Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/processing/processing.dm
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ SUBSYSTEM_DEF(processing)
/datum/controller/subsystem/processing/stat_entry()
..("[stat_tag]:[processing.len]")

/datum/controller/subsystem/processing/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/processing/fire(resumed = 0)
if(!resumed)
currentrun = processing.Copy()
Expand Down
5 changes: 5 additions & 0 deletions code/controllers/subsystem/spacedrift.dm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ SUBSYSTEM_DEF(spacedrift)
/datum/controller/subsystem/spacedrift/stat_entry()
..("P:[processing.len]")

/datum/controller/subsystem/spacedrift/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/spacedrift/fire(resumed = 0)
if(!resumed)
Expand Down
7 changes: 7 additions & 0 deletions code/controllers/subsystem/tgui.dm
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ SUBSYSTEM_DEF(tgui)
/datum/controller/subsystem/tgui/stat_entry()
..("P:[processing_uis.len]")

/datum/controller/subsystem/tgui/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing_uis)
.["custom"] = cust


/datum/controller/subsystem/tgui/fire(resumed = 0)
if (!resumed)
src.currentrun = processing_uis.Copy()
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/throwing.dm
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ SUBSYSTEM_DEF(throwing)
/datum/controller/subsystem/throwing/stat_entry()
..("P:[processing.len]")

/datum/controller/subsystem/throwing/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/throwing/fire(resumed = 0)
if(!resumed)
src.currentrun = processing.Copy()
Expand Down
1 change: 1 addition & 0 deletions code/controllers/subsystem/tickets/mentor_tickets.dm
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ GLOBAL_REAL(SSmentor_tickets, /datum/controller/subsystem/tickets/mentor_tickets
/datum/controller/subsystem/tickets/mentor_tickets/New()
NEW_SS_GLOBAL(SSmentor_tickets)
PreInit()
ss_id = "mentor_tickets"

/datum/controller/subsystem/tickets/mentor_tickets
name = "Mentor Tickets"
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/tickets/tickets.dm
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ SUBSYSTEM_DEF(tickets)

var/ticketCounter = 1

/datum/controller/subsystem/tickets/get_metrics()
. = ..()
var/list/cust = list()
cust["tickets"] = length(allTickets) // Not a perf metric but I want to see a graph where SSair usage spikes and 20 tickets come in
.["custom"] = cust

/datum/controller/subsystem/tickets/Initialize()
if(!close_messages)
close_messages = list("<font color='red' size='4'><b>- [ticket_name] Rejected! -</b></font>",
Expand Down
6 changes: 6 additions & 0 deletions code/controllers/subsystem/timer.dm
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ SUBSYSTEM_DEF(timer)
/datum/controller/subsystem/timer/stat_entry(msg)
..("B:[bucket_count] P:[length(second_queue)] H:[length(hashes)] C:[length(clienttime_timers)] S:[length(timer_id_dict)]")

/datum/controller/subsystem/timer/get_metrics()
. = ..()
var/list/cust = list()
cust["bucket_count"] = bucket_count
.["custom"] = cust

/datum/controller/subsystem/timer/fire(resumed = FALSE)
var/lit = last_invoke_tick
var/last_check = world.time - TICKS2DS(BUCKET_LEN*1.5)
Expand Down
8 changes: 7 additions & 1 deletion code/controllers/subsystem/weather.dm
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,17 @@ SUBSYSTEM_DEF(weather)
flags = SS_BACKGROUND
wait = 10
runlevels = RUNLEVEL_GAME
offline_implications = "Ash storms will no longer trigger. No immediate action is needed."
offline_implications = "Ash storms will no longer trigger. No immediate action is needed."
var/list/processing = list()
var/list/eligible_zlevels = list()
var/list/next_hit_by_zlevel = list() //Used by barometers to know when the next storm is coming

/datum/controller/subsystem/weather/get_metrics()
. = ..()
var/list/cust = list()
cust["processing"] = length(processing)
.["custom"] = cust

/datum/controller/subsystem/weather/fire()
// process active weather
for(var/V in processing)
Expand Down
2 changes: 2 additions & 0 deletions code/game/world.dm
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ GLOBAL_LIST_INIT(map_transition_config, list(CC_TRANSITION_CONFIG))
// Right off the bat
enable_auxtools_debugger()

SSmetrics.world_init_time = REALTIMEOFDAY

// Do sanity checks to ensure RUST actually exists
if(!fexists(RUST_G))
DIRECT_OUTPUT(world.log, "ERROR: RUSTG was not found and is required for the game to function. Server will now exit.")
Expand Down
1 change: 1 addition & 0 deletions code/modules/unit_tests/_unit_tests.dm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "spawn_humans.dm"
#include "sql.dm"
#include "subsystem_init.dm"
#include "subsystem_metric_sanity.dm"
#include "timer_sanity.dm"
#include "unit_test.dm"
#endif
19 changes: 19 additions & 0 deletions code/modules/unit_tests/subsystem_metric_sanity.dm
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Unit test to ensure SS metrics are valid
/datum/unit_test/subsystem_metric_sanity/Run()
for(var/datum/controller/subsystem/SS in Master.subsystems)
var/list/data = SS.get_metrics()
if(length(data) != 3)
Fail("SS[SS.ss_id] has invalid metrics data!")
continue
if(isnull(data["cost"]))
Fail("SS[SS.ss_id] has invalid metrics data! No 'cost' found in [json_encode(data)]")
continue
if(isnull(data["tick_usage"]))
Fail("SS[SS.ss_id] has invalid metrics data! No 'tick_usage' found in [json_encode(data)]")
continue
if(isnull(data["custom"]))
Fail("SS[SS.ss_id] has invalid metrics data! No 'custom' found in [json_encode(data)]")
continue
if(!islist(data["custom"]))
Fail("SS[SS.ss_id] has invalid metrics data! 'custom' is not a list in [json_encode(data)]")
continue
Loading