diff --git a/src/oomd/cfgen/src/cfgen.rs b/src/oomd/cfgen/src/cfgen.rs index 6b180797..6cdc9e62 100644 --- a/src/oomd/cfgen/src/cfgen.rs +++ b/src/oomd/cfgen/src/cfgen.rs @@ -7,9 +7,15 @@ mod types; use libcfgen::prelude::*; use types::*; +const CONFIG_VERSION: &str = "1.0.0"; + fn oomd_json(node: &Node) -> json::JsonValue { let attrs = get_attributes(node); - default_json_config(&attrs) + match attrs.host_type { + HostType::DevServer => devserver_json_config(node, &attrs), + HostType::OnDemand => od_json_config(&attrs), + _ => default_json_config(&attrs), + } // TODO(chengxiong) add other templates } @@ -29,24 +35,45 @@ fn default_json_config(attrs: &ConfigParams) -> json::JsonValue { rulesets.push(rule_protection_against_wdb_io_thrashing(attrs)); } if !attrs.fbtax2.disable_swap_protection { - rulesets.push(rule_protection_against_low_swap(attrs)); + rulesets.push(rule_fbtax2_protection_against_low_swap(attrs)); } if attrs.senpai.target.is_some() { rulesets.push(rule_senpai_ruleset(attrs)); } rulesets.append(&mut attrs.fbtax2.oomd_extra_rulesets.clone()); rulesets.push(rule_senpai_drop_in_ruleset(attrs)); - rulesets.push(rule_tw_container_drop_in_ruleset()); + rulesets.push(rule_tw_container_drop_in_ruleset(attrs)); // TODO(chengxiong): add more rule sections json::object! { "rulesets": rulesets, - "version": "1.0.0", + "version": CONFIG_VERSION, + } +} + +fn devserver_json_config(node: &Node, attrs: &ConfigParams) -> json::JsonValue { + let mut rulesets = json::Array::new(); + rulesets.push(rule_system_overview(attrs)); + rulesets.push(rule_user_session_protection(node, attrs)); + if !attrs.oomd2.disable_swap_protection { + rulesets.push(rule_oomd2_protection_against_low_swap(attrs)); + } + rulesets.push(rule_senpai_drop_in_ruleset(attrs)); + rulesets.append(&mut rules_restart_cgroup_on_mem_threshold(attrs)); + rulesets.push(rule_tw_container_drop_in_ruleset(attrs)); + json::object! { + "rulesets": rulesets, + "version": CONFIG_VERSION, } } +fn od_json_config(_attrs: &ConfigParams) -> json::JsonValue { + // TODO(chengxiong): implement this. + json::object! {} +} + fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue { - let cgroup = if [HostType::ShellServer, HostType::OnDemand].contains(&attrs.host_type) { + let cgroup = if [HostType::DevServer, HostType::OnDemand].contains(&attrs.host_type) { attrs.oomd2.oomd_target.as_str() } else { "workload.slice" @@ -288,7 +315,7 @@ fn rule_protection_against_wdb_io_thrashing(attrs: &ConfigParams) -> json::JsonV } } -fn rule_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue { +fn rule_fbtax2_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue { let mut detector = json::array! { format!("free swap goes below {} percent", attrs.fbtax2.low_swap_threshold) }; @@ -322,6 +349,33 @@ fn rule_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue { } } +fn rule_oomd2_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue { + json::object! { + "name": "protection against low swap", + "detectors": [ + [ + format!("free swap goes below {}%", attrs.oomd2.swap_protection_detect_threshold), + { + "name": "swap_free", + "args": { + "threshold_pct": attrs.oomd2.swap_protection_detect_threshold.as_str(), + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "cgroup": attrs.oomd2.kill_target.as_str(), + "threshold": attrs.oomd2.swap_protection_kill_threshold.as_str(), + "recursive": true, + } + } + ] + } +} + fn rule_senpai_ruleset(attrs: &ConfigParams) -> json::JsonValue { let mut action_args = json::object! { "io_pressure_pct": attrs.senpai.io_pressure_pct.as_str(), @@ -399,10 +453,9 @@ fn rule_senpai_drop_in_ruleset(attrs: &ConfigParams) -> json::JsonValue { } } -fn rule_tw_container_drop_in_ruleset() -> json::JsonValue { - json::object! { +fn rule_tw_container_drop_in_ruleset(attrs: &ConfigParams) -> json::JsonValue { + let mut rule = json::object! { "name": "tw_container drop-in ruleset", - "prekill_hook_timeout": "45", "drop-in": { "detectors": true, "actions": true, @@ -423,6 +476,93 @@ fn rule_tw_container_drop_in_ruleset() -> json::JsonValue { "args": {} } ], + }; + + if attrs.host_type != HostType::DevServer { + rule["prekill_hook_timeout"] = json::JsonValue::String(String::from("45")); + } + + rule +} + +fn rule_user_session_protection(node: &Node, attrs: &ConfigParams) -> json::JsonValue { + let mut user_pressure_detector = json::array! { + format!("user pressure above {} for 300s", attrs.devserver.user_mempress), + { + "name": "pressure_above", + "args": { + "cgroup": "user.slice,workload.slice,www.slice", + "resource": "memory", + "threshold": attrs.devserver.user_mempress.as_str(), + "duration": "300", + } + }, + }; + + let mut system_pressure_detector = json::array! { + format!("system pressure above {} for 300s", attrs.devserver.system_mempress), + { + "name": "pressure_above", + "args": { + "cgroup": "system.slice", + "resource": "memory", + "threshold": attrs.devserver.system_mempress.as_str(), + "duration": "300" + }, + } + }; + + if node.in_dynamic_smc_tier("devbig") { + _ = user_pressure_detector.push(json::object! { + "name": "nr_dying_descendants", + "args": { + "cgroup": "/", + "count": "30000", + "lte": "true" + } + }); + + _ = system_pressure_detector.push(json::object! { + "name": "nr_dying_descendants", + "args": { + "cgroup": "/", + "count": "30000", + "lte": "true" + } + }); + } + + _ = user_pressure_detector.push(json::object! { + "name": "memory_reclaim", + "args": { + "cgroup": "user.slice,workload.slice,www.slice", + "duration": "30" + } + }); + + _ = system_pressure_detector.push(json::object! { + "name": "memory_reclaim", + "args": { + "cgroup": "system.slice", + "duration": "30" + } + }); + + json::object! { + "name": "user session protection", + "detectors": [ + user_pressure_detector, + system_pressure_detector, + ], + "actions": [ + { + "name": "kill_by_memory_size_or_growth", + "args": { + "cgroup": attrs.oomd2.kill_target.as_str(), + "recursive": true, + } + } + ] } } @@ -445,6 +585,7 @@ fn get_attributes(node: &Node) -> ConfigParams { oomd2: Oomd2Attributes { blacklisted_jobs: Vec::new(), disable_swap_protection: false, + kill_target: String::from("user.slice/,system.slice/,workload.slice/,www.slice/"), plugins: convert_args!(btreemap!( "pressure_above" => "pressure_above", "pressure_rising_beyond" => "pressure_rising_beyond", @@ -467,10 +608,14 @@ fn get_attributes(node: &Node) -> ConfigParams { oomd_restart_threshold: oomd2_oomd_restart_threshold(), oomd_reclaim_duation: String::from("10"), oomd_post_action_delay: String::from("15"), + swap_protection_detect_threshold: String::from("5"), + swap_protection_kill_threshold: String::from("5"), }, devserver: DevServerAttributes { - user_mempress: String::from("60"), - system_mempress: String::from("80"), + // TODO(chengxiong): add overriding logic for user_mempress and system_mempress. + // Like this: https://fburl.com/code/rjcg895c + user_mempress: String::from("40"), + system_mempress: String::from("60"), }, senpai: SenpaiAttributes { silence_logs: String::from("engine"), @@ -567,6 +712,10 @@ fn get_host_type(node: &Node) -> HostType { if node.hostname_prefix() == "twshared".into() { return HostType::TwShared; } + + if node.is_devserver() { + return HostType::DevServer; + } HostType::Default } @@ -586,6 +735,7 @@ mod tests { #[rstest] #[case::shard99("twshared2434.02.cco1", HostType::TwShared)] + #[case::shard99("devvm3170.cln0", HostType::DevServer)] fn test_get_host_type(#[case] hostname: &str, #[case] expected: HostType) { let node = FakeNodeBuilder::new().hostname(hostname).build(); assert_eq!(get_host_type(&node), expected); diff --git a/src/oomd/cfgen/src/types.rs b/src/oomd/cfgen/src/types.rs index ba941ef5..83cdb1e6 100644 --- a/src/oomd/cfgen/src/types.rs +++ b/src/oomd/cfgen/src/types.rs @@ -5,7 +5,7 @@ use libcfgen::prelude::json::JsonValue; #[derive(Debug, PartialEq, Eq)] pub enum HostType { Default, - ShellServer, + DevServer, OnDemand, TwShared, } @@ -36,6 +36,7 @@ pub struct FBTax2Attributes { pub struct Oomd2Attributes { pub blacklisted_jobs: Vec, pub disable_swap_protection: bool, + pub kill_target: String, pub plugins: BTreeMap, pub oomd_dry: bool, pub oomd_disable_on_drop_in: bool, @@ -49,6 +50,8 @@ pub struct Oomd2Attributes { pub oomd_restart_threshold: BTreeMap, pub oomd_reclaim_duation: String, pub oomd_post_action_delay: String, + pub swap_protection_detect_threshold: String, + pub swap_protection_kill_threshold: String, } pub struct DevServerAttributes { diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/devvm.json b/src/oomd/cfgen/test/cfgen_test_inputs/devvm.json new file mode 100644 index 00000000..eef94b7b --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_inputs/devvm.json @@ -0,0 +1,85 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-inputs fb-oomd +{ + "fqdn": "devvm3170.cln0.facebook.com", + "region": "ireland", + "clusterType": "DEV", + "modelId": 4077, + "kernelRelease": "5.19.0-0_fbk11_hardened_11538_g61e82533119f", + "serverType": "TYPE_VIRTUAL_MACHINE", + "experiments": [], + "cpuArchitecture": "", + "metalosRootfs": false, + "provisioningConfig": { + "ethtoolByInterface": { + "eth0": { + "maxChannelsCombined": 8 + } + }, + "cpuCoreCount": 72, + "parentModelId": 333282, + "recoveryEnvironment": false, + "deviceType": "SERVER", + "datacenter": "cln0", + "cluster": "01", + "memTotal": 240305004544, + "osVersion": { + "distribution_name": "CentOS Stream release", + "version": 9, + "is_in_ramdisk": false, + "is_metalos": false + }, + "pciByAddress": { + "0000:00:02.0": { + "vendor_id": 6900, + "device_id": 4096, + "class_code": 131072 + } + }, + "static_smc_tiers": [], + "machine": "x86_64" + }, + "bootConfig": { + "ethtoolByInterface": { + "eth0": { + "driver": "virtio_net", + "driver_version": "1.0.0", + "firmware_version": "", + "bus_info": "0000:00:02.0" + } + } + }, + "runtimeConfig": { + "hasHighPrivCert": true, + "regionRoutableCluster": "cln1.02", + "block_devices": { + "block_devices": { + "vda": { + "vendor": "0x1af4", + "size_bytes": 1717986918400, + "is_rotational": true, + "physical_block_size": 512, + "logical_block_size": 512, + "is_root": true + } + } + }, + "dynamic_smc_tiers": [], + "cluster_state": "CLUSTER_IN_USE", + "installed_platforms": [ + "platform009", + "platform010", + "platform010-aarch64", + "platform010-compat" + ], + "device_nics_enum": [ + "ETH0", + "ETH1", + "SVC0" + ] + }, + "reservationConfig": { + "active_machine_materialization_id": "", + "current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE" + } +} diff --git a/src/oomd/cfgen/test/cfgen_test_manifest.yml b/src/oomd/cfgen/test/cfgen_test_manifest.yml index 605a6ef3..1c4b37a5 100644 --- a/src/oomd/cfgen/test/cfgen_test_manifest.yml +++ b/src/oomd/cfgen/test/cfgen_test_manifest.yml @@ -9,5 +9,6 @@ # You're only going to need it in rare cases, e.g. when new samples are added. # `arc cfgen ui fb-oomd --remcmd-use-globalcert` library_samples: + - devvm - twshared_vll_shard00 # Add more samples from https://fburl.com/code/vjwmkoa1 if needed diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/devvm/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/devvm/50-change-propagator.conf new file mode 100644 index 00000000..240e90fc --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/devvm/50-change-propagator.conf @@ -0,0 +1,4 @@ +@generated SignedSource<<31b3f2f747768088bd5523d8e690bfac>> +@codegen-command arc cfgen update-outputs fb-oomd +[Service] +[Unit] diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/devvm/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/devvm/oomd2.json new file mode 100644 index 00000000..a0943da1 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/devvm/oomd2.json @@ -0,0 +1,176 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-outputs fb-oomd +{ + "rulesets": [ + { + "name": "system overview", + "silence-logs": "engine", + "detectors": [ + [ + "records system stats", + { + "name": "dump_cgroup_overview", + "args": { + "cgroup": "system.slice" + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "user session protection", + "detectors": [ + [ + "user pressure above 40 for 300s", + { + "name": "pressure_above", + "args": { + "cgroup": "user.slice,workload.slice,www.slice", + "resource": "memory", + "threshold": "40", + "duration": "300" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "user.slice,workload.slice,www.slice", + "duration": "30" + } + } + ], + [ + "system pressure above 60 for 300s", + { + "name": "pressure_above", + "args": { + "cgroup": "system.slice", + "resource": "memory", + "threshold": "60", + "duration": "300" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "system.slice", + "duration": "30" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_memory_size_or_growth", + "args": { + "cgroup": "user.slice/,system.slice/,workload.slice/,www.slice/", + "recursive": true + } + } + ] + }, + { + "name": "protection against low swap", + "detectors": [ + [ + "free swap goes below 5%", + { + "name": "swap_free", + "args": { + "threshold_pct": "5" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "cgroup": "user.slice/,system.slice/,workload.slice/,www.slice/", + "threshold": "5", + "recursive": true + } + } + ] + }, + { + "name": "senpai drop-in ruleset", + "silence-logs": "engine", + "drop-in": { + "actions": true, + "disable-on-drop-in": true + }, + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "restart smc_proxy.service on memory threshold", + "detectors": [ + [ + "memory usage above", + { + "name": "memory_above", + "args": { + "cgroup": "smc_proxy.service", + "threshold_anon": "15G", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "systemd_restart", + "args": { + "service": "smc_proxy.service", + "post_action_delay": "20", + "dry": "false" + } + } + ] + }, + { + "name": "tw_container drop-in ruleset", + "drop-in": { + "detectors": true, + "actions": true, + "disable-on-drop-in": true + }, + "detectors": [ + [ + "continue", + { + "name": "stop", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + } + ], + "version": "1.0.0" +} \ No newline at end of file diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/twshared_vll_shard00/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/twshared_vll_shard00/oomd2.json index 2d3dc117..d5379a34 100644 --- a/src/oomd/cfgen/test/cfgen_test_outputs/twshared_vll_shard00/oomd2.json +++ b/src/oomd/cfgen/test/cfgen_test_outputs/twshared_vll_shard00/oomd2.json @@ -1,4 +1,4 @@ -@generated SignedSource<> +@generated SignedSource<> @codegen-command arc cfgen update-outputs fb-oomd { "rulesets": [ @@ -175,7 +175,6 @@ }, { "name": "tw_container drop-in ruleset", - "prekill_hook_timeout": "45", "drop-in": { "detectors": true, "actions": true, @@ -195,7 +194,8 @@ "name": "continue", "args": {} } - ] + ], + "prekill_hook_timeout": "45" } ], "version": "1.0.0"