From 35a35270c0934b9c2703f54c2839a87b853491c1 Mon Sep 17 00:00:00 2001 From: Chengxiong Ruan Date: Wed, 12 Jun 2024 08:30:59 -0700 Subject: [PATCH] cfgen: config override for edge hosts Summary: same as title Reviewed By: anps77 Differential Revision: D58466056 fbshipit-source-id: 38276d0fc93d7e0bcbf6f56f5dd8426f42a1392d --- src/oomd/cfgen/src/cfgen.rs | 62 ++++-- src/oomd/cfgen/src/types.rs | 3 + .../test/cfgen_test_inputs/devgpu_T17.json | 12 +- .../cfgen_test_inputs/devgpu_non_T17.json | 4 +- .../devvm_with_samll_ram.json | 8 +- .../test/cfgen_test_inputs/fna_shard00.json | 148 +++++++++++++ .../cfgen/test/cfgen_test_inputs/fnedge.json | 149 +++++++++++++ .../cfgen/test/cfgen_test_inputs/gedge.json | 141 ++++++++++++ .../cfgen_test_inputs/twpool_no_senpai.json | 12 +- .../twshared_senpai_disabled.json | 12 +- .../twshared_senpai_disabled_sandisk.json | 12 +- src/oomd/cfgen/test/cfgen_test_manifest.yml | 3 + .../fna_shard00/50-change-propagator.conf | 6 + .../cfgen_test_outputs/fna_shard00/oomd2.json | 176 +++++++++++++++ .../fnedge/50-change-propagator.conf | 6 + .../test/cfgen_test_outputs/fnedge/oomd2.json | 202 ++++++++++++++++++ .../gedge/50-change-propagator.conf | 6 + .../test/cfgen_test_outputs/gedge/oomd2.json | 176 +++++++++++++++ 18 files changed, 1104 insertions(+), 34 deletions(-) create mode 100644 src/oomd/cfgen/test/cfgen_test_inputs/fna_shard00.json create mode 100644 src/oomd/cfgen/test/cfgen_test_inputs/fnedge.json create mode 100644 src/oomd/cfgen/test/cfgen_test_inputs/gedge.json create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/50-change-propagator.conf create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/oomd2.json create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/fnedge/50-change-propagator.conf create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/fnedge/oomd2.json create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/gedge/50-change-propagator.conf create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/gedge/oomd2.json diff --git a/src/oomd/cfgen/src/cfgen.rs b/src/oomd/cfgen/src/cfgen.rs index b495834a..2322641d 100644 --- a/src/oomd/cfgen/src/cfgen.rs +++ b/src/oomd/cfgen/src/cfgen.rs @@ -746,7 +746,7 @@ fn get_attributes(node: &Node) -> ConfigParams { oomd_high_threshold_duration: String::from("60"), oomd_threshold: String::from("60"), oomd_threshold_duration: String::from("90"), - oomd_restart_threshold: oomd2_oomd_restart_threshold(), + oomd_restart_threshold: oomd2_oomd_restart_threshold(node), oomd_reclaim_duation: String::from("10"), oomd_post_action_delay: String::from("15"), swap_protection_detect_threshold: String::from("5"), @@ -870,13 +870,17 @@ fn oomd_extra_rulesets(node: &Node) -> Vec { ] } -fn oomd2_oomd_restart_threshold() -> BTreeMap { - btreemap! { - String::from("smc_proxy.service") => OomdRestartThreshold{ - threshold: String::from("15G"), - duration: String::from("10"), - post_action_delay: String::from("20"), - service_name: String::from("smc_proxy.service")} +fn oomd2_oomd_restart_threshold(node: &Node) -> BTreeMap { + if [HostType::GEdge, HostType::Fna].contains(&get_host_type(node)) { + btreemap! {} + } else { + btreemap! { + String::from("smc_proxy.service") => OomdRestartThreshold{ + threshold: String::from("15G"), + duration: String::from("10"), + post_action_delay: String::from("20"), + service_name: String::from("smc_proxy.service")} + } } } @@ -913,7 +917,15 @@ fn should_setup_iocost(node: &Node) -> bool { } fn fbtax2_blacklisted_jobs(node: &Node) -> Vec<&'static str> { - if [HostType::TwShared, HostType::Tw].contains(&get_host_type(node)) { + if [ + HostType::TwShared, + HostType::Tw, + HostType::FnEdge, + HostType::GEdge, + HostType::Fna, + ] + .contains(&get_host_type(node)) + { return vec![ // This ML model has extremely high memory usage, they need to fix // their stuff at some point. @@ -945,16 +957,27 @@ fn senpai_targets(node: &Node) -> Option { } match get_host_type(node) { - HostType::TwShared | HostType::Tw => Some(String::from( - "system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*", - )), + HostType::TwShared | HostType::Tw | HostType::FnEdge | HostType::GEdge | HostType::Fna => { + Some(String::from( + "system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*", + )) + } HostType::Synmon => Some(String::from("system.slice")), _ => None, } } fn senpai_limit_min_bytes(node: &Node) -> Option { - if [HostType::TwShared, HostType::Tw, HostType::Synmon].contains(&get_host_type(node)) { + if [ + HostType::TwShared, + HostType::Tw, + HostType::Synmon, + HostType::FnEdge, + HostType::GEdge, + HostType::Fna, + ] + .contains(&get_host_type(node)) + { let min_bytes = 100 * 1024 * 1024; return Some(min_bytes.to_string()); } @@ -999,9 +1022,22 @@ fn get_host_type(node: &Node) -> HostType { return HostType::Dns; } + if node.hostname_prefix() == FNEDGE { + return HostType::FnEdge; + } + + if node.hostname_prefix() == GEDGE { + return HostType::GEdge; + } + + if node.hostname_prefix() == FNA { + return HostType::Fna; + } + if node.is_devserver() { return HostType::DevServer; } + HostType::Default } diff --git a/src/oomd/cfgen/src/types.rs b/src/oomd/cfgen/src/types.rs index a78dd80c..b3f95e09 100644 --- a/src/oomd/cfgen/src/types.rs +++ b/src/oomd/cfgen/src/types.rs @@ -14,6 +14,9 @@ pub enum HostType { Tw, Synmon, Dns, + FnEdge, + GEdge, + Fna, } pub struct ConfigParams { diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_T17.json b/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_T17.json index 0d6ea633..a69cd7b6 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_T17.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_T17.json @@ -1,4 +1,4 @@ -@generated SignedSource<> +@generated SignedSource<> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "devgpu109.cln3.facebook.com", @@ -36,7 +36,9 @@ "vendor_id": 5348, "device_id": 5969, "class_code": 131072, - "board_part_number": "BCM957504-N1100FXB" + "board_part_number": "BCM957504-N1100FXB", + "current_speed_mts": 8000, + "current_width": 16 } }, "static_smc_tiers": [], @@ -48,7 +50,8 @@ "driver": "bnxt_en", "driver_version": "5.19.0-0_fbk9_fbaccel.phvuisqkh", "firmware_version": "220.0.59.0/pkg 220.0.83.0", - "bus_info": "0000:b3:00.0" + "bus_info": "0000:b3:00.0", + "speed_mbps": 100000 } } }, @@ -115,7 +118,8 @@ "device_nics_enum": [ "ETH0", "SVC0" - ] + ], + "loaded_kernel_modules": [] }, "reservationConfig": { "active_machine_materialization_id": "", diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_non_T17.json b/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_non_T17.json index 35489401..73f905f8 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_non_T17.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/devgpu_non_T17.json @@ -1,4 +1,4 @@ -@generated SignedSource<<94b019e8a8c2490e29066f90748585d9>> +@generated SignedSource<> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "devgpu013.eag5.facebook.com", @@ -55,7 +55,7 @@ "deviceType": "SERVER", "datacenter": "eag5", "cluster": "04", - "memTotal": 2163675131904, + "memTotal": 2163675127808, "osVersion": { "distribution_name": "CentOS Stream release", "version": 9, diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/devvm_with_samll_ram.json b/src/oomd/cfgen/test/cfgen_test_inputs/devvm_with_samll_ram.json index 9892f0bd..07ea0dc3 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/devvm_with_samll_ram.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/devvm_with_samll_ram.json @@ -1,4 +1,4 @@ -@generated SignedSource<<5c629f38a06ffc96c81ea393c0ff79c0>> +@generated SignedSource<<7094d5f857ef7db387134b6aaa98ec53>> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "devvm010.atn6.facebook.com", @@ -45,7 +45,8 @@ "driver": "virtio_net", "driver_version": "1.0.0", "firmware_version": "", - "bus_info": "0000:00:02.0" + "bus_info": "0000:00:02.0", + "speed_mbps": 0 } } }, @@ -75,7 +76,8 @@ "device_nics_enum": [ "ETH0", "SVC0" - ] + ], + "loaded_kernel_modules": [] }, "reservationConfig": { "active_machine_materialization_id": "", diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/fna_shard00.json b/src/oomd/cfgen/test/cfgen_test_inputs/fna_shard00.json new file mode 100644 index 00000000..243d8f3c --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_inputs/fna_shard00.json @@ -0,0 +1,148 @@ +@generated SignedSource<<77a12826ffacd49093dab7d916d4ce1c>> +@codegen-command arc cfgen update-inputs fb-oomd +{ + "fqdn": "fna007.01.futp1.facebook.com", + "region": "other", + "clusterType": "FNA", + "modelId": 342361, + "kernelRelease": "5.19.0-0_fbk21_hardened_rc9_12630_gab30f3f5259a", + "serverType": "TYPE_VI_FEED", + "experiments": [], + "cpuArchitecture": "icelake", + "metalosRootfs": false, + "provisioningConfig": { + "ethtoolByInterface": { + "eth0": { + "maxChannelsCombined": 63 + }, + "ip6tnl0": {}, + "tunl0": {}, + "tunlany0": {} + }, + "cpuCoreCount": 48, + "parentModelId": 328972, + "recoveryEnvironment": false, + "deviceType": "SERVER", + "datacenter": "futp1", + "cluster": "01", + "memTotal": 265750892544, + "osVersion": { + "distribution_name": "EdgeOS", + "version": 1708953544, + "is_in_ramdisk": false, + "is_metalos": false + }, + "pciByAddress": { + "0000:47:00.0": { + "vendor_id": 5555, + "device_id": 4125, + "class_code": 131072, + "board_part_number": "MCX623435AC-CDAI", + "current_speed_mts": 16000, + "current_width": 16 + } + }, + "static_smc_tiers": [], + "machine": "x86_64" + }, + "bootConfig": { + "ethtoolByInterface": { + "eth0": { + "driver": "mlx5_core", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "22.34.4000 (MT_0000000695)", + "bus_info": "0000:47:00.0", + "speed_mbps": 100000 + }, + "ip6tnl0": { + "driver": "ip6tnl", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunl0": { + "driver": "ipip", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunlany0": { + "driver": "ip6tnl", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + } + } + }, + "runtimeConfig": { + "hasHighPrivCert": true, + "regionRoutableCluster": "oth1.01", + "block_devices": { + "block_devices": { + "nvme0n1": { + "size_bytes": 3840755982336, + "is_rotational": false, + "model": "SAMSUNG MZQL23T8HCLS-00A07", + "serial": "S64HNJ0T644493", + "physical_block_size": 4096, + "logical_block_size": 4096, + "is_root": false + }, + "nvme1n1": { + "size_bytes": 960197124096, + "is_rotational": false, + "model": "SAMSUNG MZQL2960HCJR-00A07", + "serial": "S64FNE0T407240", + "physical_block_size": 4096, + "logical_block_size": 512, + "is_root": true + }, + "nvme2n1": { + "size_bytes": 3840755982336, + "is_rotational": false, + "model": "SAMSUNG MZQL23T8HCLS-00A07", + "serial": "S64HNJ0T644491", + "physical_block_size": 4096, + "logical_block_size": 4096, + "is_root": false + } + } + }, + "dynamic_smc_tiers": [], + "cluster_state": "CLUSTER_IN_USE", + "installed_platforms": [ + "platform010", + "platform010-compat" + ], + "device_nics_enum": [ + "ETH0", + "SVC0" + ], + "loaded_kernel_modules": [], + "mounts": { + "mounts": [ + { + "device": "/dev/mapper/transient", + "mount_point": "/", + "fstype": "btrfs", + "mount_options": [ + "rw", + "relatime", + "compress-force=zstd:3", + "ssd", + "space_cache=v2", + "subvolid=256", + "subvol=/volume" + ] + } + ] + } + }, + "reservationConfig": { + "active_machine_materialization_id": "", + "current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE" + } +} diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/fnedge.json b/src/oomd/cfgen/test/cfgen_test_inputs/fnedge.json new file mode 100644 index 00000000..a6b194ca --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_inputs/fnedge.json @@ -0,0 +1,149 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-inputs fb-oomd +{ + "fqdn": "fnedge624.03.sin6.facebook.com", + "region": "oregon", + "clusterType": "POP", + "modelId": 342539, + "kernelRelease": "5.19.0-0_fbk21_hardened_rc9_12630_gab30f3f5259a", + "serverType": "TYPE_VI_FEED", + "experiments": [], + "cpuArchitecture": "skylake", + "metalosRootfs": false, + "provisioningConfig": { + "ethtoolByInterface": { + "eth0": { + "maxChannelsRx": 80, + "maxChannelsTx": 32 + }, + "ip6tnl0": {}, + "tunl0": {}, + "tunlany0": {} + }, + "cpuCoreCount": 40, + "parentModelId": 334413, + "recoveryEnvironment": false, + "deviceType": "SERVER", + "datacenter": "sin6", + "cluster": "03", + "memTotal": 269953482752, + "osVersion": { + "distribution_name": "EdgeOS", + "version": 1708953544, + "is_in_ramdisk": false, + "is_metalos": false + }, + "pciByAddress": { + "0000:5d:00.0": { + "vendor_id": 5555, + "device_id": 4103, + "class_code": 163840, + "board_part_number": "764285-B21", + "current_speed_mts": 8000, + "current_width": 8 + } + }, + "static_smc_tiers": [], + "machine": "x86_64" + }, + "bootConfig": { + "ethtoolByInterface": { + "eth0": { + "driver": "mlx4_en", + "driver_version": "4.0-0", + "firmware_version": "2.42.5700", + "bus_info": "0000:5d:00.0", + "speed_mbps": 40000 + }, + "ip6tnl0": { + "driver": "ip6tnl", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunl0": { + "driver": "ipip", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunlany0": { + "driver": "ip6tnl", + "driver_version": "5.19.0-0_fbk21_hardened_rc9_126", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + } + } + }, + "runtimeConfig": { + "hasHighPrivCert": true, + "regionRoutableCluster": "prn3.06", + "block_devices": { + "block_devices": { + "nvme0n1": { + "size_bytes": 3840755982336, + "is_rotational": false, + "model": "SAMSUNG MZQLB3T8HALS-00007", + "serial": "S438NC0R840659", + "physical_block_size": 4096, + "logical_block_size": 4096, + "is_root": false + }, + "nvme1n1": { + "size_bytes": 3840755982336, + "is_rotational": false, + "model": "SAMSUNG MZQLB3T8HALS-00007", + "serial": "S438NA0N755299", + "physical_block_size": 4096, + "logical_block_size": 4096, + "is_root": false + }, + "sda": { + "size_bytes": 480103981056, + "is_rotational": false, + "model": "VK000480GWSRR", + "serial": "S4NANA0N724609", + "physical_block_size": 4096, + "logical_block_size": 512, + "is_root": true + } + } + }, + "dynamic_smc_tiers": [], + "cluster_state": "CLUSTER_IN_USE", + "installed_platforms": [ + "platform010", + "platform010-compat" + ], + "device_nics_enum": [ + "ETH0", + "SVC0" + ], + "loaded_kernel_modules": [], + "mounts": { + "mounts": [ + { + "device": "/dev/mapper/transient", + "mount_point": "/", + "fstype": "btrfs", + "mount_options": [ + "rw", + "relatime", + "compress-force=zstd:3", + "ssd", + "space_cache=v2", + "subvolid=256", + "subvol=/volume" + ] + } + ] + } + }, + "reservationConfig": { + "active_machine_materialization_id": "", + "current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE" + } +} diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/gedge.json b/src/oomd/cfgen/test/cfgen_test_inputs/gedge.json new file mode 100644 index 00000000..fadcbc5c --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_inputs/gedge.json @@ -0,0 +1,141 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-inputs fb-oomd +{ + "fqdn": "gedge774.50.ord5.facebook.com", + "region": "altoona", + "clusterType": "GAMING_POP", + "modelId": 340028, + "kernelRelease": "5.12.0-0_fbk16_hardened_7661_geb00762ce6d2", + "serverType": "TYPE_VI_FEED", + "experiments": [], + "cpuArchitecture": "skylake", + "metalosRootfs": false, + "provisioningConfig": { + "ethtoolByInterface": { + "eth0": { + "maxChannelsRx": 80, + "maxChannelsTx": 32 + }, + "ip6tnl0": {}, + "tunl0": {}, + "tunlany0": {} + }, + "cpuCoreCount": 40, + "parentModelId": 339661, + "recoveryEnvironment": false, + "deviceType": "SERVER", + "datacenter": "ord5", + "cluster": "50", + "memTotal": 202312957952, + "osVersion": { + "distribution_name": "EdgeOS", + "version": 1700056342, + "is_in_ramdisk": false, + "is_metalos": false + }, + "pciByAddress": { + "0000:5d:00.0": { + "vendor_id": 5555, + "device_id": 4103, + "class_code": 163840, + "board_part_number": "764285-B21", + "current_speed_mts": 8000, + "current_width": 8 + } + }, + "static_smc_tiers": [], + "machine": "x86_64" + }, + "bootConfig": { + "ethtoolByInterface": { + "eth0": { + "driver": "mlx4_en", + "driver_version": "4.0-0", + "firmware_version": "2.42.5700", + "bus_info": "0000:5d:00.0", + "speed_mbps": 40000 + }, + "ip6tnl0": { + "driver": "", + "driver_version": "", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunl0": { + "driver": "", + "driver_version": "", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + }, + "tunlany0": { + "driver": "", + "driver_version": "", + "firmware_version": "", + "bus_info": "", + "speed_mbps": 0 + } + } + }, + "runtimeConfig": { + "hasHighPrivCert": true, + "regionRoutableCluster": "atn1.05", + "block_devices": { + "block_devices": { + "sda": { + "size_bytes": 1920383410176, + "is_rotational": false, + "model": "VK001920GWTTC", + "serial": "PHYF021301PN1P9DGN", + "physical_block_size": 4096, + "logical_block_size": 512, + "is_root": true + } + } + }, + "dynamic_smc_tiers": [ + "edge_pool_gedge.ord5c50", + "edge_pool_gedge.ord5c50.spare" + ], + "cluster_state": "CLUSTER_IN_USE", + "installed_platforms": [ + "platform010", + "platform010-compat" + ], + "device_nics_enum": [ + "ETH0", + "ETH1", + "SVC0", + "SVC1" + ], + "loaded_kernel_modules": [ + { + "name": "nvidia", + "version": "525.105.17\n" + } + ], + "mounts": { + "mounts": [ + { + "device": "/dev/mapper/transient", + "mount_point": "/", + "fstype": "btrfs", + "mount_options": [ + "rw", + "relatime", + "compress-force=zstd:3", + "ssd", + "space_cache=v2", + "subvolid=256", + "subvol=/volume" + ] + } + ] + } + }, + "reservationConfig": { + "active_machine_materialization_id": "", + "current_reservation_host_profile_id": "EDGE:EDGE_NONE" + } +} diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/twpool_no_senpai.json b/src/oomd/cfgen/test/cfgen_test_inputs/twpool_no_senpai.json index d2b92443..f9d77a32 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/twpool_no_senpai.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/twpool_no_senpai.json @@ -1,4 +1,4 @@ -@generated SignedSource<> +@generated SignedSource<> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "tw130.03.ldc2.facebook.com", @@ -36,7 +36,9 @@ "vendor_id": 5348, "device_id": 5833, "class_code": 131072, - "board_part_number": "BCM957302M3023CBK" + "board_part_number": "BCM957302M3023CBK", + "current_speed_mts": 8000, + "current_width": 8 } }, "static_smc_tiers": [], @@ -48,7 +50,8 @@ "driver": "bnxt_en", "driver_version": "5.19.0-0_fbk12_11583_g0bef9520c", "firmware_version": "20.6.167.0/pkg 20.6.4.12", - "bus_info": "0000:5e:00.0" + "bus_info": "0000:5e:00.0", + "speed_mbps": 25000 } } }, @@ -77,7 +80,8 @@ "device_nics_enum": [ "ETH0", "SVC0" - ] + ], + "loaded_kernel_modules": [] }, "reservationConfig": { "active_machine_materialization_id": "", diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled.json b/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled.json index d7558326..192ef4f2 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled.json @@ -1,4 +1,4 @@ -@generated SignedSource<<9acea0c585c99a8d3339ae5cc6cc1725>> +@generated SignedSource<> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "twshared44829.07.ash9.facebook.com", @@ -36,7 +36,9 @@ "vendor_id": 5348, "device_id": 5833, "class_code": 131072, - "board_part_number": "BCM957302M3023CBK" + "board_part_number": "BCM957302M3023CBK", + "current_speed_mts": 8000, + "current_width": 8 } }, "static_smc_tiers": [], @@ -48,7 +50,8 @@ "driver": "bnxt_en", "driver_version": "5.12.0-0_fbk13_clang_7455_gb24d", "firmware_version": "20.6.167.0/pkg 20.6.4.12", - "bus_info": "0000:5e:00.0" + "bus_info": "0000:5e:00.0", + "speed_mbps": 25000 } } }, @@ -88,7 +91,8 @@ "device_nics_enum": [ "ETH0", "SVC0" - ] + ], + "loaded_kernel_modules": [] }, "reservationConfig": { "active_machine_materialization_id": "6321b443b7f1a", diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled_sandisk.json b/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled_sandisk.json index 21f7c85f..1fe8de79 100644 --- a/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled_sandisk.json +++ b/src/oomd/cfgen/test/cfgen_test_inputs/twshared_senpai_disabled_sandisk.json @@ -1,4 +1,4 @@ -@generated SignedSource<> +@generated SignedSource<> @codegen-command arc cfgen update-inputs fb-oomd { "fqdn": "twshared15369.35.frc1.facebook.com", @@ -35,7 +35,9 @@ "vendor_id": 5555, "device_id": 4117, "class_code": 131072, - "board_part_number": "MCX4431M-GCAN_FB" + "board_part_number": "MCX4431M-GCAN_FB", + "current_speed_mts": 8000, + "current_width": 2 } }, "static_smc_tiers": [], @@ -47,7 +49,8 @@ "driver": "mlx5_core", "driver_version": "6.4.3-0_fbk2_785_gacbb203ea6ff", "firmware_version": "14.27.2606 (FB_2510111032)", - "bus_info": "0000:04:00.0" + "bus_info": "0000:04:00.0", + "speed_mbps": 50000 } } }, @@ -91,7 +94,8 @@ "device_nics_enum": [ "ETH0", "SVC0" - ] + ], + "loaded_kernel_modules": [] }, "reservationConfig": { "active_machine_materialization_id": "63d2bf3f97b2d", diff --git a/src/oomd/cfgen/test/cfgen_test_manifest.yml b/src/oomd/cfgen/test/cfgen_test_manifest.yml index 1e34adc9..c4635c10 100644 --- a/src/oomd/cfgen/test/cfgen_test_manifest.yml +++ b/src/oomd/cfgen/test/cfgen_test_manifest.yml @@ -12,6 +12,9 @@ library_samples: - devvm - twshared_vll_shard00 - twshared_t20_zionex + - fnedge + - gedge + - fna_shard00 # Add more samples from https://fburl.com/code/vjwmkoa1 if needed samples: ondemand: diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/50-change-propagator.conf new file mode 100644 index 00000000..c44ba252 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/50-change-propagator.conf @@ -0,0 +1,6 @@ +@generated SignedSource<<3dd8c7637bb7afa680fc168e9c49060d>> +@codegen-command arc cfgen update-outputs fb-oomd +[Service] +Environment=OOMD_ARGS='--interval 1 --config /etc/oomd2.json --drop-in-dir /run/oomd/dropin' + +[Unit] diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/oomd2.json new file mode 100644 index 00000000..b9e2b5cb --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/fna_shard00/oomd2.json @@ -0,0 +1,176 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-outputs fb-oomd +{ + "rulesets": [ + { + "name": "system overview", + "silence-logs": "engine", + "detectors": [ + [ + "records system stats", + { + "name": "dump_cgroup_overview", + "args": { + "cgroup": "workload.slice" + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "protection against heavy workload thrashing", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "sustained high workload memory pressure", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "pressure_above", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "180", + "resource": "memory", + "threshold": "80" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_pg_scan", + "args": { + "cgroup": "workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "protection against low swap", + "detectors": [ + [ + "free swap goes below 10 percent", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "swap_free", + "args": { + "threshold_pct": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "biased_swap_kill": "true", + "cgroup": "system.slice/*,workload.slice/workload-wdb.slice/*,workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "senpai ruleset", + "silence-logs": "engine", + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "senpai_poking", + "args": { + "cgroup": "system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*", + "io_pressure_pct": "1.0", + "memory_high_timeout_ms": "20", + "scuba_logger_dataset": "perfpipe_senpai_events", + "limit_min_bytes": "104857600" + } + } + ] + }, + { + "name": "senpai drop-in ruleset", + "silence-logs": "engine", + "drop-in": { + "disable-on-drop-in": true, + "actions": true + }, + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "tw_container drop-in ruleset", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "continue", + { + "name": "stop", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ], + "prekill_hook_timeout": "45" + } + ], + "version": "1.0.0" +} \ No newline at end of file diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/50-change-propagator.conf new file mode 100644 index 00000000..c44ba252 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/50-change-propagator.conf @@ -0,0 +1,6 @@ +@generated SignedSource<<3dd8c7637bb7afa680fc168e9c49060d>> +@codegen-command arc cfgen update-outputs fb-oomd +[Service] +Environment=OOMD_ARGS='--interval 1 --config /etc/oomd2.json --drop-in-dir /run/oomd/dropin' + +[Unit] diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/oomd2.json new file mode 100644 index 00000000..c65bf849 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/fnedge/oomd2.json @@ -0,0 +1,202 @@ +@generated SignedSource<<321bf87df3cdbf0d9f32602db039bec7>> +@codegen-command arc cfgen update-outputs fb-oomd +{ + "rulesets": [ + { + "name": "system overview", + "silence-logs": "engine", + "detectors": [ + [ + "records system stats", + { + "name": "dump_cgroup_overview", + "args": { + "cgroup": "workload.slice" + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "restart smc_proxy.service on memory threshold", + "detectors": [ + [ + "memory usage above", + { + "name": "memory_above", + "args": { + "cgroup": "smc_proxy.service", + "duration": "10", + "threshold_anon": "15G" + } + } + ] + ], + "actions": [ + { + "name": "systemd_restart", + "args": { + "dry": "false", + "post_action_delay": "20", + "service": "smc_proxy.service" + } + } + ] + }, + { + "name": "protection against heavy workload thrashing", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "sustained high workload memory pressure", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "pressure_above", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "180", + "resource": "memory", + "threshold": "80" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_pg_scan", + "args": { + "cgroup": "workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "protection against low swap", + "detectors": [ + [ + "free swap goes below 10 percent", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "swap_free", + "args": { + "threshold_pct": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "biased_swap_kill": "true", + "cgroup": "system.slice/*,workload.slice/workload-wdb.slice/*,workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "senpai ruleset", + "silence-logs": "engine", + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "senpai_poking", + "args": { + "cgroup": "system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*", + "io_pressure_pct": "1.0", + "memory_high_timeout_ms": "20", + "scuba_logger_dataset": "perfpipe_senpai_events", + "limit_min_bytes": "104857600" + } + } + ] + }, + { + "name": "senpai drop-in ruleset", + "silence-logs": "engine", + "drop-in": { + "disable-on-drop-in": true, + "actions": true + }, + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "tw_container drop-in ruleset", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "continue", + { + "name": "stop", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ], + "prekill_hook_timeout": "45" + } + ], + "version": "1.0.0" +} \ No newline at end of file diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/gedge/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/gedge/50-change-propagator.conf new file mode 100644 index 00000000..c44ba252 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/gedge/50-change-propagator.conf @@ -0,0 +1,6 @@ +@generated SignedSource<<3dd8c7637bb7afa680fc168e9c49060d>> +@codegen-command arc cfgen update-outputs fb-oomd +[Service] +Environment=OOMD_ARGS='--interval 1 --config /etc/oomd2.json --drop-in-dir /run/oomd/dropin' + +[Unit] diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/gedge/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/gedge/oomd2.json new file mode 100644 index 00000000..b9e2b5cb --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/gedge/oomd2.json @@ -0,0 +1,176 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-outputs fb-oomd +{ + "rulesets": [ + { + "name": "system overview", + "silence-logs": "engine", + "detectors": [ + [ + "records system stats", + { + "name": "dump_cgroup_overview", + "args": { + "cgroup": "workload.slice" + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "protection against heavy workload thrashing", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "sustained high workload memory pressure", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "pressure_above", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "180", + "resource": "memory", + "threshold": "80" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "workload.slice/workload-tw.slice", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_pg_scan", + "args": { + "cgroup": "workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "protection against low swap", + "detectors": [ + [ + "free swap goes below 10 percent", + { + "name": "exists", + "args": { + "cgroup": "workload.slice/workload-tw.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.reservation.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/*.allotment.slice/sigrid_online_trainer*,workload.slice/workload-tw.slice/analyzer*,workload.slice/workload-tw.slice/*.reservation.slice/analyzer*,workload.slice/workload-tw.slice/*.allotment.slice/analyzer*,workload.slice/workload-tw.slice/bumblebee.*,workload.slice/workload-tw.slice/*.reservation.slice/bumblebee.*,workload.slice/workload-tw.slice/*.allotment.slice/bumblebee.*", + "negate": true + } + }, + { + "name": "swap_free", + "args": { + "threshold_pct": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "biased_swap_kill": "true", + "cgroup": "system.slice/*,workload.slice/workload-wdb.slice/*,workload.slice/workload-tw.slice/*", + "recursive": "true" + } + } + ] + }, + { + "name": "senpai ruleset", + "silence-logs": "engine", + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "senpai_poking", + "args": { + "cgroup": "system.slice,workload.slice/workload-wdb.slice,hostcritical.slice,workload.slice/workload-wdb.slice/*,hostcritical.slice/*", + "io_pressure_pct": "1.0", + "memory_high_timeout_ms": "20", + "scuba_logger_dataset": "perfpipe_senpai_events", + "limit_min_bytes": "104857600" + } + } + ] + }, + { + "name": "senpai drop-in ruleset", + "silence-logs": "engine", + "drop-in": { + "disable-on-drop-in": true, + "actions": true + }, + "detectors": [ + [ + "continue detector group", + { + "name": "continue", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "tw_container drop-in ruleset", + "drop-in": { + "disable-on-drop-in": true, + "detectors": true, + "actions": true + }, + "detectors": [ + [ + "continue", + { + "name": "stop", + "args": {} + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ], + "prekill_hook_timeout": "45" + } + ], + "version": "1.0.0" +} \ No newline at end of file