From 79f9a3e3ef853fb269ae6d3249884726e5df3385 Mon Sep 17 00:00:00 2001
From: Chengxiong Ruan <chengxiong@meta.com>
Date: Wed, 22 May 2024 09:03:29 -0700
Subject: [PATCH] cfgen:implement ondemand template

Summary: title says it

Differential Revision: D57641591

fbshipit-source-id: 66b4c3818a9ca3aecad19cd1f9601fdee913d993
---
 src/oomd/cfgen/src/cfgen.rs                   | 148 +++++++++++++--
 src/oomd/cfgen/src/types.rs                   |   3 +-
 .../test/cfgen_test_inputs/ondemand.json      |  93 ++++++++++
 src/oomd/cfgen/test/cfgen_test_manifest.yml   |   4 +
 .../ondemand/50-change-propagator.conf        |   4 +
 .../cfgen_test_outputs/ondemand/oomd2.json    | 169 ++++++++++++++++++
 6 files changed, 402 insertions(+), 19 deletions(-)
 create mode 100644 src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json
 create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf
 create mode 100644 src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json

diff --git a/src/oomd/cfgen/src/cfgen.rs b/src/oomd/cfgen/src/cfgen.rs
index 6cdc9e62..2702a07d 100644
--- a/src/oomd/cfgen/src/cfgen.rs
+++ b/src/oomd/cfgen/src/cfgen.rs
@@ -67,18 +67,20 @@ fn devserver_json_config(node: &Node, attrs: &ConfigParams) -> json::JsonValue {
     }
 }
 
-fn od_json_config(_attrs: &ConfigParams) -> json::JsonValue {
-    // TODO(chengxiong): implement this.
-    json::object! {}
+fn od_json_config(attrs: &ConfigParams) -> json::JsonValue {
+    let mut rulesets = json::Array::new();
+    rulesets.push(rule_system_overview(attrs));
+    rulesets.push(rule_protection_against_high_memory_pressure(attrs));
+    rulesets.append(&mut rules_restart_cgroup_on_mem_threshold(attrs));
+    rulesets.push(rule_senpai_drop_in_ruleset(attrs));
+    rulesets.push(rule_od_protection_against_low_swap(attrs));
+    json::object! {
+      "rulesets": rulesets,
+      "version": CONFIG_VERSION,
+    }
 }
 
 fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue {
-    let cgroup = if [HostType::DevServer, HostType::OnDemand].contains(&attrs.host_type) {
-        attrs.oomd2.oomd_target.as_str()
-    } else {
-        "workload.slice"
-    };
-
     let mut rule = json::object! {
         "name": "system overview",
         "silence-logs": "engine",
@@ -88,7 +90,7 @@ fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue {
                 {
                     "name": "dump_cgroup_overview",
                     "args": {
-                        "cgroup": cgroup,
+                        "cgroup": attrs.oomd2.oomd_target.as_str(),
                     }
                 }
             ]
@@ -243,7 +245,7 @@ fn rule_protection_against_heavy_workload_thrashing_detectors(
     }
 
     _ = slow_growing_mem_pressure_detector.push(json::object! {
-      "name": "pressure_rising_beyong",
+      "name": "pressure_rising_beyond",
       "args": {
         "cgroup": attrs.fbtax2.workload_monitoring_slice.as_str(),
         "resource": "memory",
@@ -416,7 +418,7 @@ fn rule_senpai_ruleset(attrs: &ConfigParams) -> json::JsonValue {
 fn rule_senpai_drop_in_ruleset(attrs: &ConfigParams) -> json::JsonValue {
     json::object! {
       "name": "senpai drop-in ruleset",
-      "silence-logs": "engine",
+      "silence-logs": if attrs.host_type == HostType::OnDemand {"engine,plugins"} else {"engine"},
       "drop-in": {
         "actions": true,
         "disable-on-drop-in": true,
@@ -566,6 +568,97 @@ fn rule_user_session_protection(node: &Node, attrs: &ConfigParams) -> json::Json
     }
 }
 
+fn rule_protection_against_high_memory_pressure(attrs: &ConfigParams) -> json::JsonValue {
+    json::object! {
+      "name": "protection against high memory pressure",
+      "drop-in": {
+        "detectors": true,
+        "actions": true,
+        "disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in,
+      },
+      "detectors": [
+        [
+          "detects fast growing memory pressure",
+          {
+            "name": attrs.oomd2.plugins["pressure_above"].as_str(),
+            "args": {
+              "cgroup": attrs.oomd2.oomd_target.as_str(),
+              "resource": "memory",
+              "threshold": attrs.oomd2.oomd_high_threshold.as_str(),
+              "duration": attrs.oomd2.oomd_high_threshold_duration.as_str(),
+            }
+          },
+          {
+            "name": attrs.oomd2.plugins["memory_reclaim"].as_str(),
+            "args": {
+              "cgroup": attrs.oomd2.oomd_target.as_str(),
+              "duration": attrs.oomd2.oomd_reclaim_duation.as_str(),
+            }
+          }
+        ],
+        [
+          "detects slow growing memory pressure",
+          {
+            "name": attrs.oomd2.plugins["pressure_rising_beyond"].as_str(),
+            "args": {
+              "cgroup": attrs.oomd2.oomd_target.as_str(),
+              "resource": "memory",
+              "threshold": attrs.oomd2.oomd_threshold.as_str(),
+              "duration": attrs.oomd2.oomd_threshold_duration.as_str(),
+            }
+          },
+          {
+            "name": attrs.oomd2.plugins["memory_reclaim"].as_str(),
+            "args": {
+              "cgroup": attrs.oomd2.oomd_target.as_str(),
+              "duration": attrs.oomd2.oomd_reclaim_duation.as_str(),
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": attrs.oomd2.plugins["kill_by_memory_size_or_growth"].as_str(),
+          "args": {
+            "cgroup": attrs.oomd2.oomd_action_target.as_str(),
+            "dry": if attrs.oomd2.oomd_dry { "true" } else {"false"},
+          }
+        }
+      ]
+    }
+}
+
+fn rule_od_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue {
+    json::object! {
+      "name": "protection against low swap",
+      "drop-in": {
+        "detectors": true,
+        "actions": true,
+        "disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in,
+      },
+      "detectors": [
+        [
+          "free swap goes below 5 percent",
+          {
+            "name": attrs.oomd2.plugins["swap_free"].as_str(),
+            "args": {
+              "threshold_pct": "5",
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": attrs.oomd2.plugins["kill_by_swap_usage"].as_str(),
+          "args": {
+            "cgroup": attrs.oomd2.oomd_action_target.as_str(),
+            "dry": if attrs.oomd2.oomd_dry { "true" } else {"false"},
+          }
+        }
+      ]
+    }
+}
+
 fn get_attributes(node: &Node) -> ConfigParams {
     ConfigParams {
         host_type: get_host_type(node),
@@ -597,14 +690,13 @@ fn get_attributes(node: &Node) -> ConfigParams {
               "senpai" => "senpai",
             )),
             oomd_dry: true,
-            oomd_disable_on_drop_in: false,
-            oomd_target: String::from("system.slice"),
-            oomd_action_target: String::from("system.slice"),
+            oomd_disable_on_drop_in: true,
+            oomd_target: oomd2_oomd_target(node),
+            oomd_action_target: String::from("system.slice/*"),
             oomd_high_threshold: String::from("80"),
             oomd_high_threshold_duration: String::from("60"),
             oomd_threshold: String::from("60"),
             oomd_threshold_duration: String::from("90"),
-            oomd_min_swap_pct: String::from("15"),
             oomd_restart_threshold: oomd2_oomd_restart_threshold(),
             oomd_reclaim_duation: String::from("10"),
             oomd_post_action_delay: String::from("15"),
@@ -625,7 +717,7 @@ fn get_attributes(node: &Node) -> ConfigParams {
             memory_high_timeout_ms: String::from("20"),
             scuba_logger_dataset: String::from("perfpipe_senpai_events"),
         },
-        disable_senpai_dropin: false,
+        disable_senpai_dropin: disable_senpai_dropin(node),
     }
 }
 
@@ -707,12 +799,33 @@ fn senpai_limit_min_bytes(node: &Node) -> Option<String> {
     None
 }
 
+fn oomd2_oomd_target(node: &Node) -> String {
+    match get_host_type(node) {
+        HostType::DevServer => String::from("system.slice"),
+        HostType::OnDemand => {
+            String::from("system.slice,workload.slice/workload-tw.slice/quicksand*.service")
+        }
+        _ => String::from("workload.slice"),
+    }
+}
+
+fn disable_senpai_dropin(node: &Node) -> bool {
+    if get_host_type(node) == HostType::OnDemand {
+        return true;
+    }
+    false
+}
+
 fn get_host_type(node: &Node) -> HostType {
     // TODO(chengxiong): add logic to determine host types.
     if node.hostname_prefix() == "twshared".into() {
         return HostType::TwShared;
     }
 
+    if node.hostname_prefix() == "od".into() {
+        return HostType::OnDemand;
+    }
+
     if node.is_devserver() {
         return HostType::DevServer;
     }
@@ -736,6 +849,7 @@ mod tests {
     #[rstest]
     #[case::shard99("twshared2434.02.cco1", HostType::TwShared)]
     #[case::shard99("devvm3170.cln0", HostType::DevServer)]
+    #[case::shard99("od2228.eag1", HostType::OnDemand)]
     fn test_get_host_type(#[case] hostname: &str, #[case] expected: HostType) {
         let node = FakeNodeBuilder::new().hostname(hostname).build();
         assert_eq!(get_host_type(&node), expected);
diff --git a/src/oomd/cfgen/src/types.rs b/src/oomd/cfgen/src/types.rs
index 83cdb1e6..72ebb27e 100644
--- a/src/oomd/cfgen/src/types.rs
+++ b/src/oomd/cfgen/src/types.rs
@@ -34,7 +34,7 @@ pub struct FBTax2Attributes {
 }
 
 pub struct Oomd2Attributes {
-    pub blacklisted_jobs: Vec<String>,
+    pub blacklisted_jobs: Vec<&'static str>,
     pub disable_swap_protection: bool,
     pub kill_target: String,
     pub plugins: BTreeMap<String, String>,
@@ -46,7 +46,6 @@ pub struct Oomd2Attributes {
     pub oomd_high_threshold_duration: String,
     pub oomd_threshold: String,
     pub oomd_threshold_duration: String,
-    pub oomd_min_swap_pct: String,
     pub oomd_restart_threshold: BTreeMap<String, OomdRestartThreshold>,
     pub oomd_reclaim_duation: String,
     pub oomd_post_action_delay: String,
diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json b/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json
new file mode 100644
index 00000000..937aee44
--- /dev/null
+++ b/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json
@@ -0,0 +1,93 @@
+@generated SignedSource<<bb1f0a1f74c2f6d1c6ca34e21fa3ca51>>
+@codegen-command arc cfgen update-inputs fb-oomd
+{
+  "fqdn": "od2228.eag1.facebook.com",
+  "region": "utah",
+  "clusterType": "SERVICE_GENERIC_NON_MEMCACHE",
+  "modelId": 341072,
+  "kernelRelease": "5.19.0-0_fbk12_11583_g0bef9520ca2b",
+  "serverType": "TYPE_X_SEARCH",
+  "experiments": [],
+  "cpuArchitecture": "cooperlake",
+  "metalosRootfs": false,
+  "provisioningConfig": {
+    "ethtoolByInterface": {
+      "eth0": {
+        "maxChannelsCombined": 52
+      }
+    },
+    "cpuCoreCount": 26,
+    "parentModelId": 338998,
+    "recoveryEnvironment": false,
+    "deviceType": "SERVER",
+    "datacenter": "eag1",
+    "cluster": "05",
+    "memTotal": 66870956032,
+    "osVersion": {
+      "distribution_name": "CentOS Stream release",
+      "version": 9,
+      "is_in_ramdisk": false,
+      "is_metalos": false
+    },
+    "pciByAddress": {
+      "0000:65:00.0": {
+        "vendor_id": 5555,
+        "device_id": 4125,
+        "class_code": 131072,
+        "board_part_number": "MCX623435MC-CDAE_FB"
+      }
+    },
+    "static_smc_tiers": [],
+    "machine": "x86_64"
+  },
+  "bootConfig": {
+    "ethtoolByInterface": {
+      "eth0": {
+        "driver": "mlx5_core",
+        "driver_version": "5.19.0-0_fbk12_11583_g0bef9520c",
+        "firmware_version": "22.32.1206 (FB_0000000018)",
+        "bus_info": "0000:65:00.0"
+      }
+    }
+  },
+  "runtimeConfig": {
+    "hasHighPrivCert": true,
+    "regionRoutableCluster": "eag1.02",
+    "block_devices": {
+      "block_devices": {
+        "nvme0n1": {
+          "size_bytes": 256055095296,
+          "is_rotational": false,
+          "model": "HFS512GDE9X083N",
+          "serial": "2621CDA6N79781110H6O",
+          "physical_block_size": 512,
+          "logical_block_size": 512,
+          "is_root": true
+        },
+        "nvme1n1": {
+          "size_bytes": 1800360124416,
+          "is_rotational": false,
+          "model": "MZOL21T9HCJR-00AFB",
+          "serial": "S5X8NG0T524955",
+          "physical_block_size": 4096,
+          "logical_block_size": 4096,
+          "is_root": false
+        }
+      }
+    },
+    "dynamic_smc_tiers": [],
+    "cluster_state": "CLUSTER_IN_USE",
+    "installed_platforms": [
+      "platform010",
+      "platform010-compat"
+    ],
+    "device_nics_enum": [
+      "ETH0",
+      "SVC0"
+    ]
+  },
+  "reservationConfig": {
+    "active_machine_materialization_id": "",
+    "current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE"
+  }
+}
diff --git a/src/oomd/cfgen/test/cfgen_test_manifest.yml b/src/oomd/cfgen/test/cfgen_test_manifest.yml
index 1c4b37a5..fb04a6d1 100644
--- a/src/oomd/cfgen/test/cfgen_test_manifest.yml
+++ b/src/oomd/cfgen/test/cfgen_test_manifest.yml
@@ -12,3 +12,7 @@ library_samples:
   - devvm
   - twshared_vll_shard00
   # Add more samples from https://fburl.com/code/vjwmkoa1 if needed
+samples:
+  ondemand:
+    # A random host with od hostname prefix.
+    production_host: od2228.eag1.facebook.com
diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf
new file mode 100644
index 00000000..240e90fc
--- /dev/null
+++ b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf
@@ -0,0 +1,4 @@
+@generated SignedSource<<31b3f2f747768088bd5523d8e690bfac>>
+@codegen-command arc cfgen update-outputs fb-oomd
+[Service]
+[Unit]
diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json
new file mode 100644
index 00000000..4fda3f59
--- /dev/null
+++ b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json
@@ -0,0 +1,169 @@
+@generated SignedSource<<60b23c1a40e237850d38e23d98d5a1bb>>
+@codegen-command arc cfgen update-outputs fb-oomd
+{
+  "rulesets": [
+    {
+      "name": "system overview",
+      "silence-logs": "engine",
+      "detectors": [
+        [
+          "records system stats",
+          {
+            "name": "dump_cgroup_overview",
+            "args": {
+              "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service"
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": "continue",
+          "args": {}
+        }
+      ],
+      "drop-in": {
+        "detectors": true,
+        "actions": true
+      }
+    },
+    {
+      "name": "protection against high memory pressure",
+      "drop-in": {
+        "detectors": true,
+        "actions": true,
+        "disable-on-drop-in": true
+      },
+      "detectors": [
+        [
+          "detects fast growing memory pressure",
+          {
+            "name": "pressure_above",
+            "args": {
+              "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service",
+              "resource": "memory",
+              "threshold": "80",
+              "duration": "60"
+            }
+          },
+          {
+            "name": "memory_reclaim",
+            "args": {
+              "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service",
+              "duration": "10"
+            }
+          }
+        ],
+        [
+          "detects slow growing memory pressure",
+          {
+            "name": "pressure_rising_beyond",
+            "args": {
+              "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service",
+              "resource": "memory",
+              "threshold": "60",
+              "duration": "90"
+            }
+          },
+          {
+            "name": "memory_reclaim",
+            "args": {
+              "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service",
+              "duration": "10"
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": "kill_by_memory_size_or_growth",
+          "args": {
+            "cgroup": "system.slice/*",
+            "dry": "true"
+          }
+        }
+      ]
+    },
+    {
+      "name": "restart smc_proxy.service on memory threshold",
+      "detectors": [
+        [
+          "memory usage above",
+          {
+            "name": "memory_above",
+            "args": {
+              "cgroup": "smc_proxy.service",
+              "threshold_anon": "15G",
+              "duration": "10"
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": "systemd_restart",
+          "args": {
+            "service": "smc_proxy.service",
+            "post_action_delay": "20",
+            "dry": "false"
+          }
+        }
+      ]
+    },
+    {
+      "name": "senpai drop-in ruleset",
+      "silence-logs": "engine,plugins",
+      "drop-in": {
+        "actions": true,
+        "disable-on-drop-in": true
+      },
+      "detectors": [
+        [
+          "stop detector group",
+          {
+            "name": "exists",
+            "args": {
+              "cgroup": "/",
+              "negate": true
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": "continue",
+          "args": {}
+        }
+      ]
+    },
+    {
+      "name": "protection against low swap",
+      "drop-in": {
+        "detectors": true,
+        "actions": true,
+        "disable-on-drop-in": true
+      },
+      "detectors": [
+        [
+          "free swap goes below 5 percent",
+          {
+            "name": "swap_free",
+            "args": {
+              "threshold_pct": "5"
+            }
+          }
+        ]
+      ],
+      "actions": [
+        {
+          "name": "kill_by_swap_usage",
+          "args": {
+            "cgroup": "system.slice/*",
+            "dry": "true"
+          }
+        }
+      ]
+    }
+  ],
+  "version": "1.0.0"
+}
\ No newline at end of file