From 34f481369efa25c014965558fe2271c53d4fff48 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Fri, 6 Oct 2023 08:31:44 +0200 Subject: [PATCH] Change PDBs to maxUnavailable: 1 (#731) * bump: operator-rs 0.52.0 * changelog * Change PDBs to maxUnavailable: 1 * bump: operator-rs 0.52.1 * wording --- CHANGELOG.md | 5 +- Cargo.lock | 8 +- Cargo.toml | 2 +- deploy/helm/zookeeper-operator/crds/crds.yaml | 2 + .../operations/pod-disruptions.adoc | 84 +------------------ rust/crd/src/lib.rs | 13 +-- rust/operator-binary/src/operations/pdb.rs | 53 +----------- rust/operator-binary/src/zk_controller.rs | 4 +- 8 files changed, 20 insertions(+), 151 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5999e370..08aed71b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,12 @@ All notable changes to this project will be documented in this file. - Default stackableVersion to operator version ([#711]). - Configuration overrides for the JVM security properties, such as DNS caching ([#715]). -- Support PodDisruptionBudgets ([#730]). +- Support PodDisruptionBudgets ([#730], [#731]). ### Changed - `vector` `0.26.0` -> `0.31.0` ([#709]). -- `operator-rs` `0.44.0` -> `0.51.1` ([#711], [#730]). +- `operator-rs` `0.44.0` -> `0.52.1` ([#711], [#730], [#731]). - Let secret-operator handle certificate conversion ([#695]). [#695]: https://github.com/stackabletech/zookeeper-operator/pull/695 @@ -21,6 +21,7 @@ All notable changes to this project will be documented in this file. [#711]: https://github.com/stackabletech/zookeeper-operator/pull/711 [#715]: https://github.com/stackabletech/zookeeper-operator/pull/715 [#730]: https://github.com/stackabletech/zookeeper-operator/pull/730 +[#731]: https://github.com/stackabletech/zookeeper-operator/pull/731 ## [23.7.0] - 2023-07-14 diff --git a/Cargo.lock b/Cargo.lock index b4ef63d9..a03c4d3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1860,8 +1860,8 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "stackable-operator" -version = "0.51.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=0.51.1#042642c1e7dac1fdc616de66106e9190478994b8" +version = "0.52.1" +source = "git+https://github.com/stackabletech/operator-rs.git?tag=0.52.1#18af9be0473cd6c30d7426e9ade74c90e4abce22" dependencies = [ "chrono", "clap", @@ -1894,8 +1894,8 @@ dependencies = [ [[package]] name = "stackable-operator-derive" -version = "0.51.1" -source = "git+https://github.com/stackabletech/operator-rs.git?tag=0.51.1#042642c1e7dac1fdc616de66106e9190478994b8" +version = "0.52.1" +source = "git+https://github.com/stackabletech/operator-rs.git?tag=0.52.1#18af9be0473cd6c30d7426e9ade74c90e4abce22" dependencies = [ "darling 0.20.3", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 85f872af..a599cc78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" serde_yaml = "0.9" snafu = "0.7" -stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.51.1" } +stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.52.1" } strum = { version = "0.25", features = ["derive"] } tokio = { version = "1.29", features = ["full"] } tokio-zookeeper = "0.2" diff --git a/deploy/helm/zookeeper-operator/crds/crds.yaml b/deploy/helm/zookeeper-operator/crds/crds.yaml index a168bf0a..97641ef8 100644 --- a/deploy/helm/zookeeper-operator/crds/crds.yaml +++ b/deploy/helm/zookeeper-operator/crds/crds.yaml @@ -3627,11 +3627,13 @@ spec: podDisruptionBudget: enabled: true maxUnavailable: null + description: This is a product-agnostic RoleConfig, which is sufficient for most of the products. properties: podDisruptionBudget: default: enabled: true maxUnavailable: null + description: 'This struct is used to configure: 1.) If PodDisruptionBudgets are created by the operator 2.) The allowed number of Pods to be unavailable (`maxUnavailable`)' properties: enabled: default: true diff --git a/docs/modules/zookeeper/pages/usage_guide/operations/pod-disruptions.adoc b/docs/modules/zookeeper/pages/usage_guide/operations/pod-disruptions.adoc index 204524a3..016f4689 100644 --- a/docs/modules/zookeeper/pages/usage_guide/operations/pod-disruptions.adoc +++ b/docs/modules/zookeeper/pages/usage_guide/operations/pod-disruptions.adoc @@ -1,86 +1,8 @@ = Allowed Pod disruptions -You can configure the allowed Pod disruptions for Zookeeper nodes as described in xref:concepts:operations/pod_disruptions.adoc[]. +You can configure the permitted Pod disruptions for Zookeeper nodes as described in xref:concepts:operations/pod_disruptions.adoc[]. -Unless you configure something else or disable our PodDisruptionBudgets (PDBs), we write the following PDBs: +Unless you configure something else or disable the provided PodDisruptionBudgets (PDBs), the following PDBs are written: == Servers - -Zookeeper servers need a certain number of nodes to be available to form a quorum to serve client requests. - -Taking this into consideration, our operator uses the following algorithm to determine the maximum number of servers allowed to be unavailable at the same time: - -`num_servers` is the number of server in the Zookeeper cluster, summed over all `roleGroups`. - -[source,rust] ----- -fn quorum_size(num_servers: u16) -> u16 { - // Same as max((num_servers / 2) + 1, 1), but without the need for floating point arithmetics, - // which are subject to rounding errors. - max((num_servers + 2) / 2, 1) -} - -// Minimum required amount of servers to form quorum. -let quorum_size = quorum_size(num_servers); - -// Subtract one to not cause a single point of failure -let max_unavailable = num_servers.saturating_sub(quorum_size).saturating_sub(1); - -// Clamp to at least a single node allowed to be offline, so we don't block Kubernetes nodes from draining. -let max_unavailable = max(max_unavailable, 1) ----- - -This results e.g. in the following numbers: - -TIP: It is strongly recommended to use an odd number of servers (e.g. 3, 5 or 7). - -[cols="1,1,1"] -|=== -|Number of servers -|Quorum size -|Maximum unavailable servers - -|1 -|1 -|1 - -|2 -|2 -|1 - -|3 -|2 -|1 - -|4 -|3 -|1 - -|5 -|3 -|1 - -|6 -|4 -|1 - -|7 -|4 -|2 - -|8 -|5 -|2 - -|9 -|5 -|3 - -|10 -|6 -|3 - -|20 -|11 -|8 -|=== +The provided PDBs only allow a single server to be offline at any given time, regardless of the number of replicas or `roleGroups`. diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 507687f7..e7bd9239 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -29,7 +29,7 @@ use stackable_operator::{ memory::{BinaryMultiple, MemoryQuantity}, product_config_utils::{ConfigError, Configuration}, product_logging::{self, spec::Logging}, - role_utils::{Role, RoleConfig, RoleGroup, RoleGroupRef}, + role_utils::{GenericRoleConfig, Role, RoleGroup, RoleGroupRef}, schemars::{self, JsonSchema}, status::condition::{ClusterCondition, HasStatusCondition}, }; @@ -519,21 +519,12 @@ impl ZookeeperCluster { } } - pub fn role_config(&self, role: &ZookeeperRole) -> Option<&RoleConfig> { + pub fn role_config(&self, role: &ZookeeperRole) -> Option<&GenericRoleConfig> { match role { ZookeeperRole::Server => self.spec.servers.as_ref().map(|s| &s.role_config), } } - pub fn num_servers(&self) -> u16 { - self.spec - .servers - .iter() - .flat_map(|s| s.role_groups.values()) - .map(|rg| rg.replicas.unwrap_or(1)) - .sum() - } - /// List all pods expected to form the cluster /// /// We try to predict the pods here rather than looking at the current cluster state in order to diff --git a/rust/operator-binary/src/operations/pdb.rs b/rust/operator-binary/src/operations/pdb.rs index e7e3a658..a8a90044 100644 --- a/rust/operator-binary/src/operations/pdb.rs +++ b/rust/operator-binary/src/operations/pdb.rs @@ -1,5 +1,3 @@ -use std::cmp::max; - use snafu::{ResultExt, Snafu}; use stackable_operator::{ builder::pdb::PodDisruptionBudgetBuilder, client::Client, cluster_resources::ClusterResources, @@ -34,7 +32,7 @@ pub async fn add_pdbs( return Ok(()); } let max_unavailable = pdb.max_unavailable.unwrap_or(match role { - ZookeeperRole::Server => max_unavailable_servers(zookeeper.num_servers()), + ZookeeperRole::Server => max_unavailable_servers(), }); let pdb = PodDisruptionBudgetBuilder::new_with_role( zookeeper, @@ -57,51 +55,6 @@ pub async fn add_pdbs( Ok(()) } -fn max_unavailable_servers(num_servers: u16) -> u16 { - // Minimum required amount of servers to form quorum. - let quorum_size = quorum_size(num_servers); - - // Subtract once to not cause a single point of failure - let max_unavailable = num_servers.saturating_sub(quorum_size).saturating_sub(1); - - // Clamp to at least a single node allowed to be offline, so we don't block Kubernetes nodes from draining. - max(max_unavailable, 1) -} - -fn quorum_size(num_servers: u16) -> u16 { - // Same as max((num_servers / 2) + 1, 1), but without the need for floating point arithmetics, - // which are subject to rounding errors. - max((num_servers + 2) / 2, 1) -} - -#[cfg(test)] -mod test { - use super::*; - use rstest::rstest; - - #[rstest] - #[case(0, 1, 1)] - #[case(1, 1, 1)] - #[case(2, 2, 1)] - #[case(3, 2, 1)] - #[case(4, 3, 1)] - #[case(5, 3, 1)] - #[case(6, 4, 1)] - #[case(7, 4, 2)] - #[case(8, 5, 2)] - #[case(9, 5, 3)] - #[case(10, 6, 3)] - #[case(20, 11, 8)] - #[case(100, 51, 48)] - - fn test_max_unavailable_servers( - #[case] num_servers: u16, - #[case] expected_quorum_size: u16, - #[case] expected_max_unavailable: u16, - ) { - let quorum_size = quorum_size(num_servers); - let max_unavailable = max_unavailable_servers(num_servers); - assert_eq!(quorum_size, expected_quorum_size); - assert_eq!(max_unavailable, expected_max_unavailable); - } +fn max_unavailable_servers() -> u16 { + 1 } diff --git a/rust/operator-binary/src/zk_controller.rs b/rust/operator-binary/src/zk_controller.rs index 4ded8c05..baae6738 100644 --- a/rust/operator-binary/src/zk_controller.rs +++ b/rust/operator-binary/src/zk_controller.rs @@ -46,7 +46,7 @@ use stackable_operator::{ CustomContainerLogConfig, }, }, - role_utils::{RoleConfig, RoleGroupRef}, + role_utils::{GenericRoleConfig, RoleGroupRef}, status::condition::{ compute_conditions, operations::ClusterOperationsConditionBuilder, statefulset::StatefulSetConditionBuilder, @@ -369,7 +369,7 @@ pub async fn reconcile_zk(zk: Arc, ctx: Arc) -> Result