Skip to content

Commit

Permalink
Change PDBs to maxUnavailable: 1 (#731)
Browse files Browse the repository at this point in the history
* bump: operator-rs 0.52.0

* changelog

* Change PDBs to maxUnavailable: 1

* bump: operator-rs 0.52.1

* wording
  • Loading branch information
sbernauer authored Oct 6, 2023
1 parent b9eec48 commit 34f4813
Show file tree
Hide file tree
Showing 8 changed files with 20 additions and 151 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,20 @@ All notable changes to this project will be documented in this file.

- Default stackableVersion to operator version ([#711]).
- Configuration overrides for the JVM security properties, such as DNS caching ([#715]).
- Support PodDisruptionBudgets ([#730]).
- Support PodDisruptionBudgets ([#730], [#731]).

### Changed

- `vector` `0.26.0` -> `0.31.0` ([#709]).
- `operator-rs` `0.44.0` -> `0.51.1` ([#711], [#730]).
- `operator-rs` `0.44.0` -> `0.52.1` ([#711], [#730], [#731]).
- Let secret-operator handle certificate conversion ([#695]).

[#695]: https://github.com/stackabletech/zookeeper-operator/pull/695
[#709]: https://github.com/stackabletech/zookeeper-operator/pull/709
[#711]: https://github.com/stackabletech/zookeeper-operator/pull/711
[#715]: https://github.com/stackabletech/zookeeper-operator/pull/715
[#730]: https://github.com/stackabletech/zookeeper-operator/pull/730
[#731]: https://github.com/stackabletech/zookeeper-operator/pull/731

## [23.7.0] - 2023-07-14

Expand Down
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.9"
snafu = "0.7"
stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.51.1" }
stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.52.1" }
strum = { version = "0.25", features = ["derive"] }
tokio = { version = "1.29", features = ["full"] }
tokio-zookeeper = "0.2"
Expand Down
2 changes: 2 additions & 0 deletions deploy/helm/zookeeper-operator/crds/crds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3627,11 +3627,13 @@ spec:
podDisruptionBudget:
enabled: true
maxUnavailable: null
description: This is a product-agnostic RoleConfig, which is sufficient for most of the products.
properties:
podDisruptionBudget:
default:
enabled: true
maxUnavailable: null
description: 'This struct is used to configure: 1.) If PodDisruptionBudgets are created by the operator 2.) The allowed number of Pods to be unavailable (`maxUnavailable`)'
properties:
enabled:
default: true
Expand Down
Original file line number Diff line number Diff line change
@@ -1,86 +1,8 @@
= Allowed Pod disruptions

You can configure the allowed Pod disruptions for Zookeeper nodes as described in xref:concepts:operations/pod_disruptions.adoc[].
You can configure the permitted Pod disruptions for Zookeeper nodes as described in xref:concepts:operations/pod_disruptions.adoc[].

Unless you configure something else or disable our PodDisruptionBudgets (PDBs), we write the following PDBs:
Unless you configure something else or disable the provided PodDisruptionBudgets (PDBs), the following PDBs are written:

== Servers

Zookeeper servers need a certain number of nodes to be available to form a quorum to serve client requests.

Taking this into consideration, our operator uses the following algorithm to determine the maximum number of servers allowed to be unavailable at the same time:

`num_servers` is the number of server in the Zookeeper cluster, summed over all `roleGroups`.

[source,rust]
----
fn quorum_size(num_servers: u16) -> u16 {
// Same as max((num_servers / 2) + 1, 1), but without the need for floating point arithmetics,
// which are subject to rounding errors.
max((num_servers + 2) / 2, 1)
}
// Minimum required amount of servers to form quorum.
let quorum_size = quorum_size(num_servers);
// Subtract one to not cause a single point of failure
let max_unavailable = num_servers.saturating_sub(quorum_size).saturating_sub(1);
// Clamp to at least a single node allowed to be offline, so we don't block Kubernetes nodes from draining.
let max_unavailable = max(max_unavailable, 1)
----

This results e.g. in the following numbers:

TIP: It is strongly recommended to use an odd number of servers (e.g. 3, 5 or 7).

[cols="1,1,1"]
|===
|Number of servers
|Quorum size
|Maximum unavailable servers

|1
|1
|1

|2
|2
|1

|3
|2
|1

|4
|3
|1

|5
|3
|1

|6
|4
|1

|7
|4
|2

|8
|5
|2

|9
|5
|3

|10
|6
|3

|20
|11
|8
|===
The provided PDBs only allow a single server to be offline at any given time, regardless of the number of replicas or `roleGroups`.
13 changes: 2 additions & 11 deletions rust/crd/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use stackable_operator::{
memory::{BinaryMultiple, MemoryQuantity},
product_config_utils::{ConfigError, Configuration},
product_logging::{self, spec::Logging},
role_utils::{Role, RoleConfig, RoleGroup, RoleGroupRef},
role_utils::{GenericRoleConfig, Role, RoleGroup, RoleGroupRef},
schemars::{self, JsonSchema},
status::condition::{ClusterCondition, HasStatusCondition},
};
Expand Down Expand Up @@ -519,21 +519,12 @@ impl ZookeeperCluster {
}
}

pub fn role_config(&self, role: &ZookeeperRole) -> Option<&RoleConfig> {
pub fn role_config(&self, role: &ZookeeperRole) -> Option<&GenericRoleConfig> {
match role {
ZookeeperRole::Server => self.spec.servers.as_ref().map(|s| &s.role_config),
}
}

pub fn num_servers(&self) -> u16 {
self.spec
.servers
.iter()
.flat_map(|s| s.role_groups.values())
.map(|rg| rg.replicas.unwrap_or(1))
.sum()
}

/// List all pods expected to form the cluster
///
/// We try to predict the pods here rather than looking at the current cluster state in order to
Expand Down
53 changes: 3 additions & 50 deletions rust/operator-binary/src/operations/pdb.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::cmp::max;

use snafu::{ResultExt, Snafu};
use stackable_operator::{
builder::pdb::PodDisruptionBudgetBuilder, client::Client, cluster_resources::ClusterResources,
Expand Down Expand Up @@ -34,7 +32,7 @@ pub async fn add_pdbs(
return Ok(());
}
let max_unavailable = pdb.max_unavailable.unwrap_or(match role {
ZookeeperRole::Server => max_unavailable_servers(zookeeper.num_servers()),
ZookeeperRole::Server => max_unavailable_servers(),
});
let pdb = PodDisruptionBudgetBuilder::new_with_role(
zookeeper,
Expand All @@ -57,51 +55,6 @@ pub async fn add_pdbs(
Ok(())
}

fn max_unavailable_servers(num_servers: u16) -> u16 {
// Minimum required amount of servers to form quorum.
let quorum_size = quorum_size(num_servers);

// Subtract once to not cause a single point of failure
let max_unavailable = num_servers.saturating_sub(quorum_size).saturating_sub(1);

// Clamp to at least a single node allowed to be offline, so we don't block Kubernetes nodes from draining.
max(max_unavailable, 1)
}

fn quorum_size(num_servers: u16) -> u16 {
// Same as max((num_servers / 2) + 1, 1), but without the need for floating point arithmetics,
// which are subject to rounding errors.
max((num_servers + 2) / 2, 1)
}

#[cfg(test)]
mod test {
use super::*;
use rstest::rstest;

#[rstest]
#[case(0, 1, 1)]
#[case(1, 1, 1)]
#[case(2, 2, 1)]
#[case(3, 2, 1)]
#[case(4, 3, 1)]
#[case(5, 3, 1)]
#[case(6, 4, 1)]
#[case(7, 4, 2)]
#[case(8, 5, 2)]
#[case(9, 5, 3)]
#[case(10, 6, 3)]
#[case(20, 11, 8)]
#[case(100, 51, 48)]

fn test_max_unavailable_servers(
#[case] num_servers: u16,
#[case] expected_quorum_size: u16,
#[case] expected_max_unavailable: u16,
) {
let quorum_size = quorum_size(num_servers);
let max_unavailable = max_unavailable_servers(num_servers);
assert_eq!(quorum_size, expected_quorum_size);
assert_eq!(max_unavailable, expected_max_unavailable);
}
fn max_unavailable_servers() -> u16 {
1
}
4 changes: 2 additions & 2 deletions rust/operator-binary/src/zk_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use stackable_operator::{
CustomContainerLogConfig,
},
},
role_utils::{RoleConfig, RoleGroupRef},
role_utils::{GenericRoleConfig, RoleGroupRef},
status::condition::{
compute_conditions, operations::ClusterOperationsConditionBuilder,
statefulset::StatefulSetConditionBuilder,
Expand Down Expand Up @@ -369,7 +369,7 @@ pub async fn reconcile_zk(zk: Arc<ZookeeperCluster>, ctx: Arc<Ctx>) -> Result<co
}

let role_config = zk.role_config(&zk_role);
if let Some(RoleConfig {
if let Some(GenericRoleConfig {
pod_disruption_budget: pdb,
}) = role_config
{
Expand Down

0 comments on commit 34f4813

Please sign in to comment.