From 169f9c0de98bcc7d47524d893d552309e18f652d Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 16:30:03 +0000 Subject: [PATCH 1/5] pageserver refactor mgr.rs shutdown into TenantManager --- pageserver/src/bin/pageserver.rs | 4 +++- pageserver/src/lib.rs | 9 ++++++-- pageserver/src/tenant/mgr.rs | 39 +++++++++++++++++++------------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 59750897ff7e..845c95a53601 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -314,6 +314,7 @@ fn start_pageserver( let http_listener = tcp_listener::bind(http_addr)?; let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; @@ -546,7 +547,7 @@ fn start_pageserver( let router_state = Arc::new( http::routes::State::new( conf, - tenant_manager, + tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), @@ -690,6 +691,7 @@ fn start_pageserver( let bg_remote_storage = remote_storage.clone(); let bg_deletion_queue = deletion_queue.clone(); BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( + &tenant_manager, bg_remote_storage.map(|_| bg_deletion_queue), 0, )); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 02a690d4e18b..3d5a87b6775f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -31,6 +31,7 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use tenant::mgr::TenantManager; use tracing::info; /// Current storage format version @@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] -pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { +pub async fn shutdown_pageserver( + tenant_manager: &TenantManager, + deletion_queue: Option, + exit_code: i32, +) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( - tenant::mgr::shutdown_all_tenants(), + tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7cf03d8fd65b..ff1192202b66 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -102,7 +102,7 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_acquire_slot`]. Open(BTreeMap), - /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } @@ -261,6 +261,10 @@ pub struct TenantManager { // See https://github.com/neondatabase/neon/issues/5796 tenants: &'static std::sync::RwLock, resources: TenantSharedResources, + + // This token is for use in edge cases like tenant deletion, where we are doing work on a + // tenant after it has been shut down, during deletion. + cancel: CancellationToken, } fn emergency_generations( @@ -620,6 +624,7 @@ pub async fn init_tenant_mgr( conf, tenants: &TENANTS, resources, + cancel: CancellationToken::new(), }) } @@ -680,21 +685,6 @@ pub(crate) fn tenant_spawn( Ok(tenant) } -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -/// NB: We leave the tenants in the map, so that they remain accessible through -/// the management API until we shut it down. If we removed the shut-down tenants -/// from the tenants map, the management API would return 404 for these tenants, -/// because TenantsMap::get() now returns `None`. -/// That could be easily misinterpreted by control plane, the consumer of the -/// management API. For example, it could attach the tenant on a different pageserver. -/// We would then be in split-brain once this pageserver restarts. -#[instrument(skip_all)] -pub(crate) async fn shutdown_all_tenants() { - shutdown_all_tenants0(&TENANTS).await -} - async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { let mut join_set = JoinSet::new(); @@ -1817,6 +1807,23 @@ impl TenantManager { Ok(()) } + + /// + /// Shut down all tenants. This runs as part of pageserver shutdown. + /// + /// NB: We leave the tenants in the map, so that they remain accessible through + /// the management API until we shut it down. If we removed the shut-down tenants + /// from the tenants map, the management API would return 404 for these tenants, + /// because TenantsMap::get() now returns `None`. + /// That could be easily misinterpreted by control plane, the consumer of the + /// management API. For example, it could attach the tenant on a different pageserver. + /// We would then be in split-brain once this pageserver restarts. + #[instrument(skip_all)] + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + shutdown_all_tenants0(self.tenants).await + } } #[derive(Debug, thiserror::Error)] From eb8d63795a28362e4d5c382bf5f75fb12fc77a93 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 16:28:49 +0000 Subject: [PATCH 2/5] pageserver: handle task_mgr task failure with immediate shutdown This avoids needing to provide a hook for task_mgr to gracefully shut down the process. --- pageserver/src/task_mgr.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 275a72c0b012..69e163effaa3 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -50,8 +50,6 @@ use once_cell::sync::Lazy; use utils::id::TimelineId; -use crate::shutdown_pageserver; - // // There are four runtimes: // @@ -453,7 +451,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(None, 1).await; + std::process::exit(1); } } From 22d8794c871de4ecd0480b3db01f9c7e2d49e562 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 16:34:59 +0000 Subject: [PATCH 3/5] pageserver: use TenantManager::cancel in tenant deletion flow --- pageserver/src/tenant/delete.rs | 21 +++++++++------------ pageserver/src/tenant/mgr.rs | 1 + 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index ffb7206b1e72..cab60c3111ff 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -296,6 +296,7 @@ impl DeleteTenantFlow { remote_storage: Option, tenants: &'static std::sync::RwLock, tenant: Arc, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { span::debug_assert_current_span_has_tenant_id(); @@ -303,7 +304,9 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { + if let Err(e) = + Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await + { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -322,6 +325,7 @@ impl DeleteTenantFlow { conf: &'static PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant: &Tenant, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { guard.mark_in_progress()?; @@ -335,15 +339,9 @@ impl DeleteTenantFlow { // Though sounds scary, different mark name? // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark( - conf, - remote_storage, - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) - .await - .context("remote_mark")? + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) + .await + .context("remote_mark")? } fail::fail_point!("tenant-delete-before-create-local-mark", |_| { @@ -546,8 +544,7 @@ impl DeleteTenantFlow { conf, remote_storage.as_ref(), &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), + &task_mgr::shutdown_token(), ) .await?; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index ff1192202b66..aa4550e29aa1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1418,6 +1418,7 @@ impl TenantManager { self.resources.remote_storage.clone(), &TENANTS, tenant, + &self.cancel, ) .await; From ec81f9ae46830554e21ca5a1b806d2cb9610db18 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 17:43:50 +0000 Subject: [PATCH 4/5] tests: update allowed messages for refactor --- test_runner/regress/test_tenant_delete.py | 4 ++-- test_runner/regress/test_timeline_delete.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 52de8890843b..a164c7f60af0 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue @@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 96a5cc491a05..0eb1327c9e9c 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( [ f".*{timeline_id}.*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. ".*Ignoring new state, equal to the existing one: Stopping", @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*failpoint: timeline-delete-before-rm", ".*Ignoring new state, equal to the existing one: Stopping", # this happens, because the stuck timeline is visible to shutdown - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] ) @@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this From 406317b0109c72ef426581ca7d71f530ccdc57de Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 15 Mar 2024 16:16:10 +0000 Subject: [PATCH 5/5] clarify a comment --- pageserver/src/tenant/mgr.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index aa4550e29aa1..3aaab6e4ef32 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -262,8 +262,10 @@ pub struct TenantManager { tenants: &'static std::sync::RwLock, resources: TenantSharedResources, - // This token is for use in edge cases like tenant deletion, where we are doing work on a - // tenant after it has been shut down, during deletion. + // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. + // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), + // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or + // when the tenant detaches. cancel: CancellationToken, }