Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pageserver: cancellation for remote ops in tenant deletion on shutdown #6105

Merged
merged 5 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ fn start_pageserver(
let http_listener = tcp_listener::bind(http_addr)?;

let pg_addr = &conf.listen_pg_addr;

info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;

Expand Down Expand Up @@ -546,7 +547,7 @@ fn start_pageserver(
let router_state = Arc::new(
http::routes::State::new(
conf,
tenant_manager,
tenant_manager.clone(),
http_auth.clone(),
remote_storage.clone(),
broker_client.clone(),
Expand Down Expand Up @@ -690,6 +691,7 @@ fn start_pageserver(
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));
Expand Down
9 changes: 7 additions & 2 deletions pageserver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub mod walredo;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tenant::mgr::TenantManager;
use tracing::info;

/// Current storage format version
Expand All @@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;

#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
pub async fn shutdown_pageserver(
tenant_manager: &TenantManager,
deletion_queue: Option<DeletionQueue>,
exit_code: i32,
) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
Expand All @@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
timed(
tenant::mgr::shutdown_all_tenants(),
tenant_manager.shutdown(),
"shutdown all tenants",
Duration::from_secs(5),
)
Expand Down
4 changes: 1 addition & 3 deletions pageserver/src/task_mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ use once_cell::sync::Lazy;

use utils::id::TimelineId;

use crate::shutdown_pageserver;

//
// There are four runtimes:
//
Expand Down Expand Up @@ -453,7 +451,7 @@ async fn task_finish(
}

if shutdown_process {
shutdown_pageserver(None, 1).await;
std::process::exit(1);
}
}

Expand Down
21 changes: 9 additions & 12 deletions pageserver/src/tenant/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,14 +296,17 @@ impl DeleteTenantFlow {
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();

pausable_failpoint!("tenant-delete-before-run");

let mut guard = Self::prepare(&tenant).await?;

if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
if let Err(e) =
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
{
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
Expand All @@ -322,6 +325,7 @@ impl DeleteTenantFlow {
conf: &'static PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
guard.mark_in_progress()?;

Expand All @@ -335,15 +339,9 @@ impl DeleteTenantFlow {
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(
conf,
remote_storage,
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await
.context("remote_mark")?
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?
}

fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Expand Down Expand Up @@ -546,8 +544,7 @@ impl DeleteTenantFlow {
conf,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
&task_mgr::shutdown_token(),
)
.await?;

Expand Down
42 changes: 26 additions & 16 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
/// New tenants can be added using [`tenant_map_acquire_slot`].
Open(BTreeMap<TenantShardId, TenantSlot>),
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
/// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
/// Existing tenants are still accessible, but no new tenants can be created.
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
}
Expand Down Expand Up @@ -261,6 +261,12 @@ pub struct TenantManager {
// See https://github.com/neondatabase/neon/issues/5796
tenants: &'static std::sync::RwLock<TenantsMap>,
resources: TenantSharedResources,

// Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
// This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime),
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
// when the tenant detaches.
cancel: CancellationToken,
}

fn emergency_generations(
Expand Down Expand Up @@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
conf,
tenants: &TENANTS,
resources,
cancel: CancellationToken::new(),
})
}

Expand Down Expand Up @@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
Ok(tenant)
}

///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
/// NB: We leave the tenants in the map, so that they remain accessible through
/// the management API until we shut it down. If we removed the shut-down tenants
/// from the tenants map, the management API would return 404 for these tenants,
/// because TenantsMap::get() now returns `None`.
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
pub(crate) async fn shutdown_all_tenants() {
shutdown_all_tenants0(&TENANTS).await
}

async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
let mut join_set = JoinSet::new();

Expand Down Expand Up @@ -1428,6 +1420,7 @@ impl TenantManager {
self.resources.remote_storage.clone(),
&TENANTS,
tenant,
&self.cancel,
)
.await;

Expand Down Expand Up @@ -1817,6 +1810,23 @@ impl TenantManager {

Ok(())
}

///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
/// NB: We leave the tenants in the map, so that they remain accessible through
/// the management API until we shut it down. If we removed the shut-down tenants
/// from the tenants map, the management API would return 404 for these tenants,
/// because TenantsMap::get() now returns `None`.
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();

shutdown_all_tenants0(self.tenants).await
}
}

#[derive(Debug, thiserror::Error)]
Expand Down
4 changes: 2 additions & 2 deletions test_runner/regress/test_tenant_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# We may leave some upload tasks in the queue. They're likely deletes.
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
# So by ignoring these instead of waiting for empty upload queue
Expand Down Expand Up @@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
Expand Down
6 changes: 3 additions & 3 deletions test_runner/regress/test_timeline_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
[
f".*{timeline_id}.*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# This happens when we fail before scheduling background operation.
# Timeline is left in stopping state and retry tries to stop it again.
".*Ignoring new state, equal to the existing one: Stopping",
Expand Down Expand Up @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
".*failpoint: timeline-delete-before-rm",
".*Ignoring new state, equal to the existing one: Stopping",
# this happens, because the stuck timeline is visible to shutdown
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
]
)

Expand Down Expand Up @@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this
Expand Down
Loading