Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow creating config for attached tenant #3446

Merged
merged 15 commits into from
Jan 27, 2023
Merged
128 changes: 61 additions & 67 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1800,69 +1800,70 @@ impl Tenant {
}

pub(super) fn persist_tenant_config(
tenant_id: &TenantId,
target_config_path: &Path,
tenant_conf: TenantConfOpt,
first_save: bool,
creating_tenant: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving tenantconf").entered();
info!("persisting tenantconf to {}", target_config_path.display());

// TODO this will prepend comments endlessly ?
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.

[tenant_config]
"#
.to_string();

// Convert the config to a toml file.
conf_content += &toml_edit::easy::to_string(&tenant_conf)?;

let mut target_config_file = VirtualFile::open_with_options(
target_config_path,
OpenOptions::new()
.truncate(true) // This needed for overwriting with small config files
.write(true)
.create_new(first_save),
)?;

target_config_file
.write(conf_content.as_bytes())
.context("Failed to write toml bytes into file")
.and_then(|_| {
target_config_file
.sync_all()
.context("Faile to fsync config file")
})
.with_context(|| {
// imitate a try-block with a closure
let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
let target_config_parent = target_config_path.parent().with_context(|| {
format!(
"Failed to write config file into path '{}'",
"Config path does not have a parent: {}",
target_config_path.display()
)
})?;

// fsync the parent directory to ensure the directory entry is durable
if first_save {
target_config_path
.parent()
.context("Config file does not have a parent")
.and_then(|target_config_parent| {
File::open(target_config_parent).context("Failed to open config parent")
})
.and_then(|tenant_dir| {
tenant_dir
.sync_all()
.context("Failed to fsync config parent")
})
.with_context(|| {
format!(
"Failed to fsync on first save for config {}",
target_config_path.display()
)
})?;
}
info!("persisting tenantconf to {}", target_config_path.display());

Ok(())
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.

[tenant_config]
"#
.to_string();

// Convert the config to a toml file.
conf_content += &toml_edit::easy::to_string(&tenant_conf)?;

let mut target_config_file = VirtualFile::open_with_options(
target_config_path,
OpenOptions::new()
.truncate(true) // This needed for overwriting with small config files
.write(true)
.create_new(creating_tenant)
// when creating a new tenant, first_save will be true and `.create(true)` will be
// ignored (per rust std docs).
//
// later when updating the config of created tenant, or persisting config for the
// first time for attached tenant, the `.create(true)` is used.
.create(true),
)?;

target_config_file
.write(conf_content.as_bytes())
.context("write toml bytes into file")
.and_then(|_| target_config_file.sync_all().context("fsync config file"))
.context("write config file")?;

// fsync the parent directory to ensure the directory entry is durable.
// before this was done conditionally on creating_tenant, but these management actions are rare
// enough to just fsync it always.

crashsafe::fsync(target_config_parent)?;
Ok(())
};

// this function is called from creating the tenant and updating the tenant config, which
// would otherwise share this context, so keep it here in one place.
do_persist(target_config_path).with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})
}

//
Expand Down Expand Up @@ -2512,26 +2513,19 @@ fn try_create_target_tenant_dir(
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
.with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
let temporary_tenant_config_path = rebase_directory(
&conf.tenant_config_path(tenant_id),
target_tenant_directory,
temporary_tenant_dir,
)
.with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;

Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
|| {
format!(
"Failed to write tenant {} config to {}",
tenant_id,
temporary_tenant_config_path.display()
)
},
)?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
"could not create tenant {} temporary timelines directory {}",
"create tenant {} temporary timelines directory {}",
tenant_id,
temporary_tenant_timelines_dir.display()
)
Expand All @@ -2542,22 +2536,22 @@ fn try_create_target_tenant_dir(

fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
format!(
"failed to move tenant {} temporary directory {} into the permanent one {}",
"move tenant {} temporary directory {} into the permanent one {}",
tenant_id,
temporary_tenant_dir.display(),
target_tenant_directory.display()
)
})?;
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
format!(
"Failed to get tenant {} dir parent for {}",
"get tenant {} dir parent for {}",
tenant_id,
target_tenant_directory.display()
)
})?;
crashsafe::fsync(target_dir_parent).with_context(|| {
format!(
"Failed to fsync renamed directory's parent {} for tenant {}",
"fsync renamed directory's parent {} for tenant {}",
target_dir_parent.display(),
tenant_id,
)
Expand Down
9 changes: 5 additions & 4 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,11 @@ pub async fn update_tenant_config(
tenant_id: TenantId,
) -> anyhow::Result<()> {
info!("configuring tenant {tenant_id}");
get_tenant(tenant_id, true)
.await?
.update_tenant_config(tenant_conf);
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
let tenant = get_tenant(tenant_id, true).await?;

tenant.update_tenant_config(tenant_conf);
let tenant_config_path = conf.tenant_config_path(tenant_id);
Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're changing the in-memory config and then fail to persist it, we report an error to the client but still run with the new config. Is this intentional?

Copy link
Member Author

@koivunej koivunej Jan 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That part was not explicitly changed on this PR but yeah it seems quite suboptimal. The update_tenant_config does a merge all of the given values ... So actually the value we end up writing to the disk is not the active tenant config. Raah, I'll open an issue. Well, perhaps investigate a bit more before calling it :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modifying the added test in this PR to instead of changing gc_horizon twice to in different http PUT requests:

  1. set gc_horizon
  2. set pitr_interval

Will write to disk as last version:

# This file contains a specific per-tenant's config.
#  It is read in case of pageserver restart.

[tenant_config]
pitr_interval = "0s"

But adding logging to Tenant::update_tenant_config shows that the tenant config is first updated to:

2023-01-27T15:27:39.514582Z  INFO tenant_config{tenant=db6238f481f8459ce03705f1fa96ab42}: updated config TenantConfOpt {
    checkpoint_distance: None,
    checkpoint_timeout: None,
    compaction_target_size: None,
    compaction_period: None,
    compaction_threshold: None,
    gc_horizon: Some(
        1000000,
    ),
    gc_period: None,
    image_creation_threshold: None,
    pitr_interval: None,
    walreceiver_connect_timeout: None,
    lagging_wal_timeout: None,
    max_lsn_wal_lag: None,
    trace_read_requests: None,
}

Then by the second request:

2023-01-27T15:27:39.521233Z  INFO tenant_config{tenant=db6238f481f8459ce03705f1fa96ab42}: updated config TenantConfOpt {
    checkpoint_distance: None,
    checkpoint_timeout: None,
    compaction_target_size: None,
    compaction_period: None,
    compaction_threshold: None,
    gc_horizon: Some(
        1000000,
    ),
    gc_period: None,
    image_creation_threshold: None,
    pitr_interval: Some(
        0ns,
    ),
    walreceiver_connect_timeout: None,
    lagging_wal_timeout: None,
    max_lsn_wal_lag: None,
    trace_read_requests: None,
}

Creating a new issue, including your comment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created two new issues, linked below.

Ok(())
}

Expand Down
53 changes: 52 additions & 1 deletion test_runner/regress/test_tenant_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@

import psycopg2.extras
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import (
LocalFsStorage,
NeonEnvBuilder,
RemoteStorageKind,
assert_tenant_status,
wait_for_upload,
)
from fixtures.types import Lsn
from fixtures.utils import wait_until


def test_tenant_config(neon_env_builder: NeonEnvBuilder):
Expand Down Expand Up @@ -158,3 +166,46 @@ def test_tenant_config(neon_env_builder: NeonEnvBuilder):
"pitr_interval": 60,
}.items()
)


def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_creating_tenant_conf_after_attach",
)

env = neon_env_builder.init_start()
assert isinstance(env.remote_storage, LocalFsStorage)

# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
assert config_path.exists(), "config file is always initially created"

http_client = env.pageserver.http_client()

detail = http_client.timeline_detail(tenant_id, timeline_id)
last_record_lsn = Lsn(detail["last_record_lsn"])
assert last_record_lsn.lsn_int != 0, "initdb must have executed"

wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)

http_client.tenant_detach(tenant_id)

assert not config_path.exists(), "detach did not remove config file"

http_client.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,
interval=1,
func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
)

env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
contents_first = config_path.read_text()
env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
contents_later = config_path.read_text()

# dont test applying the setting here, we have that another test case to show it
# we just care about being able to create the file
assert len(contents_first) > len(contents_later)