From 317a09df4a82e729c1ccb8702b12f496620bb494 Mon Sep 17 00:00:00 2001 From: Stu Hood Date: Thu, 6 Apr 2023 15:38:29 -0700 Subject: [PATCH] Add timeout to remote store calls, and adjust name of cache timeout. (#18695) #16196 moved the cache read timeout down to the network layer, which made it much more accurate. But cache lookups also involve a number of calls to a remote `Store`, which did not have their own timeout. This change adds an RPC timeout for `Store` accesses to allow for retries of tar-pitted remote store RPCs, and adjusts the naming of the `--remote-cache-rpc-timeout-millis` option to make it clear that it applies to all cache operations (including writes). --- .../pants/engine/internals/scheduler.py | 4 +- src/python/pants/option/global_options.py | 40 ++++++++++++++----- src/rust/engine/fs/fs_util/src/main.rs | 5 +-- src/rust/engine/fs/store/src/lib.rs | 4 +- src/rust/engine/fs/store/src/remote.rs | 8 ++-- .../remote/src/remote_cache.rs | 4 +- src/rust/engine/src/context.rs | 8 ++-- src/rust/engine/src/externs/interface.rs | 8 ++-- src/rust/engine/workunit_store/src/metrics.rs | 1 + 9 files changed, 50 insertions(+), 32 deletions(-) diff --git a/src/python/pants/engine/internals/scheduler.py b/src/python/pants/engine/internals/scheduler.py index 57716e6f276..7d0d8117dd3 100644 --- a/src/python/pants/engine/internals/scheduler.py +++ b/src/python/pants/engine/internals/scheduler.py @@ -180,14 +180,14 @@ def __init__( root_ca_certs_path=execution_options.remote_ca_certs_path, store_headers=execution_options.remote_store_headers, store_chunk_bytes=execution_options.remote_store_chunk_bytes, - store_chunk_upload_timeout=execution_options.remote_store_chunk_upload_timeout_seconds, store_rpc_retries=execution_options.remote_store_rpc_retries, store_rpc_concurrency=execution_options.remote_store_rpc_concurrency, + store_rpc_timeout_millis=execution_options.remote_store_rpc_timeout_millis, store_batch_api_size_limit=execution_options.remote_store_batch_api_size_limit, cache_warnings_behavior=execution_options.remote_cache_warnings.value, cache_content_behavior=execution_options.cache_content_behavior.value, cache_rpc_concurrency=execution_options.remote_cache_rpc_concurrency, - cache_read_timeout_millis=execution_options.remote_cache_read_timeout_millis, + cache_rpc_timeout_millis=execution_options.remote_cache_rpc_timeout_millis, execution_headers=execution_options.remote_execution_headers, execution_overall_deadline_secs=execution_options.remote_execution_overall_deadline_secs, execution_rpc_concurrency=execution_options.remote_execution_rpc_concurrency, diff --git a/src/python/pants/option/global_options.py b/src/python/pants/option/global_options.py index 56ab9493199..e7c4aca861c 100644 --- a/src/python/pants/option/global_options.py +++ b/src/python/pants/option/global_options.py @@ -501,14 +501,14 @@ class ExecutionOptions: remote_store_address: str | None remote_store_headers: dict[str, str] remote_store_chunk_bytes: Any - remote_store_chunk_upload_timeout_seconds: int remote_store_rpc_retries: int remote_store_rpc_concurrency: int remote_store_batch_api_size_limit: int + remote_store_rpc_timeout_millis: int remote_cache_warnings: RemoteCacheWarningsBehavior remote_cache_rpc_concurrency: int - remote_cache_read_timeout_millis: int + remote_cache_rpc_timeout_millis: int remote_execution_address: str | None remote_execution_headers: dict[str, str] @@ -523,6 +523,15 @@ def from_options( bootstrap_options: OptionValueContainer, dynamic_remote_options: DynamicRemoteOptions, ) -> ExecutionOptions: + remote_cache_rpc_timeout_millis = resolve_conflicting_options( + old_option="remote_cache_read_timeout_millis", + new_option="remote_cache_rpc_timeout_millis", + old_scope="", + new_scope="", + old_container=bootstrap_options, + new_container=bootstrap_options, + ) + return cls( # Remote execution strategy. remote_execution=dynamic_remote_options.execution, @@ -546,14 +555,14 @@ def from_options( remote_store_address=dynamic_remote_options.store_address, remote_store_headers=dynamic_remote_options.store_headers, remote_store_chunk_bytes=bootstrap_options.remote_store_chunk_bytes, - remote_store_chunk_upload_timeout_seconds=bootstrap_options.remote_store_chunk_upload_timeout_seconds, remote_store_rpc_retries=bootstrap_options.remote_store_rpc_retries, remote_store_rpc_concurrency=dynamic_remote_options.store_rpc_concurrency, remote_store_batch_api_size_limit=bootstrap_options.remote_store_batch_api_size_limit, + remote_store_rpc_timeout_millis=bootstrap_options.remote_store_rpc_timeout_millis, # Remote cache setup. remote_cache_warnings=bootstrap_options.remote_cache_warnings, remote_cache_rpc_concurrency=dynamic_remote_options.cache_rpc_concurrency, - remote_cache_read_timeout_millis=bootstrap_options.remote_cache_read_timeout_millis, + remote_cache_rpc_timeout_millis=remote_cache_rpc_timeout_millis, # Remote execution setup. remote_execution_address=dynamic_remote_options.execution_address, remote_execution_headers=dynamic_remote_options.execution_headers, @@ -632,14 +641,14 @@ def from_options(cls, options: OptionValueContainer) -> LocalStoreOptions: "user-agent": f"pants/{VERSION}", }, remote_store_chunk_bytes=1024 * 1024, - remote_store_chunk_upload_timeout_seconds=60, remote_store_rpc_retries=2, remote_store_rpc_concurrency=128, remote_store_batch_api_size_limit=4194304, + remote_store_rpc_timeout_millis=30000, # Remote cache setup. remote_cache_warnings=RemoteCacheWarningsBehavior.backoff, remote_cache_rpc_concurrency=128, - remote_cache_read_timeout_millis=1500, + remote_cache_rpc_timeout_millis=1500, # Remote execution setup. remote_execution_address=None, remote_execution_headers={ @@ -1172,7 +1181,6 @@ class BootstrapOptions: ) process_cleanup = BoolOption( default=(DEFAULT_EXECUTION_OPTIONS.keep_sandboxes == KeepSandboxes.never), - deprecation_start_version="2.15.0.dev1", removal_version="3.0.0.dev0", removal_hint="Use the `keep_sandboxes` option instead.", help=softwrap( @@ -1443,7 +1451,9 @@ class BootstrapOptions: ) remote_store_chunk_upload_timeout_seconds = IntOption( advanced=True, - default=DEFAULT_EXECUTION_OPTIONS.remote_store_chunk_upload_timeout_seconds, + default=60, + removal_version="2.19.0.dev0", + removal_hint="Unused: use the `remote_store_rpc_timeout_millis` option instead.", help="Timeout (in seconds) for uploads of individual chunks to the remote file store.", ) remote_store_rpc_retries = IntOption( @@ -1456,6 +1466,11 @@ class BootstrapOptions: default=DEFAULT_EXECUTION_OPTIONS.remote_store_rpc_concurrency, help="The number of concurrent requests allowed to the remote store service.", ) + remote_store_rpc_timeout_millis = IntOption( + advanced=True, + default=DEFAULT_EXECUTION_OPTIONS.remote_store_rpc_timeout_millis, + help="Timeout value for remote store RPCs (not including streaming requests) in milliseconds.", + ) remote_store_batch_api_size_limit = IntOption( advanced=True, default=DEFAULT_EXECUTION_OPTIONS.remote_store_batch_api_size_limit, @@ -1480,9 +1495,16 @@ class BootstrapOptions: ) remote_cache_read_timeout_millis = IntOption( advanced=True, - default=DEFAULT_EXECUTION_OPTIONS.remote_cache_read_timeout_millis, + default=DEFAULT_EXECUTION_OPTIONS.remote_cache_rpc_timeout_millis, + removal_version="2.19.0.dev0", + removal_hint="Use the `remote_cache_rpc_timeout_millis` option instead.", help="Timeout value for remote cache lookups in milliseconds.", ) + remote_cache_rpc_timeout_millis = IntOption( + advanced=True, + default=DEFAULT_EXECUTION_OPTIONS.remote_cache_rpc_timeout_millis, + help="Timeout value for remote cache RPCs in milliseconds.", + ) remote_execution_address = StrOption( advanced=True, default=cast(str, DEFAULT_EXECUTION_OPTIONS.remote_execution_address), diff --git a/src/rust/engine/fs/fs_util/src/main.rs b/src/rust/engine/fs/fs_util/src/main.rs index 0bd5f173712..941a5076a47 100644 --- a/src/rust/engine/fs/fs_util/src/main.rs +++ b/src/rust/engine/fs/fs_util/src/main.rs @@ -423,10 +423,7 @@ async fn execute(top_match: &clap::ArgMatches) -> Result<(), ExitError> { // leave this hanging forever. // // Make fs_util have a very long deadline (because it's not configurable, - // like it is inside pants) until we switch to Tower (where we can more - // carefully control specific components of timeouts). - // - // See https://github.com/pantsbuild/pants/pull/6433 for more context. + // like it is inside pants). Duration::from_secs(30 * 60), top_match .value_of_t::("rpc-attempts") diff --git a/src/rust/engine/fs/store/src/lib.rs b/src/rust/engine/fs/store/src/lib.rs index 4650164a651..8873add892c 100644 --- a/src/rust/engine/fs/store/src/lib.rs +++ b/src/rust/engine/fs/store/src/lib.rs @@ -385,7 +385,7 @@ impl Store { tls_config: grpc_util::tls::Config, headers: BTreeMap, chunk_size_bytes: usize, - upload_timeout: Duration, + rpc_timeout: Duration, rpc_retries: usize, rpc_concurrency_limit: usize, capabilities_cell_opt: Option>>, @@ -399,7 +399,7 @@ impl Store { tls_config, headers, chunk_size_bytes, - upload_timeout, + rpc_timeout, rpc_retries, rpc_concurrency_limit, capabilities_cell_opt, diff --git a/src/rust/engine/fs/store/src/remote.rs b/src/rust/engine/fs/store/src/remote.rs index 6cb6257d43f..4a1c2d18a75 100644 --- a/src/rust/engine/fs/store/src/remote.rs +++ b/src/rust/engine/fs/store/src/remote.rs @@ -29,7 +29,7 @@ use remexec::{ use tokio::io::{AsyncSeekExt, AsyncWrite, AsyncWriteExt}; use tokio::sync::Mutex; use tonic::{Code, Request, Status}; -use workunit_store::{in_workunit, ObservationMetric}; +use workunit_store::{in_workunit, Metric, ObservationMetric}; use crate::StoreError; @@ -37,7 +37,6 @@ use crate::StoreError; pub struct ByteStore { instance_name: Option, chunk_size_bytes: usize, - _upload_timeout: Duration, _rpc_attempts: usize, byte_stream_client: Arc>, cas_client: Arc>, @@ -131,7 +130,7 @@ impl ByteStore { tls_config: grpc_util::tls::Config, mut headers: BTreeMap, chunk_size_bytes: usize, - upload_timeout: Duration, + rpc_timeout: Duration, rpc_retries: usize, rpc_concurrency_limit: usize, capabilities_cell_opt: Option>>, @@ -150,7 +149,7 @@ impl ByteStore { tonic::transport::Channel::balance_list(vec![endpoint].into_iter()), rpc_concurrency_limit, http_headers, - None, + Some((rpc_timeout, Metric::RemoteStoreRequestTimeouts)), ); let byte_stream_client = Arc::new(ByteStreamClient::new(channel.clone())); @@ -162,7 +161,6 @@ impl ByteStore { Ok(ByteStore { instance_name, chunk_size_bytes, - _upload_timeout: upload_timeout, _rpc_attempts: rpc_retries + 1, byte_stream_client, cas_client, diff --git a/src/rust/engine/process_execution/remote/src/remote_cache.rs b/src/rust/engine/process_execution/remote/src/remote_cache.rs index f19baec4e4f..d09aa35b6b9 100644 --- a/src/rust/engine/process_execution/remote/src/remote_cache.rs +++ b/src/rust/engine/process_execution/remote/src/remote_cache.rs @@ -80,7 +80,7 @@ impl CommandRunner { warnings_behavior: RemoteCacheWarningsBehavior, cache_content_behavior: CacheContentBehavior, concurrency_limit: usize, - read_timeout: Duration, + rpc_timeout: Duration, append_only_caches_base_path: Option, ) -> Result { let tls_client_config = if action_cache_address.starts_with("https://") { @@ -99,7 +99,7 @@ impl CommandRunner { tonic::transport::Channel::balance_list(vec![endpoint].into_iter()), concurrency_limit, http_headers, - Some((read_timeout, Metric::RemoteCacheRequestTimeouts)), + Some((rpc_timeout, Metric::RemoteCacheRequestTimeouts)), ); let action_cache_client = Arc::new(ActionCacheClient::new(channel)); diff --git a/src/rust/engine/src/context.rs b/src/rust/engine/src/context.rs index ba593091abc..2fcc231a42d 100644 --- a/src/rust/engine/src/context.rs +++ b/src/rust/engine/src/context.rs @@ -93,14 +93,14 @@ pub struct RemotingOptions { pub root_ca_certs_path: Option, pub store_headers: BTreeMap, pub store_chunk_bytes: usize, - pub store_chunk_upload_timeout: Duration, pub store_rpc_retries: usize, pub store_rpc_concurrency: usize, + pub store_rpc_timeout: Duration, pub store_batch_api_size_limit: usize, pub cache_warnings_behavior: RemoteCacheWarningsBehavior, pub cache_content_behavior: CacheContentBehavior, pub cache_rpc_concurrency: usize, - pub cache_read_timeout: Duration, + pub cache_rpc_timeout: Duration, pub execution_headers: BTreeMap, pub execution_overall_deadline: Duration, pub execution_rpc_concurrency: usize, @@ -169,7 +169,7 @@ impl Core { grpc_util::tls::Config::new_without_mtls(root_ca_certs.clone()), remoting_opts.store_headers.clone(), remoting_opts.store_chunk_bytes, - remoting_opts.store_chunk_upload_timeout, + remoting_opts.store_rpc_timeout, remoting_opts.store_rpc_retries, remoting_opts.store_rpc_concurrency, capabilities_cell_opt, @@ -337,7 +337,7 @@ impl Core { remoting_opts.cache_warnings_behavior, remoting_opts.cache_content_behavior, remoting_opts.cache_rpc_concurrency, - remoting_opts.cache_read_timeout, + remoting_opts.cache_rpc_timeout, remoting_opts.append_only_caches_base_path.clone(), )?); } diff --git a/src/rust/engine/src/externs/interface.rs b/src/rust/engine/src/externs/interface.rs index 4810b03992a..81a6de5fb7a 100644 --- a/src/rust/engine/src/externs/interface.rs +++ b/src/rust/engine/src/externs/interface.rs @@ -299,14 +299,14 @@ impl PyRemotingOptions { root_ca_certs_path: Option, store_headers: BTreeMap, store_chunk_bytes: usize, - store_chunk_upload_timeout: u64, store_rpc_retries: usize, store_rpc_concurrency: usize, + store_rpc_timeout_millis: u64, store_batch_api_size_limit: usize, cache_warnings_behavior: String, cache_content_behavior: String, cache_rpc_concurrency: usize, - cache_read_timeout_millis: u64, + cache_rpc_timeout_millis: u64, execution_headers: BTreeMap, execution_overall_deadline_secs: u64, execution_rpc_concurrency: usize, @@ -321,15 +321,15 @@ impl PyRemotingOptions { root_ca_certs_path, store_headers, store_chunk_bytes, - store_chunk_upload_timeout: Duration::from_secs(store_chunk_upload_timeout), store_rpc_retries, store_rpc_concurrency, + store_rpc_timeout: Duration::from_millis(store_rpc_timeout_millis), store_batch_api_size_limit, cache_warnings_behavior: RemoteCacheWarningsBehavior::from_str(&cache_warnings_behavior) .unwrap(), cache_content_behavior: CacheContentBehavior::from_str(&cache_content_behavior).unwrap(), cache_rpc_concurrency, - cache_read_timeout: Duration::from_millis(cache_read_timeout_millis), + cache_rpc_timeout: Duration::from_millis(cache_rpc_timeout_millis), execution_headers, execution_overall_deadline: Duration::from_secs(execution_overall_deadline_secs), execution_rpc_concurrency, diff --git a/src/rust/engine/workunit_store/src/metrics.rs b/src/rust/engine/workunit_store/src/metrics.rs index 3e2725fc79c..ba9df309698 100644 --- a/src/rust/engine/workunit_store/src/metrics.rs +++ b/src/rust/engine/workunit_store/src/metrics.rs @@ -50,6 +50,7 @@ pub enum Metric { RemoteExecutionSuccess, RemoteExecutionTimeouts, RemoteStoreMissingDigest, + RemoteStoreRequestTimeouts, /// Number of times that we backtracked due to missing digests. BacktrackAttempts, DockerExecutionRequests,