Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ci): fix hangs in lightwalletd tests by checking concurrent process output in different threads #4828

Merged
merged 16 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions .github/workflows/deploy-gcp-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ jobs:
# - To ${{ inputs.zebra_state_dir || inputs.disk_prefix }} if not
#
# If there are multiple disks:
# - prefer images generated from the `main` branch, then any other branch
# - prefer images generated from this branch, then the `main` branch, then any other branch
# - prefer newer images to older images
#
# Passes the disk name to subsequent steps using $CACHED_DISK_NAME env variable
Expand All @@ -273,20 +273,28 @@ jobs:
DISK_PREFIX=${{ inputs.zebra_state_dir || inputs.disk_prefix }}
fi

# Try to find an image generated from the main branch
# Try to find an image generated from this branch and commit
# Fields are listed in the "Create image from state disk" step
CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
echo "main Disk: $CACHED_DISK_NAME"
BRANCH_DISK_NAME="${DISK_PREFIX}-${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT}-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${BRANCH_DISK_NAME}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
echo "${GITHUB_REF_SLUG_URL}-${GITHUB_SHA_SHORT} Disk: $CACHED_DISK_NAME"

if [[ -z "$CACHED_DISK_NAME" ]]; then
# Try to find an image generated from the main branch
CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-main-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
echo "main Disk: $CACHED_DISK_NAME"
fi

if [[ -z "$CACHED_DISK_NAME" ]]; then
# Try to find an image generated from any other branch
CACHED_DISK_NAME=$(gcloud compute images list --filter="name~${DISK_PREFIX}-.+-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1)
echo "Disk: $CACHED_DISK_NAME"
echo "any branch Disk: $CACHED_DISK_NAME"
fi

if [[ -z "$CACHED_DISK_NAME" ]]; then
echo "No cached state disk available"
echo "Expected ${DISK_PREFIX}-(branch)-[0-9a-f]+-v${LOCAL_STATE_VERSION}-${NETWORK}-${{ inputs.disk_suffix }}"
echo "Expected ${BRANCH_DISK_NAME}"
echo "Also searched for any commit on main, and any commit on any branch"
echo "Cached state test jobs must depend on the cached state rebuild job"
exit 1
fi
Expand Down
48 changes: 28 additions & 20 deletions zebra-chain/src/diagnostic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ impl CodeTimer {
#[track_caller]
pub fn start() -> Self {
let start = Instant::now();
trace!(?start, "starting code timer");
trace!(
target: "run time",
?start,
"started code timer",
);

Self {
start,
Expand All @@ -47,7 +51,6 @@ impl CodeTimer {
}

/// Finish timing the execution of a function, method, or other code region.
#[track_caller]
pub fn finish<S>(
mut self,
module_path: &'static str,
Expand All @@ -62,7 +65,6 @@ impl CodeTimer {
/// Finish timing the execution of a function, method, or other code region.
///
/// This private method can be called from [`CodeTimer::finish()`] or `drop()`.
#[track_caller]
fn finish_inner<S>(
&mut self,
module_path: impl Into<Option<&'static str>>,
Expand All @@ -78,42 +80,48 @@ impl CodeTimer {
self.has_finished = true;

let execution = self.start.elapsed();
let execution_time = duration_short(execution);
let time = duration_short(execution);
let time = time.as_str();

let module = module_path.into().unwrap_or_default();

let line = line.into().map(|line| line.to_string()).unwrap_or_default();
let line = line.as_str();

let module_path = module_path.into();
let line = line.into();
let description = description
.into()
.map(|desc| desc.to_string() + " ")
.map(|desc| desc.to_string())
.unwrap_or_default();

if execution >= self.min_warn_time {
warn!(
?execution_time,
?module_path,
?line,
"{description}code took a long time to execute",
target: "run time",
%time,
%module,
%line,
"very long {description}",
);
} else if execution >= self.min_info_time {
info!(
?execution_time,
?module_path,
?line,
"{description}code took longer than expected to execute",
target: "run time",
%time,
%module,
%line,
"long {description}",
);
} else {
trace!(
?execution_time,
?module_path,
?line,
"{description}code timer finished",
target: "run time",
%time,
%module,
%line,
"finished {description} code timer",
);
}
}
}

impl Drop for CodeTimer {
#[track_caller]
fn drop(&mut self) {
self.finish_inner(None, None, "(dropped, cancelled, or aborted)")
}
Expand Down
8 changes: 4 additions & 4 deletions zebra-test/src/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,10 @@ pub struct TestChild<T> {
/// The standard output stream of the child process.
///
/// TODO: replace with `Option<ChildOutput { stdout, stderr }>.
pub stdout: Option<Box<dyn IteratorDebug<Item = std::io::Result<String>>>>,
pub stdout: Option<Box<dyn IteratorDebug<Item = std::io::Result<String>> + Send>>,

/// The standard error stream of the child process.
pub stderr: Option<Box<dyn IteratorDebug<Item = std::io::Result<String>>>>,
pub stderr: Option<Box<dyn IteratorDebug<Item = std::io::Result<String>> + Send>>,

/// Command outputs which indicate test failure.
///
Expand Down Expand Up @@ -434,9 +434,9 @@ impl<T> TestChild<T> {
fn map_into_string_lines<R>(
&self,
reader: R,
) -> Box<dyn IteratorDebug<Item = std::io::Result<String>>>
) -> Box<dyn IteratorDebug<Item = std::io::Result<String>> + Send>
where
R: Read + Debug + 'static,
R: Read + Debug + Send + 'static,
{
let failure_regexes = self.failure_regexes.clone();
let ignore_regexes = self.ignore_regexes.clone();
Expand Down
104 changes: 54 additions & 50 deletions zebrad/tests/acceptance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
//!
//! Please refer to the documentation of each test for more information.

use std::{collections::HashSet, env, fs, path::PathBuf, time::Duration};
use std::{collections::HashSet, env, fs, panic, path::PathBuf, time::Duration};

use color_eyre::{
eyre::{eyre, Result, WrapErr},
Expand Down Expand Up @@ -1350,7 +1350,7 @@ fn lightwalletd_integration_test(test_type: LightwalletdTestType) -> Result<()>
}

// Launch lightwalletd, if needed
let mut lightwalletd = if test_type.launches_lightwalletd() {
let lightwalletd = if test_type.launches_lightwalletd() {
// Wait until `zebrad` has opened the RPC endpoint
zebrad.expect_stdout_line_matches(regex::escape(
format!("Opened RPC endpoint at {}", config.rpc.listen_addr.unwrap()).as_str(),
Expand Down Expand Up @@ -1432,60 +1432,64 @@ fn lightwalletd_integration_test(test_type: LightwalletdTestType) -> Result<()>
None
};

if test_type.needs_zebra_cached_state() {
// Wait for Zebra to sync its cached state to the chain tip
zebrad.expect_stdout_line_matches(SYNC_FINISHED_REGEX)?;

// Wait for lightwalletd to sync some blocks
if let Some(ref mut lightwalletd) = lightwalletd {
lightwalletd
.expect_stdout_line_matches("([Aa]dding block to cache)|([Ww]aiting for block)")?;

let (mut zebrad, lightwalletd) = if test_type.needs_zebra_cached_state() {
if let Some(mut lightwalletd) = lightwalletd {
// Wait for lightwalletd to sync to Zebra's tip.
//
// TODO: after the lightwalletd hangs are fixed, fail the test on errors or timeouts
if cfg!(lightwalletd_hang_fix) {
lightwalletd.expect_stdout_line_matches("[Ww]aiting for block")?;
} else {
// To work around a hang bug, we run the test until:
// - lightwalletd starts waiting for blocks (best case scenario)
// - lightwalletd syncs to near the tip (workaround, cached state image is usable)
// - the test times out with an error, but we ignore it
// (workaround, cached state might be usable, slow, or might fail other tests)
//
// TODO: update the regex to `1[8-9][0-9]{5}` when mainnet reaches block 1_800_000
let log_result = lightwalletd.expect_stdout_line_matches(
"([Aa]dding block to cache 1[7-9][0-9]{5})|([Ww]aiting for block)",
);
if log_result.is_err() {
// This error takes up about 100 lines, and looks like a panic message
tracing::warn!(
multi_line_error = ?log_result,
"ignoring a lightwalletd test failure, to work around a lightwalletd hang bug",
);
// "Adding block" and "Waiting for block" logs stop when `lightwalletd` reaches the tip.
// But if the logs just stop, we can't tell the difference between a hang and fully synced.
// So we assume `lightwalletd` will sync and log large groups of blocks,
// and check for logs with heights near the mainnet tip height.
//
// TODO: update the regex to `1[8-9][0-9]{5}` when mainnet reaches block 1_800_000
let lightwalletd_thread = std::thread::spawn(move || -> Result<_> {
tracing::info!(?test_type, "waiting for lightwalletd to sync to the tip");

lightwalletd.expect_stdout_line_matches(
"([Aa]dding block to cache 1[7-9][0-9]{5})|([Ww]aiting for block: 1[7-9][0-9]{5})",
)?;

Ok(lightwalletd)
});

// `lightwalletd` syncs can take a long time,
// so we need to check that `zebrad` has synced to the tip in parallel.
let lightwalletd_thread_and_zebrad = std::thread::spawn(move || -> Result<_> {
tracing::info!(?test_type, "waiting for zebrad to sync to the tip");

while !lightwalletd_thread.is_finished() {
zebrad.expect_stdout_line_matches(SYNC_FINISHED_REGEX)?;
}
}
}

// Check Zebra is still at the tip (also clears and prints Zebra's logs)
zebrad.expect_stdout_line_matches(SYNC_FINISHED_REGEX)?;
Ok((lightwalletd_thread, zebrad))
});

// lightwalletd doesn't log anything when we've reached the tip.
// But when it gets near the tip, it starts using the mempool.
//
// adityapk00/lightwalletd logs mempool changes, but zcash/lightwalletd doesn't.
//
// TODO: re-enable this code when lightwalletd hangs are fixed
if cfg!(lightwalletd_hang_fix) {
if let Some(ref mut lightwalletd) = lightwalletd {
lightwalletd.expect_stdout_line_matches(regex::escape(
"Block hash changed, clearing mempool clients",
))?;
lightwalletd
.expect_stdout_line_matches(regex::escape("Adding new mempool txid"))?;
}
// Retrieve the child process handles from the threads
let (lightwalletd_thread, zebrad) = lightwalletd_thread_and_zebrad
.join()
.unwrap_or_else(|panic_object| panic::resume_unwind(panic_object))?;

let lightwalletd = lightwalletd_thread
.join()
.unwrap_or_else(|panic_object| panic::resume_unwind(panic_object))?;

(zebrad, Some(lightwalletd))
} else {
// We're just syncing Zebra, so there's no lightwalletd to check
tracing::info!(?test_type, "waiting for zebrad to sync to the tip");
zebrad.expect_stdout_line_matches(SYNC_FINISHED_REGEX)?;

(zebrad, None)
}
}
} else {
// We don't have a cached state, so we don't do any tip checks for Zebra or lightwalletd
(zebrad, lightwalletd)
};

tracing::info!(
?test_type,
"cleaning up child processes and checking for errors",
);

// Cleanup both processes
//
Expand Down
19 changes: 10 additions & 9 deletions zebrad/tests/common/launch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,27 @@ pub const BETWEEN_NODES_DELAY: Duration = Duration::from_secs(2);

/// The amount of time we wait for lightwalletd to update to the tip.
///
/// The cached tip can be a few days old, and Zebra needs time to activate its mempool.
///
/// Currently, `zebrad` syncs are slower than `lightwalletd` syncs, so we re-use its timeout.
/// `lightwalletd` takes about 90 minutes to fully sync,
/// and `zebrad` takes about 30 minutes to update to the tip.
///
/// TODO: reduce to 20 minutes when `zebrad` sync performance improves
pub const LIGHTWALLETD_UPDATE_TIP_DELAY: Duration = LIGHTWALLETD_FULL_SYNC_TIP_DELAY;
pub const LIGHTWALLETD_UPDATE_TIP_DELAY: Duration = Duration::from_secs(60 * 60);

/// The amount of time we wait for lightwalletd to do a full sync to the tip.
///
/// `lightwalletd` takes about an hour to fully sync,
/// and Zebra needs time to activate its mempool.
pub const LIGHTWALLETD_FULL_SYNC_TIP_DELAY: Duration = Duration::from_secs(90 * 60);
/// See [`LIGHTWALLETD_UPDATE_TIP_DELAY`] for details.
pub const LIGHTWALLETD_FULL_SYNC_TIP_DELAY: Duration = Duration::from_secs(150 * 60);

/// The amount of extra time we wait for Zebra to sync to the tip,
/// after we ignore a lightwalletd failure.
///
/// Zebra logs a status entry every minute, so there should be at least 4 in this interval.
/// Since we restart `lightwalletd` after a hang, we allow time for another full `lightwalletd` sync.
///
/// See [`LIGHTWALLETD_UPDATE_TIP_DELAY`] for details.
///
/// TODO: remove this extra time when lightwalletd hangs are fixed
pub const ZEBRAD_EXTRA_DELAY_FOR_LIGHTWALLETD_WORKAROUND: Duration = Duration::from_secs(5 * 60);
pub const ZEBRAD_EXTRA_DELAY_FOR_LIGHTWALLETD_WORKAROUND: Duration =
LIGHTWALLETD_FULL_SYNC_TIP_DELAY;

/// Extension trait for methods on `tempfile::TempDir` for using it as a test
/// directory for `zebrad`.
Expand Down
16 changes: 10 additions & 6 deletions zebrad/tests/common/lightwalletd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,17 +311,21 @@ impl LightwalletdTestType {
default_test_config()
};

if !self.needs_zebra_cached_state() {
return Some(config);
}

let zebra_state_path = self.zebrad_state_path(test_name)?;

let mut config = match config {
Ok(config) => config,
Err(error) => return Some(Err(error)),
};

// We want to preload the consensus parameters,
// except when we're doing the quick empty state test
config.consensus.debug_skip_parameter_preload = !self.needs_zebra_cached_state();

if !self.needs_zebra_cached_state() {
return Some(Ok(config));
}

let zebra_state_path = self.zebrad_state_path(test_name)?;

config.sync.checkpoint_verify_concurrency_limit =
zebrad::components::sync::DEFAULT_CHECKPOINT_CONCURRENCY_LIMIT;

Expand Down
Loading