Skip to content

Commit

Permalink
fix(nayduck) - fix flaky state_sync_routed test (#12821)
Browse files Browse the repository at this point in the history
The test was failing when the node was slow to catch up and was still at
height 0 when entering this loop:

https://github.com/near/nearcore/blob/ba1ed9f7b26cd6bc20115a9d5350bef4cfbb18bb/pytest/tests/sanity/state_sync_routed.py#L106-L107

In this case the `hash_` here would be the genesis block hash:

https://github.com/near/nearcore/blob/ba1ed9f7b26cd6bc20115a9d5350bef4cfbb18bb/pytest/tests/sanity/state_sync_routed.py#L118-L120

The `ctx.send_moar_txs` would send it to a node that is fully caught it
and has already garbage collected the genesis block. In this case the
transaction would be invalid and the balance check would fail. This
would in turn lead to the test being stuck forever in the final loop:

https://github.com/near/nearcore/blob/ba1ed9f7b26cd6bc20115a9d5350bef4cfbb18bb/pytest/tests/sanity/state_sync_routed.py#L142-L149

This PR contains a few fixes and improvement:
* Fixing the issue described above by using the `boot_node`s head's
block hash when creating transactions.
* Decreasing the timeout of the test to 100s + 2s per block to make it
fail faster (5min instead of 20min)
* Always execute the "catch up" loop at least once to make the test more
interesting.
* More assertions and logs to fail faster
* Removed the pool_time overwrite (now the default 0.25 is used). It
doesn't make any difference but the code is nicer and shorter.
* Random mini refactorings.
  • Loading branch information
wacban authored Jan 28, 2025
1 parent ba1ed9f commit e6bb098
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 43 deletions.
1 change: 1 addition & 0 deletions cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
"locustfile",
"locustfiles",
"Logunov",
"manytx",
"Mbps",
"memtable",
"memtrie",
Expand Down
2 changes: 1 addition & 1 deletion nightly/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ In the test specification path to the file needs to be given
passed as arguments to the script. For example:

pytest sanity/lightclient.py
pytest sanity/state_sync_routed.py shreyan 115
pytest sanity/state_sync_routed.py manytx 115

Note: NayDuck also handles `mocknet` test category. It is now
deprecated and is treated like `pytest` with `--skip-build` flag
Expand Down
4 changes: 2 additions & 2 deletions nightly/pytest-sanity.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ pytest --timeout=240 sanity/state_sync5.py --features nightly
#pytest --timeout=300 sanity/state_sync_late.py notx --features nightly
pytest sanity/state_sync_missing_chunks.py
pytest sanity/state_sync_missing_chunks.py --features nightly
pytest --timeout=600 sanity/state_sync_routed.py manytx 115
pytest --timeout=600 sanity/state_sync_routed.py manytx 115 --features nightly
pytest --timeout=300 sanity/state_sync_routed.py manytx 115
pytest --timeout=300 sanity/state_sync_routed.py manytx 115 --features nightly

pytest --timeout=270 sanity/single_shard_tracking.py
pytest --timeout=270 sanity/single_shard_tracking.py --features nightly
Expand Down
17 changes: 14 additions & 3 deletions pytest/lib/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,14 +300,25 @@ def get_latest_block(self, **kw) -> BlockId:
hash=sync_info['latest_block_hash'])

def get_all_heights(self):

# Helper function to check if the block response is a "block not found" error.
def block_not_found(block) -> bool:
error = block.get('error')
if error is None:
return False

data = error.get('data')
if data is None:
return False

return 'DB Not Found Error: BLOCK:' in data

hash_ = self.get_latest_block().hash
heights = []

while True:
block = self.get_block(hash_)
if 'error' in block and 'data' in block[
'error'] and 'DB Not Found Error: BLOCK:' in block['error'][
'data']:
if block_not_found(block):
break
elif 'result' not in block:
logger.info(block)
Expand Down
7 changes: 3 additions & 4 deletions pytest/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@
from rc import gcloud

import cluster
from configured_logger import logger
import key
import transaction
from configured_logger import logger


class TxContext:

def __init__(self, act_to_val, nodes):
def __init__(self, act_to_val, nodes: typing.List[cluster.BaseNode]):
self.next_nonce = 2
self.num_nodes = len(nodes)
self.nodes = nodes
self.nodes: typing.List[cluster.BaseNode] = nodes
self.act_to_val = act_to_val
self.expected_balances = self.get_balances()
assert len(act_to_val) == self.num_nodes
Expand Down
81 changes: 48 additions & 33 deletions pytest/tests/sanity/state_sync_routed.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
assert mode in ['notx', 'onetx', 'manytx']

START_AT_BLOCK = int(sys.argv[2])
TIMEOUT = 150 + START_AT_BLOCK * 10
TIMEOUT = 100 + START_AT_BLOCK * 2

config = load_config()
node_config = state_sync_lib.get_state_sync_config_combined()
Expand Down Expand Up @@ -69,73 +69,88 @@

ctx = utils.TxContext([0, 0], [node0, node1])

sent_txs = False

sent_txs = 0
observed_height = 0
for observed_height, hash_ in utils.poll_blocks(boot_node,
timeout=TIMEOUT,
poll_interval=0.1):
for observed_height, hash_ in utils.poll_blocks(boot_node, timeout=TIMEOUT):
if observed_height >= START_AT_BLOCK:
break
if mode == 'onetx' and not sent_txs:

if mode == 'onetx' and sent_txs == 0:
ctx.send_moar_txs(hash_, 3, False)
sent_txs = True
elif mode == 'manytx' and ctx.get_balances() == ctx.expected_balances:
sent_txs += 1

if mode == 'manytx' and ctx.get_balances() == ctx.expected_balances:
logger.info(f'Sending moar warm up txs at height {observed_height}')
ctx.send_moar_txs(hash_, 3, False)
logger.info(f'Sending moar txs at height {observed_height}')
sent_txs += 1

# Wait for a few blocks to make sure all transactions are processed.
utils.wait_for_blocks(boot_node, count=5)

assert ctx.get_balances() == ctx.expected_balances, "The balances are incorrect"

if mode == 'onetx':
assert ctx.get_balances() == ctx.expected_balances
assert sent_txs == 1, "No transactions were sent"

if mode == 'manytx':
assert sent_txs > 1, "No transactions were sent"

logger.info("Warm up finished")

node4 = spin_up_node(config,
near_root,
node_dirs[4],
4,
boot_node=boot_node,
blacklist=[0, 1])

# State Sync makes the storage seem inconsistent.
node4.stop_checking_store()

metrics4 = utils.MetricsTracker(node4)
time.sleep(3)

catch_up_height = 0
for catch_up_height, hash_ in utils.poll_blocks(node4,
timeout=TIMEOUT,
poll_interval=0.1):
if catch_up_height >= observed_height:
break
for block_height, _ in utils.poll_blocks(node4, timeout=TIMEOUT):
assert time.time() - started < TIMEOUT, "Waiting for node 4 to catch up"
new_height = node4.get_latest_block().height
logger.info(f"Latest block at: {new_height}")
if new_height > catch_up_height:
catch_up_height = new_height
logger.info(f"Last observer got to height {new_height}")

boot_height = boot_node.get_latest_block().height

if mode == 'manytx':
if ctx.get_balances() == ctx.expected_balances:
ctx.send_moar_txs(hash_, 3, False)
logger.info(f"Sending moar txs at height {boot_height}")

if mode == 'manytx' and ctx.get_balances() == ctx.expected_balances:
# Use the boot node head to send more txs. The new node may be behind
# and its head may already be garbage collected on the boot node. This
# would cause the transaction to be rejected and the balance check to fail.
(boot_height, boot_hash) = boot_node.get_latest_block()
ctx.send_moar_txs(boot_hash, 3, False)
logger.info(
f"Sending moar catch up txs at height {boot_height} hash {boot_hash}"
)

if block_height >= observed_height:
break

time.sleep(0.1)

# The boot heights are the heights of blocks that the node has in its storage.
# It does not contain any blocks that were garbage collected.
boot_heights = boot_node.get_all_heights()
catch_up_height = node4.get_latest_block().height

assert catch_up_height in boot_heights, "%s not in %s" % (catch_up_height,
boot_heights)

logger.info("Catch Up finished")

while True:
assert time.time(
) - started < TIMEOUT, "Waiting for node 4 to connect to two peers"
timeout_expired = time.time() - started < TIMEOUT
assert timeout_expired, "Waiting for node 4 to connect to two peers"

num_connections = 0
for (conn_type,
count) in metrics4.get_metric_all_values("near_peer_connections"):
peer_connections = metrics4.get_metric_all_values("near_peer_connections")
for (conn_type, count) in peer_connections:
if conn_type['tier'] == 'T2':
num_connections += count
if num_connections == 2:
break
time.sleep(0.1)

logger.info("New node connected to observers")

if mode == 'manytx':
Expand Down

0 comments on commit e6bb098

Please sign in to comment.