Skip to content

Commit

Permalink
Merge pull request #72 from AntelopeIO/GH-13-disaster-test2
Browse files Browse the repository at this point in the history
IF: Disaster_recovery scenario 2 test
  • Loading branch information
heifner authored May 1, 2024
2 parents 8a5716c + fc9c9b9 commit d2ec18f
Show file tree
Hide file tree
Showing 4 changed files with 292 additions and 10 deletions.
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nodeos_high_transaction_test.py ${CMA
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/nodeos_retry_transaction_test.py ${CMAKE_CURRENT_BINARY_DIR}/nodeos_retry_transaction_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/transition_to_if.py ${CMAKE_CURRENT_BINARY_DIR}/transition_to_if.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery.py ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery_2.py ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery_2.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/disaster_recovery_2_test_shape.json ${CMAKE_CURRENT_BINARY_DIR}/disaster_recovery_2_test_shape.json COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trx_finality_status_test.py ${CMAKE_CURRENT_BINARY_DIR}/trx_finality_status_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/trx_finality_status_forked_test.py ${CMAKE_CURRENT_BINARY_DIR}/trx_finality_status_forked_test.py COPYONLY)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/plugin_http_api_test.py ${CMAKE_CURRENT_BINARY_DIR}/plugin_http_api_test.py COPYONLY)
Expand Down Expand Up @@ -149,6 +151,8 @@ set_property(TEST transition_to_if_lr PROPERTY LABELS long_running_tests)

add_test(NAME disaster_recovery COMMAND tests/disaster_recovery.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST disaster_recovery PROPERTY LABELS nonparallelizable_tests)
add_test(NAME disaster_recovery_2 COMMAND tests/disaster_recovery_2.py -v ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST disaster_recovery_2 PROPERTY LABELS nonparallelizable_tests)

add_test(NAME ship_test COMMAND tests/ship_test.py -v --num-clients 10 --num-requests 5000 ${UNSHARE} WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
set_property(TEST ship_test PROPERTY LABELS nonparallelizable_tests)
Expand Down
25 changes: 15 additions & 10 deletions tests/disaster_recovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@
from TestHarness.Node import BlockType

###############################################################
# disaster_recovery
# disaster_recovery - Scenario 1
#
# Verify that if one node in network has locked blocks then consensus can continue.
#
# Integration test with 4 finalizers (A, B, C, and D).
#
# The 4 nodes are cleanly shutdown in the following state:
# - A has LIB N. A has a finalizer safety information file that locks on a block after N.
# - B, C, and D have LIB less than N. They have finalizer safety information files that lock on N.
# - B, C, and D have LIB less than or same as N. They have finalizer safety information files that lock on N
# or a block after N.
#
# All nodes lose their reversible blocks and restart from an earlier snapshot.
# Nodes B, C, and D lose their reversible blocks. All nodes restart from an earlier snapshot.
#
# A is restarted and replays up to block N after restarting from snapshot. Block N is sent to the other
# nodes B, C, and D after they are also started up again.
# A is restarted and replays up to its last reversible block (which is a block number greater than N) after
# restarting from snapshot. Blocks N and later is sent to the other nodes B, C, and D after they are also
# started up again.
#
# Verify that LIB advances and that A, B, C, and D are eventually voting strong on new blocks.
#
Expand Down Expand Up @@ -70,8 +74,8 @@
Print(f"Snapshot head block number {ret_head_block_num}")

Print("Wait for snapshot node lib to advance")
node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib)
assert node1.waitForLibToAdvance(), "Ndoe1 did not advance LIB after snapshot of Node0"
assert node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib), "Node0 did not advance to make snapshot block LIB"
assert node1.waitForLibToAdvance(), "Node1 did not advance LIB after snapshot of Node0"

assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot"
currentLIB = node0.getIrreversibleBlockNum()
Expand All @@ -82,9 +86,10 @@
for node in [node1, node2, node3]:
assert not node.verifyAlive(), "Node did not shutdown"

# node0 will have higher lib than 1,2,3 since it can incorporate QCs in blocks
Print("Wait for node 0 LIB to advance")
assert node0.waitForBlock(currentLIB, blockType=BlockType.lib), "Node0 did not advance LIB" # uses getBlockNum(blockType=blockType) > blockNum
# node0 is likely to have higher lib than 1,2,3 since it can incorporate QCs in blocks
Print("Wait for node 0 to advance")
# 4 producers, 3 of which are not producing, wait for 4 rounds to make sure node0 defproducera has time to produce
assert node0.waitForHeadToAdvance(blocksToAdvance=2, timeout=4*6), "Node0 did not advance"
node0.kill(signal.SIGTERM)
assert not node0.verifyAlive(), "Node0 did not shutdown"

Expand Down
149 changes: 149 additions & 0 deletions tests/disaster_recovery_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python3
import os
import shutil
import signal
import time
from TestHarness import Cluster, TestHelper, Utils, WalletMgr
from TestHarness.Node import BlockType

###############################################################
# disaster_recovery - Scenario 2
#
# Verify that if finalizers are only locked on LIB blocks then all reversable blocks in the network can be lost
# and consensus can continue.
#
# Integration test with 5 nodes (A, B, C, D, and P). Nodes A, B, C, and D each have one finalizer but no proposers.
# Node P has a proposer but no finalizers. The finalizer policy consists of the four finalizers with a threshold of 3.
# The proposer policy involves just the single proposer P.
#
# A, B, C, and D can be connected to each other however we like as long as blocks sent to node A can traverse to the
# other nodes B, C, and D. However, node P should only be connected to node A.
#
# At some point after IF transition has completed and LIB is advancing, block production on node P should be paused.
# Enough time should be given to allow and in-flight votes on the latest produced blocks to be delivered to node P.
# Then, the connection between node P and node A should be severed, and then block production on node P resumed. The
# LIB on node P should advance to but then stall at block N. Then shortly after that, node P should be cleanly shut down.
#
# Verify that the LIB on A, B, C, and D has stalled and is less than block N. Then, nodes A, B, C, and D can all be
# cleanly shut down.
#
# Then, reversible blocks from all nodes should be removed. All nodes are restarted from an earlier
# snapshot (prior to block N).
#
# P is restarted and replays up to block N after restarting from snapshot. Blocks up to and including block N are sent
# to the other nodes A, B, C, and D after they are also started up again.
#
# Verify that LIB advances and that A, B, C, and D are eventually voting strong on new blocks.
###############################################################

Print=Utils.Print
errorExit=Utils.errorExit

args=TestHelper.parse_args({"-d","--keep-logs","--dump-error-details","-v","--leave-running","--unshared"})
pnodes=1
delay=args.d
debug=args.v
prod_count = 1 # per node prod count
total_nodes=pnodes+4
dumpErrorDetails=args.dump_error_details

Utils.Debug=debug
testSuccessful=False

cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs)
walletMgr=WalletMgr(True, keepRunning=args.leave_running, keepLogs=args.keep_logs)

try:
TestHelper.printSystemInfo("BEGIN")

cluster.setWalletMgr(walletMgr)

Print(f'producing nodes: {pnodes}, delay between nodes launch: {delay} second{"s" if delay != 1 else ""}')

Print("Stand up cluster")
specificExtraNodeosArgs={}
specificExtraNodeosArgs[0]="--plugin eosio::net_api_plugin --plugin eosio::producer_api_plugin "

if cluster.launch(pnodes=pnodes, totalNodes=total_nodes, totalProducers=pnodes, specificExtraNodeosArgs=specificExtraNodeosArgs,
topo="./tests/disaster_recovery_2_test_shape.json", delay=delay, loadSystemContract=False,
activateIF=True, signatureProviderForNonProducer=True) is False:
errorExit("Failed to stand up eos cluster.")

assert cluster.biosNode.getInfo(exitOnError=True)["head_block_producer"] != "eosio", "launch should have waited for production to change"

cluster.biosNode.kill(signal.SIGTERM)
cluster.waitOnClusterSync(blockAdvancing=5)

node0 = cluster.getNode(0) # P
node1 = cluster.getNode(1) # A
node2 = cluster.getNode(2) # B
node3 = cluster.getNode(3) # C
node4 = cluster.getNode(4) # D

Print("Create snapshot (node 0)")
ret = node0.createSnapshot()
assert ret is not None, "Snapshot creation failed"
ret_head_block_num = ret["payload"]["head_block_num"]
Print(f"Snapshot head block number {ret_head_block_num}")

Print("Wait for snapshot node lib to advance")
assert node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib), "Node0 did not advance to make snapshot block LIB"
assert node1.waitForLibToAdvance(), "Node1 did not advance LIB after snapshot of Node0"

assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot"

Print("Pause production on Node0")
# loop until we have a lib advance after pause, pause may happen between blocks, need current block to be produced
retrys = 10
while retrys > 0:
lib = node0.getIrreversibleBlockNum()
node0.processUrllibRequest("producer", "pause")
# wait for lib because waitForBlock uses > not >=
if node0.waitForBlock(lib, blockType=BlockType.lib, timeout=10):
break
node0.processUrllibRequest("producer", "resume")
time.sleep(0.25)
retrys -= 1
assert retrys > 0, "Node0 did not advance LIB after pause"
time.sleep(1)

Print("Disconnect the producing node (Node0) from peer Node1")
node0.processUrllibRequest("net", "disconnect", "localhost:9877")
assert not node0.waitForLibToAdvance(timeout=10), "Node0 LIB still advancing after disconnect"

Print("Resume production on Node0")
node0.processUrllibRequest("producer", "resume")
assert node0.waitForHeadToAdvance(blocksToAdvance=2)
libN = node0.getIrreversibleBlockNum()

assert not node1.waitForHeadToAdvance(timeout=5), "Node1 head still advancing after disconnect"

for node in [node1, node2, node3, node4]:
lib = node.getIrreversibleBlockNum()
assert lib < libN, "Node LIB {lib} >= LIB N {libN}"

for node in [node0, node1, node2, node3, node4]:
node.kill(signal.SIGTERM)

for node in [node0, node1, node2, node3, node4]:
assert not node.verifyAlive(), "Node did not shutdown"

for node in [node0, node1, node2, node3, node4]:
node.removeReversibleBlks()
node.removeState()

for i in range(5):
isRelaunchSuccess = cluster.getNode(i).relaunch(chainArg=" -e --snapshot {}".format(node0.getLatestSnapshot()))
assert isRelaunchSuccess, f"node {i} relaunch from snapshot failed"

for node in [node0, node1, node2, node3, node4]:
assert node.waitForLibToAdvance(), "Node did not advance LIB after relaunch"
lib = node.getIrreversibleBlockNum()
assert lib > libN, "Node LIB {lib} <= LIB N {libN}"

testSuccessful=True
finally:
TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails)

exitCode = 0 if testSuccessful else 1
exit(exitCode)
124 changes: 124 additions & 0 deletions tests/disaster_recovery_2_test_shape.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"name": "testnet_",
"ssh_helper": {
"ssh_cmd": "/usr/bin/ssh",
"scp_cmd": "/usr/bin/scp",
"ssh_identity": "",
"ssh_args": ""
},
"nodes": {
"bios":{
"name": "bios",
"keys": [
{
"privkey":"5KQwrPbwdL6PhXujxW37FSSQZ1JiwsST4cqQzDeyXtP79zkvFD3",
"pubkey":"EOS6MRyAjQq8ud7hVNYcfnVPJqcVpscN5So8BhtHuGYqET5GDW5CV"
}
],
"peers": [],
"producers": [
"eosio"
],
"dont_start": false
},
"testnet_00":{
"name": "testnet_00",
"keys": [
{
"privkey":"5Jf4sTk7vwX1MYpLJ2eQFanVvKYXFqGBrCyANPukuP2BJ5WAAKZ",
"pubkey":"EOS58B33q9S7oNkgeFfcoW3VJYu4obfDiqn5RHGE2ige6jVjUhymR"
}
],
"peers": [
"bios",
"testnet_01"
],
"producers": [
"defproducera"
],
"dont_start": false
},
"testnet_01":{
"name": "testnet_01",
"keys": [
{
"privkey":"5HviUPkTEtvF2B1nm8aZUnjma2TzgpKRjuXjwHyy3FME4xDbkZF",
"pubkey":"EOS5CbcTDgbks2ptTxvyCbT9HFbzX7PDHUY2wN4DDnVBhhQr2ZNDE",
"blspubkey":"PUB_BLS_Y8ndNvnrEpnzJcNUg49ncWDiDGRgR7WUmRRDR9yMURoS6zF14sPnbb-DsTGp0cEM628a4CmG6KXMhPJMqGZvb7RM_MGIwgbEhVaENL8rXeYLOuFDS375KHFgXxs2P5sZuaN7aA",
"blsprivkey":"PVT_BLS_A1Mifu5xyaxiveyjnZ-qN2zOt-5_KLMpjTrDI9udcQNV1NBR",
"blspop":"SIG_BLS_7D0OUU1h7E0AKkAmqV4v3Ot9oSPWJBOss4yDejr2x1g5G31cSSAYIAtqZOYC-ioNzddY7zkvTcbhKgBzv5a-G1HmV1pOCXXPJ5TL0iqU8Ks5abeEWCdhArGATmRQiSMYNcj9rMQcm3H6Z0pOlOdbDdt8Cg-SY_H4jEGmAY2ZqudAH_U8gS19aydJU-2uQq0SPIr2Okl-WNbc-q3NVQw6Y0sAHAwN4BOIHup2MJyDDDIbpSEkBchRp3zna1XJf6oBuUzpqQ"
}
],
"peers": [
"bios",
"testnet_02",
"testnet_04"
],
"producers": [
],
"dont_start": false
},
"testnet_02":{
"name": "testnet_02",
"keys": [
{
"privkey":"5KkQbdxFHr8Pg1N3DEMDdU7emFgUTwQvh99FDJrodFhUbbsAtQT",
"pubkey":"EOS6Tkpf8kcDfa32WA9B4nTcEJ64ZdDMSNioDcaL6rzdMwnpzaWJB",
"blspubkey":"PUB_BLS_Wf_O_QeyVhekDXS5q3qBxTyj_qxSrX_uiCY4z8ClpW0X2jrAVgAVHOQ9IR2H40QTWveD8QIGhhSbmSFPa0zFbs5k3yfnjfuuwpA7T1O13_LSdtxT19ehYiE4chZX6SUMJ09JFA",
"blsprivkey":"PVT_BLS_1ZLWim0k80ssXswSZp1T3ydHO9U3gLnKKlEBIDy8927XDLLj",
"blspop":"SIG_BLS_EL09aI3w-qCgarLM2Z5-T6sisSHBN0J4vMZxtGQklkOcAxgnCaPPXe0roxY4W0gVe2y6T01YrklmT_qZu2tAwqiNrVJcScY8QKvRSeczGBBab1MgnHvaAOuf6bA4JPAELIu2iPWfsS6-oLyLbNP5xtZpMXPHu3yaSJssXNOb5rcVs1KXaIUEagJeAlBBQEcKmFWfeAsJ_R8JDw4i9gSNmROzUjm6LVBpvB7vrnPDPFRA0BQ19H4FED6PtuFPShwJGVz4dg"
}
],
"peers": [
"bios",
"testnet_01",
"testnet_04"
],
"producers": [
],
"dont_start": false
},
"testnet_03":{
"name": "testnet_03",
"keys": [
{
"privkey":"5JxTJJegQBpEL1p77TzkN1ompMB9gDwAfjM9chPzFCB4chxmwrE",
"pubkey":"EOS52ntDHqA2qj4xVo7KmxdezMRhvvBqpZBuKYJCsgihisxmywpAx",
"blspubkey":"PUB_BLS_C-FprIiry6X-8dlLYH7xUAhIuKXBQv56zJPgtcdmKeHf8AAy750eRrOYBtKG0-QEIN5l_yl9dTLvAYmOios6Q5t3ybWBUVVQ2WWcbZLVxzwBftLwYvo1zPXH7LHEE_sAgP1i7g",
"blsprivkey":"PVT_BLS_ubElmjajfsYP_9HRSpmV-Fi_IPWKTyJS4XFSWrU8ezMZ_mL_",
"blspop":"SIG_BLS_k3wrhVl2GUG_lGsPr9io-zoamPw7eiaxMDExk-yOqcpXtu0zALHoUWJRh0WOerAS1-_RQNhbi4q-BWO9IbiNWRKP9CYIhNIL6ochGHHy4aBmZ-IzEjfBrDt7inDtFTYY0Gl372e5OqPXAwi6J3GeHipXuzAiw7SV8XdWFefthxId4meKX6vw5_RWx4XQ4ScRYoCG7UQtIZkQPEsu1SfJGL6z-cfTTSq-naKbzp0QQYfqtQkFfmL7qQUH1iohnb0HbTbRbQ"
}
],
"peers": [
"bios",
"testnet_01",
"testnet_02",
"testnet_04"
],
"producers": [
],
"dont_start": false
},
"testnet_04":{
"name": "testnet_04",
"keys": [
{
"privkey":"5K3h9XiAmrx9EuqD8CRxHgQwEVDaWpqrhrnpdvwHtVzwJFMhNmE",
"pubkey":"EOS7K5pQCk22ojetRdyumrqp6nJX6eiQiTWWcGkZAMGhoBxgcsxhK",
"blspubkey":"PUB_BLS_kGOCEX1MM5Xl928OOvGLyNo3_GpV8av1HnoaCEGOD8bAu3MDvazu0gCZGA1G7msTh1ZTPMEMVdXMuRVS0tv_9bW9Ohz9XvgtjgbPpxxc_NaeENkGg4uDBOro0Rk8DCEW4ToLKA",
"blsprivkey":"PVT_BLS_EnQXObGKvYqfubrKjxpCqNkHeLlkQg7LERjDGm1RKjgyFZnk",
"blspop":"SIG_BLS_bXrzPVc-ahxOCWrcl-iWIMuS8ego54iz7vi38A8h_ViqtxklH9O3A2z0eiw5j40M08ejiTm7JbCY_GOwulv1oXb9SaLYQkCTZjzCVssDkghLBRTVCZW2oJmU9WbZXikNw6nkygTs5sUTtCda2a_M5jqY_Rw92_NWmbolgBNkFvMcAgSHexdETA-b7QgJX_oYBWkyP0Pt8LzO6bJueZSjH8wZ8VuPc9o8taY85mt_qgdOTbXVBG2m5ud0eAUps2UHAHt-Ig"
}
],
"peers": [
"bios",
"testnet_01",
"testnet_02",
"testnet_03"
],
"producers": [
],
"dont_start": false
}
}
}

0 comments on commit d2ec18f

Please sign in to comment.