Skip to content

Commit

Permalink
GH-13 Add disaster recovery scenario 3
Browse files Browse the repository at this point in the history
  • Loading branch information
heifner committed May 17, 2024
1 parent 29c7626 commit 04ef5aa
Showing 1 changed file with 131 additions and 0 deletions.
131 changes: 131 additions & 0 deletions tests/disaster_recovery_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3
import os
import shutil
import signal
import time
from TestHarness import Cluster, TestHelper, Utils, WalletMgr
from TestHarness.Node import BlockType

###############################################################
# disaster_recovery - Scenario 3
#
# Create integration test with 4 nodes (A, B, C, and D) which each have their own producer and finalizer. The finalizer
# policy consists of the four finalizers with a threshold of 3. The proposer policy involves all four proposers.
#
# - At least two of the four nodes should have a LIB N and a finalizer safety information file that locks on a block
# after N. The other two nodes should have a LIB that is less than or equal to block N.
#
# All nodes are shut down. The reversible blocks on all nodes is deleted. Restart all nodes from an earlier snapshot.
#
# All nodes eventually sync up to block N. Some nodes will consider block N to LIB but others may not.
#
# Not enough finalizers should be voting because of the lock in their finalizer safety information file. Verify that
# LIB does not advance on any node.
#
# Cleanly shut down all nodes and delete their finalizer safety information files. Then restart the nodes.
#
# Verify that LIB advances on all nodes and they all agree on the LIB. In particular, verify that block N is the
# same ID on all nodes as the one before nodes were first shutdown.
#
###############################################################

Print=Utils.Print
errorExit=Utils.errorExit

args=TestHelper.parse_args({"-d","--keep-logs","--dump-error-details","-v","--leave-running","--unshared"})
pnodes=4
delay=args.d
debug=args.v
prod_count = 1 # per node prod count
total_nodes=pnodes
dumpErrorDetails=args.dump_error_details

Utils.Debug=debug
testSuccessful=False

cluster=Cluster(unshared=args.unshared, keepRunning=args.leave_running, keepLogs=args.keep_logs)
walletMgr=WalletMgr(True, keepRunning=args.leave_running, keepLogs=args.keep_logs)

try:
TestHelper.printSystemInfo("BEGIN")

cluster.setWalletMgr(walletMgr)

Print(f'producing nodes: {pnodes}, delay between nodes launch: {delay} second{"s" if delay != 1 else ""}')

Print("Stand up cluster")
if cluster.launch(pnodes=pnodes, totalNodes=total_nodes, totalProducers=pnodes, delay=delay, loadSystemContract=False,
activateIF=True, biosFinalizer=False) is False:
errorExit("Failed to stand up eos cluster.")

assert cluster.biosNode.getInfo(exitOnError=True)["head_block_producer"] != "eosio", "launch should have waited for production to change"
cluster.biosNode.kill(signal.SIGTERM)
cluster.waitOnClusterSync(blockAdvancing=5)

node0 = cluster.getNode(0) # A
node1 = cluster.getNode(1) # B
node2 = cluster.getNode(2) # C
node3 = cluster.getNode(3) # D

Print("Create snapshot (node 0)")
ret = node0.createSnapshot()
assert ret is not None, "Snapshot creation failed"
ret_head_block_num = ret["payload"]["head_block_num"]
Print(f"Snapshot head block number {ret_head_block_num}")

Print("Wait for snapshot node lib to advance")
assert node0.waitForBlock(ret_head_block_num+1, blockType=BlockType.lib), "Node0 did not advance to make snapshot block LIB"
assert node1.waitForLibToAdvance(), "Node1 did not advance LIB after snapshot of Node0"

assert node0.waitForLibToAdvance(), "Node0 did not advance LIB after snapshot"
currentLIB = node0.getIrreversibleBlockNum()
libBlock = node0.getBlock(currentLIB)
Print(f"Lib Block: {libBlock}")

Print("Shutdown all nodes")
for node in [node0, node1, node2, node3]:
node.kill(signal.SIGTERM)
for node in [node0, node1, node2, node3]:
assert not node.verifyAlive(), "Node did not shutdown"

Print("Remove reversible blocks and state, but not finalizers safety data")
for node in [node0, node1, node2, node3]:
node.removeReversibleBlks()
node.removeState()

Print("Restart nodes from snapshot")
for i in range(4):
isRelaunchSuccess = cluster.getNode(i).relaunch(chainArg=" -e --snapshot {}".format(node0.getLatestSnapshot()))
assert isRelaunchSuccess, f"node {i} relaunch from snapshot failed"

Print("Verify LIB does not advance on any node")
for node in [node0, node1, node2, node3]:
assert not node.waitForLibToAdvance(), "Node advanced LIB after relaunch when it should not"

Print("Shutdown all nodes to remove finalizer safety data")
for node in [node0, node1, node2, node3]:
node.kill(signal.SIGTERM)
for node in [node0, node1, node2, node3]:
assert not node.verifyAlive(), "Node did not shutdown"

for node in [node0, node1, node2, node3]:
node.removeFinalizersSafetyFile()

Print("Restart nodes")
for node in [node0, node1, node2, node3]:
node.relaunch(rmArgs=" --snapshot {}".format(node0.getLatestSnapshot()))

Print("Verify LIB advances on all nodes")
for node in [node0, node1, node2, node3]:
assert node.waitForLibToAdvance(), "Node did not advance LIB after restart"

for node in [node0, node1, node2, node3]:
nodeId = node.getBlock(currentLIB)["id"]
assert nodeId == libBlock["id"], "Node lib block id does not match prior lib block id"

testSuccessful=True
finally:
TestHelper.shutdown(cluster, walletMgr, testSuccessful=testSuccessful, dumpErrorDetails=dumpErrorDetails)

exitCode = 0 if testSuccessful else 1
exit(exitCode)

0 comments on commit 04ef5aa

Please sign in to comment.