Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[4.0] SHiP: fix intermittent empty get_block_result #1276

Merged
merged 3 commits into from
Jun 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,10 @@ struct session : session_base, std::enable_shared_from_this<session<Plugin, Sock
return;
}

auto block_id = plugin->get_block_id(to_send_block_num);
// not just an optimization, on accepted_block signal may not be able to find block_num in forkdb as it has not been validated
// until after the accepted_block signal
std::optional<chain::block_id_type> block_id =
(block_state && block_state->block_num == to_send_block_num) ? block_state->id : plugin->get_block_id(to_send_block_num);

if (block_id && position_it && (*position_it)->block_num == to_send_block_num) {
// This branch happens when the head block of nodeos is behind the head block of connecting client.
Expand Down
57 changes: 55 additions & 2 deletions tests/nodeos_forked_chain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import time
import json
import signal
import os

from TestHarness import Cluster, Node, TestHelper, Utils, WalletMgr, CORE_SYMBOL
from TestHarness.TestHelper import AppArgs

###############################################################
# nodeos_forked_chain_test
Expand All @@ -27,6 +29,8 @@
# Time is allowed to progress so that the "bridge" node can catchup and both producer nodes to come to consensus
# The block log is then checked for both producer nodes to verify that the 10 producer fork is selected and that
# both nodes are in agreement on the block log.
# This test also runs a state_history_plugin (SHiP) on node 0 and uses ship_streamer to verify all blocks are received
# across the fork.
#
###############################################################

Expand Down Expand Up @@ -121,9 +125,10 @@ def getMinHeadAndLib(prodNodes):
return (headBlockNum, libNum)



appArgs = AppArgs()
extraArgs = appArgs.add(flag="--num-ship-clients", type=int, help="How many ship_streamers should be started", default=2)
args = TestHelper.parse_args({"--prod-count","--dump-error-details","--keep-logs","-v","--leave-running","--clean-run",
"--wallet-port","--unshared"})
"--wallet-port","--unshared"}, applicationSpecificArgs=appArgs)
Utils.Debug=args.v
totalProducerNodes=2
totalNonProducerNodes=1
Expand All @@ -137,6 +142,7 @@ def getMinHeadAndLib(prodNodes):
prodCount=args.prod_count
killAll=args.clean_run
walletPort=args.wallet_port
num_clients=args.num_ship_clients

walletMgr=WalletMgr(True, port=walletPort)
testSuccessful=False
Expand All @@ -154,6 +160,9 @@ def getMinHeadAndLib(prodNodes):
cluster.cleanup()
Print("Stand up cluster")
specificExtraNodeosArgs={}
shipNodeNum = 0
specificExtraNodeosArgs[shipNodeNum]="--plugin eosio::state_history_plugin --disable-replay-opts"

# producer nodes will be mapped to 0 through totalProducerNodes-1, so the number totalProducerNodes will be the non-producing node
specificExtraNodeosArgs[totalProducerNodes]="--plugin eosio::test_control_api_plugin"

Expand Down Expand Up @@ -286,6 +295,31 @@ def getBlock(self, blockNum):
timestampStr=Node.getBlockAttribute(block, "timestamp", blockNum)
timestamp=datetime.strptime(timestampStr, Utils.TimeFmt)

shipNode = cluster.getNode(0)
block_range = 800
start_block_num = blockNum
end_block_num = start_block_num + block_range

shipClient = "tests/ship_streamer"
cmd = f"{shipClient} --start-block-num {start_block_num} --end-block-num {end_block_num} --fetch-block --fetch-traces --fetch-deltas"
if Utils.Debug: Utils.Print(f"cmd: {cmd}")
clients = []
files = []
shipTempDir = os.path.join(Utils.DataDir, "ship")
os.makedirs(shipTempDir, exist_ok = True)
shipClientFilePrefix = os.path.join(shipTempDir, "client")

starts = []
for i in range(0, num_clients):
start = time.perf_counter()
outFile = open(f"{shipClientFilePrefix}{i}.out", "w")
errFile = open(f"{shipClientFilePrefix}{i}.err", "w")
Print(f"Start client {i}")
popen=Utils.delayedCheckOutput(cmd, stdout=outFile, stderr=errFile)
starts.append(time.perf_counter())
clients.append((popen, cmd))
files.append((outFile, errFile))
Utils.Print(f"Client {i} started, Ship node head is: {shipNode.getBlockNum()}")

# *** Identify what the production cycle is ***

Expand Down Expand Up @@ -559,6 +593,25 @@ def getBlock(self, blockNum):
Utils.errorExit("Did not find find block %s (the original divergent block) in blockProducers0, test setup is wrong. blockProducers0: %s" % (killBlockNum, ", ".join(blockProducers0)))
Print("Fork resolved and determined producer %s for block %s" % (resolvedKillBlockProducer, killBlockNum))

Print(f"Stopping all {num_clients} clients")
for index, (popen, _), (out, err), start in zip(range(len(clients)), clients, files, starts):
popen.wait()
Print(f"Stopped client {index}. Ran for {time.perf_counter() - start:.3f} seconds.")
out.close()
err.close()
outFile = open(f"{shipClientFilePrefix}{index}.out", "r")
data = json.load(outFile)
block_num = start_block_num
for i in data:
# fork can cause block numbers to be repeated
this_block_num = i['get_blocks_result_v0']['this_block']['block_num']
if this_block_num < block_num:
block_num = this_block_num
assert block_num == this_block_num, f"{block_num} != {this_block_num}"
assert isinstance(i['get_blocks_result_v0']['block'], str) # verify block in result
block_num += 1
assert block_num-1 == end_block_num, f"{block_num-1} != {end_block_num}"

blockProducers0=[]
blockProducers1=[]

Expand Down