Skip to content

Commit

Permalink
fix a bug where timeout does not work for stalls
Browse files Browse the repository at this point in the history
Summary:
As discovered in T170192224, the cachebench's timeout handler is triggering the graceful shutdown,
which does not work when whole cachebench processes are stalled due to a bug like the livelock in
T169248917. This change fixes the bug by aborting the graceful shutdown after another shutdown
timeout of 30s.

This change also reduced the number of keys and ops for navy-with-reinsertion[-inmem-buf].json.
According to the cogwheel test runs, it seems that the cachebench is running ~21 times slower; i.e.,
256M OPS vs 7.4M OPS for 100s, which is expected because of the contention on the clean region. The
runtime of the navy.json is already over 49m which is close to 1hr timeout already.

navy.json https://fburl.com/servicelab/qrjrqn9g
navy-with-reinsertion.json https://fburl.com/servicelab/wyg9poo8

This change also reduced the number of ops for several more cases with marginal timeouts.

feature_stress/free_list.json: https://fburl.com/servicelab/pa1noczk
integration_tests/cachelib_map.json: https://fburl.com/servicelab/ahgiefnv

Reviewed By: therealgymmy

Differential Revision: D51417010

fbshipit-source-id: 1f32f1ec3641cafdd26591845d69ca889bf3a808
  • Loading branch information
Jaesoo Lee authored and facebook-github-bot committed Nov 22, 2023
1 parent b7ead24 commit 3c0ef49
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 22 deletions.
11 changes: 10 additions & 1 deletion cachelib/cachebench/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,20 @@ void setupTimeoutHandler() {
stopperThread = std::make_unique<std::thread>([] {
folly::EventBase eb;
eb.runAfterDelay(
[]() {
[&eb]() {
XLOGF(INFO,
"Stopping due to timeout {} seconds",
FLAGS_timeout_seconds);
if (runnerInstance) {
runnerInstance->abort();
}
eb.terminateLoopSoon();
},
FLAGS_timeout_seconds * 1000);
eb.loopForever();
// We give another few seconds for the graceful shutdown to complete
eb.runAfterDelay([]() { XCHECK(false); }, 30 * 1000);
eb.loopForever();
});
stopperThread->detach();
}
Expand Down Expand Up @@ -157,4 +164,6 @@ int main(int argc, char** argv) {
std::cout << "Invalid configuration. Exception: " << e.what() << std::endl;
return 1;
}

return 0;
}
16 changes: 14 additions & 2 deletions cachelib/cachebench/runner/Runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,16 @@ bool Runner::run(std::chrono::seconds progressInterval,
std::cout << std::endl;

stressor_.reset();
return cacheStats.renderIsTestPassed(std::cout);

bool passed = cacheStats.renderIsTestPassed(std::cout);
if (aborted_) {
std::cerr << "Test aborted.\n";
passed = false;
}
return passed;
}

void Runner::run(folly::UserCounters& counters) {
bool Runner::run(folly::UserCounters& counters) {
stressor_->start();
stressor_->finish();

Expand All @@ -77,6 +83,12 @@ void Runner::run(folly::UserCounters& counters) {

stressor_.reset();
}

if (aborted_) {
std::cerr << "Test aborted.\n";
return false;
}
return true;
}

} // namespace cachebench
Expand Down
5 changes: 4 additions & 1 deletion cachelib/cachebench/runner/Runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ class Runner {
// in addition to running time, cachebench has several metrics
// (hit rate, throughput, ect.) to be compared, use BENCHMARK_COUNTER
// and put metrics into folly::UserCounters to show metrics in output results.
void run(folly::UserCounters&);
bool run(folly::UserCounters&);

void abort() {
aborted_ = true;
if (stressor_) {
stressor_->abort();
}
Expand All @@ -60,6 +61,8 @@ class Runner {
private:
// instance of the stressor.
std::unique_ptr<Stressor> stressor_;

bool aborted_{false};
};
} // namespace cachebench
} // namespace cachelib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
},
"test_config" :
{

"checkConsistency" : true,

"numOps" : 30000000,
"numOps" : 1000000,
"numThreads" : 40,
"numKeys" : 10000000,
"numKeys" : 1000000,


"keySizeRange" : [1, 8],
"keySizeRangeProbability" : [1.0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
},
"test_config" :
{

"checkConsistency" : true,

"numOps" : 30000000,
"numOps" : 1000000,
"numThreads" : 40,
"numKeys" : 10000000,
"numKeys" : 1000000,


"keySizeRange" : [1, 8],
"keySizeRangeProbability" : [1.0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@
"moveOnSlabRelease" : false,
"rebalanceStrategy" : "tail-age"
},
"test_config" :
"test_config" :
{
"numOps" : 50000000,
"numOps" : 20000000,
"numThreads" : 20,
"numKeys" : 10000000,



"keySizeRange" : [32, 33],
"keySizeRangeProbability" : [1.0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
"rebalanceMinSlabs": 1,
"allocFactor": 2.0
},
"test_config":
"test_config":
{
"name": "cachelib_map",
"numOps" : 200000
"numOps" : 150000
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
"rebalanceMinSlabs": 1,
"allocFactor": 2.0
},
"test_config":
"test_config":
{
"name": "cachelib_range_map",
"numOps" : 200000
"numOps" : 150000
}

}

0 comments on commit 3c0ef49

Please sign in to comment.