Skip to content

Commit

Permalink
build speed tweak for CacheLocality: no constinit table of relaxed_at…
Browse files Browse the repository at this point in the history
…omic

Summary:
The compiler must evaluate the constructor for each element of the table, which is observed to cost compile time. Switch to a constinit array of raw bytes instead and always use `atomic_ref`s to access the table in order to avoid valuating all of the `relaxed_atomic` constructors at runtime.

```name=master
$ foundation/scripts/run-perf-compile --quiet --syntax --platform platform010 --which perf folly/concurrency/test/CacheLocalityTest.cpp
     9,275,503,791      instructions:uP
```
```name=branch
$ foundation/scripts/run-perf-compile --quiet --syntax --platform platform010 --which perf folly/concurrency/test/CacheLocalityTest.cpp
     4,577,023,094      instructions:uP
```

Reviewed By: Gownta

Differential Revision: D40504017

fbshipit-source-id: b4cb22b27ed8f31787b0f860736be0e4b56e38da
  • Loading branch information
yfeldblum authored and facebook-github-bot committed Nov 3, 2022
1 parent 30abf02 commit 82d3251
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
13 changes: 9 additions & 4 deletions folly/concurrency/CacheLocality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ bool AccessSpreaderBase::initialize(
Getcpu::Func (&pickGetcpuFunc)(),
const CacheLocality& (&system)()) {
(void)AccessSpreaderStaticInit::instance; // ODR-use it so it is not dropped
constexpr auto relaxed = std::memory_order_relaxed;
auto& cacheLocality = system();
auto n = cacheLocality.numCpus;
for (size_t width = 0; width <= kMaxCpus; ++width) {
Expand All @@ -374,19 +375,23 @@ bool AccessSpreaderBase::initialize(
assert(index < n);
// as index goes from 0..n, post-transform value goes from
// 0..numStripes
row[cpu] = static_cast<CompactStripe>((index * numStripes) / n);
assert(row[cpu] < numStripes);
make_atomic_ref(row[cpu]).store(
static_cast<CompactStripe>((index * numStripes) / n), relaxed);
assert(make_atomic_ref(row[cpu]).load(relaxed) < numStripes);
}
size_t filled = n;
while (filled < kMaxCpus) {
size_t len = std::min(filled, kMaxCpus - filled);
for (size_t i = 0; i < len; ++i) {
row[filled + i] = row[i].load();
make_atomic_ref(row[filled + i])
.store(make_atomic_ref(row[i]).load(relaxed), relaxed);
}
filled += len;
}
for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
assert(row[cpu] == row[cpu - n]);
assert(
make_atomic_ref(row[cpu]).load(relaxed) ==
make_atomic_ref(row[cpu - n]).load(relaxed));
}
}
state.getcpu.exchange(pickGetcpuFunc(), std::memory_order_acq_rel);
Expand Down
17 changes: 10 additions & 7 deletions folly/concurrency/CacheLocality.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include <folly/detail/StaticSingletonManager.h>
#include <folly/lang/Align.h>
#include <folly/lang/Exception.h>
#include <folly/synchronization/RelaxedAtomic.h>
#include <folly/synchronization/AtomicRef.h>

namespace folly {

Expand Down Expand Up @@ -192,16 +192,15 @@ class AccessSpreaderBase {
kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(),
"stripeByCpu element type isn't wide enough");

using CompactStripeTable =
relaxed_atomic<CompactStripe>[kMaxCpus + 1][kMaxCpus];
using CompactStripeTable = CompactStripe[kMaxCpus + 1][kMaxCpus];

struct GlobalState {
/// For each level of splitting up to kMaxCpus, maps the cpu (mod
/// kMaxCpus) to the stripe. Rather than performing any inequalities
/// or modulo on the actual number of cpus, we just fill in the entire
/// array.
/// Keep as the first field to avoid extra + in the fastest path.
CompactStripeTable table;
mutable CompactStripeTable table;

/// Points to the getcpu-like function we are using to obtain the
/// current cpu. It should not be assumed that the returned cpu value
Expand Down Expand Up @@ -262,7 +261,7 @@ struct AccessSpreader : private detail::AccessSpreaderBase {

public:
FOLLY_EXPORT static GlobalState& state() {
static FOLLY_CONSTINIT GlobalState state;
static FOLLY_CONSTINIT GlobalState state{};
if (FOLLY_UNLIKELY(!state.getcpu.load(std::memory_order_acquire))) {
initialize(state);
}
Expand All @@ -278,7 +277,9 @@ struct AccessSpreader : private detail::AccessSpreaderBase {

unsigned cpu;
s.getcpu.load(std::memory_order_relaxed)(&cpu, nullptr, nullptr);
return s.table[std::min(size_t(kMaxCpus), numStripes)][cpu % kMaxCpus];
cpu = cpu % kMaxCpus;
auto& ref = s.table[std::min(size_t(kMaxCpus), numStripes)][cpu];
return make_atomic_ref(ref).load(std::memory_order_relaxed);
}

/// Returns the stripe associated with the current CPU. The returned
Expand All @@ -292,7 +293,9 @@ struct AccessSpreader : private detail::AccessSpreaderBase {
if (kIsMobile) {
return current(numStripes, s);
}
return s.table[std::min(size_t(kMaxCpus), numStripes)][cpuCache().cpu(s)];
unsigned cpu = cpuCache().cpu(s);
auto& ref = s.table[std::min(size_t(kMaxCpus), numStripes)][cpu];
return make_atomic_ref(ref).load(std::memory_order_relaxed);
}

/// Forces the next cachedCurrent() call in this thread to re-probe the
Expand Down

0 comments on commit 82d3251

Please sign in to comment.