Skip to content

Commit

Permalink
Added GRO policy HOMA_GRO_IDLE_NEW
Browse files Browse the repository at this point in the history
(Fixed load-balancing problem with HOMA_GRO_IDLE, where if SoftIRQ
was slow to start up, GRO would keep assigning more batches of
packets to it, instead of switching to a different core)
  • Loading branch information
johnousterhout committed Jul 18, 2022
1 parent 3de988b commit 79df0f4
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 12 deletions.
45 changes: 41 additions & 4 deletions homa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1707,10 +1707,25 @@ struct homa {
* want to know what they mean, read the code of homa_offload.c
*/
#define HOMA_GRO_BYPASS 1
#define HOMA_GRO_SAME_CORE 2
#define HOMA_GRO_IDLE 4
#define HOMA_GRO_NEXT 8
#define HOMA_GRO_NORMAL HOMA_GRO_SAME_CORE|HOMA_GRO_IDLE
#define HOMA_GRO_SAME_CORE 2
#define HOMA_GRO_IDLE 4
#define HOMA_GRO_NEXT 8
#define HOMA_GRO_IDLE_NEW 16
#define HOMA_GRO_NORMAL HOMA_GRO_SAME_CORE|HOMA_GRO_IDLE_NEW

/*
* @gro_busy_usecs: try not to schedule SoftIRQ processing on a core
* if it has handled Homa packets at GRO level in the last
* gro_busy_us microseconds (improve load balancing by avoiding
* hot spots). Set externally via sysctl.
*/
int gro_busy_usecs;

/**
* @gro_busy_cycles: Same as gro_busy_usecs, except in units
* of get_cycles().
*/
int gro_busy_cycles;

/**
* @timer_ticks: number of times that homa_timer has been invoked
Expand Down Expand Up @@ -2281,6 +2296,28 @@ struct homa_core {
* handlers.
*/
__u64 last_active;

/**
* @last_gro: the last time (in get_cycle() units) that Homa
* processed packets at GRO(NAPI) level on this core. Used to
* avoid assigning SoftIRQ handlers to this core when it has
* been used recently for GRO.
*/
__u64 last_gro;

/**
* @softirq_busy: nonzero means that packets have been assigned
* to this core for SoftIRQ processing, but the processing is not
* yet complete.
*/
__s8 softirq_busy;

/**
* @softirq_offset: used when rotating SoftIRQ assignment among
* the next cores; contains an offset to add to the current core
* to produce the core for SoftIRQ.
*/
__s8 softirq_offset;

/**
* held_skb: last packet buffer known to be available for
Expand Down
4 changes: 4 additions & 0 deletions homa_incoming.c
Original file line number Diff line number Diff line change
Expand Up @@ -1650,6 +1650,10 @@ void homa_incoming_sysctl_changed(struct homa *homa)
tmp = (tmp*cpu_khz)/1000;
homa->poll_cycles = tmp;

tmp = homa->gro_busy_usecs;
tmp = (tmp*cpu_khz)/1000;
homa->gro_busy_cycles = tmp;

tmp = homa->rtt_bytes * homa->duty_cycle;
homa->grant_threshold = tmp/1000;
if (homa->grant_threshold > homa->rtt_bytes)
Expand Down
63 changes: 56 additions & 7 deletions homa_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,56 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
// tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d",
// h->type, h->sender_id, ntohl(d->seg.offset),
// NAPI_GRO_CB(skb)->count);

if (homa->gro_policy & HOMA_GRO_IDLE) {

#define CORES_TO_CHECK 4
if (homa->gro_policy & HOMA_GRO_IDLE_NEW) {
/* Pick a specific core to handle SoftIRQ processing for this
* group of packets. This policy scans the next several cores
* in order after this, trying to find one that is not
* already busy with SoftIRQ processing, and that doesn't appear
* to be active with NAPI/GRO processing either. If there
* is no such core, just rotate among the next cores.
*/
int i;
int candidate = raw_smp_processor_id();
int this_core = candidate;
__u64 now = get_cycles();
struct homa_core *core;
for (i = CORES_TO_CHECK; i > 0; i--) {
candidate++;
if (unlikely(candidate >= nr_cpu_ids))
candidate = 0;
core = homa_cores[candidate];
if (!core->softirq_busy && ((core->last_gro
+ homa->gro_busy_cycles) < now)) {
tt_record1("homa_gro_complete chose core %d "
"with IDLE_NEW policy",
candidate);
break;
}
}
if (i <= 0) {
/* All of the candidates appear to be busy; just
* rotate among them.
*/
int offset = homa_cores[candidate]->softirq_offset;
offset += 1;
if (offset > CORES_TO_CHECK)
offset = 1;
homa_cores[candidate]->softirq_offset = offset;
candidate = this_core
+ homa_cores[candidate]->softirq_offset;
while (candidate >= nr_cpu_ids) {
candidate -= nr_cpu_ids;
}
tt_record1("homa_gro_complete chose core %d with "
"IDLE_NEW policy (all cores busy)",
candidate);
}
homa_cores[candidate]->softirq_busy = 1;
homa_cores[this_core]->last_gro = now;
homa_set_softirq_cpu(skb, candidate);
} else if (homa->gro_policy & HOMA_GRO_IDLE) {
int i, core, best;
__u64 best_time = ~0;
__u64 last_active;
Expand All @@ -245,13 +293,10 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
* core gets overloaded. We do that by checking the next several
* cores in order after this one, and choosing the one that
* hasn't done NAPI or SoftIRQ processing for Homa in the
* longest time. Also, if HOMA_GRO_NO_TASK is set, compute
* a second "best" core where we only consider cores that have
* no runnable user tasks; if there is such a core, use this
* in preference to the first "best".
* longest time.
*/
core = best = raw_smp_processor_id();
for (i = 0; i < 4; i++) {
for (i = 0; i < CORES_TO_CHECK; i++) {
core++;
if (unlikely(core >= nr_cpu_ids))
core = 0;
Expand All @@ -262,6 +307,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
}
}
homa_set_softirq_cpu(skb, best);
tt_record1("homa_gro_complete chose core %d with IDLE policy",
best);
} else if (homa->gro_policy & HOMA_GRO_NEXT) {
/* Use the next core (in circular order) to handle the
* SoftIRQ processing.
Expand All @@ -270,6 +317,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
if (unlikely(target >= nr_cpu_ids))
target = 0;
homa_set_softirq_cpu(skb, target);
tt_record1("homa_gro_complete chose core %d with NEXT policy",
target);
}

return 0;
Expand Down
14 changes: 14 additions & 0 deletions homa_plumbing.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ static struct ctl_table homa_ctl_table[] = {
.mode = 0644,
.proc_handler = homa_dointvec
},
{
.procname = "gro_busy_us",
.data = &homa_data.gro_busy_usecs,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = homa_dointvec
},
{
.procname = "gro_policy",
.data = &homa_data.gro_policy,
Expand Down Expand Up @@ -1156,6 +1163,13 @@ int homa_softirq(struct sk_buff *skb) {

for (skb = packets; skb != NULL; skb = next) {
next = skb->next;
if (next == NULL) {
/* Once we're down to a single packet to process,
* it's OK for GRO to start assigning us more
* work.
*/
homa_cores[raw_smp_processor_id()]->softirq_busy = 0;
}
saddr = ip_hdr(skb)->saddr;
num_packets++;

Expand Down
4 changes: 4 additions & 0 deletions homa_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ int homa_init(struct homa *homa)
core = (struct homa_core *) (first + i*aligned_size);
homa_cores[i] = core;
core->last_active = 0;
core->last_gro = 0;
core->softirq_busy = 0;
core->softirq_offset = 0;
core->held_skb = NULL;
core->held_bucket = 0;
core->thread = NULL;
Expand Down Expand Up @@ -143,6 +146,7 @@ int homa_init(struct homa *homa)
homa->max_gso_size = 10000;
homa->max_gro_skbs = 10;
homa->gro_policy = HOMA_GRO_NORMAL;
homa->gro_busy_usecs = 10;
homa->timer_ticks = 0;
spin_lock_init(&homa->metrics_lock);
homa->metrics = NULL;
Expand Down
8 changes: 8 additions & 0 deletions man/homa.7
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,14 @@ transmits full-size packets.
An integer value that determines how Homa processes incoming packets
at the GRO level. See code in homa_offload.c for more details.
.TP
.IR gro_busy_usecs
An integer value. Under some
.IR gro_policy
settings, Homa will try not to assign SoftIRQ processing to a core if
it has had GRO-level activity in the last
.IR gro_busy_usecs
microseconds (in order to avoid hot spots that degrade load balancing).
.TP
.IR link_mbps
An integer value specifying the bandwidth of this machine's uplink to
the top-of-rack switch, in units of 1e06 bits per second.
Expand Down
37 changes: 37 additions & 0 deletions test/unit_homa_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,43 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs)
kfree_skb(self->skb);
}

TEST_F(homa_offload, homa_gro_complete__GRO_IDLE_NEW)
{
homa->gro_policy = HOMA_GRO_IDLE_NEW;
mock_cycles = 1000;
homa->gro_busy_cycles = 100;
cpu_number = 5;
homa_cores[6]->softirq_busy = 1;
homa_cores[6]->last_gro = 0;
homa_cores[7]->softirq_busy = 0;
homa_cores[7]->last_gro = 901;
homa_cores[0]->softirq_busy = 1;
homa_cores[0]->last_gro = 0;
homa_cores[1]->softirq_busy = 0;
homa_cores[1]->last_gro = 899;
homa_cores[2]->softirq_busy = 0;
homa_cores[2]->last_gro = 0;

// Avoid busy cores.
homa_gro_complete(self->skb, 0);
EXPECT_EQ(1, self->skb->hash - 32);
EXPECT_EQ(1, homa_cores[1]->softirq_busy);

// All cores busy; must rotate.
homa_gro_complete(self->skb, 0);
EXPECT_EQ(6, self->skb->hash - 32);
EXPECT_EQ(1, homa_cores[1]->softirq_offset);
homa_gro_complete(self->skb, 0);
EXPECT_EQ(7, self->skb->hash - 32);
homa_gro_complete(self->skb, 0);
EXPECT_EQ(0, self->skb->hash - 32);
homa_gro_complete(self->skb, 0);
EXPECT_EQ(1, self->skb->hash - 32);
homa_gro_complete(self->skb, 0);
EXPECT_EQ(6, self->skb->hash - 32);
EXPECT_EQ(1, homa_cores[1]->softirq_offset);
}

TEST_F(homa_offload, homa_gro_complete__GRO_IDLE)
{
homa->gro_policy = HOMA_GRO_IDLE;
Expand Down
2 changes: 1 addition & 1 deletion timetrace.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* timetrace stubs; we will then connect the timetrace mechanism here with
* those stubs to allow the rest of the kernel to log in our buffers.
*/
// #define TT_KERNEL 1
#define TT_KERNEL 1
#endif
#ifdef TT_KERNEL
extern int tt_linux_buffer_mask;
Expand Down

0 comments on commit 79df0f4

Please sign in to comment.