Skip to content

Commit

Permalink
Added gro_busy_usecs configuration parameter
Browse files Browse the repository at this point in the history
* Also, removed HOMA_GRO_BYPASS option (it's never appropriate)
* Added metrics for grant and data bypass, updated metrics.py
  • Loading branch information
johnousterhout committed Dec 7, 2023
1 parent e1dbc38 commit 8b3d5d5
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 87 deletions.
49 changes: 36 additions & 13 deletions homa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1945,10 +1945,6 @@ struct homa {
/* Bits that can be specified for gro_policy. These were created for
* testing, in order to evaluate various possible policies; you almost
* certainly should not use any value other than HOMA_GRO_NORMAL.
* HOMA_GRO_BYPASS: Pass all incoming packets directly to
* homa_softirq during GRO; this bypasses
* the SoftIRQ dispatching mechanism as well
* as the network and IP stack layers.
* HOMA_GRO_SAME_CORE If isolated packets arrive (not part of
* a batch) use the GRO core for SoftIRQ also.
* HOMA_GRO_IDLE Use old mechanism for selecting an idle
Expand All @@ -1958,13 +1954,14 @@ struct homa {
* HOMA_GRO_GEN2 Use the new mechanism for selecting an
* idle core for SoftIRQ.
* HOMA_GRO_FAST_GRANTS Pass all grants immediately to
* homa_softirq during GRO.
* HOMA_GRO_SHORT_BYPASS Pass all short packets directly to
* homa_softirq during GRO.
* homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_SHORT_BYPASS Pass all single-packet messages directly
* to homa_softirq during GRO (only if the
* core isn't overloaded).
* HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load
* balancing.
*/
#define HOMA_GRO_BYPASS 1
#define HOMA_GRO_SAME_CORE 2
#define HOMA_GRO_IDLE 4
#define HOMA_GRO_NEXT 8
Expand All @@ -1973,7 +1970,7 @@ struct homa {
#define HOMA_GRO_SHORT_BYPASS 64
#define HOMA_GRO_GEN3 128
#define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \
|HOMA_GRO_SHORT_BYPASS)
|HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS)

/*
* @busy_usecs: if there has been activity on a core within the
Expand All @@ -1986,6 +1983,19 @@ struct homa {
/** @busy_cycles: Same as busy_usecs except in get_cycles() units. */
int busy_cycles;

/*
* @gro_busy_usecs: if the gap between the completion of
* homa_gro_receive and the next call to homa_gro_receive on the same
* core is less than this, then GRO on that core is considered to be
* "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be
* done because they risk overloading the core. Set externally via
* sysctl.
*/
int gro_busy_usecs;

/** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */
int gro_busy_cycles;

/**
* @timer_ticks: number of times that homa_timer has been invoked
* (may wraparound, which is safe).
Expand Down Expand Up @@ -2629,6 +2639,20 @@ struct homa_metrics {
*/
__u64 gen3_alt_handoffs;

/**
* @gro_grant_bypasses: total number of GRANT packets passed directly
* to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
* mechanism (triggered by HOMA_GRO_FAST_GRANTS).
*/
__u64 gro_grant_bypasses;

/**
* @gro_data_bypasses: total number of DATA packets passed directly
* to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
* mechanism (triggered by HOMA_GRO_SHORT_BYPASS).
*/
__u64 gro_data_bypasses;

/** @temp: For temporary use during testing. */
#define NUM_TEMP_METRICS 10
__u64 temp[NUM_TEMP_METRICS];
Expand All @@ -2648,10 +2672,9 @@ struct homa_core {
__u64 last_active;

/**
* @last_gro: the last time (in get_cycle() units) that Homa
* processed packets at GRO(NAPI) level on this core. Used to
* avoid assigning SoftIRQ handlers to this core when it has
* been used recently for GRO.
* @last_gro: the last time (in get_cycle() units) that
* homa_gro_receive returned on this core. Used to determine
* whether GRO is keeping a core busy.
*/
__u64 last_gro;

Expand Down
4 changes: 4 additions & 0 deletions homa_incoming.c
Original file line number Diff line number Diff line change
Expand Up @@ -1933,6 +1933,10 @@ void homa_incoming_sysctl_changed(struct homa *homa)
tmp = (tmp*cpu_khz)/1000;
homa->busy_cycles = tmp;

tmp = homa->gro_busy_usecs;
tmp = (tmp*cpu_khz)/1000;
homa->gro_busy_cycles = tmp;

tmp = homa->bpage_lease_usecs;
tmp = (tmp*cpu_khz)/1000;
homa->bpage_lease_cycles = tmp;
Expand Down
40 changes: 21 additions & 19 deletions homa_offload.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2019-2022 Stanford University
/* Copyright (c) 2019-2023 Stanford University
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
Expand Down Expand Up @@ -151,12 +151,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
struct sk_buff *held_skb;
struct sk_buff *result = NULL;
struct homa_core *core = homa_cores[raw_smp_processor_id()];
__u64 now = get_cycles();
int busy = (now - core->last_gro) < homa->gro_busy_cycles;
__u32 hash;
__u64 saved_softirq_metric, softirq_cycles;
struct data_header *h_new = (struct data_header *)
skb_transport_header(skb);
int priority;
__u32 saddr;

core->last_active = now;
if (skb_is_ipv6(skb)) {
priority = ipv6_hdr(skb)->priority;
saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]);
Expand All @@ -169,12 +173,18 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
// if (!pskb_may_pull(skb, 64))
// tt_record("homa_gro_receive can't pull enough data "
// "from packet for trace");
if (h_new->common.type == DATA)
if (h_new->common.type == DATA) {
tt_record4("homa_gro_receive got packet from 0x%x "
"id %llu, offset %d, priority %d",
saddr, homa_local_id(h_new->common.sender_id),
ntohl(h_new->seg.offset), priority);
else if (h_new->common.type == GRANT) {
if ((h_new->seg.segment_length == h_new->message_length)
&& (homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
&& !busy) {
INC_METRIC(gro_data_bypasses, 1);
goto bypass;
}
} else if (h_new->common.type == GRANT) {
tt_record4("homa_gro_receive got grant from 0x%x "
"id %llu, offset %d, priority %d",
saddr, homa_local_id(h_new->common.sender_id),
Expand All @@ -186,21 +196,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
* a significant difference in throughput for large
* messages, especially when the system is loaded.
*/
if (homa->gro_policy & HOMA_GRO_FAST_GRANTS)
if ((homa->gro_policy & HOMA_GRO_FAST_GRANTS) && !busy) {
INC_METRIC(gro_grant_bypasses, 1);
goto bypass;
}
} else
tt_record4("homa_gro_receive got packet from 0x%x "
"id %llu, type 0x%x, priority %d",
saddr, homa_local_id(h_new->common.sender_id),
h_new->common.type, priority);

core->last_active = get_cycles();

if ((homa->gro_policy & HOMA_GRO_BYPASS)
|| ((homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
&& (skb->len < 1400)))
goto bypass;

/* The GRO mechanism tries to separate packets onto different
* gro_lists by hash. This is bad for us, because we want to batch
* packets together regardless of their RPCs. So, instead of
Expand Down Expand Up @@ -273,24 +278,22 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,

done:
homa_check_pacer(homa, 1);
core->last_gro = get_cycles();
return result;

bypass:
/* Record SoftIRQ cycles in a different metric to reflect that
* they happened during bypass.
*/
saved_softirq_metric = homa_cores[raw_smp_processor_id()]
->metrics.softirq_cycles;
saved_softirq_metric = core->metrics.softirq_cycles;
homa_softirq(skb);
softirq_cycles = homa_cores[raw_smp_processor_id()]
->metrics.softirq_cycles - saved_softirq_metric;
homa_cores[raw_smp_processor_id()]->metrics.softirq_cycles
= saved_softirq_metric;
softirq_cycles = core->metrics.softirq_cycles - saved_softirq_metric;
core->metrics.softirq_cycles = saved_softirq_metric;
INC_METRIC(bypass_softirq_cycles, softirq_cycles);
core->last_gro = get_cycles();

/* This return value indicates that we have freed skb. */
return ERR_PTR(-EINPROGRESS);

}

/**
Expand Down Expand Up @@ -350,7 +353,6 @@ void homa_gro_gen2(struct sk_buff *skb)
ntohl(h->seg.offset));
}
atomic_inc(&homa_cores[candidate]->softirq_backlog);
homa_cores[this_core]->last_gro = now;
homa_set_softirq_cpu(skb, candidate);
}

Expand Down
7 changes: 7 additions & 0 deletions homa_plumbing.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,13 @@ static struct ctl_table homa_ctl_table[] = {
.mode = 0644,
.proc_handler = homa_dointvec
},
{
.procname = "gro_busy_usecs",
.data = &homa_data.gro_busy_usecs,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = homa_dointvec
},
{
.procname = "gro_policy",
.data = &homa_data.gro_policy,
Expand Down
11 changes: 11 additions & 0 deletions homa_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ int homa_init(struct homa *homa)
homa->gso_force_software = 0;
homa->gro_policy = HOMA_GRO_NORMAL;
homa->busy_usecs = 100;
homa->gro_busy_usecs = 5;
homa->timer_ticks = 0;
spin_lock_init(&homa->metrics_lock);
homa->metrics = NULL;
Expand Down Expand Up @@ -1826,6 +1827,16 @@ char *homa_print_metrics(struct homa *homa)
"Gen3 handoffs to secondary core (primary was "
"busy)\n",
m->gen3_alt_handoffs);
homa_append_metric(homa,
"gro_grant_bypasses %15llu "
"Grant packets passed directly to homa_softirq "
"by homa_gro_receive\n",
m->gro_grant_bypasses);
homa_append_metric(homa,
"gro_data_bypasses %15llu "
"Data packets passed directly to homa_softirq "
"by homa_gro_receive\n",
m->gro_data_bypasses);
for (i = 0; i < NUM_TEMP_METRICS; i++)
homa_append_metric(homa,
"temp%-2d %15llu "
Expand Down
16 changes: 13 additions & 3 deletions man/homa.7
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,6 @@ in
The amount of time (in microseconds) that a given core can own a page in
a receive buffer pool before its ownership can be revoked by a different
core.
.IR gro_policy
An integer value that determines how Homa processes incoming packets
at the GRO level. See code in homa_offload.c for more details.
.TP
.IR busy_usecs
An integer value in microsecond units; if a core has been active in
Expand Down Expand Up @@ -339,6 +336,19 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small
value can provide significant benefits for the largest messages under very high
loads, but for most loads its effect is negligible.
.TP
.I gro_busy_usecs
An integer value used to determine whether or not to perform some
optimizations specified by
.IR gro_policy .
If the gap between the completion of one call to homa_gro_receive and
the invocation of the next call on the same core is less than this many
microseconds, the core is considered to be "busy", so optimizations
that add to the load of the core will not be performed.
.TP
.I gro_policy
An integer value that determines how Homa processes incoming packets
at the GRO level. See code in homa_offload.c for more details.
.TP
.IR gso_force_software
If this value is nonzero, Homa will perform GSO in software instead of
asking the NIC to perform TSO in hardware. This can be useful when running
Expand Down
Loading

0 comments on commit 8b3d5d5

Please sign in to comment.