Added GRO policy HOMA_GRO_IDLE_NEW

(Fixed load-balancing problem with HOMA_GRO_IDLE, where if SoftIRQ was slow to start up, GRO would keep assigning more batches of packets to it, instead of switching to a different core)
PlatformLab · Jul 18, 2022 · 79df0f4 · 79df0f4
1 parent 3de988b
commit 79df0f4
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 12 deletions.
diff --git a/homa_impl.h b/homa_impl.h
@@ -1707,10 +1707,25 @@ struct homa {
 	 * want to know what they mean, read the code of homa_offload.c
 	 */
 	#define HOMA_GRO_BYPASS      1
-    #define HOMA_GRO_SAME_CORE   2
-    #define HOMA_GRO_IDLE        4
-    #define HOMA_GRO_NEXT        8
-    #define HOMA_GRO_NORMAL      HOMA_GRO_SAME_CORE|HOMA_GRO_IDLE
+	#define HOMA_GRO_SAME_CORE   2
+	#define HOMA_GRO_IDLE        4
+	#define HOMA_GRO_NEXT        8
+	#define HOMA_GRO_IDLE_NEW    16
+	#define HOMA_GRO_NORMAL      HOMA_GRO_SAME_CORE|HOMA_GRO_IDLE_NEW
+
+	/*
+	 * @gro_busy_usecs: try not to schedule SoftIRQ processing on a core
+	 * if it has handled Homa packets at GRO level in the last
+	 * gro_busy_us microseconds (improve load balancing by avoiding
+	 * hot spots). Set externally via sysctl.
+	 */
+	int gro_busy_usecs;
+
+	/**
+	 * @gro_busy_cycles: Same as gro_busy_usecs, except in units
+	 * of get_cycles().
+	 */
+	int gro_busy_cycles;
 
 	/**
 	 * @timer_ticks: number of times that homa_timer has been invoked
@@ -2281,6 +2296,28 @@ struct homa_core {
 	 * handlers.
 	 */
 	__u64 last_active;
+
+	/**
+	 * @last_gro: the last time (in get_cycle() units) that Homa
+	 * processed packets at GRO(NAPI) level on this core. Used to
+	 * avoid assigning SoftIRQ handlers to this core when it has
+	 * been used recently for GRO.
+	 */
+	__u64 last_gro;
+
+	/**
+	 * @softirq_busy: nonzero means that packets have been assigned
+	 * to this core for SoftIRQ processing, but the processing is not
+	 * yet complete.
+	 */
+	__s8 softirq_busy;
+
+	/**
+	 * @softirq_offset: used when rotating SoftIRQ assignment among
+	 * the next cores; contains an offset to add to the current core
+	 * to produce the core for SoftIRQ.
+	 */
+	__s8 softirq_offset;
 
         /**
          * held_skb: last packet buffer known to be available for

diff --git a/homa_incoming.c b/homa_incoming.c
@@ -1650,6 +1650,10 @@ void homa_incoming_sysctl_changed(struct homa *homa)
 	tmp = (tmp*cpu_khz)/1000;
 	homa->poll_cycles = tmp;
 
+	tmp = homa->gro_busy_usecs;
+	tmp = (tmp*cpu_khz)/1000;
+	homa->gro_busy_cycles = tmp;
+
 	tmp = homa->rtt_bytes * homa->duty_cycle;
 	homa->grant_threshold = tmp/1000;
 	if (homa->grant_threshold > homa->rtt_bytes)

diff --git a/homa_offload.c b/homa_offload.c
@@ -234,8 +234,56 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
 //	tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d",
 //			h->type, h->sender_id, ntohl(d->seg.offset),
 //			NAPI_GRO_CB(skb)->count);
-
-	if (homa->gro_policy & HOMA_GRO_IDLE) {
+
+#define CORES_TO_CHECK 4
+	if (homa->gro_policy & HOMA_GRO_IDLE_NEW) {
+		/* Pick a specific core to handle SoftIRQ processing for this
+		 * group of packets. This policy scans the next several cores
+		 * in order after this, trying to find one that is not
+		 * already busy with SoftIRQ processing, and that doesn't appear
+		 * to be active with NAPI/GRO processing either. If there
+		 * is no such core, just rotate among the next cores.
+		 */
+		int i;
+		int candidate = raw_smp_processor_id();
+		int this_core = candidate;
+		__u64 now = get_cycles();
+		struct homa_core *core;
+		for (i = CORES_TO_CHECK; i > 0; i--) {
+			candidate++;
+			if (unlikely(candidate >= nr_cpu_ids))
+				candidate = 0;
+			core = homa_cores[candidate];
+			if (!core->softirq_busy && ((core->last_gro
+					+ homa->gro_busy_cycles) < now)) {
+				tt_record1("homa_gro_complete chose core %d "
+						"with IDLE_NEW policy",
+						candidate);
+				break;
+			}
+		}
+		if (i <= 0) {
+			/* All of the candidates appear to be busy; just
+			 * rotate among them.
+			 */
+			int offset = homa_cores[candidate]->softirq_offset;
+			offset += 1;
+			if (offset > CORES_TO_CHECK)
+				offset = 1;
+			homa_cores[candidate]->softirq_offset = offset;
+			candidate = this_core
+					+ homa_cores[candidate]->softirq_offset;
+			while (candidate >= nr_cpu_ids) {
+				candidate -= nr_cpu_ids;
+			}
+			tt_record1("homa_gro_complete chose core %d with "
+					"IDLE_NEW policy (all cores busy)",
+					candidate);
+		}
+		homa_cores[candidate]->softirq_busy = 1;
+		homa_cores[this_core]->last_gro = now;
+		homa_set_softirq_cpu(skb, candidate);
+	} else if (homa->gro_policy & HOMA_GRO_IDLE) {
 		int i, core, best;
 		__u64 best_time = ~0;
 		__u64 last_active;
@@ -245,13 +293,10 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
 		 * core gets overloaded. We do that by checking the next several
 		 * cores in order after this one, and choosing the one that
 		 * hasn't done NAPI or SoftIRQ processing for Homa in the
-		 * longest time. Also, if HOMA_GRO_NO_TASK is set, compute
-		 * a second "best" core where we only consider cores that have
-		 * no runnable user tasks; if there is such a core, use this
-		 * in preference to the first "best".
+		 * longest time.
 		 */
 		core = best = raw_smp_processor_id();
-		for (i = 0; i < 4; i++) {
+		for (i = 0; i < CORES_TO_CHECK; i++) {
 			core++;
 			if (unlikely(core >= nr_cpu_ids))
 				core = 0;
@@ -262,6 +307,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
 			}
 		}
 		homa_set_softirq_cpu(skb, best);
+		tt_record1("homa_gro_complete chose core %d with IDLE policy",
+				best);
 	} else if (homa->gro_policy & HOMA_GRO_NEXT) {
 		/* Use the next core (in circular order) to handle the
 		 * SoftIRQ processing.
@@ -270,6 +317,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset)
 		if (unlikely(target >= nr_cpu_ids))
 			target = 0;
 		homa_set_softirq_cpu(skb, target);
+		tt_record1("homa_gro_complete chose core %d with NEXT policy",
+				target);
 	}
 
 	return 0;

diff --git a/homa_plumbing.c b/homa_plumbing.c
@@ -199,6 +199,13 @@ static struct ctl_table homa_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= homa_dointvec
 	},
+	{
+		.procname	= "gro_busy_us",
+		.data		= &homa_data.gro_busy_usecs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= homa_dointvec
+	},
 	{
 		.procname	= "gro_policy",
 		.data		= &homa_data.gro_policy,
@@ -1156,6 +1163,13 @@ int homa_softirq(struct sk_buff *skb) {
 
 	for (skb = packets; skb != NULL; skb = next) {
 		next = skb->next;
+		if (next == NULL) {
+			/* Once we're down to a single packet to process,
+			 * it's OK for GRO to start assigning us more
+			 * work.
+			 */
+			homa_cores[raw_smp_processor_id()]->softirq_busy = 0;
+		}
 		saddr = ip_hdr(skb)->saddr;
 		num_packets++;
 

diff --git a/homa_utils.c b/homa_utils.c
@@ -60,6 +60,9 @@ int homa_init(struct homa *homa)
 			core = (struct homa_core *) (first + i*aligned_size);
 			homa_cores[i] = core;
 			core->last_active = 0;
+			core->last_gro = 0;
+			core->softirq_busy = 0;
+			core->softirq_offset = 0;
 			core->held_skb = NULL;
 			core->held_bucket = 0;
 			core->thread = NULL;
@@ -143,6 +146,7 @@ int homa_init(struct homa *homa)
 	homa->max_gso_size = 10000;
 	homa->max_gro_skbs = 10;
 	homa->gro_policy = HOMA_GRO_NORMAL;
+	homa->gro_busy_usecs = 10;
 	homa->timer_ticks = 0;
 	spin_lock_init(&homa->metrics_lock);
 	homa->metrics = NULL;

diff --git a/man/homa.7 b/man/homa.7
@@ -206,6 +206,14 @@ transmits full-size packets.
 An integer value that determines how Homa processes incoming packets
 at the GRO level. See code in homa_offload.c for more details.
 .TP
+.IR gro_busy_usecs
+An integer value. Under some
+.IR gro_policy
+settings, Homa will try not to assign SoftIRQ processing to a core if
+it has had GRO-level activity in the last
+.IR gro_busy_usecs
+microseconds (in order to avoid hot spots that degrade load balancing).
+.TP
 .IR link_mbps
 An integer value specifying the bandwidth of this machine's uplink to
 the top-of-rack switch, in units of 1e06 bits per second.

diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c
@@ -192,6 +192,43 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs)
 	kfree_skb(self->skb);
 }
 
+TEST_F(homa_offload, homa_gro_complete__GRO_IDLE_NEW)
+{
+	homa->gro_policy = HOMA_GRO_IDLE_NEW;
+	mock_cycles = 1000;
+	homa->gro_busy_cycles = 100;
+	cpu_number = 5;
+	homa_cores[6]->softirq_busy = 1;
+	homa_cores[6]->last_gro = 0;
+	homa_cores[7]->softirq_busy = 0;
+	homa_cores[7]->last_gro = 901;
+	homa_cores[0]->softirq_busy = 1;
+	homa_cores[0]->last_gro = 0;
+	homa_cores[1]->softirq_busy = 0;
+	homa_cores[1]->last_gro = 899;
+	homa_cores[2]->softirq_busy = 0;
+	homa_cores[2]->last_gro = 0;
+
+	// Avoid busy cores.
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(1, self->skb->hash - 32);
+	EXPECT_EQ(1, homa_cores[1]->softirq_busy);
+
+	// All cores busy; must rotate.
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(6, self->skb->hash - 32);
+	EXPECT_EQ(1, homa_cores[1]->softirq_offset);
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(7, self->skb->hash - 32);
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(0, self->skb->hash - 32);
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(1, self->skb->hash - 32);
+	homa_gro_complete(self->skb, 0);
+	EXPECT_EQ(6, self->skb->hash - 32);
+	EXPECT_EQ(1, homa_cores[1]->softirq_offset);
+}
+
 TEST_F(homa_offload, homa_gro_complete__GRO_IDLE)
 {
 	homa->gro_policy = HOMA_GRO_IDLE;

diff --git a/timetrace.c b/timetrace.c
@@ -20,7 +20,7 @@
  * timetrace stubs; we will then connect the timetrace mechanism here with
  * those stubs to allow the rest of the kernel to log in our buffers.
  */
-// #define TT_KERNEL 1
+#define TT_KERNEL 1
 #endif
 #ifdef TT_KERNEL
 extern int        tt_linux_buffer_mask;