Added gro_busy_usecs configuration parameter

* Also, removed HOMA_GRO_BYPASS option (it's never appropriate) * Added metrics for grant and data bypass, updated metrics.py
PlatformLab · Dec 7, 2023 · 8b3d5d5 · 8b3d5d5
1 parent e1dbc38
commit 8b3d5d5
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 87 deletions.
diff --git a/homa_impl.h b/homa_impl.h
@@ -1945,10 +1945,6 @@ struct homa {
 	/* Bits that can be specified for gro_policy. These were created for
 	 * testing, in order to evaluate various possible policies; you almost
 	 * certainly should not use any value other than HOMA_GRO_NORMAL.
-	 * HOMA_GRO_BYPASS:           Pass all incoming packets directly to
-	 *                            homa_softirq during GRO; this bypasses
-	 *                            the SoftIRQ dispatching mechanism as well
-	 *                            as the network and IP stack layers.
 	 * HOMA_GRO_SAME_CORE         If isolated packets arrive (not part of
 	 *                            a batch) use the GRO core for SoftIRQ also.
 	 * HOMA_GRO_IDLE              Use old mechanism for selecting an idle
@@ -1958,13 +1954,14 @@ struct homa {
 	 * HOMA_GRO_GEN2              Use the new mechanism for selecting an
 	 *                            idle core for SoftIRQ.
 	 * HOMA_GRO_FAST_GRANTS       Pass all grants immediately to
-	 *                            homa_softirq during GRO.
-	 * HOMA_GRO_SHORT_BYPASS      Pass all short packets directly to
-	 *                            homa_softirq during GRO.
+	 *                            homa_softirq during GRO (only if the
+	 *                            core isn't overloaded).
+	 * HOMA_GRO_SHORT_BYPASS      Pass all single-packet messages directly
+	 *                            to homa_softirq during GRO (only if the
+	 *                            core isn't overloaded).
 	 * HOMA_GRO_GEN3              Use the "Gen3" mechanisms for load
 	 *                            balancing.
 	 */
-	#define HOMA_GRO_BYPASS          1
 	#define HOMA_GRO_SAME_CORE       2
 	#define HOMA_GRO_IDLE            4
 	#define HOMA_GRO_NEXT            8
@@ -1973,7 +1970,7 @@ struct homa {
 	#define HOMA_GRO_SHORT_BYPASS   64
 	#define HOMA_GRO_GEN3          128
 	#define HOMA_GRO_NORMAL      (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \
-			|HOMA_GRO_SHORT_BYPASS)
+			|HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS)
 
 	/*
 	 * @busy_usecs: if there has been activity on a core within the
@@ -1986,6 +1983,19 @@ struct homa {
 	/** @busy_cycles: Same as busy_usecs except in get_cycles() units. */
 	int busy_cycles;
 
+	/*
+	 * @gro_busy_usecs: if the gap between the completion of
+	 * homa_gro_receive and the next call to homa_gro_receive on the same
+	 * core is less than this, then GRO on that core is considered to be
+	 * "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be
+	 * done because they risk overloading the core. Set externally via
+	 * sysctl.
+	 */
+	int gro_busy_usecs;
+
+	/** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */
+	int gro_busy_cycles;
+
 	/**
 	 * @timer_ticks: number of times that homa_timer has been invoked
 	 * (may wraparound, which is safe).
@@ -2629,6 +2639,20 @@ struct homa_metrics {
 	 */
 	__u64 gen3_alt_handoffs;
 
+	/**
+	 * @gro_grant_bypasses: total number of GRANT packets passed directly
+	 * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
+	 * mechanism (triggered by HOMA_GRO_FAST_GRANTS).
+	 */
+	__u64 gro_grant_bypasses;
+
+	/**
+	 * @gro_data_bypasses: total number of DATA packets passed directly
+	 * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
+	 * mechanism (triggered by HOMA_GRO_SHORT_BYPASS).
+	 */
+	__u64 gro_data_bypasses;
+
 	/** @temp: For temporary use during testing. */
 #define NUM_TEMP_METRICS 10
 	__u64 temp[NUM_TEMP_METRICS];
@@ -2648,10 +2672,9 @@ struct homa_core {
 	__u64 last_active;
 
 	/**
-	 * @last_gro: the last time (in get_cycle() units) that Homa
-	 * processed packets at GRO(NAPI) level on this core. Used to
-	 * avoid assigning SoftIRQ handlers to this core when it has
-	 * been used recently for GRO.
+	 * @last_gro: the last time (in get_cycle() units) that
+	 * homa_gro_receive returned on this core. Used to determine
+	 * whether GRO is keeping a core busy.
 	 */
 	__u64 last_gro;
 

diff --git a/homa_incoming.c b/homa_incoming.c
@@ -1933,6 +1933,10 @@ void homa_incoming_sysctl_changed(struct homa *homa)
 	tmp = (tmp*cpu_khz)/1000;
 	homa->busy_cycles = tmp;
 
+	tmp = homa->gro_busy_usecs;
+	tmp = (tmp*cpu_khz)/1000;
+	homa->gro_busy_cycles = tmp;
+
 	tmp = homa->bpage_lease_usecs;
 	tmp = (tmp*cpu_khz)/1000;
 	homa->bpage_lease_cycles = tmp;

diff --git a/homa_offload.c b/homa_offload.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019-2022 Stanford University
+/* Copyright (c) 2019-2023 Stanford University
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -151,12 +151,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 	struct sk_buff *held_skb;
 	struct sk_buff *result = NULL;
 	struct homa_core *core = homa_cores[raw_smp_processor_id()];
+	__u64 now = get_cycles();
+	int busy = (now - core->last_gro) < homa->gro_busy_cycles;
 	__u32 hash;
 	__u64 saved_softirq_metric, softirq_cycles;
 	struct data_header *h_new = (struct data_header *)
 			skb_transport_header(skb);
 	int priority;
 	__u32 saddr;
+
+	core->last_active = now;
 	if (skb_is_ipv6(skb)) {
 		priority = ipv6_hdr(skb)->priority;
 		saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]);
@@ -169,12 +173,18 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 //	if (!pskb_may_pull(skb, 64))
 //		tt_record("homa_gro_receive can't pull enough data "
 //				"from packet for trace");
-	if (h_new->common.type == DATA)
+	if (h_new->common.type == DATA) {
 		tt_record4("homa_gro_receive got packet from 0x%x "
 				"id %llu, offset %d, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
 				ntohl(h_new->seg.offset), priority);
-	else if (h_new->common.type == GRANT) {
+		if ((h_new->seg.segment_length == h_new->message_length)
+				&& (homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
+				&& !busy) {
+			INC_METRIC(gro_data_bypasses, 1);
+			goto bypass;
+		}
+	} else if (h_new->common.type == GRANT) {
 		tt_record4("homa_gro_receive got grant from 0x%x "
 				"id %llu, offset %d, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
@@ -186,21 +196,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 		 * a significant difference in throughput for large
 		 * messages, especially when the system is loaded.
 		 */
-		if (homa->gro_policy & HOMA_GRO_FAST_GRANTS)
+		if ((homa->gro_policy & HOMA_GRO_FAST_GRANTS) && !busy) {
+			INC_METRIC(gro_grant_bypasses, 1);
 			goto bypass;
+		}
 	} else
 		tt_record4("homa_gro_receive got packet from 0x%x "
 				"id %llu, type 0x%x, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
 				h_new->common.type, priority);
 
-	core->last_active = get_cycles();
-
-	if ((homa->gro_policy & HOMA_GRO_BYPASS)
-			|| ((homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
-			&& (skb->len < 1400)))
-		goto bypass;
-
 	/* The GRO mechanism tries to separate packets onto different
 	 * gro_lists by hash. This is bad for us, because we want to batch
 	 * packets together regardless of their RPCs. So, instead of
@@ -273,24 +278,22 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 
     done:
 	homa_check_pacer(homa, 1);
+	core->last_gro = get_cycles();
 	return result;
 
     bypass:
         /* Record SoftIRQ cycles in a different metric to reflect that
 	 * they happened during bypass.
 	 */
-	saved_softirq_metric = homa_cores[raw_smp_processor_id()]
-			->metrics.softirq_cycles;
+	saved_softirq_metric = core->metrics.softirq_cycles;
 	homa_softirq(skb);
-	softirq_cycles = homa_cores[raw_smp_processor_id()]
-			->metrics.softirq_cycles - saved_softirq_metric;
-	homa_cores[raw_smp_processor_id()]->metrics.softirq_cycles
-			= saved_softirq_metric;
+	softirq_cycles = core->metrics.softirq_cycles - saved_softirq_metric;
+	core->metrics.softirq_cycles = saved_softirq_metric;
 	INC_METRIC(bypass_softirq_cycles, softirq_cycles);
+	core->last_gro = get_cycles();
 
 	/* This return value indicates that we have freed skb. */
 	return ERR_PTR(-EINPROGRESS);
-
 }
 
 /**
@@ -350,7 +353,6 @@ void homa_gro_gen2(struct sk_buff *skb)
 				ntohl(h->seg.offset));
 	}
 	atomic_inc(&homa_cores[candidate]->softirq_backlog);
-	homa_cores[this_core]->last_gro = now;
 	homa_set_softirq_cpu(skb, candidate);
 }
 

diff --git a/homa_plumbing.c b/homa_plumbing.c
@@ -280,6 +280,13 @@ static struct ctl_table homa_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= homa_dointvec
 	},
+	{
+		.procname	= "gro_busy_usecs",
+		.data		= &homa_data.gro_busy_usecs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= homa_dointvec
+	},
 	{
 		.procname	= "gro_policy",
 		.data		= &homa_data.gro_policy,

diff --git a/homa_utils.c b/homa_utils.c
@@ -155,6 +155,7 @@ int homa_init(struct homa *homa)
 	homa->gso_force_software = 0;
 	homa->gro_policy = HOMA_GRO_NORMAL;
 	homa->busy_usecs = 100;
+	homa->gro_busy_usecs = 5;
 	homa->timer_ticks = 0;
 	spin_lock_init(&homa->metrics_lock);
 	homa->metrics = NULL;
@@ -1826,6 +1827,16 @@ char *homa_print_metrics(struct homa *homa)
 				"Gen3 handoffs to secondary core (primary was "
 				"busy)\n",
 				m->gen3_alt_handoffs);
+		homa_append_metric(homa,
+				"gro_grant_bypasses       %15llu  "
+				"Grant packets passed directly to homa_softirq "
+				"by homa_gro_receive\n",
+				m->gro_grant_bypasses);
+		homa_append_metric(homa,
+				"gro_data_bypasses        %15llu  "
+				"Data packets passed directly to homa_softirq "
+				"by homa_gro_receive\n",
+				m->gro_data_bypasses);
 		for (i = 0; i < NUM_TEMP_METRICS;  i++)
 			homa_append_metric(homa,
 					"temp%-2d                  %15llu  "

diff --git a/man/homa.7 b/man/homa.7
@@ -268,9 +268,6 @@ in
 The amount of time (in microseconds) that a given core can own a page in
 a receive buffer pool before its ownership can be revoked by a different
 core.
-.IR gro_policy
-An integer value that determines how Homa processes incoming packets
-at the GRO level. See code in homa_offload.c for more details.
 .TP
 .IR busy_usecs
 An integer value in microsecond units; if a core has been active in
@@ -339,6 +336,19 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small
 value can provide significant benefits for the largest messages under very high
 loads, but for most loads its effect is negligible.
 .TP
+.I gro_busy_usecs
+An integer value used to determine whether or not to perform some
+optimizations specified by
+.IR gro_policy .
+If the gap between the completion of one call to homa_gro_receive and
+the invocation of the next call on the same core is less than this many
+microseconds, the core is considered to be "busy", so optimizations
+that add to the load of the core will not be performed.
+.TP
+.I gro_policy
+An integer value that determines how Homa processes incoming packets
+at the GRO level. See code in homa_offload.c for more details.
+.TP
 .IR gso_force_software
 If this value is nonzero, Homa will perform GSO in software instead of
 asking the NIC to perform TSO in hardware. This can be useful when running