From 8b3d5d5fa4f59cf0eba96ec18818edeec83f297d Mon Sep 17 00:00:00 2001
From: John Ousterhout <ouster@cs.stanford.edu>
Date: Thu, 7 Dec 2023 11:23:57 -0800
Subject: [PATCH] Added gro_busy_usecs configuration parameter * Also, removed
 HOMA_GRO_BYPASS option (it's never appropriate) * Added metrics for grant and
 data bypass, updated metrics.py

---
 homa_impl.h              |  49 +++++++++++-----
 homa_incoming.c          |   4 ++
 homa_offload.c           |  40 +++++++------
 homa_plumbing.c          |   7 +++
 homa_utils.c             |  11 ++++
 man/homa.7               |  16 ++++-
 test/unit_homa_offload.c | 122 ++++++++++++++++++++++++---------------
 util/metrics.py          |  22 +++++--
 8 files changed, 184 insertions(+), 87 deletions(-)

diff --git a/homa_impl.h b/homa_impl.h
index 8f5bdc2f..1aea8737 100644
--- a/homa_impl.h
+++ b/homa_impl.h
@@ -1945,10 +1945,6 @@ struct homa {
 	/* Bits that can be specified for gro_policy. These were created for
 	 * testing, in order to evaluate various possible policies; you almost
 	 * certainly should not use any value other than HOMA_GRO_NORMAL.
-	 * HOMA_GRO_BYPASS:           Pass all incoming packets directly to
-	 *                            homa_softirq during GRO; this bypasses
-	 *                            the SoftIRQ dispatching mechanism as well
-	 *                            as the network and IP stack layers.
 	 * HOMA_GRO_SAME_CORE         If isolated packets arrive (not part of
 	 *                            a batch) use the GRO core for SoftIRQ also.
 	 * HOMA_GRO_IDLE              Use old mechanism for selecting an idle
@@ -1958,13 +1954,14 @@ struct homa {
 	 * HOMA_GRO_GEN2              Use the new mechanism for selecting an
 	 *                            idle core for SoftIRQ.
 	 * HOMA_GRO_FAST_GRANTS       Pass all grants immediately to
-	 *                            homa_softirq during GRO.
-	 * HOMA_GRO_SHORT_BYPASS      Pass all short packets directly to
-	 *                            homa_softirq during GRO.
+	 *                            homa_softirq during GRO (only if the
+	 *                            core isn't overloaded).
+	 * HOMA_GRO_SHORT_BYPASS      Pass all single-packet messages directly
+	 *                            to homa_softirq during GRO (only if the
+	 *                            core isn't overloaded).
 	 * HOMA_GRO_GEN3              Use the "Gen3" mechanisms for load
 	 *                            balancing.
 	 */
-	#define HOMA_GRO_BYPASS          1
 	#define HOMA_GRO_SAME_CORE       2
 	#define HOMA_GRO_IDLE            4
 	#define HOMA_GRO_NEXT            8
@@ -1973,7 +1970,7 @@ struct homa {
 	#define HOMA_GRO_SHORT_BYPASS   64
 	#define HOMA_GRO_GEN3          128
 	#define HOMA_GRO_NORMAL      (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \
-			|HOMA_GRO_SHORT_BYPASS)
+			|HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS)
 
 	/*
 	 * @busy_usecs: if there has been activity on a core within the
@@ -1986,6 +1983,19 @@ struct homa {
 	/** @busy_cycles: Same as busy_usecs except in get_cycles() units. */
 	int busy_cycles;
 
+	/*
+	 * @gro_busy_usecs: if the gap between the completion of
+	 * homa_gro_receive and the next call to homa_gro_receive on the same
+	 * core is less than this, then GRO on that core is considered to be
+	 * "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be
+	 * done because they risk overloading the core. Set externally via
+	 * sysctl.
+	 */
+	int gro_busy_usecs;
+
+	/** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */
+	int gro_busy_cycles;
+
 	/**
 	 * @timer_ticks: number of times that homa_timer has been invoked
 	 * (may wraparound, which is safe).
@@ -2629,6 +2639,20 @@ struct homa_metrics {
 	 */
 	__u64 gen3_alt_handoffs;
 
+	/**
+	 * @gro_grant_bypasses: total number of GRANT packets passed directly
+	 * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
+	 * mechanism (triggered by HOMA_GRO_FAST_GRANTS).
+	 */
+	__u64 gro_grant_bypasses;
+
+	/**
+	 * @gro_data_bypasses: total number of DATA packets passed directly
+	 * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ
+	 * mechanism (triggered by HOMA_GRO_SHORT_BYPASS).
+	 */
+	__u64 gro_data_bypasses;
+
 	/** @temp: For temporary use during testing. */
 #define NUM_TEMP_METRICS 10
 	__u64 temp[NUM_TEMP_METRICS];
@@ -2648,10 +2672,9 @@ struct homa_core {
 	__u64 last_active;
 
 	/**
-	 * @last_gro: the last time (in get_cycle() units) that Homa
-	 * processed packets at GRO(NAPI) level on this core. Used to
-	 * avoid assigning SoftIRQ handlers to this core when it has
-	 * been used recently for GRO.
+	 * @last_gro: the last time (in get_cycle() units) that
+	 * homa_gro_receive returned on this core. Used to determine
+	 * whether GRO is keeping a core busy.
 	 */
 	__u64 last_gro;
 
diff --git a/homa_incoming.c b/homa_incoming.c
index 4d43c3fd..66bcb0ac 100644
--- a/homa_incoming.c
+++ b/homa_incoming.c
@@ -1933,6 +1933,10 @@ void homa_incoming_sysctl_changed(struct homa *homa)
 	tmp = (tmp*cpu_khz)/1000;
 	homa->busy_cycles = tmp;
 
+	tmp = homa->gro_busy_usecs;
+	tmp = (tmp*cpu_khz)/1000;
+	homa->gro_busy_cycles = tmp;
+
 	tmp = homa->bpage_lease_usecs;
 	tmp = (tmp*cpu_khz)/1000;
 	homa->bpage_lease_cycles = tmp;
diff --git a/homa_offload.c b/homa_offload.c
index e217535c..589eb005 100644
--- a/homa_offload.c
+++ b/homa_offload.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019-2022 Stanford University
+/* Copyright (c) 2019-2023 Stanford University
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -151,12 +151,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 	struct sk_buff *held_skb;
 	struct sk_buff *result = NULL;
 	struct homa_core *core = homa_cores[raw_smp_processor_id()];
+	__u64 now = get_cycles();
+	int busy = (now - core->last_gro) < homa->gro_busy_cycles;
 	__u32 hash;
 	__u64 saved_softirq_metric, softirq_cycles;
 	struct data_header *h_new = (struct data_header *)
 			skb_transport_header(skb);
 	int priority;
 	__u32 saddr;
+
+	core->last_active = now;
 	if (skb_is_ipv6(skb)) {
 		priority = ipv6_hdr(skb)->priority;
 		saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]);
@@ -169,12 +173,18 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 //	if (!pskb_may_pull(skb, 64))
 //		tt_record("homa_gro_receive can't pull enough data "
 //				"from packet for trace");
-	if (h_new->common.type == DATA)
+	if (h_new->common.type == DATA) {
 		tt_record4("homa_gro_receive got packet from 0x%x "
 				"id %llu, offset %d, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
 				ntohl(h_new->seg.offset), priority);
-	else if (h_new->common.type == GRANT) {
+		if ((h_new->seg.segment_length == h_new->message_length)
+				&& (homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
+				&& !busy) {
+			INC_METRIC(gro_data_bypasses, 1);
+			goto bypass;
+		}
+	} else if (h_new->common.type == GRANT) {
 		tt_record4("homa_gro_receive got grant from 0x%x "
 				"id %llu, offset %d, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
@@ -186,21 +196,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 		 * a significant difference in throughput for large
 		 * messages, especially when the system is loaded.
 		 */
-		if (homa->gro_policy & HOMA_GRO_FAST_GRANTS)
+		if ((homa->gro_policy & HOMA_GRO_FAST_GRANTS) && !busy) {
+			INC_METRIC(gro_grant_bypasses, 1);
 			goto bypass;
+		}
 	} else
 		tt_record4("homa_gro_receive got packet from 0x%x "
 				"id %llu, type 0x%x, priority %d",
 				saddr, homa_local_id(h_new->common.sender_id),
 				h_new->common.type, priority);
 
-	core->last_active = get_cycles();
-
-	if ((homa->gro_policy & HOMA_GRO_BYPASS)
-			|| ((homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
-			&& (skb->len < 1400)))
-		goto bypass;
-
 	/* The GRO mechanism tries to separate packets onto different
 	 * gro_lists by hash. This is bad for us, because we want to batch
 	 * packets together regardless of their RPCs. So, instead of
@@ -273,24 +278,22 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
 
     done:
 	homa_check_pacer(homa, 1);
+	core->last_gro = get_cycles();
 	return result;
 
     bypass:
         /* Record SoftIRQ cycles in a different metric to reflect that
 	 * they happened during bypass.
 	 */
-	saved_softirq_metric = homa_cores[raw_smp_processor_id()]
-			->metrics.softirq_cycles;
+	saved_softirq_metric = core->metrics.softirq_cycles;
 	homa_softirq(skb);
-	softirq_cycles = homa_cores[raw_smp_processor_id()]
-			->metrics.softirq_cycles - saved_softirq_metric;
-	homa_cores[raw_smp_processor_id()]->metrics.softirq_cycles
-			= saved_softirq_metric;
+	softirq_cycles = core->metrics.softirq_cycles - saved_softirq_metric;
+	core->metrics.softirq_cycles = saved_softirq_metric;
 	INC_METRIC(bypass_softirq_cycles, softirq_cycles);
+	core->last_gro = get_cycles();
 
 	/* This return value indicates that we have freed skb. */
 	return ERR_PTR(-EINPROGRESS);
-
 }
 
 /**
@@ -350,7 +353,6 @@ void homa_gro_gen2(struct sk_buff *skb)
 				ntohl(h->seg.offset));
 	}
 	atomic_inc(&homa_cores[candidate]->softirq_backlog);
-	homa_cores[this_core]->last_gro = now;
 	homa_set_softirq_cpu(skb, candidate);
 }
 
diff --git a/homa_plumbing.c b/homa_plumbing.c
index 99b6670e..1ef73229 100644
--- a/homa_plumbing.c
+++ b/homa_plumbing.c
@@ -280,6 +280,13 @@ static struct ctl_table homa_ctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= homa_dointvec
 	},
+	{
+		.procname	= "gro_busy_usecs",
+		.data		= &homa_data.gro_busy_usecs,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= homa_dointvec
+	},
 	{
 		.procname	= "gro_policy",
 		.data		= &homa_data.gro_policy,
diff --git a/homa_utils.c b/homa_utils.c
index 29ab42b5..76c6eb62 100644
--- a/homa_utils.c
+++ b/homa_utils.c
@@ -155,6 +155,7 @@ int homa_init(struct homa *homa)
 	homa->gso_force_software = 0;
 	homa->gro_policy = HOMA_GRO_NORMAL;
 	homa->busy_usecs = 100;
+	homa->gro_busy_usecs = 5;
 	homa->timer_ticks = 0;
 	spin_lock_init(&homa->metrics_lock);
 	homa->metrics = NULL;
@@ -1826,6 +1827,16 @@ char *homa_print_metrics(struct homa *homa)
 				"Gen3 handoffs to secondary core (primary was "
 				"busy)\n",
 				m->gen3_alt_handoffs);
+		homa_append_metric(homa,
+				"gro_grant_bypasses       %15llu  "
+				"Grant packets passed directly to homa_softirq "
+				"by homa_gro_receive\n",
+				m->gro_grant_bypasses);
+		homa_append_metric(homa,
+				"gro_data_bypasses        %15llu  "
+				"Data packets passed directly to homa_softirq "
+				"by homa_gro_receive\n",
+				m->gro_data_bypasses);
 		for (i = 0; i < NUM_TEMP_METRICS;  i++)
 			homa_append_metric(homa,
 					"temp%-2d                  %15llu  "
diff --git a/man/homa.7 b/man/homa.7
index ab7aa155..e83daf60 100644
--- a/man/homa.7
+++ b/man/homa.7
@@ -268,9 +268,6 @@ in
 The amount of time (in microseconds) that a given core can own a page in
 a receive buffer pool before its ownership can be revoked by a different
 core.
-.IR gro_policy
-An integer value that determines how Homa processes incoming packets
-at the GRO level. See code in homa_offload.c for more details.
 .TP
 .IR busy_usecs
 An integer value in microsecond units; if a core has been active in
@@ -339,6 +336,19 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small
 value can provide significant benefits for the largest messages under very high
 loads, but for most loads its effect is negligible.
 .TP
+.I gro_busy_usecs
+An integer value used to determine whether or not to perform some
+optimizations specified by
+.IR gro_policy .
+If the gap between the completion of one call to homa_gro_receive and
+the invocation of the next call on the same core is less than this many
+microseconds, the core is considered to be "busy", so optimizations
+that add to the load of the core will not be performed.
+.TP
+.I gro_policy
+An integer value that determines how Homa processes incoming packets
+at the GRO level. See code in homa_offload.c for more details.
+.TP
 .IR gso_force_software
 If this value is nonzero, Homa will perform GSO in software instead of
 asking the NIC to perform TSO in hardware. This can be useful when running
diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c
index 425c2572..42a624a8 100644
--- a/test/unit_homa_offload.c
+++ b/test/unit_homa_offload.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019-2022 Stanford University
+/* Copyright (c) 2019-2023 Stanford University
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -73,6 +73,11 @@ FIXTURE_SETUP(homa_offload)
 	list_add_tail(&self->skb2->list, &self->napi.gro_hash[2].list);
 	INIT_LIST_HEAD(&self->empty_list);
 	unit_log_clear();
+
+	/* Configure so core isn't considered too busy for bypasses. */
+	mock_cycles = 1000;
+	self->homa.gro_busy_cycles = 500;
+	homa_cores[cpu_number]->last_gro = 400;
 }
 FIXTURE_TEARDOWN(homa_offload)
 {
@@ -103,44 +108,6 @@ TEST_F(homa_offload, homa_gso_segment_set_ip_ids)
 	kfree_skb(segs);
 }
 
-TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization)
-{
-	struct in6_addr client_ip = unit_get_in_addr("196.168.0.1");
-	struct in6_addr server_ip = unit_get_in_addr("1.2.3.4");
-	int client_port = 40000;
-	__u64 client_id = 1234;
-	__u64 server_id = 1235;
-	struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING,
-			&client_ip, &server_ip, client_port, server_id, 100,
-			20000);
-	ASSERT_NE(NULL, srpc);
-	homa_xmit_data(srpc, false);
-	unit_log_clear();
-
-	struct grant_header h = {{.sport = htons(srpc->dport),
-	                .dport = htons(self->hsk.port),
-			.sender_id = cpu_to_be64(client_id),
-			.type = GRANT},
-		        .offset = htonl(11000),
-			.priority = 3,
-			.resend_all = 0};
-	self->homa.gro_policy = HOMA_GRO_FAST_GRANTS;
-	struct sk_buff *result = homa_gro_receive(&self->empty_list,
-			mock_skb_new(&client_ip, &h.common, 0, 0));
-	EXPECT_EQ(EINPROGRESS, -PTR_ERR(result));
-	EXPECT_EQ(11000, srpc->msgout.granted);
-	EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get());
-
-	unit_log_clear();
-	h.offset = htonl(14000);
-	self->homa.gro_policy = 0;
-	struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0);
-	result = homa_gro_receive(&self->empty_list, skb);
-	EXPECT_EQ(NULL, result);
-	EXPECT_EQ(11000, srpc->msgout.granted);
-	EXPECT_STREQ("", unit_log_get());
-	kfree_skb(skb);
-}
 TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS)
 {
 	struct in6_addr client_ip = unit_get_in_addr("196.168.0.1");
@@ -159,7 +126,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS)
 			.seg = {.offset = htonl(2000),
 			        .segment_length = htonl(1400),
 	                        .ack = {0, 0, 0}}};
-	struct sk_buff *skb, *skb2, *skb3;
+	struct sk_buff *skb, *skb2, *skb3, *skb4;
 
 	struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT,
 			&client_ip, &server_ip, client_port, server_id, 10000,
@@ -171,24 +138,85 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS)
 	skb = mock_skb_new(&self->ip, &h.common, 1400, 2000);
 	struct sk_buff *result = homa_gro_receive(&self->empty_list, skb);
 	EXPECT_EQ(0, -PTR_ERR(result));
-	EXPECT_EQ(8600, srpc->msgin.bytes_remaining);
+	EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses);
 
-	/* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but packet too long. */
+	/* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer
+	 * than one packet.
+	 */
 	self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS;
-	skb2 = mock_skb_new(&self->ip, &h.common, 1400, 3000);
+	homa_cores[cpu_number]->last_gro = 400;
+	skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000);
 	result = homa_gro_receive(&self->empty_list, skb2);
 	EXPECT_EQ(0, -PTR_ERR(result));
-	EXPECT_EQ(8600, srpc->msgin.bytes_remaining);
+	EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses);
 
 	/* Third attempt: bypass should happen. */
-	h.seg.segment_length = htonl(100);
-	skb3 = mock_skb_new(&self->ip, &h.common, 100, 4000);
+	h.message_length = h.seg.segment_length;
+	h.incoming = h.seg.segment_length;
+	homa_cores[cpu_number]->last_gro = 400;
+	skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000);
 	result = homa_gro_receive(&self->empty_list, skb3);
 	EXPECT_EQ(EINPROGRESS, -PTR_ERR(result));
-	EXPECT_EQ(8500, srpc->msgin.bytes_remaining);
+	EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses);
+
+	/* Third attempt: no bypass because core busy. */
+	homa_cores[cpu_number]->last_gro = 600;
+	skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000);
+	result = homa_gro_receive(&self->empty_list, skb3);
+	EXPECT_EQ(0, -PTR_ERR(result));
+	EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses);
 
 	kfree_skb(skb);
 	kfree_skb(skb2);
+	kfree_skb(skb4);
+}
+TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization)
+{
+	struct in6_addr client_ip = unit_get_in_addr("196.168.0.1");
+	struct in6_addr server_ip = unit_get_in_addr("1.2.3.4");
+	int client_port = 40000;
+	__u64 client_id = 1234;
+	__u64 server_id = 1235;
+	struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING,
+			&client_ip, &server_ip, client_port, server_id, 100,
+			20000);
+	ASSERT_NE(NULL, srpc);
+	homa_xmit_data(srpc, false);
+	unit_log_clear();
+
+	struct grant_header h = {{.sport = htons(srpc->dport),
+	                .dport = htons(self->hsk.port),
+			.sender_id = cpu_to_be64(client_id),
+			.type = GRANT},
+		        .offset = htonl(11000),
+			.priority = 3,
+			.resend_all = 0};
+
+	/* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */
+	self->homa.gro_policy = 0;
+	struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0);
+	struct sk_buff *result = homa_gro_receive(&self->empty_list, skb);
+	EXPECT_EQ(0, -PTR_ERR(result));
+	EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_grant_bypasses);
+	EXPECT_STREQ("", unit_log_get());
+
+	/* Second attempt: HOMA_FAST_GRANTS is enabled. */
+	self->homa.gro_policy = HOMA_GRO_FAST_GRANTS;
+	homa_cores[cpu_number]->last_gro = 400;
+	struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0);
+	result = homa_gro_receive(&self->empty_list, skb2);
+	EXPECT_EQ(EINPROGRESS, -PTR_ERR(result));
+	EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses);
+	EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get());
+
+	/* Third attempt: core is too busy for fast grants. */
+	homa_cores[cpu_number]->last_gro = 600;
+	struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0);
+	result = homa_gro_receive(&self->empty_list, skb3);
+	EXPECT_EQ(0, -PTR_ERR(result));
+	EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses);
+	kfree_skb(skb);
+	kfree_skb(skb3);
 }
 TEST_F(homa_offload, homa_gro_receive__no_held_skb)
 {
diff --git a/util/metrics.py b/util/metrics.py
index bd2d0ce1..aaa9b2df 100755
--- a/util/metrics.py
+++ b/util/metrics.py
@@ -390,12 +390,24 @@ def scale_number(number):
                     /deltas["handoffs_thread_waiting"])
         else:
             alt_thread_percent = 0.0
-        print("Available immediately:      %4.1f%%" % (100.0 - poll_percent
+        if deltas["packets_rcvd_DATA"]:
+            data_bypass_percent = (100.0*deltas["gro_data_bypasses"]
+                        /deltas["packets_rcvd_DATA"])
+        else:
+            data_bypass_percent = 0.0
+        if deltas["packets_rcvd_GRANT"]:
+            grant_bypass_percent = (100.0*deltas["gro_grant_bypasses"]
+                        /deltas["packets_rcvd_GRANT"])
+        else:
+            grant_bypass_percent = 0.0
+        print("Available immediately:        %5.1f%%" % (100.0 - poll_percent
                 - sleep_percent))
-        print("Arrived while polling:      %4.1f%%" % (poll_percent))
-        print("Blocked at least once:      %4.1f%%" % (sleep_percent))
-        print("Alternate GRO handoffs:     %4.1f%%" % (gen3_alt_percent))
-        print("Alternate thread handoffs:  %4.1f%%" % (alt_thread_percent))
+        print("Arrived while polling:        %5.1f%%" % (poll_percent))
+        print("Blocked at least once:        %5.1f%%" % (sleep_percent))
+        print("Alternate GRO handoffs:       %5.1f%%" % (gen3_alt_percent))
+        print("Alternate thread handoffs:    %5.1f%%" % (alt_thread_percent))
+        print("GRO bypass for data packets:  %5.1f%%" % (data_bypass_percent))
+        print("GRO bypass for grant packets: %5.1f%%" % (grant_bypass_percent))
 
     print("\nMiscellaneous:")
     print("--------------")