From 7aaaeae5c7b3aaafd50c4700bc46a5810142c168 Mon Sep 17 00:00:00 2001
From: John Ousterhout <ouster@cs.stanford.edu>
Date: Mon, 7 Nov 2022 10:04:18 -0800
Subject: [PATCH] Implemented software GSO for Homa

---
 README.md       | 14 ++++++++------
 homa_impl.h     |  3 +++
 homa_offload.c  | 26 +++++++++++++++++++++++++-
 homa_outgoing.c |  1 +
 notes.txt       |  7 +++++++
 perf.txt        |  5 +++++
 test/mock.c     |  6 ++++++
 7 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 07423139..8582b588 100644
--- a/README.md
+++ b/README.md
@@ -60,18 +60,19 @@ This repo contains an implementation of the Homa transport protocol as a Linux k
   It is not clear that this approach will work with all NICs, but the following
   NICs are known to work:
   - Mellanox ConnectX-4, ConnectX-5, and ConnectX-6
-  
+
   There have been reports of problems with the following NICs (these have not
   yet been explored thoroughly enough to know whether the problems are
   insurmountable):
   - Intel E810 (ice), XXV710 (i40e), XL710
 
   Please let me know if you find other NICs that work (or NICs that don't work).
-  If the NIC doesn't support TSO for Homa, then you'll need to use `sysctl` to
-  ensure that `max_gso_size` is the same as the maximum packet size (if it is
-  larger, then messages larger than the packet size will hang: outgoing packets
-  will be dropped by the NIC, but smaller retry packets get through, so Homa
-  will keep retrying over and over).
+  If the NIC doesn't support TSO for Homa, then Homa will perform segmentation
+  in software, but that's quite a bit slower. If for some reason software
+  GSO doesn't work (it's fairly new in Homa), then messages larger than the
+  maximum packet size may hang or result in very poor performance. If this
+  happens, you'll need to use `sysctl` to ensure that `max_gso_size` is the
+  same as the maximum packet size.
 
 - A collection of man pages is available in the "man" subdirectory. The API for
   Homa is different from TCP sockets.
@@ -96,6 +97,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k
      sysctl mechanism. For details, see the man page `homa.7`.
 
 ## Significant recent improvements
+- November 2022: Implemented software GSO for Homa.
 - September 2022: Added support for IPv6, as well as completion cookies.
   This required small but incompatible changes to the API.
   Many thanks to Dan Manjarres for contributing these
diff --git a/homa_impl.h b/homa_impl.h
index 404d85c8..410664d1 100644
--- a/homa_impl.h
+++ b/homa_impl.h
@@ -2794,6 +2794,9 @@ extern int      homa_gro_complete(struct sk_buff *skb, int thoff);
 extern struct sk_buff
                *homa_gro_receive(struct list_head *gro_list,
                     struct sk_buff *skb);
+extern struct sk_buff
+               *homa_gso_segment(struct sk_buff *skb,
+		    netdev_features_t features);
 extern int      homa_hash(struct sock *sk);
 extern enum hrtimer_restart
                 homa_hrtimer(struct hrtimer *timer);
diff --git a/homa_offload.c b/homa_offload.c
index a373cbbd..09a17b09 100644
--- a/homa_offload.c
+++ b/homa_offload.c
@@ -21,7 +21,7 @@
 
 static const struct net_offload homa_offload = {
 	.callbacks = {
-		.gso_segment	=	NULL,
+		.gso_segment	=	homa_gso_segment,
 		.gro_receive	=	homa_gro_receive,
 		.gro_complete	=	homa_gro_complete,
 	},
@@ -80,6 +80,30 @@ static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu)
 	__skb_set_sw_hash(skb, hash, false);
 }
 
+/**
+ * homa_gso_segment() - Split up a large outgoing Homa packet (larger than MTU)
+ * into multiple smaller packets.
+ * @skb:       Packet to split.
+ * @features:  Passed through to skb_segment.
+ * Return: A list of packets, or NULL if for the packet couldn't be split.
+ */
+struct sk_buff *homa_gso_segment(struct sk_buff *skb,
+		netdev_features_t features)
+{
+	struct sk_buff *segs;
+	tt_record2("homa_gso_segment invoked, frags %d, headlen %d",
+			skb_shinfo(skb)->nr_frags, skb_headlen(skb));
+
+	/* This is needed to separate header info (which is replicated
+	 * in each segment) from data, which is divided among the segments.
+	 */
+	__skb_pull(skb, sizeof(struct data_header)
+			- sizeof(struct data_segment));
+	segs = skb_segment(skb, features);
+	tt_record("homa_gso_segment returning");
+	return segs;
+}
+
 /**
  * homa_gro_receive() - Invoked for each input packet at a very low
  * level in the stack to perform GRO. However, this code does GRO in an
diff --git a/homa_outgoing.c b/homa_outgoing.c
index 40760930..2b22e5e5 100644
--- a/homa_outgoing.c
+++ b/homa_outgoing.c
@@ -129,6 +129,7 @@ struct sk_buff *homa_fill_packets(struct homa_sock *hsk, struct homa_peer *peer,
 			skb_shinfo(skb)->gso_size = sizeof(struct data_segment)
 					+ max_pkt_data;
 			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+//			skb_shinfo(skb)->gso_type = 0xd;  // Force software GSO
 		}
 		skb_shinfo(skb)->gso_segs = 0;
 
diff --git a/notes.txt b/notes.txt
index 2c15f213..7979433e 100755
--- a/notes.txt
+++ b/notes.txt
@@ -18,6 +18,13 @@ Notes for Homa implementation in Linux:
   * pin_user_page (not sure the difference from get_user_page)
 
 * Performance-related tasks:
+  * Implement sk_buff caching for output buffers:
+    * Allocation is slow (2-10 us on AMD processors; check on Intel?)
+    * Large buffers exceed KMALLOC_MAX_CACHE_SIZE, so they aren't cached
+      in slabs
+    * Keep free lists in Homa for different sizes (e.g. pre-GSO and GSO),
+      append output buffers there
+    * Can recycle an sk_buff by calling build_skb_around().
   * Rework FIFO granting so that it doesn't consider homa->max_overcommit
     (just find the oldest message that doesn't have a pity grant)? Also,
     it doesn't look like homa_grant_fifo is keeping track of pity grants
diff --git a/perf.txt b/perf.txt
index 566b366d..644aca75 100644
--- a/perf.txt
+++ b/perf.txt
@@ -2,6 +2,11 @@ This file contains various notes and lessons learned concerning performance
 of the Homa Linux kernel module.  The notes are in reverse chronological
 order.
 
+* (November 2022) Software GSO is very slow (17 usec on AMD EPYC processors,
+  breaking 64K into 9K jumbo frames. The main problem appears to be sk_buff
+  allocation, which takes multiple usecs because the packet buffers are too
+  large to be cached in the slab allocator.
+
 * (November 2022) Compared "cp_node client --workload 500000" performance
   on c6525-100g cluster (24-core AMD 7402P processors @ 2.8 Ghz, 100 Gbps
   networking) vs. xl170 cluster (10-core Intel E5-2640v4 @ 2.4 Ghz, 25 Gbps
diff --git a/test/mock.c b/test/mock.c
index f4cf06d2..dae3c2c2 100644
--- a/test/mock.c
+++ b/test/mock.c
@@ -931,6 +931,12 @@ void *skb_put(struct sk_buff *skb, unsigned int len)
 	return result;
 }
 
+struct sk_buff *skb_segment(struct sk_buff *head_skb,
+		netdev_features_t features)
+{
+	return NULL;
+}
+
 int sock_common_getsockopt(struct socket *sock, int level, int optname,
 		char __user *optval, int __user *optlen)
 {