From 7aaaeae5c7b3aaafd50c4700bc46a5810142c168 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Mon, 7 Nov 2022 10:04:18 -0800 Subject: [PATCH] Implemented software GSO for Homa --- README.md | 14 ++++++++------ homa_impl.h | 3 +++ homa_offload.c | 26 +++++++++++++++++++++++++- homa_outgoing.c | 1 + notes.txt | 7 +++++++ perf.txt | 5 +++++ test/mock.c | 6 ++++++ 7 files changed, 55 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 07423139..8582b588 100644 --- a/README.md +++ b/README.md @@ -60,18 +60,19 @@ This repo contains an implementation of the Homa transport protocol as a Linux k It is not clear that this approach will work with all NICs, but the following NICs are known to work: - Mellanox ConnectX-4, ConnectX-5, and ConnectX-6 - + There have been reports of problems with the following NICs (these have not yet been explored thoroughly enough to know whether the problems are insurmountable): - Intel E810 (ice), XXV710 (i40e), XL710 Please let me know if you find other NICs that work (or NICs that don't work). - If the NIC doesn't support TSO for Homa, then you'll need to use `sysctl` to - ensure that `max_gso_size` is the same as the maximum packet size (if it is - larger, then messages larger than the packet size will hang: outgoing packets - will be dropped by the NIC, but smaller retry packets get through, so Homa - will keep retrying over and over). + If the NIC doesn't support TSO for Homa, then Homa will perform segmentation + in software, but that's quite a bit slower. If for some reason software + GSO doesn't work (it's fairly new in Homa), then messages larger than the + maximum packet size may hang or result in very poor performance. If this + happens, you'll need to use `sysctl` to ensure that `max_gso_size` is the + same as the maximum packet size. - A collection of man pages is available in the "man" subdirectory. The API for Homa is different from TCP sockets. @@ -96,6 +97,7 @@ This repo contains an implementation of the Homa transport protocol as a Linux k sysctl mechanism. For details, see the man page `homa.7`. ## Significant recent improvements +- November 2022: Implemented software GSO for Homa. - September 2022: Added support for IPv6, as well as completion cookies. This required small but incompatible changes to the API. Many thanks to Dan Manjarres for contributing these diff --git a/homa_impl.h b/homa_impl.h index 404d85c8..410664d1 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -2794,6 +2794,9 @@ extern int homa_gro_complete(struct sk_buff *skb, int thoff); extern struct sk_buff *homa_gro_receive(struct list_head *gro_list, struct sk_buff *skb); +extern struct sk_buff + *homa_gso_segment(struct sk_buff *skb, + netdev_features_t features); extern int homa_hash(struct sock *sk); extern enum hrtimer_restart homa_hrtimer(struct hrtimer *timer); diff --git a/homa_offload.c b/homa_offload.c index a373cbbd..09a17b09 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -21,7 +21,7 @@ static const struct net_offload homa_offload = { .callbacks = { - .gso_segment = NULL, + .gso_segment = homa_gso_segment, .gro_receive = homa_gro_receive, .gro_complete = homa_gro_complete, }, @@ -80,6 +80,30 @@ static inline void homa_set_softirq_cpu(struct sk_buff *skb, int cpu) __skb_set_sw_hash(skb, hash, false); } +/** + * homa_gso_segment() - Split up a large outgoing Homa packet (larger than MTU) + * into multiple smaller packets. + * @skb: Packet to split. + * @features: Passed through to skb_segment. + * Return: A list of packets, or NULL if for the packet couldn't be split. + */ +struct sk_buff *homa_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs; + tt_record2("homa_gso_segment invoked, frags %d, headlen %d", + skb_shinfo(skb)->nr_frags, skb_headlen(skb)); + + /* This is needed to separate header info (which is replicated + * in each segment) from data, which is divided among the segments. + */ + __skb_pull(skb, sizeof(struct data_header) + - sizeof(struct data_segment)); + segs = skb_segment(skb, features); + tt_record("homa_gso_segment returning"); + return segs; +} + /** * homa_gro_receive() - Invoked for each input packet at a very low * level in the stack to perform GRO. However, this code does GRO in an diff --git a/homa_outgoing.c b/homa_outgoing.c index 40760930..2b22e5e5 100644 --- a/homa_outgoing.c +++ b/homa_outgoing.c @@ -129,6 +129,7 @@ struct sk_buff *homa_fill_packets(struct homa_sock *hsk, struct homa_peer *peer, skb_shinfo(skb)->gso_size = sizeof(struct data_segment) + max_pkt_data; skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; +// skb_shinfo(skb)->gso_type = 0xd; // Force software GSO } skb_shinfo(skb)->gso_segs = 0; diff --git a/notes.txt b/notes.txt index 2c15f213..7979433e 100755 --- a/notes.txt +++ b/notes.txt @@ -18,6 +18,13 @@ Notes for Homa implementation in Linux: * pin_user_page (not sure the difference from get_user_page) * Performance-related tasks: + * Implement sk_buff caching for output buffers: + * Allocation is slow (2-10 us on AMD processors; check on Intel?) + * Large buffers exceed KMALLOC_MAX_CACHE_SIZE, so they aren't cached + in slabs + * Keep free lists in Homa for different sizes (e.g. pre-GSO and GSO), + append output buffers there + * Can recycle an sk_buff by calling build_skb_around(). * Rework FIFO granting so that it doesn't consider homa->max_overcommit (just find the oldest message that doesn't have a pity grant)? Also, it doesn't look like homa_grant_fifo is keeping track of pity grants diff --git a/perf.txt b/perf.txt index 566b366d..644aca75 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,11 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +* (November 2022) Software GSO is very slow (17 usec on AMD EPYC processors, + breaking 64K into 9K jumbo frames. The main problem appears to be sk_buff + allocation, which takes multiple usecs because the packet buffers are too + large to be cached in the slab allocator. + * (November 2022) Compared "cp_node client --workload 500000" performance on c6525-100g cluster (24-core AMD 7402P processors @ 2.8 Ghz, 100 Gbps networking) vs. xl170 cluster (10-core Intel E5-2640v4 @ 2.4 Ghz, 25 Gbps diff --git a/test/mock.c b/test/mock.c index f4cf06d2..dae3c2c2 100644 --- a/test/mock.c +++ b/test/mock.c @@ -931,6 +931,12 @@ void *skb_put(struct sk_buff *skb, unsigned int len) return result; } +struct sk_buff *skb_segment(struct sk_buff *head_skb, + netdev_features_t features) +{ + return NULL; +} + int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) {