Skip to content

Commit

Permalink
Introduce new packet format for TCP hijacking
Browse files Browse the repository at this point in the history
(Don't interleave seg_headers with GSO data; let TSO increment
the sequence field instead)
  • Loading branch information
johnousterhout committed Jul 30, 2024
1 parent 0484a97 commit 7780160
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 147 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ This repo contains an implementation of the Homa transport protocol as a Linux k
sysctl mechanism. For details, see the man page `homa.7`.
## Significant recent improvements
- July 2024: introduced "TCP hijacking", where Homa packets are sent as
legitimate TCP segments (using TCP as the IP protocol) and then reclaimed
from TCP on the destination. This allows Homa to make better use of
TSO and RSS.
- June 2024: refactored sk_buff management to use frags; improves
efficiency significantly.
- April 2024: replaced `master` branch with `main`
Expand Down
103 changes: 73 additions & 30 deletions homa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,9 @@ struct common_header {

/**
* @sequence: corresponds to the sequence number field in TCP headers;
* must not be used by Homa, in case it gets incremented during TCP
* offload.
* used in DATA packets to hold the offset in the message of the first
* byte of data. However, when TSO is used without TCP hijacking, this
* value will only be correct in the first segment of a GSO packet.
*/
__be32 sequence;

Expand All @@ -277,6 +278,7 @@ struct common_header {
* @doff: High order 4 bits holds the number of 4-byte chunks in a
* data_header (low-order bits unused). Used only for DATA packets;
* must be in the same position as the data offset in a TCP header.
* Used by TSO to determine where the replicated header portion ends.
*/
__u8 doff;

Expand Down Expand Up @@ -342,33 +344,66 @@ struct homa_ack {
__be16 server_port;
} __attribute__((packed));

/**
* struct data_segment - Wire format for a chunk of data that is part of
* a DATA packet. A single sk_buff can hold multiple data_segments in order
* to enable send and receive offload (the idea is to carry many network
* packets of info in a single traversal of the Linux networking stack).
* A DATA sk_buff contains a data_header followed by any number of
* data_segments.
/* struct data_header - Contains data for part or all of a Homa message.
* An incoming packet consists of a data_header followed by message data.
* An outgoing packet can have this simple format as well, or it can be
* structured as a GSO packet. Homa supports two different formats for GSO
* packets, depending on whether TCP hijacking is enabled:
*
* No hijacking: TCP hijacking:
*
* |-----------------------| |-----------------------|
* | | | |
* | data_header | | data_header |
* | | | |
* |---------------------- | |-----------------------|
* | | | |
* | | | |
* | segment data | | segment data |
* | | | |
* | | | |
* |-----------------------| |-----------------------|
* | seg_header | | |
* |-----------------------| | |
* | | | segment data |
* | | | |
* | segment data | | |
* | | |-----------------------|
* | | | |
* |-----------------------| | |
* | seg_header | | segment data |
* |-----------------------| | |
* | | | |
* | | |-----------------------|
* | segment data |
* | |
* | |
* |-----------------------|
*
* With TCP hijacking, TSO will automatically adjust @common.sequence in
* the segments, so that value can be used as the offset of the data within
* the message. Without TCP hijacking, TSO will not adjust @common.sequence
* in the segments, so Homa sprinkles correct offsets (in seg_headers)
* throughout the segment data; TSO/GSO will include a different seg_header
* in each generated packet.
*/
struct data_segment {

struct seg_header {
/**
* @offset: Offset within message of the first byte of data in
* this segment. Segments within an sk_buff are not guaranteed
* to be in order.
* this segment. If this field is -1 it means that the packet was
* generated by GSO with TCP hijacking. In this case the true offset
* is in @common.sequence. homa_gro_receive detects this situation
* and updates this value from @common.sequence if needed, so the
* value will always be valid once the packet reaches homa_softirq.
*/
__be32 offset;

/** @data: the payload of this segment. */
char data[0];
} __attribute__((packed));

/* struct data_header - Overall header format for a DATA sk_buff, which
* contains this header followed by any number of data_segments.
*/
struct data_header {
struct common_header common;

/** @message_length: Total #bytes in the *message* */
/** @message_length: Total #bytes in the message. */
__be32 message_length;

/**
Expand Down Expand Up @@ -405,26 +440,26 @@ struct data_header {

__u8 pad;

/** @seg: First of possibly many segments */
struct data_segment seg;
/** @seg: First of possibly many segments. */
struct seg_header seg;
} __attribute__((packed));
_Static_assert(sizeof(struct data_header) <= HOMA_MAX_HEADER,
"data_header too large for HOMA_MAX_HEADER; must "
"adjust HOMA_MAX_HEADER");
_Static_assert(sizeof(struct data_header) >= HOMA_MIN_PKT_LENGTH,
"data_header too small: Homa doesn't currently have code"
"to pad data packets");
_Static_assert(((sizeof(struct data_header) - sizeof(struct data_segment))
_Static_assert(((sizeof(struct data_header) - sizeof(struct seg_header))
& 0x3) == 0,
" data_header length not a multiple of 4 bytes (required "
"for TCP/TSO compatibility");

/**
* homa_rx_data_len() - Returns the total amount of message data contained
* in an incoming DATA packet. This function works only for incoming
* packets and ougoing packets that don't use GSO.
* homa_data_len() - Returns the total number of bytes in a DATA packet
* after the data_header. Note: if the packet is a GSO packet, the result
* may include metadata as well as packet data.
*/
static inline int homa_rx_data_len(struct sk_buff *skb)
static inline int homa_data_len(struct sk_buff *skb)
{
return skb->len - skb_transport_offset(skb) - sizeof(struct data_header);
}
Expand Down Expand Up @@ -3116,6 +3151,9 @@ struct homa_skb_info {
*/
int data_bytes;

/** @seg_length: maximum number of data bytes in each GSO segment. */
int seg_length;

/**
* @offset: offset within the message of the first byte of data in
* this packet.
Expand Down Expand Up @@ -3306,12 +3344,15 @@ static inline struct homa_rpc_bucket *homa_server_rpc_bucket(

/**
* homa_set_doff() - Fills in the doff TCP header field for a Homa packet.
* @h: Packet header whose doff field is to be set.
* @h: Packet header whose doff field is to be set.
* @size: Size of the "header", bytes (must be a multiple of 4). This
* information is used only for TSO; it's the number of bytes
* that should be replicated in each segment. The bytes after
* this will be distributed among segments.
*/
static inline void homa_set_doff(struct data_header *h)
static inline void homa_set_doff(struct data_header *h, int size)
{
h->common.doff = (sizeof(struct data_header)
- sizeof(struct data_segment)) << 2;
h->common.doff = size << 2;
}

static inline struct homa_sock *homa_sk(const struct sock *sk)
Expand Down Expand Up @@ -3594,6 +3635,8 @@ extern void homa_dst_refresh(struct homa_peertab *peertab,
extern int homa_err_handler_v4(struct sk_buff *skb, u32 info);
extern int homa_err_handler_v6(struct sk_buff *skb, struct inet6_skb_parm *
, u8, u8, int, __be32);
extern int homa_fill_data_interleaved(struct homa_rpc *rpc,
struct sk_buff *skb, struct iov_iter *iter);
extern struct homa_rpc
*homa_find_client_rpc(struct homa_sock *hsk, __u64 id);
extern struct homa_rpc
Expand Down
8 changes: 4 additions & 4 deletions homa_incoming.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ void homa_add_packet(struct homa_rpc *rpc, struct sk_buff *skb)
{
struct data_header *h = (struct data_header *) skb->data;
int start = ntohl(h->seg.offset);
int length = homa_rx_data_len(skb);
int length = homa_data_len(skb);
int end = start + length;
struct homa_gap *gap, *dummy, *gap2;

Expand Down Expand Up @@ -253,7 +253,7 @@ int homa_copy_to_user(struct homa_rpc *rpc)
struct data_header *h = (struct data_header *)
skbs[i]->data;
int offset = ntohl(h->seg.offset);
int pkt_length = homa_rx_data_len(skbs[i]);
int pkt_length = homa_data_len(skbs[i]);
int copied = 0;
char *dst;
struct iovec iov;
Expand Down Expand Up @@ -574,9 +574,9 @@ void homa_data_pkt(struct sk_buff *skb, struct homa_rpc *rpc)
tt_record4("Dropping packet because no buffer space available: "
"id %d, offset %d, length %d, old incoming %d",
rpc->id, ntohl(h->seg.offset),
homa_rx_data_len(skb),
homa_data_len(skb),
rpc->msgin.granted);
INC_METRIC(dropped_data_no_bufs, homa_rx_data_len(skb));
INC_METRIC(dropped_data_no_bufs, homa_data_len(skb));
goto discard;
}

Expand Down
10 changes: 8 additions & 2 deletions homa_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ struct sk_buff *homa_gso_segment(struct sk_buff *skb,
* in each segment) from data, which is divided among the segments.
*/
__skb_pull(skb, sizeof(struct data_header)
- sizeof(struct data_segment));
- sizeof(struct seg_header));
segs = skb_segment(skb, features);

/* Set incrementing ids in each of the segments (mimics behavior
Expand Down Expand Up @@ -274,11 +274,17 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list,
// tt_record("homa_gro_receive can't pull enough data "
// "from packet for trace");
if (h_new->common.type == DATA) {
if (h_new->seg.offset == -1) {
tt_record2("homa_gro_receive replaced offset %d with %d",
ntohl(h_new->seg.offset),
ntohl(h_new->common.sequence));
h_new->seg.offset = h_new->common.sequence;
}
tt_record4("homa_gro_receive got packet from 0x%x "
"id %llu, offset %d, priority %d",
saddr, homa_local_id(h_new->common.sender_id),
ntohl(h_new->seg.offset), priority);
if ((homa_rx_data_len(skb) == ntohl(h_new->message_length))
if ((homa_data_len(skb) == ntohl(h_new->message_length))
&& (homa->gro_policy & HOMA_GRO_SHORT_BYPASS)
&& !busy) {
INC_METRIC(gro_data_bypasses, 1);
Expand Down
Loading

0 comments on commit 7780160

Please sign in to comment.