From 8b3d5d5fa4f59cf0eba96ec18818edeec83f297d Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Thu, 7 Dec 2023 11:23:57 -0800 Subject: [PATCH] Added gro_busy_usecs configuration parameter * Also, removed HOMA_GRO_BYPASS option (it's never appropriate) * Added metrics for grant and data bypass, updated metrics.py --- homa_impl.h | 49 +++++++++++----- homa_incoming.c | 4 ++ homa_offload.c | 40 +++++++------ homa_plumbing.c | 7 +++ homa_utils.c | 11 ++++ man/homa.7 | 16 ++++- test/unit_homa_offload.c | 122 ++++++++++++++++++++++++--------------- util/metrics.py | 22 +++++-- 8 files changed, 184 insertions(+), 87 deletions(-) diff --git a/homa_impl.h b/homa_impl.h index 8f5bdc2f..1aea8737 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -1945,10 +1945,6 @@ struct homa { /* Bits that can be specified for gro_policy. These were created for * testing, in order to evaluate various possible policies; you almost * certainly should not use any value other than HOMA_GRO_NORMAL. - * HOMA_GRO_BYPASS: Pass all incoming packets directly to - * homa_softirq during GRO; this bypasses - * the SoftIRQ dispatching mechanism as well - * as the network and IP stack layers. * HOMA_GRO_SAME_CORE If isolated packets arrive (not part of * a batch) use the GRO core for SoftIRQ also. * HOMA_GRO_IDLE Use old mechanism for selecting an idle @@ -1958,13 +1954,14 @@ struct homa { * HOMA_GRO_GEN2 Use the new mechanism for selecting an * idle core for SoftIRQ. * HOMA_GRO_FAST_GRANTS Pass all grants immediately to - * homa_softirq during GRO. - * HOMA_GRO_SHORT_BYPASS Pass all short packets directly to - * homa_softirq during GRO. + * homa_softirq during GRO (only if the + * core isn't overloaded). + * HOMA_GRO_SHORT_BYPASS Pass all single-packet messages directly + * to homa_softirq during GRO (only if the + * core isn't overloaded). * HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load * balancing. */ - #define HOMA_GRO_BYPASS 1 #define HOMA_GRO_SAME_CORE 2 #define HOMA_GRO_IDLE 4 #define HOMA_GRO_NEXT 8 @@ -1973,7 +1970,7 @@ struct homa { #define HOMA_GRO_SHORT_BYPASS 64 #define HOMA_GRO_GEN3 128 #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \ - |HOMA_GRO_SHORT_BYPASS) + |HOMA_GRO_SHORT_BYPASS|HOMA_GRO_FAST_GRANTS) /* * @busy_usecs: if there has been activity on a core within the @@ -1986,6 +1983,19 @@ struct homa { /** @busy_cycles: Same as busy_usecs except in get_cycles() units. */ int busy_cycles; + /* + * @gro_busy_usecs: if the gap between the completion of + * homa_gro_receive and the next call to homa_gro_receive on the same + * core is less than this, then GRO on that core is considered to be + * "busy", and optimizations such as HOMA_GRO_SHORT_BYPASS will not be + * done because they risk overloading the core. Set externally via + * sysctl. + */ + int gro_busy_usecs; + + /** @gro_busy_cycles: Same as busy_usecs except in get_cycles() units. */ + int gro_busy_cycles; + /** * @timer_ticks: number of times that homa_timer has been invoked * (may wraparound, which is safe). @@ -2629,6 +2639,20 @@ struct homa_metrics { */ __u64 gen3_alt_handoffs; + /** + * @gro_grant_bypasses: total number of GRANT packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_FAST_GRANTS). + */ + __u64 gro_grant_bypasses; + + /** + * @gro_data_bypasses: total number of DATA packets passed directly + * to homa_softirq by homa_gro_receive, bypassing the normal SoftIRQ + * mechanism (triggered by HOMA_GRO_SHORT_BYPASS). + */ + __u64 gro_data_bypasses; + /** @temp: For temporary use during testing. */ #define NUM_TEMP_METRICS 10 __u64 temp[NUM_TEMP_METRICS]; @@ -2648,10 +2672,9 @@ struct homa_core { __u64 last_active; /** - * @last_gro: the last time (in get_cycle() units) that Homa - * processed packets at GRO(NAPI) level on this core. Used to - * avoid assigning SoftIRQ handlers to this core when it has - * been used recently for GRO. + * @last_gro: the last time (in get_cycle() units) that + * homa_gro_receive returned on this core. Used to determine + * whether GRO is keeping a core busy. */ __u64 last_gro; diff --git a/homa_incoming.c b/homa_incoming.c index 4d43c3fd..66bcb0ac 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1933,6 +1933,10 @@ void homa_incoming_sysctl_changed(struct homa *homa) tmp = (tmp*cpu_khz)/1000; homa->busy_cycles = tmp; + tmp = homa->gro_busy_usecs; + tmp = (tmp*cpu_khz)/1000; + homa->gro_busy_cycles = tmp; + tmp = homa->bpage_lease_usecs; tmp = (tmp*cpu_khz)/1000; homa->bpage_lease_cycles = tmp; diff --git a/homa_offload.c b/homa_offload.c index e217535c..589eb005 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2019-2022 Stanford University +/* Copyright (c) 2019-2023 Stanford University * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -151,12 +151,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, struct sk_buff *held_skb; struct sk_buff *result = NULL; struct homa_core *core = homa_cores[raw_smp_processor_id()]; + __u64 now = get_cycles(); + int busy = (now - core->last_gro) < homa->gro_busy_cycles; __u32 hash; __u64 saved_softirq_metric, softirq_cycles; struct data_header *h_new = (struct data_header *) skb_transport_header(skb); int priority; __u32 saddr; + + core->last_active = now; if (skb_is_ipv6(skb)) { priority = ipv6_hdr(skb)->priority; saddr = ntohl(ipv6_hdr(skb)->saddr.in6_u.u6_addr32[3]); @@ -169,12 +173,18 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, // if (!pskb_may_pull(skb, 64)) // tt_record("homa_gro_receive can't pull enough data " // "from packet for trace"); - if (h_new->common.type == DATA) + if (h_new->common.type == DATA) { tt_record4("homa_gro_receive got packet from 0x%x " "id %llu, offset %d, priority %d", saddr, homa_local_id(h_new->common.sender_id), ntohl(h_new->seg.offset), priority); - else if (h_new->common.type == GRANT) { + if ((h_new->seg.segment_length == h_new->message_length) + && (homa->gro_policy & HOMA_GRO_SHORT_BYPASS) + && !busy) { + INC_METRIC(gro_data_bypasses, 1); + goto bypass; + } + } else if (h_new->common.type == GRANT) { tt_record4("homa_gro_receive got grant from 0x%x " "id %llu, offset %d, priority %d", saddr, homa_local_id(h_new->common.sender_id), @@ -186,21 +196,16 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, * a significant difference in throughput for large * messages, especially when the system is loaded. */ - if (homa->gro_policy & HOMA_GRO_FAST_GRANTS) + if ((homa->gro_policy & HOMA_GRO_FAST_GRANTS) && !busy) { + INC_METRIC(gro_grant_bypasses, 1); goto bypass; + } } else tt_record4("homa_gro_receive got packet from 0x%x " "id %llu, type 0x%x, priority %d", saddr, homa_local_id(h_new->common.sender_id), h_new->common.type, priority); - core->last_active = get_cycles(); - - if ((homa->gro_policy & HOMA_GRO_BYPASS) - || ((homa->gro_policy & HOMA_GRO_SHORT_BYPASS) - && (skb->len < 1400))) - goto bypass; - /* The GRO mechanism tries to separate packets onto different * gro_lists by hash. This is bad for us, because we want to batch * packets together regardless of their RPCs. So, instead of @@ -273,24 +278,22 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, done: homa_check_pacer(homa, 1); + core->last_gro = get_cycles(); return result; bypass: /* Record SoftIRQ cycles in a different metric to reflect that * they happened during bypass. */ - saved_softirq_metric = homa_cores[raw_smp_processor_id()] - ->metrics.softirq_cycles; + saved_softirq_metric = core->metrics.softirq_cycles; homa_softirq(skb); - softirq_cycles = homa_cores[raw_smp_processor_id()] - ->metrics.softirq_cycles - saved_softirq_metric; - homa_cores[raw_smp_processor_id()]->metrics.softirq_cycles - = saved_softirq_metric; + softirq_cycles = core->metrics.softirq_cycles - saved_softirq_metric; + core->metrics.softirq_cycles = saved_softirq_metric; INC_METRIC(bypass_softirq_cycles, softirq_cycles); + core->last_gro = get_cycles(); /* This return value indicates that we have freed skb. */ return ERR_PTR(-EINPROGRESS); - } /** @@ -350,7 +353,6 @@ void homa_gro_gen2(struct sk_buff *skb) ntohl(h->seg.offset)); } atomic_inc(&homa_cores[candidate]->softirq_backlog); - homa_cores[this_core]->last_gro = now; homa_set_softirq_cpu(skb, candidate); } diff --git a/homa_plumbing.c b/homa_plumbing.c index 99b6670e..1ef73229 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -280,6 +280,13 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + { + .procname = "gro_busy_usecs", + .data = &homa_data.gro_busy_usecs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, { .procname = "gro_policy", .data = &homa_data.gro_policy, diff --git a/homa_utils.c b/homa_utils.c index 29ab42b5..76c6eb62 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -155,6 +155,7 @@ int homa_init(struct homa *homa) homa->gso_force_software = 0; homa->gro_policy = HOMA_GRO_NORMAL; homa->busy_usecs = 100; + homa->gro_busy_usecs = 5; homa->timer_ticks = 0; spin_lock_init(&homa->metrics_lock); homa->metrics = NULL; @@ -1826,6 +1827,16 @@ char *homa_print_metrics(struct homa *homa) "Gen3 handoffs to secondary core (primary was " "busy)\n", m->gen3_alt_handoffs); + homa_append_metric(homa, + "gro_grant_bypasses %15llu " + "Grant packets passed directly to homa_softirq " + "by homa_gro_receive\n", + m->gro_grant_bypasses); + homa_append_metric(homa, + "gro_data_bypasses %15llu " + "Data packets passed directly to homa_softirq " + "by homa_gro_receive\n", + m->gro_data_bypasses); for (i = 0; i < NUM_TEMP_METRICS; i++) homa_append_metric(homa, "temp%-2d %15llu " diff --git a/man/homa.7 b/man/homa.7 index ab7aa155..e83daf60 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -268,9 +268,6 @@ in The amount of time (in microseconds) that a given core can own a page in a receive buffer pool before its ownership can be revoked by a different core. -.IR gro_policy -An integer value that determines how Homa processes incoming packets -at the GRO level. See code in homa_offload.c for more details. .TP .IR busy_usecs An integer value in microsecond units; if a core has been active in @@ -339,6 +336,19 @@ of the bandwidth is for FIFO and 90% for SRPT). As of October 2020, a small value can provide significant benefits for the largest messages under very high loads, but for most loads its effect is negligible. .TP +.I gro_busy_usecs +An integer value used to determine whether or not to perform some +optimizations specified by +.IR gro_policy . +If the gap between the completion of one call to homa_gro_receive and +the invocation of the next call on the same core is less than this many +microseconds, the core is considered to be "busy", so optimizations +that add to the load of the core will not be performed. +.TP +.I gro_policy +An integer value that determines how Homa processes incoming packets +at the GRO level. See code in homa_offload.c for more details. +.TP .IR gso_force_software If this value is nonzero, Homa will perform GSO in software instead of asking the NIC to perform TSO in hardware. This can be useful when running diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 425c2572..42a624a8 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2019-2022 Stanford University +/* Copyright (c) 2019-2023 Stanford University * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -73,6 +73,11 @@ FIXTURE_SETUP(homa_offload) list_add_tail(&self->skb2->list, &self->napi.gro_hash[2].list); INIT_LIST_HEAD(&self->empty_list); unit_log_clear(); + + /* Configure so core isn't considered too busy for bypasses. */ + mock_cycles = 1000; + self->homa.gro_busy_cycles = 500; + homa_cores[cpu_number]->last_gro = 400; } FIXTURE_TEARDOWN(homa_offload) { @@ -103,44 +108,6 @@ TEST_F(homa_offload, homa_gso_segment_set_ip_ids) kfree_skb(segs); } -TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) -{ - struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); - struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); - int client_port = 40000; - __u64 client_id = 1234; - __u64 server_id = 1235; - struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, - &client_ip, &server_ip, client_port, server_id, 100, - 20000); - ASSERT_NE(NULL, srpc); - homa_xmit_data(srpc, false); - unit_log_clear(); - - struct grant_header h = {{.sport = htons(srpc->dport), - .dport = htons(self->hsk.port), - .sender_id = cpu_to_be64(client_id), - .type = GRANT}, - .offset = htonl(11000), - .priority = 3, - .resend_all = 0}; - self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; - struct sk_buff *result = homa_gro_receive(&self->empty_list, - mock_skb_new(&client_ip, &h.common, 0, 0)); - EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(11000, srpc->msgout.granted); - EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get()); - - unit_log_clear(); - h.offset = htonl(14000); - self->homa.gro_policy = 0; - struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); - result = homa_gro_receive(&self->empty_list, skb); - EXPECT_EQ(NULL, result); - EXPECT_EQ(11000, srpc->msgout.granted); - EXPECT_STREQ("", unit_log_get()); - kfree_skb(skb); -} TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) { struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); @@ -159,7 +126,7 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) .seg = {.offset = htonl(2000), .segment_length = htonl(1400), .ack = {0, 0, 0}}}; - struct sk_buff *skb, *skb2, *skb3; + struct sk_buff *skb, *skb2, *skb3, *skb4; struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_RCVD_ONE_PKT, &client_ip, &server_ip, client_port, server_id, 10000, @@ -171,24 +138,85 @@ TEST_F(homa_offload, homa_gro_receive__HOMA_GRO_SHORT_BYPASS) skb = mock_skb_new(&self->ip, &h.common, 1400, 2000); struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); - /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but packet too long. */ + /* Second attempt: HOMA_GRO_SHORT_BYPASS enabled but message longer + * than one packet. + */ self->homa.gro_policy |= HOMA_GRO_SHORT_BYPASS; - skb2 = mock_skb_new(&self->ip, &h.common, 1400, 3000); + homa_cores[cpu_number]->last_gro = 400; + skb2 = mock_skb_new(&self->ip, &h.common, 1400, 2000); result = homa_gro_receive(&self->empty_list, skb2); EXPECT_EQ(0, -PTR_ERR(result)); - EXPECT_EQ(8600, srpc->msgin.bytes_remaining); + EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_data_bypasses); /* Third attempt: bypass should happen. */ - h.seg.segment_length = htonl(100); - skb3 = mock_skb_new(&self->ip, &h.common, 100, 4000); + h.message_length = h.seg.segment_length; + h.incoming = h.seg.segment_length; + homa_cores[cpu_number]->last_gro = 400; + skb3 = mock_skb_new(&self->ip, &h.common, 1400, 4000); result = homa_gro_receive(&self->empty_list, skb3); EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); - EXPECT_EQ(8500, srpc->msgin.bytes_remaining); + EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); + + /* Third attempt: no bypass because core busy. */ + homa_cores[cpu_number]->last_gro = 600; + skb4 = mock_skb_new(&self->ip, &h.common, 1400, 4000); + result = homa_gro_receive(&self->empty_list, skb3); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_data_bypasses); kfree_skb(skb); kfree_skb(skb2); + kfree_skb(skb4); +} +TEST_F(homa_offload, homa_gro_receive__fast_grant_optimization) +{ + struct in6_addr client_ip = unit_get_in_addr("196.168.0.1"); + struct in6_addr server_ip = unit_get_in_addr("1.2.3.4"); + int client_port = 40000; + __u64 client_id = 1234; + __u64 server_id = 1235; + struct homa_rpc *srpc = unit_server_rpc(&self->hsk, UNIT_OUTGOING, + &client_ip, &server_ip, client_port, server_id, 100, + 20000); + ASSERT_NE(NULL, srpc); + homa_xmit_data(srpc, false); + unit_log_clear(); + + struct grant_header h = {{.sport = htons(srpc->dport), + .dport = htons(self->hsk.port), + .sender_id = cpu_to_be64(client_id), + .type = GRANT}, + .offset = htonl(11000), + .priority = 3, + .resend_all = 0}; + + /* First attempt: HOMA_GRO_FAST_GRANTS not enabled. */ + self->homa.gro_policy = 0; + struct sk_buff *skb = mock_skb_new(&client_ip, &h.common, 0, 0); + struct sk_buff *result = homa_gro_receive(&self->empty_list, skb); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(0, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_STREQ("", unit_log_get()); + + /* Second attempt: HOMA_FAST_GRANTS is enabled. */ + self->homa.gro_policy = HOMA_GRO_FAST_GRANTS; + homa_cores[cpu_number]->last_gro = 400; + struct sk_buff *skb2 = mock_skb_new(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb2); + EXPECT_EQ(EINPROGRESS, -PTR_ERR(result)); + EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + EXPECT_STREQ("xmit DATA 1400@10000", unit_log_get()); + + /* Third attempt: core is too busy for fast grants. */ + homa_cores[cpu_number]->last_gro = 600; + struct sk_buff *skb3 = mock_skb_new(&client_ip, &h.common, 0, 0); + result = homa_gro_receive(&self->empty_list, skb3); + EXPECT_EQ(0, -PTR_ERR(result)); + EXPECT_EQ(1, homa_cores[cpu_number]->metrics.gro_grant_bypasses); + kfree_skb(skb); + kfree_skb(skb3); } TEST_F(homa_offload, homa_gro_receive__no_held_skb) { diff --git a/util/metrics.py b/util/metrics.py index bd2d0ce1..aaa9b2df 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -390,12 +390,24 @@ def scale_number(number): /deltas["handoffs_thread_waiting"]) else: alt_thread_percent = 0.0 - print("Available immediately: %4.1f%%" % (100.0 - poll_percent + if deltas["packets_rcvd_DATA"]: + data_bypass_percent = (100.0*deltas["gro_data_bypasses"] + /deltas["packets_rcvd_DATA"]) + else: + data_bypass_percent = 0.0 + if deltas["packets_rcvd_GRANT"]: + grant_bypass_percent = (100.0*deltas["gro_grant_bypasses"] + /deltas["packets_rcvd_GRANT"]) + else: + grant_bypass_percent = 0.0 + print("Available immediately: %5.1f%%" % (100.0 - poll_percent - sleep_percent)) - print("Arrived while polling: %4.1f%%" % (poll_percent)) - print("Blocked at least once: %4.1f%%" % (sleep_percent)) - print("Alternate GRO handoffs: %4.1f%%" % (gen3_alt_percent)) - print("Alternate thread handoffs: %4.1f%%" % (alt_thread_percent)) + print("Arrived while polling: %5.1f%%" % (poll_percent)) + print("Blocked at least once: %5.1f%%" % (sleep_percent)) + print("Alternate GRO handoffs: %5.1f%%" % (gen3_alt_percent)) + print("Alternate thread handoffs: %5.1f%%" % (alt_thread_percent)) + print("GRO bypass for data packets: %5.1f%%" % (data_bypass_percent)) + print("GRO bypass for grant packets: %5.1f%%" % (grant_bypass_percent)) print("\nMiscellaneous:") print("--------------")