From bdd763345de9b362de945940d7891ad6ba337fd1 Mon Sep 17 00:00:00 2001 From: John Ousterhout Date: Fri, 10 Nov 2023 14:20:03 -0800 Subject: [PATCH] Added Gen3 load balancing --- balance.txt | 114 ++++++++++++++++++++++++ homa_impl.h | 71 +++++++++++++-- homa_incoming.c | 59 +++++++++++-- homa_offload.c | 179 +++++++++++++++++++++++++------------- homa_plumbing.c | 93 ++++++++++++++++++-- homa_utils.c | 31 ++++++- man/homa.7 | 32 ++++--- perf.txt | 6 +- test/unit_homa_incoming.c | 82 +++++++++++++++++ test/unit_homa_offload.c | 54 +++++++++++- util/metrics.py | 18 +++- 11 files changed, 638 insertions(+), 101 deletions(-) create mode 100644 balance.txt diff --git a/balance.txt b/balance.txt new file mode 100644 index 00000000..84a44437 --- /dev/null +++ b/balance.txt @@ -0,0 +1,114 @@ +This file discusses the issue of load-balancing in Homa. + +In order to keep up with fast networks, transport protocols must distribute +their processing across multiple cores. For outgoing packets this happens +naturally: sending threads run on different cores and packet processing +for outbound packets happens on the same core is the sending thread. Things +are more difficult for incoming packets. In general, an incoming packet +will pass through 3 cores: +* NAPI/GRO: the NIC distributes incoming packets across cores using RSS. + The number of incoming channels, and their association with cores, can + be configured in software. The NIC will then distribute packets across + those channels using a hash based on packet header fields. The device + driver receives packets as part of NAPI, then packets are collected into + batches using GRO and handed off to SoftIRQ. +* SoftIRQ processing occurs on a (potentially) different core from NAPI/GRO; + the network stack runs here, including Homa's main handlers for incoming + packets. The system default is to compute another hash function on packet + headers to select a SoftIRQ or for a batch, but it is possible for GRO + to make its own choice of core, and Homa does this. +* Once a complete message is received, it is handed off to an application + thread, which typically runs on a different core. + +The load balancing challenge is to distribute load across multiple cores +without overloading any individual core ("hotspots"). This has proven +quite difficult, and hotspots are the primary source of tail latency in Homa. +The most common cause of hotspots is when 2 or more of the above tasks +are assigned to the same core. For example: +* Two batches from different NAPI/GRO cores might get assigned to the same + SoftIRQ core. +* A particular core might be very busy handling NAPI/GRO for a stream of + packets in a large message; this will prevent application threads from + making progress on that core. A short message might pass through other + cores for NAPI/GRO and SoftIRQ, but if its application is running on + the busy core, then it will not able to process the short message. + +Part of the problem is that core assignments are made independently by +3 different schedulers (RSS for the NAPI/GRO core, GRO or the system for +the SoftIRQ core, and the Linux scheduler for the application core), +so conflicts are likely to occur. Only one of these schedulers is under +control of the transport protocol. + +It's also important to note that using more cores isn't always the best +approach. For example, if a node is lightly loaded, it would be best to +do all RX processing on a single core: using multiple cores causes extra +cache misses as data migrates from core to core, and it also adds latency +to pass control between cores. In an ideal world, the number of cores used for +protocol processing would be just enough to keep any of them from getting +overloaded. However, it appears to be hard to vary the number of cores +without risking overloads; except in a few special cases, Homa doesn't do +this. + +Homa tries to use its control over SoftIRQ scheduling to minimize hotspots. +Several different approaches have been tried over time; this document +focuses on the two most recent ones, which are called "Gen2" and "Gen3". + +Gen2 Load Balancing +------------------- +* Gen2 assumes that NAPI/GRO processing is occurring on all cores. +* When GRO chooses where to assign a batch of packets for SoftIRQ, it + considers the next several cores (in ascending circular core order + after the GRO core). +* GRO uses several criteria to try to find a "good" core for SoftIRQ, such + as avoiding a core that has done recent GRO processing, or one for which + there is already pending SoftIRQ work. +* Selection stops as soon as it finds a "good" core. +* If no "good" core is found, then GRO will rotate among the successor + cores on a batch-by-batch basis. +* In some cases, Gen2 will bypass the SoftIRQ handoff mechanism and simply + run SoftIRQ immediately on its core. This is done in two cases: short + packets and grant packets. Bypass is particularly useful for grants + because it eliminates the latency associated with a handoff, and grant + turnaround time is important for overall performance. + +Gen2 has several problems: +* It doesn't do anything about the problem of application threads conflicting + with NAPI/GRO or SoftIRQ. +* A single core may be assigned both SoftIRQ and NAPI/GRO work at the + same time. +* The SoftIRQ core groups for different NAPI/GRO cores overlap, so it's + possible for multiple GROs to schedule batches to the same SoftIRQ core. +* When receiving packets from a large message, Gen2 tends to alternate between + 2 or more SoftIRQ cores, which results in unnecessary cache coherency + traffic. +* If the NAPI/GRO core is overloaded, bypass can make things worse (especially + since grant processing results in transmitting additional packets, which + is fairly expensive). + +Gen3 Load Balancing +------------------- +The Gen3 load-balancing mechanism is an attempt to solve the problems +associated with Gen2. +* The number of channels is reduced, so that only 1/4 of the cores do + NAPI/GRO processing. This appears to be sufficient capacity to avoid + overloads on any of the NAPI/GRO cores. +* Each NAPI/GRO core has 3 other cores (statically assigned) that it can use + for SoftIRQ processing. The SoftIRQ core groups for different NAPI/GRO + cores do not overlap. This means that SoftIRQ and GRO will never happen + simultaneously on the same core, and there will be no conflicts between + the SoftIRQ groups of different NAPI/GRO cores. +* Gen3 takes steps to avoid core conflicts between application threads and + NAPI/GRO and SoftIRQ processing, as described below. +* When an application thread is using Homa actively on a core, the core + is marked as "busy". When GRO selects a SoftIRQ core, it attempts to + avoid cores that are busy with application threads. If there is a choice + of un-busy cores, GRO will try to reuse a single SoftIRQ over and over. +* Homa also keeps track of recent NAPI/GRO and SoftIRQ processing on each + core. When an incoming message becomes ready and there are multiple threads + waiting for messages, Homa tries to pick a thread whose core has not had + recent Homa activity. +* Between these two mechanisms, the hope is that SoftIRQ and application + work will adjust their core assignments to avoid conflicts. + +Gen3 was implemented in November of 2023; so far its performance appears to be +about the same as Gen2. We \ No newline at end of file diff --git a/homa_impl.h b/homa_impl.h index 11aa70ee..8f5bdc2f 100644 --- a/homa_impl.h +++ b/homa_impl.h @@ -716,6 +716,12 @@ struct homa_interest { */ int locked; + /** + * @core: Core on which @thread was executing when it registered + * its interest. Used for load balancing (see balance.txt). + */ + int core; + /** * @reg_rpc: RPC whose @interest field points here, or * NULL if none. @@ -746,6 +752,7 @@ static void inline homa_interest_init(struct homa_interest *interest) interest->thread = current; atomic_long_set(&interest->ready_rpc, 0); interest->locked = 0; + interest->core = raw_smp_processor_id(); interest->reg_rpc = NULL; interest->request_links.next = LIST_POISON1; interest->response_links.next = LIST_POISON1; @@ -1950,10 +1957,12 @@ struct homa { * order for SoftIRQ (deprecated). * HOMA_GRO_GEN2 Use the new mechanism for selecting an * idle core for SoftIRQ. - * HOMA_GRO_FAST_GRANTS Pass all grant I can see immediately to + * HOMA_GRO_FAST_GRANTS Pass all grants immediately to * homa_softirq during GRO. * HOMA_GRO_SHORT_BYPASS Pass all short packets directly to - * homa_softirq during GR). + * homa_softirq during GRO. + * HOMA_GRO_GEN3 Use the "Gen3" mechanisms for load + * balancing. */ #define HOMA_GRO_BYPASS 1 #define HOMA_GRO_SAME_CORE 2 @@ -1962,14 +1971,15 @@ struct homa { #define HOMA_GRO_GEN2 16 #define HOMA_GRO_FAST_GRANTS 32 #define HOMA_GRO_SHORT_BYPASS 64 + #define HOMA_GRO_GEN3 128 #define HOMA_GRO_NORMAL (HOMA_GRO_SAME_CORE|HOMA_GRO_GEN2 \ |HOMA_GRO_SHORT_BYPASS) /* * @busy_usecs: if there has been activity on a core within the * last @busy_usecs, it is considered to be busy and Homa will - * try to avoid scheduling other activities on the core. Set - * externally via sysctl. + * try to avoid scheduling other activities on the core. See + * balance.txt for more on load balancing. Set externally via sysctl. */ int busy_usecs; @@ -2157,6 +2167,19 @@ struct homa_metrics { */ __u64 slow_wakeups; + /** + * @handoffs_thread_waiting: total number of times that an RPC + * was handed off to a waiting thread (vs. being queued). + */ + __u64 handoffs_thread_waiting; + + /** + * @handoffs_alt_thread: total number of times that a thread other + * than the first on the list was chosen for a handoff (because the + * first thread was on a busy core). + */ + __u64 handoffs_alt_thread; + /** * @poll_cycles: total time spent in the polling loop in * homa_wait_for_message, as measured with get_cycles(). @@ -2593,6 +2616,19 @@ struct homa_metrics { */ __u64 dropped_data_no_bufs; + /** + * @gen3_handoffs: total number of handoffs from GRO to SoftIRQ made + * by Gen3 load balancer. + */ + __u64 gen3_handoffs; + + /** + * @gen3_alt_handoffs: total number of GRO->SoftIRQ handoffs that + * didn't choose the primary SoftIRQ core because it was busy with + * app threads. + */ + __u64 gen3_alt_handoffs; + /** @temp: For temporary use during testing. */ #define NUM_TEMP_METRICS 10 __u64 temp[NUM_TEMP_METRICS]; @@ -2607,8 +2643,7 @@ struct homa_core { /** * @last_active: the last time (in get_cycle() units) that * there was system activity, such NAPI or SoftIRQ, on this - * core. Used to pick a less-busy core for assigning SoftIRQ - * handlers. + * core. Used for load balancing. */ __u64 last_active; @@ -2634,6 +2669,23 @@ struct homa_core { */ int softirq_offset; + /** + * @gen3_softirq_cores: when the Gen3 load balancer is in use, + * GRO will arrange for SoftIRQ processing to occur on one of + * these cores; -1 values are ignored (see balance.txt for more + * on lewd balancing). This information is filled in via sysctl. + */ +#define NUM_GEN3_SOFTIRQ_CORES 3 + int gen3_softirq_cores[NUM_GEN3_SOFTIRQ_CORES]; + + /** + * @last_app_active: the most recent time (get_cycles() units) + * when an application was actively using Homa on this core (e.g., + * by sending or receiving messages). Used for load balancing + * (see balance.txt). + */ + __u64 last_app_active; + /** * held_skb: last packet buffer known to be available for * merging other packets into on this core (note: may not still @@ -3058,6 +3110,9 @@ extern int homa_check_nic_queue(struct homa *homa, struct sk_buff *skb, bool force); extern struct homa_rpc *homa_choose_fifo_grant(struct homa *homa); +extern struct homa_interest + *homa_choose_interest(struct homa *homa, struct list_head *head, + int offset); extern int homa_choose_rpcs_to_grant(struct homa *homa, struct homa_rpc **rpcs, int max_rpcs); extern void homa_close(struct sock *sock, long timeout); @@ -3095,6 +3150,8 @@ extern int homa_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); extern void homa_grant_pkt(struct sk_buff *skb, struct homa_rpc *rpc); extern int homa_gro_complete(struct sk_buff *skb, int thoff); +extern void homa_gro_gen2(struct sk_buff *skb); +extern void homa_gro_gen3(struct sk_buff *skb); extern struct sk_buff *homa_gro_receive(struct list_head *gro_list, struct sk_buff *skb); @@ -3229,6 +3286,8 @@ extern int homa_softirq(struct sk_buff *skb); extern void homa_spin(int usecs); extern char *homa_symbol_for_state(struct homa_rpc *rpc); extern char *homa_symbol_for_type(uint8_t type); +extern int homa_sysctl_softirq_cores(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); extern void homa_timer(struct homa *homa); extern int homa_timer_main(void *transportInfo); extern void homa_unhash(struct sock *sk); diff --git a/homa_incoming.c b/homa_incoming.c index 9ff62827..4d43c3fd 100644 --- a/homa_incoming.c +++ b/homa_incoming.c @@ -1694,6 +1694,7 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, INC_METRIC(poll_cycles, now - poll_start); /* Now it's time to sleep. */ + homa_cores[interest.core]->last_app_active = now; set_current_state(TASK_INTERRUPTIBLE); rpc = (struct homa_rpc *) atomic_long_read(&interest.ready_rpc); if (!rpc && !hsk->shutdown) { @@ -1779,7 +1780,43 @@ struct homa_rpc *homa_wait_for_message(struct homa_sock *hsk, int flags, } /** - * @homa_rpc_handoff: This function is called when the input message for + * @homa_choose_interest() - Given a list of interests for an incoming + * message, choose the best one to handle it (if any). + * @homa: Overall information about the Homa transport. + * @head: Head pointers for the list of interest: either + * hsk->request_interests or hsk->response_interests. + * @offset: Offset of "next" pointers in the list elements (either + * offsetof(request_links) or offsetof(response_links). + * Return: An interest to use for the incoming message, or NULL if none + * is available. If possible, this function tries to pick an + * interest whose thread is running on a core that isn't + * currently busy doing Homa transport work. + */ +struct homa_interest *homa_choose_interest(struct homa *homa, + struct list_head *head, int offset) +{ + struct homa_interest *backup = NULL; + struct list_head *pos; + struct homa_interest *interest; + __u64 busy_time = get_cycles() - homa->busy_cycles; + + list_for_each(pos, head) { + interest = (struct homa_interest *) (((char *) pos) - offset); + if (homa_cores[interest->core]->last_active < busy_time) { + if (backup != NULL) + INC_METRIC(handoffs_alt_thread, 1); + return interest; + } + if (backup == NULL) + backup = interest; + } + + /* All interested threads are on busy cores; return the first. */ + return backup; +} + +/** + * @homa_rpc_handoff() - This function is called when the input message for * an RPC is ready for attention from a user thread. It either notifies * a waiting reader or queues the RPC. * @rpc: RPC to handoff; must be locked. The caller must @@ -1803,17 +1840,17 @@ void homa_rpc_handoff(struct homa_rpc *rpc) /* Second, check the interest list for this type of RPC. */ if (homa_is_client(rpc->id)) { - interest = list_first_entry_or_null( + interest = homa_choose_interest(hsk->homa, &hsk->response_interests, - struct homa_interest, response_links); + offsetof(struct homa_interest, response_links)); if (interest) goto thread_waiting; list_add_tail(&rpc->ready_links, &hsk->ready_responses); INC_METRIC(responses_queued, 1); } else { - interest = list_first_entry_or_null( + interest = homa_choose_interest(hsk->homa, &hsk->request_interests, - struct homa_interest, request_links); + offsetof(struct homa_interest, request_links)); if (interest) goto thread_waiting; list_add_tail(&rpc->ready_links, &hsk->ready_requests); @@ -1838,8 +1875,17 @@ void homa_rpc_handoff(struct homa_rpc *rpc) */ atomic_or(RPC_HANDING_OFF, &rpc->flags); interest->locked = 0; + INC_METRIC(handoffs_thread_waiting, 1); + tt_record3("homa_rpc_handoff handing off id %d to pid %d on core %d", + rpc->id, interest->thread->pid, + task_cpu(interest->thread)); atomic_long_set_release(&interest->ready_rpc, (long) rpc); + /* Update the last_app_active time for the thread's core, so Homa + * will try to avoid doing any work there. + */ + homa_cores[interest->core]->last_app_active = get_cycles(); + /* Clear the interest. This serves two purposes. First, it saves * the waking thread from acquiring the socket lock again, which * reduces contention on that lock). Second, it ensures that @@ -1854,9 +1900,6 @@ void homa_rpc_handoff(struct homa_rpc *rpc) if (interest->response_links.next != LIST_POISON1) list_del(&interest->response_links); wake_up_process(interest->thread); - tt_record3("homa_rpc_handoff handed off id %d to pid %d on core %d", - rpc->id, interest->thread->pid, - task_cpu(interest->thread)); } /** diff --git a/homa_offload.c b/homa_offload.c index fdc9319c..e217535c 100644 --- a/homa_offload.c +++ b/homa_offload.c @@ -19,6 +19,8 @@ #include "homa_impl.h" +#define CORES_TO_CHECK 4 + static const struct net_offload homa_offload = { .callbacks = { .gso_segment = homa_gso_segment, @@ -274,6 +276,9 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, return result; bypass: + /* Record SoftIRQ cycles in a different metric to reflect that + * they happened during bypass. + */ saved_softirq_metric = homa_cores[raw_smp_processor_id()] ->metrics.softirq_cycles; homa_softirq(skb); @@ -283,14 +288,113 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, = saved_softirq_metric; INC_METRIC(bypass_softirq_cycles, softirq_cycles); - /* Record SoftIRQ cycles in a different metric to reflect that - * they happened during bypass. + /* This return value indicates that we have freed skb. */ + return ERR_PTR(-EINPROGRESS); + +} + +/** + * homa_gro_gen2() - When the Gen2 load balancer is being used this function + * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a + * batch of packets + * @skb: First in a group of packets that are ready to be passed to SoftIRQ. + * Information will be updated in the packet so that Linux will + * direct it to the chosen core. + */ +void homa_gro_gen2(struct sk_buff *skb) +{ + /* Scan the next several cores in order after the current core, + * trying to find one that is not already busy with SoftIRQ processing, + * and that doesn't appear to be active with NAPI/GRO processing + * either. If there is no such core, just rotate among the next + * cores. See balance.txt for overall design information on load + * balancing. */ + struct data_header *h = (struct data_header *) skb_transport_header(skb); + int i; + int this_core = raw_smp_processor_id(); + int candidate = this_core; + __u64 now = get_cycles(); + struct homa_core *core; + for (i = CORES_TO_CHECK; i > 0; i--) { + candidate++; + if (unlikely(candidate >= nr_cpu_ids)) + candidate = 0; + core = homa_cores[candidate]; + if (atomic_read(&core->softirq_backlog) > 0) + continue; + if ((core->last_gro + homa->busy_cycles) > now) + continue; + tt_record3("homa_gro_gen2 chose core %d for id %d " + "offset %d", + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); + break; + } + if (i <= 0) { + /* All of the candidates appear to be busy; just + * rotate among them. + */ + int offset = homa_cores[this_core]->softirq_offset; + offset += 1; + if (offset > CORES_TO_CHECK) + offset = 1; + homa_cores[this_core]->softirq_offset = offset; + candidate = this_core + offset; + while (candidate >= nr_cpu_ids) { + candidate -= nr_cpu_ids; + } + tt_record3("homa_gro_gen2 chose core %d for id %d " + "offset %d (all cores busy)", + candidate, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); + } + atomic_inc(&homa_cores[candidate]->softirq_backlog); + homa_cores[this_core]->last_gro = now; + homa_set_softirq_cpu(skb, candidate); +} +/** + * homa_gro_gen3() - When the Gen3 load balancer is being used this function + * is invoked by homa_gro_complete to choose a core to handle SoftIRQ for a + * batch of packets + * @skb: First in a group of packets that are ready to be passed to SoftIRQ. + * Information will be updated in the packet so that Linux will + * direct it to the chosen core. + */ +void homa_gro_gen3(struct sk_buff *skb) +{ + /* See balance.txt for overall design information on the Gen3 + * load balancer. + */ + struct data_header *h = (struct data_header *) skb_transport_header(skb); + int i, core; + __u64 now, busy_time; + int *candidates = homa_cores[raw_smp_processor_id()]->gen3_softirq_cores; - /* This return value indicates that we have freed skb. */ - return ERR_PTR(-EINPROGRESS); + now = get_cycles(); + busy_time = now - homa->busy_cycles; + core = candidates[0]; + for (i = 0; i < NUM_GEN3_SOFTIRQ_CORES; i++) { + int candidate = candidates[i]; + if (candidate < 0) { + break; + } + if (homa_cores[candidate]->last_app_active < busy_time) { + core = candidate; + break; + } + } + homa_set_softirq_cpu(skb, core); + homa_cores[core]->last_active = now; + tt_record4("homa_gro_gen3 chose core %d for id %d, offset %d, delta %d", + core, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset), + now - homa_cores[core]->last_app_active); + INC_METRIC(gen3_handoffs, 1); + if (core != candidates[0]) + INC_METRIC(gen3_alt_handoffs, 1); } /** @@ -305,64 +409,15 @@ struct sk_buff *homa_gro_receive(struct list_head *held_list, */ int homa_gro_complete(struct sk_buff *skb, int hoffset) { - struct common_header *h = (struct common_header *) - skb_transport_header(skb); - struct data_header *d = (struct data_header *) h; + struct data_header *h = (struct data_header *) skb_transport_header(skb); // tt_record4("homa_gro_complete type %d, id %d, offset %d, count %d", // h->type, homa_local_id(h->sender_id), ntohl(d->seg.offset), // NAPI_GRO_CB(skb)->count); -#define CORES_TO_CHECK 4 - if (homa->gro_policy & HOMA_GRO_GEN2) { - /* Pick a specific core to handle SoftIRQ processing for this - * group of packets. This policy scans the next several cores - * in order after this, trying to find one that is not - * already busy with SoftIRQ processing, and that doesn't appear - * to be active with NAPI/GRO processing either. If there - * is no such core, just rotate among the next cores. - */ - int i; - int this_core = raw_smp_processor_id(); - int candidate = this_core; - __u64 now = get_cycles(); - struct homa_core *core; - for (i = CORES_TO_CHECK; i > 0; i--) { - candidate++; - if (unlikely(candidate >= nr_cpu_ids)) - candidate = 0; - core = homa_cores[candidate]; - if (atomic_read(&core->softirq_backlog) > 0) - continue; - if ((core->last_gro + homa->busy_cycles) > now) - continue; - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE_NEW policy", - candidate, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); - break; - } - if (i <= 0) { - /* All of the candidates appear to be busy; just - * rotate among them. - */ - int offset = homa_cores[this_core]->softirq_offset; - offset += 1; - if (offset > CORES_TO_CHECK) - offset = 1; - homa_cores[this_core]->softirq_offset = offset; - candidate = this_core + offset; - while (candidate >= nr_cpu_ids) { - candidate -= nr_cpu_ids; - } - tt_record3("homa_gro_complete chose core %d for id %d " - "offset %d with IDLE_NEW policy " - "(all cores busy)", - candidate, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); - } - atomic_inc(&homa_cores[candidate]->softirq_backlog); - homa_cores[this_core]->last_gro = now; - homa_set_softirq_cpu(skb, candidate); + if (homa->gro_policy & HOMA_GRO_GEN3) { + homa_gro_gen3(skb); + } else if (homa->gro_policy & HOMA_GRO_GEN2) { + homa_gro_gen2(skb); } else if (homa->gro_policy & HOMA_GRO_IDLE) { int i, core, best; __u64 best_time = ~0; @@ -389,8 +444,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) homa_set_softirq_cpu(skb, best); tt_record3("homa_gro_complete chose core %d for id %d " "offset %d with IDLE policy", - best, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); + best, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } else if (homa->gro_policy & HOMA_GRO_NEXT) { /* Use the next core (in circular order) to handle the * SoftIRQ processing. @@ -401,8 +456,8 @@ int homa_gro_complete(struct sk_buff *skb, int hoffset) homa_set_softirq_cpu(skb, target); tt_record3("homa_gro_complete chose core %d for id %d " "offset %d with NEXT policy", - target, homa_local_id(h->sender_id), - ntohl(d->seg.offset)); + target, homa_local_id(h->common.sender_id), + ntohl(h->seg.offset)); } return 0; diff --git a/homa_plumbing.c b/homa_plumbing.c index 37ca1964..99b6670e 100644 --- a/homa_plumbing.c +++ b/homa_plumbing.c @@ -224,6 +224,13 @@ static struct ctl_table homa_ctl_table[] = { .mode = 0644, .proc_handler = homa_dointvec }, + { + .procname = "busy_usecs", + .data = &homa_data.busy_usecs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = homa_dointvec + }, { .procname = "cutoff_version", .data = &homa_data.cutoff_version, @@ -260,15 +267,15 @@ static struct ctl_table homa_ctl_table[] = { .proc_handler = proc_dointvec }, { - .procname = "grant_fifo_fraction", - .data = &homa_data.grant_fifo_fraction, - .maxlen = sizeof(int), + .procname = "gen3_softirq_cores", + .data = NULL, + .maxlen = 0, .mode = 0644, - .proc_handler = homa_dointvec + .proc_handler = homa_sysctl_softirq_cores }, { - .procname = "gro_busy_us", - .data = &homa_data.busy_usecs, + .procname = "grant_fifo_fraction", + .data = &homa_data.grant_fifo_fraction, .maxlen = sizeof(int), .mode = 0644, .proc_handler = homa_dointvec @@ -871,6 +878,7 @@ int homa_sendmsg(struct sock *sk, struct msghdr *msg, size_t length) { struct homa_rpc *rpc = NULL; sockaddr_in_union *addr = (sockaddr_in_union *) msg->msg_name; + homa_cores[raw_smp_processor_id()]->last_app_active = start; if (unlikely(!msg->msg_control_is_user)) { result = -EINVAL; goto error; @@ -997,6 +1005,7 @@ int homa_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int result; INC_METRIC(recv_calls, 1); + homa_cores[raw_smp_processor_id()]->last_app_active = start; if (unlikely(!msg->msg_control)) { /* This test isn't strictly necessary, but it provides a * hook for testing kernel call times. @@ -1656,6 +1665,78 @@ int homa_dointvec(struct ctl_table *table, int write, return result; } +/** + * homa_sysctl_softirq_cores() - This function is invoked to handle sysctl + * requests for the "gen3_softirq_cores" target, which requires special + * processing. + * @table: sysctl table describing value to be read or written. + * @write: Nonzero means value is being written, 0 means read. + * @buffer: Address in user space of the input/output data. + * @lenp: Not exactly sure. + * @ppos: Not exactly sure. + * + * Return: 0 for success, nonzero for error. + */ +int homa_sysctl_softirq_cores(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int result, i; + struct ctl_table table_copy; + struct homa_core *core; + int max_values, *values; + + max_values = (NUM_GEN3_SOFTIRQ_CORES + 1) * nr_cpu_ids; + values = (int *) kmalloc(max_values * sizeof(int), GFP_KERNEL); + if (values == NULL) + return -ENOMEM; + + table_copy = *table; + table_copy.data = values; + if (write) { + /* First value is core id, others are contents of its + * gen3_softirq_cores. + */ + for (i = 0; i < max_values ; i++) + values[i] = -1; + table_copy.maxlen = max_values; + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + if (result != 0) + goto done; + for (i = 0; i < max_values; + i += NUM_GEN3_SOFTIRQ_CORES + 1) { + int j; + if (values[i] < 0) + break; + core = homa_cores[values[i]]; + for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) + core->gen3_softirq_cores[j] = values[i+j+1]; + } + } else { + /* Read: return values from all of the cores. */ + int *dst; + + table_copy.maxlen = 0; + dst = values; + for (i = 0; i < nr_cpu_ids; i++) { + int j; + + *dst = i; + dst++; + table_copy.maxlen += sizeof(int); + core = homa_cores[i]; + for (j = 0; j < NUM_GEN3_SOFTIRQ_CORES; j++) { + *dst = core->gen3_softirq_cores[j]; + dst++; + table_copy.maxlen += sizeof(int); + } + } + result = proc_dointvec(&table_copy, write, buffer, lenp, ppos); + } +done: + kfree(values); + return result; +} + /** * homa_hrtimer() - This function is invoked by the hrtimer mechanism to * wake up the timer thread. Runs at IRQ level. diff --git a/homa_utils.c b/homa_utils.c index 17e35b6f..29ab42b5 100644 --- a/homa_utils.c +++ b/homa_utils.c @@ -57,12 +57,18 @@ int homa_init(struct homa *homa) first = (char *) (((__u64) core_memory + 0x3f) & ~0x3f); for (i = 0; i < nr_cpu_ids; i++) { struct homa_core *core; + int j; + core = (struct homa_core *) (first + i*aligned_size); homa_cores[i] = core; core->last_active = 0; core->last_gro = 0; atomic_set(&core->softirq_backlog, 0); core->softirq_offset = 0; + core->gen3_softirq_cores[0] = i^1; + for (j = 1; j < NUM_GEN3_SOFTIRQ_CORES; j++) + core->gen3_softirq_cores[j] = -1; + core->last_app_active = 0; core->held_skb = NULL; core->held_bucket = 0; memset(&core->metrics, 0, sizeof(core->metrics)); @@ -148,7 +154,7 @@ int homa_init(struct homa *homa) homa->max_gro_skbs = 20; homa->gso_force_software = 0; homa->gro_policy = HOMA_GRO_NORMAL; - homa->busy_usecs = 10; + homa->busy_usecs = 100; homa->timer_ticks = 0; spin_lock_init(&homa->metrics_lock); homa->metrics = NULL; @@ -1481,6 +1487,15 @@ char *homa_print_metrics(struct homa *homa) "slow_wakeups %15llu " "Messages received after thread went to sleep\n", m->slow_wakeups); + homa_append_metric(homa, + "handoffs_thread_waiting %15llu " + "RPC handoffs to waiting threads (vs. queue)\n", + m->handoffs_thread_waiting); + homa_append_metric(homa, + "handoffs_alt_thread %15llu " + "RPC handoffs not to first on list (avoid busy " + "core)\n", + m->handoffs_alt_thread); homa_append_metric(homa, "poll_cycles %15llu " "Time spent polling for incoming messages\n", @@ -1802,6 +1817,15 @@ char *homa_print_metrics(struct homa *homa) "dropped_data_no_bufs %15llu " "Data bytes dropped because app buffers full\n", m->dropped_data_no_bufs); + homa_append_metric(homa, + "gen3_handoffs %15llu " + "GRO->SoftIRQ handoffs made by Gen3 balancer\n", + m->gen3_handoffs); + homa_append_metric(homa, + "gen3_alt_handoffs %15llu " + "Gen3 handoffs to secondary core (primary was " + "busy)\n", + m->gen3_alt_handoffs); for (i = 0; i < NUM_TEMP_METRICS; i++) homa_append_metric(homa, "temp%-2d %15llu " @@ -1926,7 +1950,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) return; rpc->hsk->homa->freeze_type = 0; if (!tt_frozen) { - struct freeze_header freeze; +// struct freeze_header freeze; printk(KERN_NOTICE "freezing in homa_freeze with freeze_type %d\n", type); tt_record1("homa_freeze calling homa_rpc_log_active with freeze_type %d", type); homa_rpc_log_active_tt(rpc->hsk->homa, 0); @@ -1934,6 +1958,7 @@ void homa_freeze(struct homa_rpc *rpc, enum homa_freeze_type type, char *format) printk(KERN_NOTICE "%s\n", format); tt_record2(format, rpc->id, tt_addr(rpc->peer->addr)); tt_freeze(); - homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); +// homa_xmit_control(FREEZE, &freeze, sizeof(freeze), rpc); + homa_freeze_peers(rpc->hsk->homa); } } diff --git a/man/homa.7 b/man/homa.7 index 66aedb61..ab7aa155 100644 --- a/man/homa.7 +++ b/man/homa.7 @@ -268,6 +268,17 @@ in The amount of time (in microseconds) that a given core can own a page in a receive buffer pool before its ownership can be revoked by a different core. +.IR gro_policy +An integer value that determines how Homa processes incoming packets +at the GRO level. See code in homa_offload.c for more details. +.TP +.IR busy_usecs +An integer value in microsecond units; if a core has been active in +the last +.IR busy_usecs +time, Homa will consider it to be "busy": in some situations Homa +will try to avoid scheduling conflicting activities on that core, in order to +avoid hot spots and achieve better load balancing. .TP .I cutoff_version (Read-only) The current version for unscheduled cutoffs; incremented @@ -307,6 +318,16 @@ Homa will freeze its internal timetrace. This is used for debugging and performance analysis; see the source code for the values currently supported. .TP +.IR gen3_softirq_cores +Used to query and change the set of SoftIRQ cores associated with each +GRO core. When written, the value contains 4 integers. The first is the number +of a core on which GRO processing occurs. The others are core numbers for +up to three other cores; the GRO core will choose from among these cores +when deciding where to direct batches of packets for SoftIRQ processing. +SoftIRQ core numbers of -1 can be used to reduce the number of SoftIRQ +choices. When read, the value contains 4 integers for each core, with the +same format as described above. +.TP .IR grant_fifo_fraction When sending grants, Homa normally uses an SRPT policy, granting to the message(s) with the fewest remaining bytes. This parameter can be @@ -323,17 +344,6 @@ If this value is nonzero, Homa will perform GSO in software instead of asking the NIC to perform TSO in hardware. This can be useful when running with NICs that refuse to perform TSO on Homa packets. .TP -.IR gro_policy -An integer value that determines how Homa processes incoming packets -at the GRO level. See code in homa_offload.c for more details. -.TP -.IR gro_busy_usecs -An integer value. Under some -.IR gro_policy -settings, Homa will try not to assign SoftIRQ processing to a core if -it has had GRO-level activity in the last -.IR gro_busy_usecs -microseconds (in order to avoid hot spots that degrade load balancing). .TP .IR link_mbps An integer value specifying the bandwidth of this machine's uplink to diff --git a/perf.txt b/perf.txt index 8011ee71..a3a5b9ef 100644 --- a/perf.txt +++ b/perf.txt @@ -2,6 +2,10 @@ This file contains various notes and lessons learned concerning performance of the Homa Linux kernel module. The notes are in reverse chronological order. +49. (November 2023) Implemented "Gen3" load balancing scheme, renamed the + old scheme "Gen2". For details on load balancing, see balance.txt. + So far, Gen3 doesn't appear any better than Gen2. + 48. (August 2023) Unexpected packet loss on c6525-100g cluster (AMD processors, 100 Gbps links). Under some conditions (such as "cp_node client --one-way --workload 1000000" with dynamic_windows=1 and unsched_bytes=50000) @@ -497,7 +501,7 @@ Event Median so as not to interfere with user threads. This sometimes means it has to wait for a full time slice for other threads, which seems to be 5-7 ms. I tried disabling this feature of __do_softirq, so that all requests get - processed in the high-priority thread, and the P999 latency improved by + processed in the high-priority thread, and the P9999 latency improved by about 10x (< 1 ms worst case). 18. (July 2020) Small-message latency. The best-case RTT for small messages diff --git a/test/unit_homa_incoming.c b/test/unit_homa_incoming.c index 42851076..e10905ee 100644 --- a/test/unit_homa_incoming.c +++ b/test/unit_homa_incoming.c @@ -2817,6 +2817,66 @@ TEST_F(homa_incoming, homa_wait_for_message__signal) EXPECT_EQ(EINTR, -PTR_ERR(rpc)); } +TEST_F(homa_incoming, homa_choose_interest__empty_list) +{ + struct homa_interest *result = homa_choose_interest(&self->homa, + &self->hsk.request_interests, + offsetof(struct homa_interest, request_links)); + EXPECT_EQ(NULL, result); +} +TEST_F(homa_incoming, homa_choose_interest__find_idle_core) +{ + struct homa_interest interest1, interest2, interest3; + homa_interest_init(&interest1); + interest1.core = 1; + list_add_tail(&interest1.request_links, &self->hsk.request_interests); + homa_interest_init(&interest2); + interest2.core = 2; + list_add_tail(&interest2.request_links, &self->hsk.request_interests); + homa_interest_init(&interest3); + interest3.core = 3; + list_add_tail(&interest3.request_links, &self->hsk.request_interests); + + mock_cycles = 5000; + self->homa.busy_cycles = 1000; + homa_cores[1]->last_active = 4100; + homa_cores[2]->last_active = 3500; + homa_cores[3]->last_active = 2000; + + struct homa_interest *result = homa_choose_interest(&self->homa, + &self->hsk.request_interests, + offsetof(struct homa_interest, request_links)); + ASSERT_NE(NULL, result); + EXPECT_EQ(2, result->core); + INIT_LIST_HEAD(&self->hsk.request_interests); +} +TEST_F(homa_incoming, homa_choose_interest__all_cores_busy) +{ + struct homa_interest interest1, interest2, interest3; + homa_interest_init(&interest1); + interest1.core = 1; + list_add_tail(&interest1.request_links, &self->hsk.request_interests); + homa_interest_init(&interest2); + interest2.core = 2; + list_add_tail(&interest2.request_links, &self->hsk.request_interests); + homa_interest_init(&interest3); + interest3.core = 3; + list_add_tail(&interest3.request_links, &self->hsk.request_interests); + + mock_cycles = 5000; + self->homa.busy_cycles = 1000; + homa_cores[1]->last_active = 4100; + homa_cores[2]->last_active = 4001; + homa_cores[3]->last_active = 4800; + + struct homa_interest *result = homa_choose_interest(&self->homa, + &self->hsk.request_interests, + offsetof(struct homa_interest, request_links)); + ASSERT_NE(NULL, result); + EXPECT_EQ(1, result->core); + INIT_LIST_HEAD(&self->hsk.request_interests); +} + TEST_F(homa_incoming, homa_rpc_handoff__handoff_already_in_progress) { struct homa_interest interest; @@ -2983,6 +3043,28 @@ TEST_F(homa_incoming, homa_rpc_handoff__detach_interest) EXPECT_EQ(0, unit_list_length(&self->hsk.request_interests)); atomic_andnot(RPC_HANDING_OFF, &crpc->flags); } +TEST_F(homa_incoming, homa_rpc_handoff__update_last_app_active) +{ + struct homa_interest interest; + struct homa_rpc *crpc = unit_client_rpc(&self->hsk, + UNIT_OUTGOING, self->client_ip, self->server_ip, + self->server_port, self->client_id, 20000, 1600); + ASSERT_NE(NULL, crpc); + EXPECT_EQ(NULL, crpc->interest); + unit_log_clear(); + + homa_interest_init(&interest); + interest.thread = &mock_task; + interest.reg_rpc = crpc; + interest.core = 2; + crpc->interest = &interest; + mock_cycles = 10000; + homa_cores[2]->last_app_active = 444; + homa_rpc_handoff(crpc); + EXPECT_STREQ("wake_up_process pid 0", unit_log_get()); + EXPECT_EQ(10000, homa_cores[2]->last_app_active); + atomic_andnot(RPC_HANDING_OFF, &crpc->flags); +} TEST_F(homa_incoming, homa_incoming_sysctl_changed__grant_nonfifo) { diff --git a/test/unit_homa_offload.c b/test/unit_homa_offload.c index 93347fdd..425c2572 100644 --- a/test/unit_homa_offload.c +++ b/test/unit_homa_offload.c @@ -300,7 +300,7 @@ TEST_F(homa_offload, homa_gro_receive__max_gro_skbs) kfree_skb(self->skb); } -TEST_F(homa_offload, homa_gro_complete__GRO_IDLE_NEW) +TEST_F(homa_offload, homa_gro_gen2) { homa->gro_policy = HOMA_GRO_GEN2; mock_cycles = 1000; @@ -337,6 +337,58 @@ TEST_F(homa_offload, homa_gro_complete__GRO_IDLE_NEW) EXPECT_EQ(1, homa_cores[5]->softirq_offset); } +TEST_F(homa_offload, homa_gro_gen3__basics) +{ + homa->gro_policy = HOMA_GRO_GEN3; + struct homa_core *core = homa_cores[cpu_number]; + core->gen3_softirq_cores[0] = 3; + core->gen3_softirq_cores[1] = 7; + core->gen3_softirq_cores[2] = 5; + homa_cores[3]->last_app_active = 4100; + homa_cores[7]->last_app_active = 3900; + homa_cores[5]->last_app_active = 2000; + mock_cycles = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(7, self->skb->hash - 32); + EXPECT_EQ(0, homa_cores[3]->last_active); + EXPECT_EQ(5000, homa_cores[7]->last_active); +} +TEST_F(homa_offload, homa_gro_gen3__stop_on_negative_core_id) +{ + homa->gro_policy = HOMA_GRO_GEN3; + struct homa_core *core = homa_cores[cpu_number]; + core->gen3_softirq_cores[0] = 3; + core->gen3_softirq_cores[1] = -1; + core->gen3_softirq_cores[2] = 5; + homa_cores[3]->last_app_active = 4100; + homa_cores[5]->last_app_active = 2000; + mock_cycles = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(3, self->skb->hash - 32); + EXPECT_EQ(5000, homa_cores[3]->last_active); +} +TEST_F(homa_offload, homa_gro_gen3__all_cores_busy_so_pick_first) +{ + homa->gro_policy = HOMA_GRO_GEN3; + struct homa_core *core = homa_cores[cpu_number]; + core->gen3_softirq_cores[0] = 3; + core->gen3_softirq_cores[1] = 7; + core->gen3_softirq_cores[2] = 5; + homa_cores[3]->last_app_active = 4100; + homa_cores[7]->last_app_active = 4001; + homa_cores[5]->last_app_active = 4500; + mock_cycles = 5000; + self->homa.busy_cycles = 1000; + + homa_gro_complete(self->skb, 0); + EXPECT_EQ(3, self->skb->hash - 32); + EXPECT_EQ(5000, homa_cores[3]->last_active); +} + TEST_F(homa_offload, homa_gro_complete__GRO_IDLE) { homa->gro_policy = HOMA_GRO_IDLE; diff --git a/util/metrics.py b/util/metrics.py index f4d5c73f..bd2d0ce1 100755 --- a/util/metrics.py +++ b/util/metrics.py @@ -380,10 +380,22 @@ def scale_number(number): print("-------------------") poll_percent = 100.0*float(deltas["fast_wakeups"])/total_messages sleep_percent = 100.0*float(deltas["slow_wakeups"])/total_messages - print("Available immediately: %4.1f%%" % (100.0 - poll_percent + if deltas["gen3_alt_handoffs"]: + gen3_alt_percent = (100.0*deltas["gen3_alt_handoffs"] + /deltas["gen3_handoffs"]) + else: + gen3_alt_percent = 0.0 + if deltas["handoffs_alt_thread"]: + alt_thread_percent = (100.0*deltas["handoffs_alt_thread"] + /deltas["handoffs_thread_waiting"]) + else: + alt_thread_percent = 0.0 + print("Available immediately: %4.1f%%" % (100.0 - poll_percent - sleep_percent)) - print("Arrived while polling: %4.1f%%" % (poll_percent)) - print("Blocked at least once: %4.1f%%" % (sleep_percent)) + print("Arrived while polling: %4.1f%%" % (poll_percent)) + print("Blocked at least once: %4.1f%%" % (sleep_percent)) + print("Alternate GRO handoffs: %4.1f%%" % (gen3_alt_percent)) + print("Alternate thread handoffs: %4.1f%%" % (alt_thread_percent)) print("\nMiscellaneous:") print("--------------")