From 5d045ce28525388fa2c9c6fdb189e7289ccc6768 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 7 Jan 2025 22:46:04 +0100 Subject: [PATCH] controller: Fix IPv6 dp flow explosion by setting flow table prefixes. OVS allows enabling prefix match optimizations per flow table. This enables masked matches whenever possible on fields that otherwise would be exact matched in the datapath flow. By default, however, only nw_src and nw_dst are enabled (L4 ports are also always enabled, but this is not configurable). OVN is using mixed flow tables that match on both IPv4 and IPv6 addresses, meaning that IPv6 traffic generates exact match datapath flows where IPv4 generates masked matches, causing datapath flow explosion under heavy IPv6 load. OVN owns the "br-int" bridge and the flow tables, so it should enable appropriate fields per flow table to avoid flow explosion and achieve better performance overall. Example on how the prefixes can be configured manually: for i in $(seq 0 254); do ovs-vsctl set Bridge br-int flow_tables:${i}=@N -- \ --id=@N create Flow_Table name=t${i} \ prefixes=nw_src,nw_dst,ipv6_dst,ipv6_src; done Until recently, OVS only supported up to 3 prefixes per flow table, but now the limit will be increased to 4 in OVS 3.5 and some newer minor releases of older versions down to 3.3. Unfortunately, that means that ovn-controller needs to check and choose the appropriate number of prefixes. For the 3 we may just add ipv6_src and leave ipv6_dst unoptimized. OVS 3.5 will have all 4 prefixes enabled by default, but OVN will be paired with older versions of OVS for a long time, so it's better to set these config options to better support older setups. Unfortunately, IDL doesn't provide a way today to get the type of the column from the server side, so it's hard to tell how many prefixes are actually supported. A few approaches: 1. Try 3 and 4 and check if transaction fails. 2. Try to get and parse the schema from the server. 3. Enhance IDL to provide server column type information. While the first approach seems simpler, it's actually not trivial to figure out why exactly the transaction failed from the application level. IDL only has the string representation of the error and doesn't provide it to the application. The third approach is the most clean one, but it requires modifications of the IDL and CS layers in order to get this information. This would significantly complicate the process of getting this change backported to OVN 24.03 LTS, for example. The second approach is taken by this commit with intention to replace the schema parsing with the enhanced IDL API, once it is available. This allows for easier backports today with a cleaner solution in the future. IMO, the backportability is important due to increasing importance of IPv6 in OVN clusters and the cloud environments in general. NOTE: when the patch was backported to branch-24.09 the OVS submodule version was also bumped to the latest OVS/branch-3.3 to include the following OVS commit: 6fed6a7d3133 ("classifier: Increase the maximum number of prefixes (tries).") Reported-at: https://issues.redhat.com/browse/FDP-1024 Signed-off-by: Ilya Maximets Reviewed-by: Ales Musil Signed-off-by: Dumitru Ceara (cherry picked from commit 89e43f7528b067b1bc9f6c5fd67857b39ebc518d) --- NEWS | 3 ++ TODO.rst | 3 ++ controller/ovn-controller.c | 78 ++++++++++++++++++++++++++- include/ovn/features.h | 4 +- lib/features.c | 105 +++++++++++++++++++++++++++++++++++- lib/test-ovn-features.c | 6 +-- ovs | 2 +- tests/ovn-controller.at | 50 +++++++++++++++++ 8 files changed, 244 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index cec5513889..87b4cf93f7 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,8 @@ OVN v24.03.6 - xx xxx xxxx -------------------------- + - Improved handling of IPv6 traffic by enabling address prefix tracking + in OVS for both IPv4 and IPv6 addresses, whenever possible, reducing + the amount of IPv6 datapath flows. OVN v24.03.5 - 21 Jan 2025 -------------------------- diff --git a/TODO.rst b/TODO.rst index 17d539ad68..01370e38bc 100644 --- a/TODO.rst +++ b/TODO.rst @@ -130,3 +130,6 @@ OVN To-do List * Move the lflow build parallel processing from northd.c to lflow-mgr.c This would also ensure that the variables declared in northd.c (eg. thread_lflow_counter) are not externed in lflow-mgr.c. + +* Remove OVS database schema parsing from features.c once server side column + types are available through IDL. diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c index 10b4c5573f..81d4e70793 100644 --- a/controller/ovn-controller.c +++ b/controller/ovn-controller.c @@ -487,6 +487,76 @@ create_br_datapath(struct ovsdb_idl_txn *ovs_idl_txn, return dp; } +#define N_FLOW_TABLES 255 + +static void +update_flow_table_prefixes(struct ovsdb_idl_txn *ovs_idl_txn, + const struct ovsrec_bridge *br_int) +{ + size_t max_prefixes = ovs_features_max_flow_table_prefixes_get(); + struct ds ds = DS_EMPTY_INITIALIZER; + const char *prefixes[] = { + "ip_src", "ip_dst", "ipv6_src", "ipv6_dst", + }; + struct ovsrec_flow_table *ft; + size_t i; + + /* We must not attempt setting more prefixes than our IDL supports. + * Note: This should be a build time assertion, but IDL structures + * are not defined as constants. */ + ovs_assert( + ARRAY_SIZE(prefixes) <= + ovsrec_flow_table_columns[OVSREC_FLOW_TABLE_COL_PREFIXES].type.n_max); + + if (!max_prefixes) { + /* Not discovered yet. */ + return; + } + + max_prefixes = MIN(max_prefixes, ARRAY_SIZE(prefixes)); + if (br_int->n_flow_tables == N_FLOW_TABLES && + br_int->value_flow_tables[0]->n_prefixes == max_prefixes) { + /* Already up to date. Ideally, we would check every table, + * but it seems excessive. */ + return; + } + + for (i = 1; i < br_int->n_flow_tables; i++) { + if (br_int->value_flow_tables[i] != br_int->value_flow_tables[0]) { + break; + } + } + if (i == N_FLOW_TABLES) { + /* Correct number of flow tables and all pointing to the same row. */ + ft = br_int->value_flow_tables[0]; + } else { + /* Unexpected configuration. Let's create a new flow table row. + * Old ones will be garbage collected by the database. */ + struct ovsrec_flow_table *values[N_FLOW_TABLES]; + int64_t keys[N_FLOW_TABLES]; + + ft = ovsrec_flow_table_insert(ovs_idl_txn); + for (i = 0; i < ARRAY_SIZE(values); i++) { + keys[i] = i; + values[i] = ft; + } + ovsrec_bridge_set_flow_tables(br_int, keys, values, + ARRAY_SIZE(values)); + } + + ds_put_cstr(&ds, "Setting flow table prefixes:"); + for (i = 0 ; i < max_prefixes; i++) { + ds_put_char(&ds, ' '); + ds_put_cstr(&ds, prefixes[i]); + ds_put_char(&ds, ','); + } + ds_chomp(&ds, ','); + VLOG_INFO("%s.", ds_cstr_ro(&ds)); + ds_destroy(&ds); + + ovsrec_flow_table_set_prefixes(ft, prefixes, max_prefixes); +} + static const struct ovsrec_bridge * get_br_int(const struct ovsrec_bridge_table *bridge_table, const struct ovsrec_open_vswitch_table *ovs_table) @@ -563,6 +633,8 @@ process_br_int(struct ovsdb_idl_txn *ovs_idl_txn, datapath_type); } } + + update_flow_table_prefixes(ovs_idl_txn, br_int); } } *br_int_ = br_int; @@ -1112,8 +1184,11 @@ ctrl_register_ovs_idl(struct ovsdb_idl *ovs_idl) ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_ports); ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_name); ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_fail_mode); + ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_flow_tables); ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_other_config); ovsdb_idl_add_column(ovs_idl, &ovsrec_bridge_col_external_ids); + ovsdb_idl_add_table(ovs_idl, &ovsrec_table_flow_table); + ovsdb_idl_add_column(ovs_idl, &ovsrec_flow_table_col_prefixes); ovsdb_idl_add_table(ovs_idl, &ovsrec_table_ssl); ovsdb_idl_add_column(ovs_idl, &ovsrec_ssl_col_bootstrap_ca_cert); ovsdb_idl_add_column(ovs_idl, &ovsrec_ssl_col_ca_cert); @@ -5726,7 +5801,8 @@ main(int argc, char *argv[]) if (ovs_idl_txn && ovs_feature_support_run(br_int_dp ? &br_int_dp->capabilities : NULL, - br_int ? br_int->name : NULL)) { + br_int ? br_int->name : NULL, + ovs_remote)) { VLOG_INFO("OVS feature set changed, force recompute."); engine_set_force_recompute(true); diff --git a/include/ovn/features.h b/include/ovn/features.h index 935db5c49d..4738604011 100644 --- a/include/ovn/features.h +++ b/include/ovn/features.h @@ -51,9 +51,11 @@ enum ovs_feature_value { void ovs_feature_support_destroy(void); bool ovs_feature_is_supported(enum ovs_feature_value feature); bool ovs_feature_support_run(const struct smap *ovs_capabilities, - const char *br_name); + const char *br_name, + const char *db_target); bool ovs_feature_set_discovered(void); uint32_t ovs_feature_max_meters_get(void); uint32_t ovs_feature_max_select_groups_get(void); +size_t ovs_features_max_flow_table_prefixes_get(void); #endif diff --git a/lib/features.c b/lib/features.c index b044372352..57d96fe3e7 100644 --- a/lib/features.c +++ b/lib/features.c @@ -20,6 +20,9 @@ #include "lib/util.h" #include "lib/dirs.h" #include "socket-util.h" +#include "lib/ovsdb-cs.h" +#include "lib/ovsdb-error.h" +#include "lib/ovsdb-types.h" #include "lib/vswitch-idl.h" #include "odp-netlink.h" #include "openvswitch/vlog.h" @@ -221,17 +224,99 @@ ovs_feature_get_openflow_cap(const char *br_name) return ret; } +/* Monitoring how many prefixes flow tables can have. + * TODO: Remove this code once the server column type is available via IDL. */ +static struct ovsdb_cs *vswitch_cs; +static size_t max_flow_table_prefixes; + +static struct json * +vswitch_schema_prefixes_find(const struct json *schema_json, + void *ctx OVS_UNUSED) +{ + /* We must return a monitor request object, but we do not actually want + * to monitor the data, so returning an empty one. */ + struct json *monitor_request = json_object_create(); + struct ovsdb_idl_column *prefixes_col; + + prefixes_col = &ovsrec_flow_table_columns[OVSREC_FLOW_TABLE_COL_PREFIXES]; + + if (schema_json->type != JSON_OBJECT) { + VLOG_WARN_RL(&rl, "OVS database schema is not a JSON object"); + return monitor_request; + } + + const char *ft_name = ovsrec_table_classes[OVSREC_TABLE_FLOW_TABLE].name; + const char *nested_objects[] = { + "tables", ft_name, "columns", prefixes_col->name, "type", + }; + const struct json *nested = schema_json; + + for (size_t i = 0; i < ARRAY_SIZE(nested_objects); i++) { + nested = shash_find_data(json_object(nested), nested_objects[i]); + if (!nested || nested->type != JSON_OBJECT) { + VLOG_WARN_RL(&rl, "OVS database schema has no valid '%s'.", + nested_objects[i]); + return monitor_request; + } + } + + struct ovsdb_type server_type; + struct ovsdb_error *error; + + error = ovsdb_type_from_json(&server_type, nested); + if (error) { + char *msg = ovsdb_error_to_string_free(error); + + VLOG_WARN_RL(&rl, "Failed to parse type of %s column in %s table: %s", + prefixes_col->name, ft_name, msg); + free(msg); + return monitor_request; + } + + if (max_flow_table_prefixes != server_type.n_max) { + VLOG_INFO_RL(&rl, "OVS DB schema supports %d flow table prefixes, " + "our IDL supports: %d", + server_type.n_max, prefixes_col->type.n_max); + max_flow_table_prefixes = server_type.n_max; + } + + return monitor_request; +} + +static struct ovsdb_cs_ops feature_cs_ops = { + .compose_monitor_requests = vswitch_schema_prefixes_find, +}; + +static void +vswitch_schema_prefixes_run(void) +{ + struct ovsdb_cs_event *event; + struct ovs_list events; + + ovsdb_cs_run(vswitch_cs, &events); + LIST_FOR_EACH_POP (event, list_node, &events) { + /* We're not really interested in events. We only care about the + * monitor request callback being invoked on re-connections or + * schema changes. */ + ovsdb_cs_event_destroy(event); + } + ovsdb_cs_wait(vswitch_cs); +} + void ovs_feature_support_destroy(void) { rconn_destroy(swconn); swconn = NULL; + ovsdb_cs_destroy(vswitch_cs); + vswitch_cs = NULL; } /* Returns 'true' if the set of tracked OVS features has been updated. */ bool ovs_feature_support_run(const struct smap *ovs_capabilities, - const char *br_name) + const char *br_name, + const char *db_target) { static struct smap empty_caps = SMAP_INITIALIZER(&empty_caps); bool updated = false; @@ -259,6 +344,16 @@ ovs_feature_support_run(const struct smap *ovs_capabilities, new_state ? "supported" : "not supported"); } } + + if (!vswitch_cs) { + vswitch_cs = ovsdb_cs_create(ovsrec_idl_class.database, 3, + &feature_cs_ops, NULL); + } + ovsdb_cs_set_remote(vswitch_cs, db_target, true); + /* This feature doesn't affect OpenFlow rules, so not touching 'updated' + * even if changed. */ + vswitch_schema_prefixes_run(); + return updated; } @@ -284,3 +379,11 @@ ovs_feature_max_select_groups_get(void) { return ovs_group_features.max_groups[OFPGT11_SELECT]; } + +/* Returns the number of flow table prefixes that can be configured in + * the OVS database. */ +size_t +ovs_features_max_flow_table_prefixes_get(void) +{ + return max_flow_table_prefixes; +} diff --git a/lib/test-ovn-features.c b/lib/test-ovn-features.c index aabc547e68..5ff904c12e 100644 --- a/lib/test-ovn-features.c +++ b/lib/test-ovn-features.c @@ -26,15 +26,15 @@ test_ovn_features(struct ovs_cmdl_context *ctx OVS_UNUSED) struct smap features = SMAP_INITIALIZER(&features); smap_add(&features, "ct_zero_snat", "false"); - ovs_assert(!ovs_feature_support_run(&features, NULL)); + ovs_assert(!ovs_feature_support_run(&features, NULL, NULL)); ovs_assert(!ovs_feature_is_supported(OVS_CT_ZERO_SNAT_SUPPORT)); smap_replace(&features, "ct_zero_snat", "true"); - ovs_assert(ovs_feature_support_run(&features, NULL)); + ovs_assert(ovs_feature_support_run(&features, NULL, NULL)); ovs_assert(ovs_feature_is_supported(OVS_CT_ZERO_SNAT_SUPPORT)); smap_add(&features, "unknown_feature", "true"); - ovs_assert(!ovs_feature_support_run(&features, NULL)); + ovs_assert(!ovs_feature_support_run(&features, NULL, NULL)); smap_destroy(&features); } diff --git a/ovs b/ovs index dfe601bbc1..f078a551f9 160000 --- a/ovs +++ b/ovs @@ -1 +1 @@ -Subproject commit dfe601bbc154c836e6ec3526a1eb331c1c09a06e +Subproject commit f078a551f99726e146d423944c8e4ae3633603eb diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at index ccffabace5..4000d82618 100644 --- a/tests/ovn-controller.at +++ b/tests/ovn-controller.at @@ -3002,6 +3002,56 @@ OVN_CLEANUP([hv1 /no response to inactivity probe after .* seconds, disconnecting/d]) AT_CLEANUP +AT_SETUP([ovn-controller - br-int flow table prefixes]) +AT_KEYWORDS([ovn-controller prefixes]) +ovn_start + +net_add n1 +sim_add hv1 +ovs-vsctl add-br br-phys + +dnl Stop the database server before starting ovn-controller and convert the +dnl schema to support only 3 prefixes. +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +AT_CHECK([sed 's/"max": 4}/"max": 3}/' $ovs_srcdir/vswitchd/vswitch.ovsschema > new_schema]) +AT_CHECK([ovsdb-tool convert hv1/conf.db new_schema]) +start_daemon ovsdb-server --remote=punix:db.sock + +dnl Start ovn-controller. +ovn_attach n1 br-phys 192.168.0.20 + +dnl Wait for exactly 3 prefixes to be configured. +OVS_WAIT_UNTIL( + [grep -q 'Setting flow table prefixes: ip_src, ip_dst, ipv6_src.' \ + hv1/ovn-controller.log]) +OVS_WAIT_FOR_OUTPUT([ovs-vsctl --columns=prefixes --bare list Flow_Table], [0], [dnl +ip_dst ip_src ipv6_src +]) + +dnl Remember the row UUID. +uuid=$(ovs-vsctl --columns=_uuid --bare list Flow_Table) + +dnl Stop the database server again and convert the schema back to support +dnl up to 4 prefixes. +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +AT_CHECK([cat $ovs_srcdir/vswitchd/vswitch.ovsschema > new_schema]) +AT_CHECK([ovsdb-tool convert hv1/conf.db new_schema]) +start_daemon ovsdb-server --remote=punix:db.sock + +dnl Wait for exactly 4 prefixes to be configured. +OVS_WAIT_FOR_OUTPUT( + [grep -q 'Setting flow table prefixes: ip_src, ip_dst, ipv6_src, ipv6_dst.' \ + hv1/ovn-controller.log]) +OVS_WAIT_FOR_OUTPUT([ovs-vsctl --columns=prefixes --bare list Flow_Table], [0], [dnl +ip_dst ip_src ipv6_dst ipv6_src +]) + +dnl Check that the record was updated and not replaced. +AT_CHECK([test "$(ovs-vsctl --columns=_uuid --bare list Flow_Table)" = "${uuid}"]) + +OVN_CLEANUP([hv1]) +AT_CLEANUP + AT_SETUP([ovn-controller - Start controller with empty db]) AT_KEYWORDS([ovn])