From 67100f0ca81934836d4d58e8fd769c33032ca8cc Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 6 Feb 2025 09:56:53 +0100 Subject: [PATCH] ic: Add support for spine-leaf topology for transit switches. Previous commits added support for spine-leaf topology for local and distributed logical switches. Now adding support for the same to transit switches. Transit switches are a little different from normal distributed switches in a way that we must have ports of a transit switch claimed by some chassis in order to tunnel packets to correct availability zones. In a usual distributed switch we can rely on VIFs being claimed by chassis to find the destination, but OVN setup of one availability zone is not aware of ports of the other, except the ports of the transit switch itself. So, transit switch ports has to be claimed. OVN IC is also built on the assumption that all the transit switch ports are router ports. Let's allow 'switch' LSPs to be claimed by ovn-controller using the 'requested-chassis' column. And make ovn-ic daemon aware of 'switch' type ports as well, so it can build remote ports from them and know to which chassis the port traffic should be tunneled to. On top of previous benefits for distributed switches, Spine-Leaf topology with a transit spine switch allows: - Full locality of changes: With a current L2 topology spread across availability zones, a single transit switch has all the ports: all the VIFs, local to the current zone, and all the remote ports from all other zones. So, if the user wants to add a new port, they have to add it to every zone. To one as a VIF and to all the other as a remote port. ovn-ic daemon will do that, but it's still a lot of movements and a lot of updates in each zone for changes happening in only one. With the transit spine switch, every zone will only have their own VIFs on a node switch and one remote port per other AZ. Addition of a new port in such a setup doesn't require changing anything in non-local availability zones. This would be similar to node switches with localnet ports, but without need for a dedicated provider network. - Better handling of multicast traffic: Pipeline will be split across nodes, so every node will process multicast for its own ports and forward traffic to appropriate other chassis when necessary. This should be particularly useful for L2 topologies of ovn-kubernetes in the AZ per node configuration. In addition for the previous configuration of switch ports, transit switch ports will need the 'requested-chassis' set like this: ovn-nbctl lsp-set-options spine-to-ls1 requested-chassis=hv1 Reported-at: https://issues.redhat.com/browse/FDP-874 Signed-off-by: Ilya Maximets Acked-by: Mark Michelson Signed-off-by: Numan Siddique --- NEWS | 1 + controller/binding.c | 45 +++++++++--- ic/ovn-ic.c | 56 +++++++++++---- tests/ovn-ic.at | 161 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 240 insertions(+), 23 deletions(-) diff --git a/NEWS b/NEWS index 9e5d9f0742..d5ce31e496 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,7 @@ Post v24.09.0 Router Ports. - Added support for Spine-Leaf topology of logical switches by adding a new LSP type 'switch' that can directly connect two logical switches. + Supported for both distributed and transit switches. - Bump python version required for building OVN to 3.7. - SSL/TLS: * TLSv1 and TLSv1.1 protocols are deprecated and disabled by default diff --git a/controller/binding.c b/controller/binding.c index 04214cf896..316b2c36be 100644 --- a/controller/binding.c +++ b/controller/binding.c @@ -2112,6 +2112,10 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, } } +static bool consider_patch_port_for_local_datapaths( + const struct sbrec_port_binding *, + struct binding_ctx_in *, struct binding_ctx_out *); + void binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out) { @@ -2155,7 +2159,9 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out) switch (lport_type) { case LP_PATCH: update_related_lport(pb, b_ctx_out); + consider_patch_port_for_local_datapaths(pb, b_ctx_in, b_ctx_out); break; + case LP_VTEP: update_related_lport(pb, b_ctx_out); struct lport *vtep_lport = xmalloc(sizeof *vtep_lport); @@ -2925,7 +2931,7 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, return true; } -static void +static bool consider_patch_port_for_local_datapaths(const struct sbrec_port_binding *pb, struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out) @@ -2976,15 +2982,30 @@ consider_patch_port_for_local_datapaths(const struct sbrec_port_binding *pb, } } - if (sbrec_port_binding_is_updated(pb, SBREC_PORT_BINDING_COL_TYPE) && - (pb->chassis == b_ctx_in->chassis_rec || - is_additional_chassis(pb, b_ctx_in->chassis_rec))) { + /* If this chassis is requested - try to claim. */ + if (pb->requested_chassis == b_ctx_in->chassis_rec) { + return claim_lport(pb, NULL, b_ctx_in->chassis_rec, NULL, + !b_ctx_in->ovnsb_idl_txn, false, + b_ctx_out->tracked_dp_bindings, + b_ctx_out->if_mgr, + b_ctx_out->postponed_ports); + } + /* If this chassis is claimed, but not requested to be; or requested for + * some other chassis, but claimed by us - release. */ + if ((!pb->requested_chassis && !pb->n_additional_chassis && pb->chassis) + || pb->chassis == b_ctx_in->chassis_rec + || is_additional_chassis(pb, b_ctx_in->chassis_rec) + || if_status_is_port_claimed(b_ctx_out->if_mgr, pb->logical_port)) { + remove_local_lports(pb->logical_port, b_ctx_out); - release_lport(pb, b_ctx_in->chassis_rec, - !b_ctx_in->ovnsb_idl_txn, - b_ctx_out->tracked_dp_bindings, - b_ctx_out->if_mgr); + if (!release_lport(pb, b_ctx_in->chassis_rec, + !b_ctx_in->ovnsb_idl_txn, + b_ctx_out->tracked_dp_bindings, + b_ctx_out->if_mgr)) { + return false; + } } + return true; } static bool @@ -3042,7 +3063,8 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in, case LP_PATCH: update_related_lport(pb, b_ctx_out); - consider_patch_port_for_local_datapaths(pb, b_ctx_in, b_ctx_out); + handled = consider_patch_port_for_local_datapaths(pb, b_ctx_in, + b_ctx_out); break; case LP_VTEP: @@ -3084,8 +3106,9 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in, distributed_port, pb->logical_port); break; } - consider_patch_port_for_local_datapaths(distributed_pb, b_ctx_in, - b_ctx_out); + handled = consider_patch_port_for_local_datapaths(distributed_pb, + b_ctx_in, + b_ctx_out); break; case LP_EXTERNAL: diff --git a/ic/ovn-ic.c b/ic/ovn-ic.c index 75b5d17874..ede961a169 100644 --- a/ic/ovn-ic.c +++ b/ic/ovn-ic.c @@ -503,10 +503,32 @@ find_crp_for_sb_pb(struct ic_context *ctx, return find_crp_from_lrp(ctx, peer); } +static const struct nbrec_logical_switch_port * +get_lsp_by_ts_port_name(struct ic_context *ctx, const char *ts_port_name) +{ + const struct nbrec_logical_switch_port *lsp, *key; + + key = nbrec_logical_switch_port_index_init_row(ctx->nbrec_port_by_name); + nbrec_logical_switch_port_index_set_name(key, ts_port_name); + lsp = nbrec_logical_switch_port_index_find(ctx->nbrec_port_by_name, key); + nbrec_logical_switch_port_index_destroy_row(key); + + return lsp; +} + static const char * -get_lrp_address_for_sb_pb(struct ic_context *ctx, - const struct sbrec_port_binding *sb_pb) +get_lp_address_for_sb_pb(struct ic_context *ctx, + const struct sbrec_port_binding *sb_pb) { + const struct nbrec_logical_switch_port *nb_lsp; + + nb_lsp = get_lsp_by_ts_port_name(ctx, sb_pb->logical_port); + if (!strcmp(nb_lsp->type, "switch")) { + /* Switches always have implicit "unknown" address, and IC-SB port + * binding can only have one address specified. */ + return "unknown"; + } + const struct sbrec_port_binding *peer = find_peer_port(ctx, sb_pb); if (!peer) { return NULL; @@ -597,9 +619,9 @@ sync_local_port(struct ic_context *ctx, const struct nbrec_logical_switch_port *lsp) { /* Sync address from NB to ISB */ - const char *address = get_lrp_address_for_sb_pb(ctx, sb_pb); + const char *address = get_lp_address_for_sb_pb(ctx, sb_pb); if (!address) { - VLOG_DBG("Can't get logical router port address for logical" + VLOG_DBG("Can't get router/switch port address for logical" " switch port %s", sb_pb->logical_port); if (isb_pb->address[0]) { icsbrec_port_binding_set_address(isb_pb, ""); @@ -616,6 +638,10 @@ sync_local_port(struct ic_context *ctx, if (strcmp(crp->chassis->name, isb_pb->gateway)) { icsbrec_port_binding_set_gateway(isb_pb, crp->chassis->name); } + } else if (!strcmp(lsp->type, "switch") && sb_pb->chassis) { + if (strcmp(sb_pb->chassis->name, isb_pb->gateway)) { + icsbrec_port_binding_set_gateway(isb_pb, sb_pb->chassis->name); + } } else { if (isb_pb->gateway[0]) { icsbrec_port_binding_set_gateway(isb_pb, ""); @@ -715,7 +741,7 @@ create_isb_pb(struct ic_context *ctx, icsbrec_port_binding_set_logical_port(isb_pb, sb_pb->logical_port); icsbrec_port_binding_set_tunnel_key(isb_pb, pb_tnl_key); - const char *address = get_lrp_address_for_sb_pb(ctx, sb_pb); + const char *address = get_lp_address_for_sb_pb(ctx, sb_pb); if (address) { icsbrec_port_binding_set_address(isb_pb, address); } @@ -804,7 +830,8 @@ port_binding_run(struct ic_context *ctx, for (int i = 0; i < ls->n_ports; i++) { lsp = ls->ports[i]; - if (!strcmp(lsp->type, "router")) { + if (!strcmp(lsp->type, "router") + || !strcmp(lsp->type, "switch")) { /* The port is local. */ sb_pb = find_lsp_in_sb(ctx, lsp); if (!sb_pb) { @@ -1314,13 +1341,8 @@ static const char * get_lrp_name_by_ts_port_name(struct ic_context *ctx, const char *ts_port_name) { const struct nbrec_logical_switch_port *nb_lsp; - const struct nbrec_logical_switch_port *nb_lsp_key = - nbrec_logical_switch_port_index_init_row(ctx->nbrec_port_by_name); - nbrec_logical_switch_port_index_set_name(nb_lsp_key, ts_port_name); - nb_lsp = nbrec_logical_switch_port_index_find(ctx->nbrec_port_by_name, - nb_lsp_key); - nbrec_logical_switch_port_index_destroy_row(nb_lsp_key); + nb_lsp = get_lsp_by_ts_port_name(ctx, ts_port_name); if (!nb_lsp) { return NULL; } @@ -1781,6 +1803,16 @@ route_run(struct ic_context *ctx, ICSBREC_PORT_BINDING_FOR_EACH_EQUAL (isb_pb, isb_pb_key, ctx->icsbrec_port_binding_by_az) { + const struct nbrec_logical_switch_port *nb_lsp; + + nb_lsp = get_lsp_by_ts_port_name(ctx, isb_pb->logical_port); + if (!strcmp(nb_lsp->type, "switch")) { + VLOG_DBG("IC-SB Port_Binding '%s' on ts '%s' corresponds to a " + "switch port, not considering for route collection.", + isb_pb->logical_port, isb_pb->transit_switch); + continue; + } + const char *ts_lrp_name = get_lrp_name_by_ts_port_name(ctx, isb_pb->logical_port); if (!ts_lrp_name) { diff --git a/tests/ovn-ic.at b/tests/ovn-ic.at index fbcfca2e45..db4057f44c 100644 --- a/tests/ovn-ic.at +++ b/tests/ovn-ic.at @@ -2618,3 +2618,164 @@ OVN_CLEANUP_IC([az1], [az2]) AT_CLEANUP ]) + +OVN_FOR_EACH_NORTHD([ +AT_SETUP([spine-leaf: 2 AZs, 2 HVs, 2 LSs, connected via transit spine switch]) +AT_KEYWORDS([spine leaf]) +AT_SKIP_IF([test $HAVE_SCAPY = no]) + +ovn_init_ic_db + +ovn_start az1 +ovn_start az2 + +# Logical network: +# Single network 172.16.1.0/24. Two switches with VIF ports on HVs in two +# separate AZs, connected to a spine transit switch via 'switch' ports. + +ovn-ic-nbctl ts-add spine +ovn_as az1 check ovn-nbctl ls-add ls1 +ovn_as az2 check ovn-nbctl ls-add ls2 + +# Connect ls1 to spine. +ovn_as az1 +check ovn-nbctl lsp-add spine spine-to-ls1 +check ovn-nbctl lsp-add ls1 ls1-to-spine +check ovn-nbctl lsp-set-type spine-to-ls1 switch peer=ls1-to-spine +check ovn-nbctl lsp-set-type ls1-to-spine switch peer=spine-to-ls1 + +# Connect ls2 to spine. +ovn_as az2 +check ovn-nbctl lsp-add spine spine-to-ls2 +check ovn-nbctl lsp-add ls2 ls2-to-spine +check ovn-nbctl lsp-set-type spine-to-ls2 switch peer=ls2-to-spine +check ovn-nbctl lsp-set-type ls2-to-spine switch peer=spine-to-ls2 + +# Create logical port ls1-lp1 in ls1 +ovn_as az1 check ovn-nbctl lsp-add ls1 ls1-lp1 \ +-- lsp-set-addresses ls1-lp1 "f0:00:00:01:02:01 172.16.1.1" +# Create logical port ls1-lp2 in ls1 +ovn_as az1 check ovn-nbctl lsp-add ls1 ls1-lp2 \ +-- lsp-set-addresses ls1-lp2 "f0:00:00:01:02:02 172.16.1.2" + +# Create logical port ls2-lp1 in ls2 +ovn_as az2 check ovn-nbctl lsp-add ls2 ls2-lp1 \ +-- lsp-set-addresses ls2-lp1 "f0:00:00:01:02:03 172.16.1.3" +# Create logical port ls2-lp2 in ls2 +ovn_as az2 check ovn-nbctl lsp-add ls2 ls2-lp2 \ +-- lsp-set-addresses ls2-lp2 "f0:00:00:01:02:04 172.16.1.4" + +# Create hypervisors and OVS ports corresponding to logical ports. +net_add n1 + +sim_add hv1 +as hv1 +check ovs-vsctl add-br br-phys +ovn_az_attach az1 n1 br-phys 192.168.1.1 16 +check ovs-vsctl set open . external-ids:ovn-is-interconn=true +ovs-vsctl -- add-port br-int vif1 -- \ + set interface vif1 external-ids:iface-id=ls1-lp1 \ + options:tx_pcap=hv1/vif1-tx.pcap \ + options:rxq_pcap=hv1/vif1-rx.pcap \ + ofport-request=1 +ovs-vsctl -- add-port br-int vif2 -- \ + set interface vif2 external-ids:iface-id=ls1-lp2 \ + options:tx_pcap=hv1/vif2-tx.pcap \ + options:rxq_pcap=hv1/vif2-rx.pcap \ + ofport-request=2 + +sim_add hv2 +as hv2 +check ovs-vsctl add-br br-phys +ovn_az_attach az2 n1 br-phys 192.168.2.1 16 +check ovs-vsctl set open . external-ids:ovn-is-interconn=true +ovs-vsctl -- add-port br-int vif3 -- \ + set interface vif3 external-ids:iface-id=ls2-lp1 \ + options:tx_pcap=hv2/vif3-tx.pcap \ + options:rxq_pcap=hv2/vif3-rx.pcap \ + ofport-request=3 +ovs-vsctl -- add-port br-int vif4 -- \ + set interface vif4 external-ids:iface-id=ls2-lp2 \ + options:tx_pcap=hv2/vif4-tx.pcap \ + options:rxq_pcap=hv2/vif4-rx.pcap \ + ofport-request=4 + +# Bind transit switch ports to their chassis. +check ovn_as az1 ovn-nbctl lsp-set-options spine-to-ls1 requested-chassis=hv1 +check ovn_as az2 ovn-nbctl lsp-set-options spine-to-ls2 requested-chassis=hv2 + +# Pre-populate the hypervisors' ARP tables so that we don't lose any +# packets for ARP resolution (native tunneling doesn't queue packets +# for ARP resolution). +OVN_POPULATE_ARP + +ovn_as az1 +check ovn-nbctl --wait=hv sync +ovn-sbctl dump-flows > az1/sbflows + +#wait_for_ports_up +ovn_as az2 +check ovn-nbctl --wait=hv sync +ovn-sbctl dump-flows > az2/sbflows + +check ovn-ic-nbctl --wait=sb sync + +ovn-ic-nbctl show > ic-nbctl.dump +AT_CAPTURE_FILE([ic-nbctl.dump]) + +(echo "---------ISB dump-----" + ovn-ic-sbctl show + echo "---------------------" + ovn-ic-sbctl list gateway + echo "---------------------" + ovn-ic-sbctl list datapath_binding + echo "---------------------" + ovn-ic-sbctl list port_binding + echo "---------------------" + ovn-ic-sbctl list route + echo "---------------------") > ic-sbctl.dump +AT_CAPTURE_FILE([ic-sbctl.dump]) + +# Send ip packets between the two ports. +src_mac="f0:00:00:01:02:01" +dst_mac="f0:00:00:01:02:03" +src_ip=172.16.1.1 +dst_ip=172.16.1.3 +packet=$(fmt_pkt "Ether(dst='${dst_mac}', src='${src_mac}')/ \ + IP(src='${src_ip}', dst='${dst_ip}')/ \ + UDP(sport=1538, dport=4369)") +check as hv1 ovs-appctl netdev-dummy/receive vif1 $packet + +# Check that datapath is not doing any extra work and sends the packet out +# through the tunnel. +AT_CHECK([ovn_as az1 as hv1 ovs-appctl ofproto/trace --names \ + br-int in_port=vif1 $packet > ofproto-trace-1]) +AT_CAPTURE_FILE([ofproto-trace-1]) +AT_CHECK([grep 'Megaflow:' ofproto-trace-1], [0], [dnl +Megaflow: recirc_id=0,eth,ip,in_port=vif1,dl_src=f0:00:00:01:02:01,dl_dst=f0:00:00:01:02:03,nw_ecn=0,nw_frag=no +]) +AT_CHECK([grep -q \ + 'Datapath actions: tnl_push(tnl_port(genev_sys_6081).*out_port(br-phys))' \ + ofproto-trace-1]) + +# It's a little problematic to trace the other side, but we can check +# datapath actions. +AT_CHECK([as hv2 ovs-appctl dpctl/dump-flows --names \ + | grep actions | sed 's/.*\(actions:.*\)/\1/' | sort], [0], [dnl +actions:tnl_pop(genev_sys_6081) +actions:vif3 +]) + +# No modifications expected. +AT_CHECK([echo $packet > expected]) + +AT_CHECK([touch empty]) + +# Check that it is delivered where needed and not delivered where not. +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty]) +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected]) +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty]) + +OVN_CLEANUP_IC([az1], [az2]) +AT_CLEANUP +])